pseries: SLOF PCI flag day
authorBenjamin Herrenschmidt <benh@kernel.crashing.org>
Wed, 11 Jan 2012 19:46:28 +0000 (19:46 +0000)
committerAlexander Graf <agraf@suse.de>
Sat, 21 Jan 2012 04:17:02 +0000 (05:17 +0100)
Currently on the pseries machine the SLOF firmware is used normally,
but we bypass it when -kernel is specified.  Having these two

different boot paths can cause some confusion.

In particular at present we need to "probe" the (emulated) PCI bus and
produce device tree nodes for the PCI devices in qemu, for the -kernel
case.  In the SLOF case, it takes the device tree from qemu adds some
stuff to it then passes it on to the kernel.

It's been decided that a better approach is to always boot through
SLOF, even when using -kernel.  WIth this approach we can leave PCI
probing and device node creation to SLOF in all cases which removes a
bunch of code in qemu, and avoids iterating the PCI devices from the
machine specific init code which we're not supposed to do.

This patch changes qemu to always boot through SLOF, and not to create
PCI nodes.  Simultaneously it updates the included version of SLOF
(submodule and binary image) to one which supports (and requires) the
new approach.

The new SLOF version also includes a number of unrelated enhancements:
support for booting from virtio-pci devices and e1000, greatly
improved FCode support and many bugfixes.  It also makes SLOF ready to
be used even when specifying a kernel on the qemu command line.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
hw/spapr.c
hw/spapr_pci.c
pc-bios/README
pc-bios/slof.bin
roms/SLOF

index 0e1f80dfdc26afb08b6d7904cbc01869fedae7a3..b01137181337e99cd0c6d0b380494bf28bc9da91 100644 (file)
 
 #include <libfdt.h>
 
-#define KERNEL_LOAD_ADDR        0x00000000
-#define INITRD_LOAD_ADDR        0x02800000
+/* SLOF memory layout:
+ *
+ * SLOF raw image loaded at 0, copies its romfs right below the flat
+ * device-tree, then position SLOF itself 31M below that
+ *
+ * So we set FW_OVERHEAD to 40MB which should account for all of that
+ * and more
+ *
+ * We load our kernel at 4M, leaving space for SLOF initial image
+ */
 #define FDT_MAX_SIZE            0x10000
 #define RTAS_MAX_SIZE           0x10000
 #define FW_MAX_SIZE             0x400000
 #define FW_FILE_NAME            "slof.bin"
+#define FW_OVERHEAD             0x2800000
+#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
 
-#define MIN_RMA_SLOF           128UL
+#define MIN_RMA_SLOF            128UL
 
 #define TIMEBASE_FREQ           512000000ULL
 
 #define MAX_CPUS                256
-#define XICS_IRQS              1024
+#define XICS_IRQS               1024
 
 #define SPAPR_PCI_BUID          0x800000020000001ULL
 #define SPAPR_PCI_MEM_WIN_ADDR  (0x10000000000ULL + 0xA0000000)
@@ -139,6 +149,7 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
                                    target_phys_addr_t rma_size,
                                    target_phys_addr_t initrd_base,
                                    target_phys_addr_t initrd_size,
+                                   target_phys_addr_t kernel_size,
                                    const char *boot_device,
                                    const char *kernel_cmdline,
                                    long hash_shift)
@@ -176,6 +187,12 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
     fdt = g_malloc0(FDT_MAX_SIZE);
     _FDT((fdt_create(fdt, FDT_MAX_SIZE)));
 
+    if (kernel_size) {
+        _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
+    }
+    if (initrd_size) {
+        _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
+    }
     _FDT((fdt_finish_reservemap(fdt)));
 
     /* Root node */
@@ -197,15 +214,13 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
                        &start_prop, sizeof(start_prop))));
     _FDT((fdt_property(fdt, "linux,initrd-end",
                        &end_prop, sizeof(end_prop))));
-    _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
+    if (kernel_size) {
+        uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
+                              cpu_to_be64(kernel_size) };
 
-    /*
-     * Because we don't always invoke any firmware, we can't rely on
-     * that to do BAR allocation.  Long term, we should probably do
-     * that ourselves, but for now, this setting (plus advertising the
-     * current BARs as 0) causes sufficiently recent kernels to to the
-     * BAR assignment themselves */
-    _FDT((fdt_property_cell(fdt, "linux,pci-probe-only", 0)));
+        _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
+    }
+    _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
 
     _FDT((fdt_end_node(fdt)));
 
@@ -445,6 +460,12 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
 
     _FDT((fdt_pack(fdt)));
 
+    if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
+        hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
+                 fdt_totalsize(fdt), FDT_MAX_SIZE);
+        exit(1);
+    }
+
     cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
 
     g_free(fdt);
@@ -494,8 +515,9 @@ static void ppc_spapr_init(ram_addr_t ram_size,
     MemoryRegion *sysmem = get_system_memory();
     MemoryRegion *ram = g_new(MemoryRegion, 1);
     target_phys_addr_t rma_alloc_size, rma_size;
-    uint32_t initrd_base;
-    long kernel_size, initrd_size, fw_size;
+    uint32_t initrd_base = 0;
+    long kernel_size = 0, initrd_size = 0;
+    long load_limit, rtas_limit, fw_size;
     long pteg_shift = 17;
     char *filename;
 
@@ -517,11 +539,13 @@ static void ppc_spapr_init(ram_addr_t ram_size,
         rma_size = ram_size;
     }
 
-    /* We place the device tree just below either the top of the RMA,
+    /* We place the device tree and RTAS just below either the top of the RMA,
      * or just below 2GB, whichever is lowere, so that it can be
      * processed with 32-bit real mode code if necessary */
-    spapr->fdt_addr = MIN(rma_size, 0x80000000) - FDT_MAX_SIZE;
-    spapr->rtas_addr = spapr->fdt_addr - RTAS_MAX_SIZE;
+    rtas_limit = MIN(rma_size, 0x80000000);
+    spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
+    spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
+    load_limit = spapr->fdt_addr - FW_OVERHEAD;
 
     /* init CPUs */
     if (cpu_model == NULL) {
@@ -577,13 +601,19 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 
     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
     spapr->rtas_size = load_image_targphys(filename, spapr->rtas_addr,
-                                           ram_size - spapr->rtas_addr);
+                                           rtas_limit - spapr->rtas_addr);
     if (spapr->rtas_size < 0) {
         hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
         exit(1);
     }
+    if (spapr->rtas_size > RTAS_MAX_SIZE) {
+        hw_error("RTAS too big ! 0x%lx bytes (max is 0x%x)\n",
+                 spapr->rtas_size, RTAS_MAX_SIZE);
+        exit(1);
+    }
     g_free(filename);
 
+
     /* Set up Interrupt Controller */
     spapr->icp = xics_system_init(XICS_IRQS);
     spapr->next_irq = 16;
@@ -622,6 +652,20 @@ static void ppc_spapr_init(ram_addr_t ram_size,
         spapr_vscsi_create(spapr->vio_bus, 0x2000 + i);
     }
 
+    if (rma_size < (MIN_RMA_SLOF << 20)) {
+        fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
+                "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
+        exit(1);
+    }
+
+    fprintf(stderr, "sPAPR memory map:\n");
+    fprintf(stderr, "RTAS                 : 0x%08lx..%08lx\n",
+            (unsigned long)spapr->rtas_addr,
+            (unsigned long)(spapr->rtas_addr + spapr->rtas_size - 1));
+    fprintf(stderr, "FDT                  : 0x%08lx..%08lx\n",
+            (unsigned long)spapr->fdt_addr,
+            (unsigned long)(spapr->fdt_addr + FDT_MAX_SIZE - 1));
+
     if (kernel_filename) {
         uint64_t lowaddr = 0;
 
@@ -630,57 +674,60 @@ static void ppc_spapr_init(ram_addr_t ram_size,
         if (kernel_size < 0) {
             kernel_size = load_image_targphys(kernel_filename,
                                               KERNEL_LOAD_ADDR,
-                                              ram_size - KERNEL_LOAD_ADDR);
+                                              load_limit - KERNEL_LOAD_ADDR);
         }
         if (kernel_size < 0) {
             fprintf(stderr, "qemu: could not load kernel '%s'\n",
                     kernel_filename);
             exit(1);
         }
+        fprintf(stderr, "Kernel               : 0x%08x..%08lx\n",
+                KERNEL_LOAD_ADDR, KERNEL_LOAD_ADDR + kernel_size - 1);
 
         /* load initrd */
         if (initrd_filename) {
-            initrd_base = INITRD_LOAD_ADDR;
+            /* Try to locate the initrd in the gap between the kernel
+             * and the firmware. Add a bit of space just in case
+             */
+            initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
             initrd_size = load_image_targphys(initrd_filename, initrd_base,
-                                              ram_size - initrd_base);
+                                              load_limit - initrd_base);
             if (initrd_size < 0) {
                 fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
                         initrd_filename);
                 exit(1);
             }
+            fprintf(stderr, "Ramdisk              : 0x%08lx..%08lx\n",
+                    (long)initrd_base, (long)(initrd_base + initrd_size - 1));
         } else {
             initrd_base = 0;
             initrd_size = 0;
         }
+    }
 
-        spapr->entry_point = KERNEL_LOAD_ADDR;
-    } else {
-        if (rma_size < (MIN_RMA_SLOF << 20)) {
-            fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
-                    "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
-            exit(1);
-        }
-        filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, FW_FILE_NAME);
-        fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
-        if (fw_size < 0) {
-            hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
-            exit(1);
-        }
-        g_free(filename);
-        spapr->entry_point = 0x100;
-        initrd_base = 0;
-        initrd_size = 0;
-
-        /* SLOF will startup the secondary CPUs using RTAS,
-           rather than expecting a kexec() style entry */
-        for (env = first_cpu; env != NULL; env = env->next_cpu) {
-            env->halted = 1;
-        }
+    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, FW_FILE_NAME);
+    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
+    if (fw_size < 0) {
+        hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
+        exit(1);
+    }
+    g_free(filename);
+    fprintf(stderr, "Firmware load        : 0x%08x..%08lx\n",
+            0, fw_size);
+    fprintf(stderr, "Firmware runtime     : 0x%08lx..%08lx\n",
+            load_limit, (unsigned long)spapr->fdt_addr);
+
+    spapr->entry_point = 0x100;
+
+    /* SLOF will startup the secondary CPUs using RTAS */
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        env->halted = 1;
     }
 
     /* Prepare the device tree */
     spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
                                             initrd_base, initrd_size,
+                                            kernel_size,
                                             boot_device, kernel_cmdline,
                                             pteg_shift + 7);
     assert(spapr->fdt_skel != NULL);
index f3f9246ee0f23a23d070f9931314340de12ab685..cf3762829288d0812eba948489b9575db1560693 100644 (file)
@@ -324,31 +324,13 @@ void spapr_create_phb(sPAPREnvironment *spapr,
 #define b_fff(x)        b_x((x), 8, 3)  /* function number */
 #define b_rrrrrrrr(x)   b_x((x), 0, 8)  /* register number */
 
-static uint32_t regtype_to_ss(uint8_t type)
-{
-    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
-        return 3;
-    }
-    if (type == PCI_BASE_ADDRESS_SPACE_IO) {
-        return 1;
-    }
-    return 2;
-}
-
 int spapr_populate_pci_devices(sPAPRPHBState *phb,
                                uint32_t xics_phandle,
                                void *fdt)
 {
     PCIBus *bus = phb->host_state.bus;
-    int bus_off, node_off = 0, devid, fn, i, n, devices;
-    DeviceState *qdev;
+    int bus_off, i;
     char nodename[256];
-    struct {
-        uint32_t hi;
-        uint64_t addr;
-        uint64_t size;
-    } __attribute__((packed)) reg[PCI_NUM_REGIONS + 1],
-          assigned_addresses[PCI_NUM_REGIONS];
     uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) };
     struct {
         uint32_t hi;
@@ -369,7 +351,7 @@ int spapr_populate_pci_devices(sPAPRPHBState *phb,
     };
     uint64_t bus_reg[] = { cpu_to_be64(phb->buid), 0 };
     uint32_t interrupt_map_mask[] = {
-        cpu_to_be32(b_ddddd(-1)|b_fff(-1)), 0x0, 0x0, 0x0};
+        cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, 0x0};
     uint32_t interrupt_map[bus->nirq][7];
 
     /* Start populating the FDT */
@@ -397,118 +379,26 @@ int spapr_populate_pci_devices(sPAPRPHBState *phb,
     _FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range)));
     _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges)));
     _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
-    _FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
-                     &interrupt_map_mask, sizeof(interrupt_map_mask)));
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
 
-    /* Populate PCI devices and allocate IRQs */
-    devices = 0;
-    QTAILQ_FOREACH(qdev, &bus->qbus.children, sibling) {
-        PCIDevice *dev = DO_UPCAST(PCIDevice, qdev, qdev);
-        int irq_index = pci_spapr_map_irq(dev, 0);
-        uint32_t *irqmap = interrupt_map[devices];
-        uint8_t *config = dev->config;
-
-        devid = dev->devfn >> 3;
-        fn = dev->devfn & 7;
-
-        sprintf(nodename, "pci@%u,%u", devid, fn);
-
-        /* Allocate interrupt from the map */
-        if (devid > bus->nirq)  {
-            printf("Unexpected behaviour in spapr_populate_pci_devices,"
-                    "wrong devid %u\n", devid);
-            exit(-1);
-        }
-        irqmap[0] = cpu_to_be32(b_ddddd(devid)|b_fff(fn));
+    /* Build the interrupt-map, this must matches what is done
+     * in pci_spapr_map_irq
+     */
+    _FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
+                     &interrupt_map_mask, sizeof(interrupt_map_mask)));
+    for (i = 0; i < 7; i++) {
+        uint32_t *irqmap = interrupt_map[i];
+        irqmap[0] = cpu_to_be32(b_ddddd(i)|b_fff(0));
         irqmap[1] = 0;
         irqmap[2] = 0;
         irqmap[3] = 0;
         irqmap[4] = cpu_to_be32(xics_phandle);
-        irqmap[5] = cpu_to_be32(phb->lsi_table[irq_index].dt_irq);
+        irqmap[5] = cpu_to_be32(phb->lsi_table[i % SPAPR_PCI_NUM_LSI].dt_irq);
         irqmap[6] = cpu_to_be32(0x8);
-
-        /* Add node to FDT */
-        node_off = fdt_add_subnode(fdt, bus_off, nodename);
-        if (node_off < 0) {
-            return node_off;
-        }
-
-        _FDT(fdt_setprop_cell(fdt, node_off, "vendor-id",
-                              pci_get_word(&config[PCI_VENDOR_ID])));
-        _FDT(fdt_setprop_cell(fdt, node_off, "device-id",
-                              pci_get_word(&config[PCI_DEVICE_ID])));
-        _FDT(fdt_setprop_cell(fdt, node_off, "revision-id",
-                              pci_get_byte(&config[PCI_REVISION_ID])));
-        _FDT(fdt_setprop_cell(fdt, node_off, "class-code",
-                              pci_get_long(&config[PCI_CLASS_REVISION]) >> 8));
-        _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id",
-                              pci_get_word(&config[PCI_SUBSYSTEM_ID])));
-        _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id",
-                              pci_get_word(&config[PCI_SUBSYSTEM_VENDOR_ID])));
-
-        /* Config space region comes first */
-        reg[0].hi = cpu_to_be32(
-            b_n(0) |
-            b_p(0) |
-            b_t(0) |
-            b_ss(0/*config*/) |
-            b_bbbbbbbb(0) |
-            b_ddddd(devid) |
-            b_fff(fn));
-        reg[0].addr = 0;
-        reg[0].size = 0;
-
-        n = 0;
-        for (i = 0; i < ARRAY_SIZE(bars); ++i) {
-            if (0 == dev->io_regions[i].size) {
-                continue;
-            }
-
-            reg[n+1].hi = cpu_to_be32(
-                b_n(0) |
-                b_p(0) |
-                b_t(0) |
-                b_ss(regtype_to_ss(dev->io_regions[i].type)) |
-                b_bbbbbbbb(0) |
-                b_ddddd(devid) |
-                b_fff(fn) |
-                b_rrrrrrrr(bars[i]));
-            reg[n+1].addr = 0;
-            reg[n+1].size = cpu_to_be64(dev->io_regions[i].size);
-
-            assigned_addresses[n].hi = cpu_to_be32(
-                b_n(1) |
-                b_p(0) |
-                b_t(0) |
-                b_ss(regtype_to_ss(dev->io_regions[i].type)) |
-                b_bbbbbbbb(0) |
-                b_ddddd(devid) |
-                b_fff(fn) |
-                b_rrrrrrrr(bars[i]));
-
-            /*
-             * Writing zeroes to assigned_addresses causes the guest kernel to
-             * reassign BARs
-             */
-            assigned_addresses[n].addr = cpu_to_be64(dev->io_regions[i].addr);
-            assigned_addresses[n].size = reg[n+1].size;
-
-            ++n;
-        }
-        _FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1)));
-        _FDT(fdt_setprop(fdt, node_off, "assigned-addresses",
-                         assigned_addresses,
-                         sizeof(assigned_addresses[0])*(n)));
-        _FDT(fdt_setprop_cell(fdt, node_off, "interrupts",
-                              pci_get_byte(&config[PCI_INTERRUPT_PIN])));
-
-        ++devices;
     }
-
     /* Write interrupt map */
     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
-                     devices * sizeof(interrupt_map[0])));
+                     7 * sizeof(interrupt_map[0])));
 
     return 0;
 }
index 1cebbbc89a29d291992e13bb85a29de94d286b63..5dce355f565955f84d086811cb5820e56722a8fe 100644 (file)
@@ -17,7 +17,7 @@
 - SLOF (Slimline Open Firmware) is a free IEEE 1275 Open Firmware
   implementation for certain IBM POWER hardware.  The sources are at
   https://github.com/dgibson/SLOF, and the image currently in qemu is
-  built from git tag qemu-slof-20111013.
+  built from git tag qemu-slof-20120111.1.
 
 - sgabios (the Serial Graphics Adapter option ROM) provides a means for
   legacy x86 software to communicate with an attached serial console as
index 1e84582cc695ff6e5973e045ef31ca951ca15c79..8554f54b5773f0771eec28f6c3bc0dc17e063a47 100644 (file)
Binary files a/pc-bios/slof.bin and b/pc-bios/slof.bin differ
index 32e3430c018ceb8413cb808477449d1968c42497..ab062ff3b37c39649f2b0d94ed607adc6f6b3c7d 160000 (submodule)
--- a/roms/SLOF
+++ b/roms/SLOF
@@ -1 +1 @@
-Subproject commit 32e3430c018ceb8413cb808477449d1968c42497
+Subproject commit ab062ff3b37c39649f2b0d94ed607adc6f6b3c7d