[PATCH] Have x86_64 use add_active_range() and free_area_init_nodes
authorMel Gorman <mel@csn.ul.ie>
Wed, 27 Sep 2006 08:49:52 +0000 (01:49 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Wed, 27 Sep 2006 15:26:11 +0000 (08:26 -0700)
Size zones and holes in an architecture independent manner for x86_64.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
arch/x86_64/Kconfig
arch/x86_64/kernel/e820.c
arch/x86_64/kernel/setup.c
arch/x86_64/mm/init.c
arch/x86_64/mm/k8topology.c
arch/x86_64/mm/numa.c
arch/x86_64/mm/srat.c
include/asm-x86_64/e820.h
include/asm-x86_64/proto.h

index efe249e..326aff7 100644 (file)
@@ -85,6 +85,9 @@ config ARCH_MAY_HAVE_PC_FDC
        bool
        default y
 
+config ARCH_POPULATES_NODE_MAP
+       def_bool y
+
 config DMI
        bool
        default y
index c0af382..b3f0908 100644 (file)
@@ -162,59 +162,14 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
        return -1UL;            
 } 
 
-/* 
- * Free bootmem based on the e820 table for a node.
- */
-void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
-{
-       int i;
-       for (i = 0; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i]; 
-               unsigned long last, addr;
-
-               if (ei->type != E820_RAM || 
-                   ei->addr+ei->size <= start || 
-                   ei->addr >= end)
-                       continue;
-
-               addr = round_up(ei->addr, PAGE_SIZE);
-               if (addr < start) 
-                       addr = start;
-
-               last = round_down(ei->addr + ei->size, PAGE_SIZE); 
-               if (last >= end)
-                       last = end; 
-
-               if (last > addr && last-addr >= PAGE_SIZE)
-                       free_bootmem_node(pgdat, addr, last-addr);
-       }
-}
-
 /*
  * Find the highest page frame number we have available
  */
 unsigned long __init e820_end_of_ram(void)
 {
-       int i;
        unsigned long end_pfn = 0;
+       end_pfn = find_max_pfn_with_active_regions();
        
-       for (i = 0; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i]; 
-               unsigned long start, end;
-
-               start = round_up(ei->addr, PAGE_SIZE); 
-               end = round_down(ei->addr + ei->size, PAGE_SIZE); 
-               if (start >= end)
-                       continue;
-               if (ei->type == E820_RAM) { 
-               if (end > end_pfn<<PAGE_SHIFT)
-                       end_pfn = end>>PAGE_SHIFT;
-               } else { 
-                       if (end > end_pfn_map<<PAGE_SHIFT) 
-                               end_pfn_map = end>>PAGE_SHIFT;
-               } 
-       }
-
        if (end_pfn > end_pfn_map) 
                end_pfn_map = end_pfn;
        if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
@@ -224,43 +179,10 @@ unsigned long __init e820_end_of_ram(void)
        if (end_pfn > end_pfn_map) 
                end_pfn = end_pfn_map; 
 
+       printk("end_pfn_map = %lu\n", end_pfn_map);
        return end_pfn; 
 }
 
-/* 
- * Compute how much memory is missing in a range.
- * Unlike the other functions in this file the arguments are in page numbers.
- */
-unsigned long __init
-e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
-{
-       unsigned long ram = 0;
-       unsigned long start = start_pfn << PAGE_SHIFT;
-       unsigned long end = end_pfn << PAGE_SHIFT;
-       int i;
-       for (i = 0; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i];
-               unsigned long last, addr;
-
-               if (ei->type != E820_RAM ||
-                   ei->addr+ei->size <= start ||
-                   ei->addr >= end)
-                       continue;
-
-               addr = round_up(ei->addr, PAGE_SIZE);
-               if (addr < start)
-                       addr = start;
-
-               last = round_down(ei->addr + ei->size, PAGE_SIZE);
-               if (last >= end)
-                       last = end;
-
-               if (last > addr)
-                       ram += last - addr;
-       }
-       return ((end - start) - ram) >> PAGE_SHIFT;
-}
-
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
@@ -342,6 +264,49 @@ void __init e820_mark_nosave_regions(void)
        }
 }
 
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+                                                       unsigned long end_pfn)
+{
+       int i;
+       unsigned long ei_startpfn, ei_endpfn;
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+               ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+               ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
+                                                               >> PAGE_SHIFT;
+
+               /* Skip map entries smaller than a page */
+               if (ei_startpfn > ei_endpfn)
+                       continue;
+
+               /* Check if end_pfn_map should be updated */
+               if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
+                       end_pfn_map = ei_endpfn;
+
+               /* Skip if map is outside the node */
+               if (ei->type != E820_RAM ||
+                               ei_endpfn <= start_pfn ||
+                               ei_startpfn >= end_pfn)
+                       continue;
+
+               /* Check for overlaps */
+               if (ei_startpfn < start_pfn)
+                       ei_startpfn = start_pfn;
+               if (ei_endpfn > end_pfn)
+                       ei_endpfn = end_pfn;
+
+               /* Obey end_user_pfn to save on memmap */
+               if (ei_startpfn >= end_user_pfn)
+                       continue;
+               if (ei_endpfn > end_user_pfn)
+                       ei_endpfn = end_user_pfn;
+
+               add_active_range(nid, ei_startpfn, ei_endpfn);
+       }
+}
+
 /* 
  * Add a memory region to the kernel e820 map.
  */ 
index f98e48c..0b00bb2 100644 (file)
@@ -292,7 +292,8 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
        if (bootmap == -1L)
                panic("Cannot find bootmem map of size %ld\n",bootmap_size);
        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-       e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+       e820_register_active_regions(0, start_pfn, end_pfn);
+       free_bootmem_with_active_regions(0, end_pfn);
        reserve_bootmem(bootmap, bootmap_size);
 } 
 #endif
@@ -384,6 +385,7 @@ void __init setup_arch(char **cmdline_p)
 
        finish_e820_parsing();
 
+       e820_register_active_regions(0, 0, -1UL);
        /*
         * partially used pages are not usable - thus
         * we are rounding upwards:
@@ -414,6 +416,9 @@ void __init setup_arch(char **cmdline_p)
        max_pfn = end_pfn;
        high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
 
+       /* Remove active ranges so rediscovery with NUMA-awareness happens */
+       remove_all_active_ranges();
+
 #ifdef CONFIG_ACPI_NUMA
        /*
         * Parse SRAT to discover nodes.
index 1e4669f..4792839 100644 (file)
@@ -403,69 +403,15 @@ void __cpuinit zap_low_mappings(int cpu)
        __flush_tlb_all();
 }
 
-/* Compute zone sizes for the DMA and DMA32 zones in a node. */
-__init void
-size_zones(unsigned long *z, unsigned long *h,
-          unsigned long start_pfn, unsigned long end_pfn)
-{
-       int i;
-       unsigned long w;
-
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               z[i] = 0;
-
-       if (start_pfn < MAX_DMA_PFN)
-               z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
-       if (start_pfn < MAX_DMA32_PFN) {
-               unsigned long dma32_pfn = MAX_DMA32_PFN;
-               if (dma32_pfn > end_pfn)
-                       dma32_pfn = end_pfn;
-               z[ZONE_DMA32] = dma32_pfn - start_pfn;
-       }
-       z[ZONE_NORMAL] = end_pfn - start_pfn;
-
-       /* Remove lower zones from higher ones. */
-       w = 0;
-       for (i = 0; i < MAX_NR_ZONES; i++) {
-               if (z[i])
-                       z[i] -= w;
-               w += z[i];
-       }
-
-       /* Compute holes */
-       w = start_pfn;
-       for (i = 0; i < MAX_NR_ZONES; i++) {
-               unsigned long s = w;
-               w += z[i];
-               h[i] = e820_hole_size(s, w);
-       }
-
-       /* Add the space pace needed for mem_map to the holes too. */
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
-
-       /* The 16MB DMA zone has the kernel and other misc mappings.
-          Account them too */
-       if (h[ZONE_DMA]) {
-               h[ZONE_DMA] += dma_reserve;
-               if (h[ZONE_DMA] >= z[ZONE_DMA]) {
-                       printk(KERN_WARNING
-                               "Kernel too large and filling up ZONE_DMA?\n");
-                       h[ZONE_DMA] = z[ZONE_DMA];
-               }
-       }
-}
-
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
-       unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
-
+       unsigned long max_zone_pfns[MAX_NR_ZONES] = {MAX_DMA_PFN,
+                                                       MAX_DMA32_PFN,
+                                                       end_pfn};
        memory_present(0, 0, end_pfn);
        sparse_init();
-       size_zones(zones, holes, 0, end_pfn);
-       free_area_init_node(0, NODE_DATA(0), zones,
-                           __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
+       free_area_init_nodes(max_zone_pfns);
 }
 #endif
 
@@ -608,7 +554,8 @@ void __init mem_init(void)
 #else
        totalram_pages = free_all_bootmem();
 #endif
-       reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
+       reservedpages = end_pfn - totalram_pages -
+                                       absent_pages_in_range(0, end_pfn);
 
        after_bootmem = 1;
 
index 5cf594f..b5b8dba 100644 (file)
@@ -149,6 +149,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
                
                nodes[nodeid].start = base; 
                nodes[nodeid].end = limit;
+               e820_register_active_regions(nodeid,
+                               nodes[nodeid].start >> PAGE_SHIFT,
+                               nodes[nodeid].end >> PAGE_SHIFT);
 
                prevbase = base;
 
index 322bf45..829a008 100644 (file)
@@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
                                         bootmap_start >> PAGE_SHIFT, 
                                         start_pfn, end_pfn); 
 
-       e820_bootmem_free(NODE_DATA(nodeid), start, end);
+       free_bootmem_with_active_regions(nodeid, end);
 
        reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
        reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
@@ -175,13 +175,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 void __init setup_node_zones(int nodeid)
 { 
        unsigned long start_pfn, end_pfn, memmapsize, limit;
-       unsigned long zones[MAX_NR_ZONES];
-       unsigned long holes[MAX_NR_ZONES];
 
        start_pfn = node_start_pfn(nodeid);
        end_pfn = node_end_pfn(nodeid);
 
-       Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
+       Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
                nodeid, start_pfn, end_pfn);
 
        /* Try to allocate mem_map at end to not fill up precious <4GB
@@ -195,10 +193,6 @@ void __init setup_node_zones(int nodeid)
                                round_down(limit - memmapsize, PAGE_SIZE), 
                                limit);
 #endif
-
-       size_zones(zones, holes, start_pfn, end_pfn);
-       free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
-                           start_pfn, holes);
 } 
 
 void __init numa_init_array(void)
@@ -259,8 +253,11 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
                printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
                return -1;
        }
-       for_each_online_node(i)
+       for_each_online_node(i) {
+               e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+                                               nodes[i].end >> PAGE_SHIFT);
                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       }
        numa_init_array();
        return 0;
 }
@@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
        for (i = 0; i < NR_CPUS; i++)
                numa_set_node(i, 0);
        node_to_cpumask[0] = cpumask_of_cpu(0);
+       e820_register_active_regions(0, start_pfn, end_pfn);
        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
 
@@ -340,12 +338,17 @@ static void __init arch_sparse_init(void)
 void __init paging_init(void)
 { 
        int i;
+       unsigned long max_zone_pfns[MAX_NR_ZONES] = { MAX_DMA_PFN,
+               MAX_DMA32_PFN,
+               end_pfn};
 
        arch_sparse_init();
 
        for_each_online_node(i) {
                setup_node_zones(i); 
        }
+
+       free_area_init_nodes(max_zone_pfns);
 } 
 
 static __init int numa_setup(char *opt)
index ca10701..7b50bb1 100644 (file)
@@ -93,6 +93,7 @@ static __init void bad_srat(void)
                apicid_to_node[i] = NUMA_NO_NODE;
        for (i = 0; i < MAX_NUMNODES; i++)
                nodes_add[i].start = nodes[i].end = 0;
+       remove_all_active_ranges();
 }
 
 static __init inline int srat_disabled(void)
@@ -175,7 +176,7 @@ static int hotadd_enough_memory(struct bootnode *nd)
 
        if (mem < 0)
                return 0;
-       allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+       allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
        allowed = (allowed / 100) * hotadd_percent;
        if (allocated + mem > allowed) {
                unsigned long range;
@@ -225,7 +226,7 @@ static int reserve_hotadd(int node, unsigned long start, unsigned long end)
        }
 
        /* This check might be a bit too strict, but I'm keeping it for now. */
-       if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+       if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
                printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
                return -1;
        }
@@ -319,6 +320,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 
        printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
               nd->start, nd->end);
+       e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
+                                               nd->end >> PAGE_SHIFT);
 
 #ifdef RESERVE_HOTADD
        if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
@@ -343,13 +346,13 @@ static int nodes_cover_memory(void)
                unsigned long s = nodes[i].start >> PAGE_SHIFT;
                unsigned long e = nodes[i].end >> PAGE_SHIFT;
                pxmram += e - s;
-               pxmram -= e820_hole_size(s, e);
+               pxmram -= absent_pages_in_range(s, e);
                pxmram -= nodes_add[i].end - nodes_add[i].start;
                if ((long)pxmram < 0)
                        pxmram = 0;
        }
 
-       e820ram = end_pfn - e820_hole_size(0, end_pfn);
+       e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
        /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
        if ((long)(e820ram - pxmram) >= 1*1024*1024) {
                printk(KERN_ERR
index e15d3c8..fa20867 100644 (file)
@@ -47,10 +47,9 @@ extern void e820_print_map(char *who);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
 
-extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
 extern void e820_setup_gap(void);
-extern unsigned long e820_hole_size(unsigned long start_pfn,
-                                   unsigned long end_pfn);
+extern void e820_register_active_regions(int nid,
+                               unsigned long start_pfn, unsigned long end_pfn);
 
 extern void finish_e820_parsing(void);
 
index b73d0c7..c28fc2d 100644 (file)
@@ -24,8 +24,6 @@ extern void mtrr_bp_init(void);
 #define mtrr_bp_init() do {} while (0)
 #endif
 extern void init_memory_mapping(unsigned long start, unsigned long end);
-extern void size_zones(unsigned long *z, unsigned long *h,
-                       unsigned long start_pfn, unsigned long end_pfn);
 
 extern void system_call(void); 
 extern int kernel_syscall(void);