diff -urNp acpi/arch/ia64/config.in disc/arch/ia64/config.in --- acpi/arch/ia64/config.in Mon Jul 29 15:23:18 2002 +++ disc/arch/ia64/config.in Fri Aug 9 12:59:01 2002 @@ -72,6 +72,10 @@ if [ "$CONFIG_MCKINLEY" = "y" ]; then fi if [ "$CONFIG_IA64_GENERIC" = "y" -o "$CONFIG_IA64_DIG" = "y" -o "$CONFIG_IA64_HP_ZX1" = "y" ]; then + bool ' Enable NUMA support' CONFIG_NUMA + if [ "$CONFIG_NUMA" = "y" ]; then + define_bool CONFIG_DISCONTIGMEM y + fi bool ' Enable IA-64 Machine Check Abort' CONFIG_IA64_MCA define_bool CONFIG_PM y fi diff -urNp acpi/arch/ia64/kernel/setup.c disc/arch/ia64/kernel/setup.c --- acpi/arch/ia64/kernel/setup.c Fri Aug 9 12:39:54 2002 +++ disc/arch/ia64/kernel/setup.c Fri Aug 9 12:59:01 2002 @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -109,15 +110,53 @@ find_max_pfn (unsigned long start, unsig #define IGNORE_PFN0 1 /* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */ + +#ifdef CONFIG_DISCONTIGMEM /* - * Free available memory based on the primitive map created from - * the boot parameters. This routine does not assume the incoming - * segments are sorted. + * efi_memmap_walk() knows nothing about layout of memory across nodes. Find + * out to which node a block of memory belongs. Ignore memory that we cannot + * identify, and split blocks that run across multiple nodes. + * + * Take this opportunity to round the start address up and the end address + * down to page boundaries. */ -static int -free_available_memory (unsigned long start, unsigned long end, void *arg) +void +call_pernode_memory (unsigned long start, unsigned long end, void *arg) +{ + unsigned long rs, re; + void (*func)(unsigned long, unsigned long, int, int); + int i; + + start = PAGE_ALIGN(start); + end &= PAGE_MASK; + if (start >= end) + return; + + func = arg; + + for (i = 0; i < num_memblks; i++) { + rs = MAX(start, node_memblk[i].start_paddr); + re = MIN(end, node_memblk[i].start_paddr+node_memblk[i].size); + + if (rs < re) + (*func)(rs, re-rs, node_memblk[i].nid, + node_memblk[i].bank); + } +} +#endif /* CONFIG_DISCONTIGMEM */ + +/* + * Filter incoming memory segments based on the primitive map created from + * the boot parameters. Segments contained in the map are removed from the + * memory ranges. A caller-specified function is called with the memory + * ranges that remain after filtering. + * This routine does not assume the incoming segments are sorted. + */ +int +filter_rsvd_memory (unsigned long start, unsigned long end, void *arg) { unsigned long range_start, range_end, prev_start; + void (*func)(unsigned long, unsigned long); int i; #if IGNORE_PFN0 @@ -131,13 +170,18 @@ free_available_memory (unsigned long sta * lowest possible address(walker uses virtual) */ prev_start = PAGE_OFFSET; + func = arg; for (i = 0; i < num_rsvd_regions; ++i) { range_start = MAX(start, prev_start); range_end = MIN(end, rsvd_region[i].start); if (range_start < range_end) - free_bootmem(__pa(range_start), range_end - range_start); +#ifdef CONFIG_DISCONTIGMEM + call_pernode_memory(__pa(range_start), __pa(range_end), func); +#else + (*func)(__pa(range_start), range_end - range_start); +#endif /* nothing more available in this segment */ if (range_end == end) return 0; @@ -149,6 +193,8 @@ free_available_memory (unsigned long sta } +#ifndef CONFIG_DISCONTIGMEM + /* * Find a place to put the bootmap and return its starting address in bootmap_start. * This address must be page-aligned. @@ -173,11 +219,9 @@ find_bootmap_location (unsigned long sta range_start = MAX(start, free_start); range_end = MIN(end, rsvd_region[i].start & PAGE_MASK); - if (range_end <= range_start) continue; /* skip over empty range */ - - if (range_end - range_start >= needed) { + if (range_end > range_start && range_end - range_start >= needed) { bootmap_start = __pa(range_start); - return 1; /* done */ + return -1; /* done */ } /* nothing more available in this segment */ @@ -187,6 +231,7 @@ find_bootmap_location (unsigned long sta } return 0; } +#endif /* CONFIG_DISCONTIGMEM */ static void sort_regions (struct rsvd_region *rsvd_region, int max) @@ -255,6 +300,13 @@ find_memory (void) max_pfn = 0; efi_memmap_walk(find_max_pfn, &max_pfn); +#ifdef CONFIG_DISCONTIGMEM + { + extern void discontig_mem_init(void); + bootmap_start = bootmap_size = 0; /* stops gcc warnings */ + discontig_mem_init() ; + } +#else /* how many bytes to cover all the pages */ bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT; @@ -267,8 +319,9 @@ find_memory (void) bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn); /* Free all available memory, then mark bootmem-map as being in use. */ - efi_memmap_walk(free_available_memory, 0); + efi_memmap_walk(filter_rsvd_memory, free_bootmem); reserve_bootmem(bootmap_start, bootmap_size); +#endif /* CONFIG_DISCONTIGMEM */ #ifdef CONFIG_BLK_DEV_INITRD if (ia64_boot_param->initrd_start) { @@ -446,6 +499,8 @@ show_cpuinfo (struct seq_file *m, void * c->itc_freq / 1000000, c->itc_freq % 1000000, lpj*HZ/500000, (lpj*HZ/5000) % 100); return 0; +#undef lpj +#undef cpu } static void * @@ -533,6 +588,7 @@ identify_cpu (struct cpuinfo_ia64 *c) c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); } + /* * cpu_init() initializes state that is per-CPU. This function acts * as a 'CPU state barrier', nothing should get across. @@ -546,7 +602,14 @@ cpu_init (void) unsigned int max_ctx; struct cpuinfo_ia64 *my_cpu_data; #ifdef CONFIG_NUMA - int cpu, order; + int cpu, order=0; + struct page *pg; + struct cpuinfo_ia64 *old_cpu_data=NULL; +#ifdef CONFIG_IA64_DIG + /* FIXME */ + static struct cpuinfo_ia64 dig_numa_cpu_data[NR_CPUS]; +#endif + /* * If NUMA is configured, the cpu_data array is not preallocated. The boot cpu @@ -556,27 +619,39 @@ cpu_init (void) * before the cpus are actually started. */ if (!boot_cpu_data) { - my_cpu_data = alloc_bootmem_pages_node(NODE_DATA(numa_node_id()), + my_cpu_data = alloc_bootmem_pages_node(BOOT_NODE_DATA(boot_get_local_cnodeid()), sizeof(struct cpuinfo_ia64)); boot_cpu_data = my_cpu_data; my_cpu_data->cpu_data[0] = my_cpu_data; for (cpu = 1; cpu < NR_CPUS; ++cpu) my_cpu_data->cpu_data[cpu] - = alloc_bootmem_pages_node(NODE_DATA(numa_node_id()), + = alloc_bootmem_pages_node(BOOT_NODE_DATA(boot_get_local_cnodeid()), sizeof(struct cpuinfo_ia64)); for (cpu = 1; cpu < NR_CPUS; ++cpu) memcpy(my_cpu_data->cpu_data[cpu]->cpu_data, my_cpu_data->cpu_data, sizeof(my_cpu_data->cpu_data)); } else { +#ifdef CONFIG_IA64_DIG + my_cpu_data = &dig_numa_cpu_data[smp_processor_id()]; +#else + int nid = boot_get_local_cnodeid(); order = get_order(sizeof(struct cpuinfo_ia64)); - my_cpu_data = page_address(alloc_pages_node(numa_node_id(), GFP_KERNEL, order)); + pg = __alloc_pages(GFP_KERNEL, order, + BOOT_NODE_DATA(nid)->node_zonelists + + (GFP_KERNEL & GFP_ZONEMASK)); + my_cpu_data = page_address(pg); +#endif memcpy(my_cpu_data, boot_cpu_data->cpu_data[smp_processor_id()], sizeof(struct cpuinfo_ia64)); - __free_pages(virt_to_page(boot_cpu_data->cpu_data[smp_processor_id()]), - order); + + /* Cant call __free_pages until cpu_data is set up. */ + old_cpu_data = boot_cpu_data->cpu_data[smp_processor_id()]; for (cpu = 0; cpu < NR_CPUS; ++cpu) boot_cpu_data->cpu_data[cpu]->cpu_data[smp_processor_id()] = my_cpu_data; } + my_cpu_data->node_data = get_node_data_ptr(); + my_cpu_data->cnodeid = boot_get_local_cnodeid(); + my_cpu_data->node_data->active_cpu_count++; #else my_cpu_data = cpu_data(smp_processor_id()); #endif @@ -667,4 +742,9 @@ cpu_init (void) local_cpu_data->phys_stacked_size_p8 = num_phys_stacked*8 + 8; platform_cpu_init(); + +#ifdef CONFIG_NUMA + if (old_cpu_data) + __free_pages(virt_to_page(old_cpu_data), order); +#endif } diff -urNp acpi/arch/ia64/mm/Makefile disc/arch/ia64/mm/Makefile --- acpi/arch/ia64/mm/Makefile Fri Aug 9 12:39:54 2002 +++ disc/arch/ia64/mm/Makefile Fri Aug 9 12:59:01 2002 @@ -11,5 +11,6 @@ O_TARGET := mm.o obj-y := init.o fault.o tlb.o extable.o obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_DISCONTIGMEM) += discontig.o include $(TOPDIR)/Rules.make diff -urNp acpi/arch/ia64/mm/discontig.c disc/arch/ia64/mm/discontig.c --- acpi/arch/ia64/mm/discontig.c Thu Jan 1 01:00:00 1970 +++ disc/arch/ia64/mm/discontig.c Fri Aug 9 13:32:09 2002 @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Tony Luck + */ + +/* + * Platform initialization for Discontig Memory + */ + +#include +#include +#include +#include +#include +#include + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* + * Round an address upward to the next multiple of chunk size. + */ +#define CHUNKROUNDUP(n) (((n)+PLAT_CHUNKSIZE-1) & ~(PLAT_CHUNKSIZE-1)) + +/* + * (Yes, I know this is a very ugly hack!! This hack should be fixed, but the + * only fix that looks reasonable requires changes to the interface to the bootmem + * allocator. These changes, while not unreasonable, ripple across multiple platforms. + * The following hack is used until we find a better solution). + * + * Ideally, we would define boot_pg_data as + * pg_data_t boot_pg_data[PLAT_MAX_COMPACT_NODES] + * + * However, this adds an unacceptibly large static data structure (8MB on SGI + * platforms) to the kernel. + * + * Pointers to boot_pg_data structures are used in the interface to the bootmem + * allocator. There is only 1 field (bdata) in the boot_pg_data that + * is ever used. Bdata is used by the bootmem allocator to locate the bootmem_data_t + * structure for the node. + * + * The following structure is used during boot as the pg_data structure for all + * nodes. By casting boot_pg_data[n] to the pg_data_t structure for node n, we can + * compress the space required for the pg_data array. + * + */ +static long boot_pg_data[8*PLAT_MAX_COMPACT_NODES+sizeof(pg_data_t)] __initdata; + +static pg_data_t *pg_data_ptr[PLAT_MAX_COMPACT_NODES] __initdata; +static ia64_node_data_t *node_data[PLAT_MAX_COMPACT_NODES] __initdata; +static bootmem_data_t bdata[PLAT_MAX_COMPACT_NODES][PLAT_CLUMPS_PER_NODE+1] __initdata; + +struct page *invalid_mem_map; /* value returned by virt_to_page for bad addresses */ + +#ifdef CONFIG_IA64_SGI_SN +static cnodeid_t phys_node_map[PLAT_MAX_NODE_NUMBER] __initdata; +#endif + +extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg); + +/* + * Return the compact node number of this cpu. Used prior to + * setting up the cpu_data area. + * Note - not fast, intended for boot use only!! + */ +int __init +boot_get_local_cnodeid(void) +{ + int i; + + for (i=0; ibdata; + epfn = CHUNKROUNDUP(pstart + length) >> PAGE_SHIFT; + cstart = pstart & ~(PLAT_CLUMPSIZE - 1); + + while (bdp->node_low_pfn) { + /* if this is part of a block that we have already seen */ + if (cstart == bdp->node_boot_start) { + bdp->node_low_pfn = MAX(bdp->node_low_pfn, epfn); + break; + } + /* if this block immediately follows a block that is >= 95% full */ + if ((cstart>>PAGE_SHIFT) < bdp->node_low_pfn || + (cstart>>PAGE_SHIFT) - bdp->node_low_pfn < (PLAT_CLUMPSIZE>>PAGE_SHIFT)/20) { + bdp->node_low_pfn = epfn; + break; + } + if (i++ == PLAT_CLUMPS_PER_NODE) { + printk("Lost %ld bytes at %lx\n", length, pstart); + return 0; + } + bdp++; + } + if (bdp->node_low_pfn == 0) { + bdp->node_boot_start = cstart; + bdp->node_low_pfn = epfn; + } + + min_low_pfn = MIN(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT); + max_low_pfn = MAX(max_low_pfn, bdp->node_low_pfn); + + return 0; +} + +/* + * Find space on each node for the bootmem map. + * + * Called by efi_memmap_walk to find boot memory on each node. Note that + * only blocks that are free are passed to this routine (currently filtered by + * free_available_memory). + */ +static int __init +find_bootmap_space(unsigned long pstart, unsigned long length, int cnodeid) +{ + unsigned long mapsize, pages, epfn; + bootmem_data_t *bdp; + + epfn = (pstart + length) >> PAGE_SHIFT; + bdp = &pg_data_ptr[cnodeid]->bdata[0]; + while (bdp->node_low_pfn) { + if (pstart >= bdp->node_boot_start && epfn <= bdp->node_low_pfn) + break; + bdp++; + } + if (bdp->node_low_pfn == 0) + return 0; + + if (!bdp->node_bootmem_map) { + pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); + mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; + if (length > mapsize) { + init_bootmem_node( + BOOT_NODE_DATA(cnodeid), + pstart>>PAGE_SHIFT, + bdp->node_boot_start>>PAGE_SHIFT, + bdp->node_low_pfn); + } + + } + + return 0; +} + + +/* + * Free available memory to the bootmem allocator. + * + * Note that only blocks that are free are passed to this routine (currently + * filtered by free_available_memory). + * + */ +static int __init +discontig_free_bootmem_node(unsigned long pstart, unsigned long length, int cnodeid) +{ + free_bootmem_node(BOOT_NODE_DATA(cnodeid), pstart, length); + + return 0; +} + + +/* + * Reserve the space used by the bootmem maps. + */ +static void __init +discontig_reserve_bootmem(void) +{ + int cnodeid; + unsigned long mapbase, mapsize, pages; + bootmem_data_t *bdp; + + for (cnodeid=0; cnodeid < numnodes; cnodeid++) { + bdp = BOOT_NODE_DATA(cnodeid)->bdata; + + while (bdp->node_low_pfn) { + pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT); + mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT; + mapbase = __pa(bdp->node_bootmem_map); + reserve_bootmem_node(BOOT_NODE_DATA(cnodeid), + mapbase, mapsize); + bdp++; + } + } +} + + + +/* + * Allocate per node tables. + * - the pg_data structure is allocated on each node. This minimizes offnode + * memory references + * - the node data is allocated & initialized. Portions of this structure is read-only (after + * boot) and contains node-local pointers to usefuls data structures located on + * other nodes. + * + * We also switch to using the "real" pg_data structures at this point. Earlier in boot, we + * use a different structure. The only use for pg_data prior to the point in boot is to get + * the pointer to the bdata for the node. + */ +static void __init +allocate_pernode_structures(void) +{ + pg_data_t *pgdat=0, *new_pgdat_list=0; + int pxm, cnodeid, mycnodeid; + + mycnodeid = boot_get_local_cnodeid(); + for (cnodeid=numnodes-1; cnodeid>=0 ; cnodeid--) { + node_data[cnodeid] = alloc_bootmem_node(BOOT_NODE_DATA(cnodeid), sizeof (ia64_node_data_t)); + + /* + * On SN platforms, we need to make sure that the pg_data structures dont alias + * to the same L1/2/3/4 cache lines. First attempt at making structures node local had + * 3x degradation for a simple page fault. The degradation was caused by aliasing. + * The pgdata structures on all nodes were located at the same node offset. + * + * To prevent aliasing, we use the "goal" parameter of the boot memory allocator + * to specify the lowest acceptible address for allocations. New allocations must + * start after the end of the physical page of the previous allocation, adjusted + * for node numbers. + * + * We could also make a change in the bootmem allocator but that requires changes + * to base linux. Preliminary experiments didnt show any performance advantage + * in changing the bootmem allocator. + * + * Not sure if this problem is SN specific or not..... + * ZZZ - I dont like this method - find a better way ZZZ + */ + pgdat = __alloc_bootmem_node(BOOT_NODE_DATA(cnodeid), sizeof(pg_data_t), SMP_CACHE_BYTES, + PLAT_BOOTMEM_ALLOC_GOAL(cnodeid, pgdat+1)); + pgdat->bdata = &(bdata[cnodeid][0]); + pg_data_ptr[cnodeid] = pgdat; + pgdat->node_next = new_pgdat_list; + new_pgdat_list = pgdat; + + } + + memcpy(node_data[mycnodeid]->pg_data_ptrs, pg_data_ptr, sizeof(pg_data_ptr)); + memcpy(node_data[mycnodeid]->node_data_ptrs, node_data, sizeof(node_data)); + + pgdat_list = new_pgdat_list; + +#ifdef CONFIG_IA64_SGI_SN + { + memset(phys_node_map, -1, sizeof(phys_node_map)); + for (i=0; iphysical_node_map, phys_node_data, sizeof(phys_node_data)); + } +#endif +} + + + +/* + * Called early in boot to setup the boot memory allocator, and to + * allocate the node-local pg_data & node-directory data structures.. + */ +void __init +discontig_mem_init(void) +{ + int cnodeid; + + for (cnodeid=0; cnodeidbdata = &bdata[cnodeid][0]; + } + + min_low_pfn = -1; + max_low_pfn = 0; + + + efi_memmap_walk(filter_rsvd_memory, build_maps); + efi_memmap_walk(filter_rsvd_memory, find_bootmap_space); + efi_memmap_walk(filter_rsvd_memory, discontig_free_bootmem_node); + discontig_reserve_bootmem(); + allocate_pernode_structures(); +} + + + +/* + * Initialize the paging system. + * - determine sizes of each node + * - initialize the paging system for the node + * - build the nodedir for the node. This contains pointers to + * the per-bank mem_map entries. + * - fix the page struct "virtual" pointers. These are bank specific + * values that the paging system doesnt understand. + * - replicate the nodedir structure to other nodes + */ + +void __init +discontig_paging_init(void) +{ + int i, cnodeid, mycnodeid; + unsigned long max_dma, zones_size[MAX_NR_ZONES]; + unsigned long kaddr, ekaddr, npages; + struct page *page; + bootmem_data_t *bdp; + + max_mapnr = 0; + max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + + + mycnodeid = boot_get_local_cnodeid(); + for (cnodeid = 0; cnodeid < numnodes; cnodeid++) { + long pfn, startpfn; + + memset(zones_size, 0, sizeof(zones_size)); + + startpfn = -1; + bdp = BOOT_NODE_DATA(cnodeid)->bdata; + while (bdp->node_low_pfn) { + pfn = bdp->node_boot_start >> PAGE_SHIFT; + if (startpfn == -1) + startpfn = pfn; + if (pfn > max_dma) + zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - pfn); + else if (bdp->node_low_pfn < max_dma) + zones_size[ZONE_DMA] += (bdp->node_low_pfn - pfn); + else { + zones_size[ZONE_DMA] += (max_dma - pfn); + zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - max_dma); + } + bdp++; + } + + free_area_init_node(cnodeid, NODE_DATA(cnodeid), NULL, zones_size, startpfn<node_mem_map; + + bdp = BOOT_NODE_DATA(cnodeid)->bdata; + + while (bdp->node_low_pfn) { + kaddr = (unsigned long)__va(bdp->node_boot_start); + ekaddr = (unsigned long)__va(bdp->node_low_pfn << PAGE_SHIFT); + while (kaddr < ekaddr) { + node_data[mycnodeid]->clump_mem_map_base[PLAT_CLUMP_MEM_MAP_INDEX(kaddr)] = page; + npages = PLAT_CLUMPSIZE/PAGE_SIZE; + if (kaddr + (npages< ekaddr) + npages = (ekaddr - kaddr) >> PAGE_SHIFT; + for (i = 0; i < npages; i++, page++, kaddr += PAGE_SIZE) + page->virtual = (void*)kaddr; + } + bdp++; + } + max_mapnr = MAX(max_mapnr, page - mem_map); + } + + /* + * Finish setting up the node data for this node, then copy it to the other nodes. + */ + for (cnodeid=0; cnodeid < numnodes; cnodeid++) + if (mycnodeid != cnodeid) { + memcpy(node_data[cnodeid], node_data[mycnodeid], sizeof(ia64_node_data_t)); + node_data[cnodeid]->cnodeid = cnodeid; + node_data[cnodeid]->active_cpu_count = 0; + } + + invalid_mem_map = mem_map + max_mapnr; +} diff -urNp acpi/arch/ia64/mm/init.c disc/arch/ia64/mm/init.c --- acpi/arch/ia64/mm/init.c Mon Jul 29 15:23:19 2002 +++ disc/arch/ia64/mm/init.c Fri Aug 9 12:59:01 2002 @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -186,6 +188,53 @@ si_meminfo (struct sysinfo *val) return; } +#ifdef CONFIG_DISCONTIGMEM +void +show_mem(void) +{ + int i, nid; + int reserved, cached, slab, free, active; + pg_data_t *pgdat; + zone_t *zone; + + printk("%4s %7s %7s %9s %7s %7s %7s\n", "node","total", + "free","reserved","swap","slab","active"); + + /* + * Iterate over each node's pg_data_t and look at its pages + */ + for(nid = 0; nid < numnodes; nid++) { + pgdat = NODE_DATA(nid); + reserved = cached = slab = free = active = 0; + + /* + * Get info about each page in the node + */ + printk("%4d ", pgdat->node_id); + for(i = 0; i < pgdat->node_size; i++) { + if (PageReserved(pgdat->node_mem_map + i)) + reserved++; + if (PageSwapCache(pgdat->node_mem_map + i)) + cached++; + if (PageSlab(pgdat->node_mem_map + i)) + slab++; + if (PageActive(pgdat->node_mem_map + i)) + active++; + } + for (zone = pgdat->node_zones; + zone < pgdat->node_zones + MAX_NR_ZONES; zone++) + free += zone->free_pages; + + printk("%7ld ", pgdat->node_size); + printk("%7d ", free); + printk("%9d ", reserved); + printk("%7d ", cached); + printk("%7d ", slab); + printk("%7d\n", active); + } + show_buffers(); +} +#else /* !CONFIG_DISCONTIGMEM */ void show_mem(void) { @@ -195,32 +244,6 @@ show_mem(void) printk("Mem-info:\n"); show_free_areas(); -#ifdef CONFIG_DISCONTIGMEM - { - pg_data_t *pgdat = pgdat_list; - - printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - do { - printk("Node ID: %d\n", pgdat->node_id); - for(i = 0; i < pgdat->node_size; i++) { - if (PageReserved(pgdat->node_mem_map+i)) - reserved++; - else if (PageSwapCache(pgdat->node_mem_map+i)) - cached++; - else if (page_count(pgdat->node_mem_map + i)) - shared += page_count(pgdat->node_mem_map + i) - 1; - } - printk("\t%d pages of RAM\n", pgdat->node_size); - printk("\t%d reserved pages\n", reserved); - printk("\t%d pages shared\n", shared); - printk("\t%d pages swap cached\n", cached); - pgdat = pgdat->node_next; - } while (pgdat); - printk("Total of %ld pages in page table cache\n", pgtable_cache_size); - show_buffers(); - printk("%d free buffer pages\n", nr_free_buffer_pages()); - } -#else /* !CONFIG_DISCONTIGMEM */ printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); i = max_mapnr; while (i-- > 0) { @@ -240,8 +263,8 @@ show_mem(void) printk("%d pages swap cached\n", cached); printk("%ld pages in page table cache\n", pgtable_cache_size); show_buffers(); -#endif /* !CONFIG_DISCONTIGMEM */ } +#endif /* !CONFIG_DISCONTIGMEM */ /* * This is like put_dirty_page() but installs a clean page with PAGE_GATE protection @@ -515,7 +538,10 @@ count_pages (u64 start, u64 end, void *a void paging_init (void) { - unsigned long max_dma, zones_size[MAX_NR_ZONES]; +#ifndef CONFIG_DISCONTIGMEM + unsigned long max_dma; +#endif + unsigned long zones_size[MAX_NR_ZONES]; /* initialize mem_map[] */ @@ -524,6 +550,12 @@ paging_init (void) num_physpages = 0; efi_memmap_walk(count_pages, &num_physpages); +#ifdef CONFIG_DISCONTIGMEM + { + extern void discontig_paging_init(void); + discontig_paging_init(); + } +#else max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; #ifdef CONFIG_VIRTUAL_MEM_MAP @@ -568,18 +600,17 @@ paging_init (void) } free_area_init(zones_size); #endif /* !CONFIG_VIRTUAL_MEM_MAP */ +#endif /* CONFIG_DISCONTIGMEM */ } static int count_reserved_pages (u64 start, u64 end, void *arg) { - unsigned long num_reserved = 0; unsigned long *count = arg; - struct page *pg; - - for (pg = virt_to_page(start); pg < virt_to_page(end); ++pg) - if (PageReserved(pg)) - ++num_reserved; + unsigned long num_reserved = 0; + for (; start < end; start += PAGE_SIZE) + if (PageReserved(virt_to_page(start))) + ++num_reserved; *count += num_reserved; return 0; } @@ -603,7 +634,9 @@ mem_init (void) if (!mem_map) BUG(); +#ifndef CONFIG_DISCONTIGMEM max_mapnr = max_low_pfn; +#endif high_memory = __va(max_low_pfn * PAGE_SIZE); totalram_pages += free_all_bootmem(); @@ -633,6 +666,8 @@ mem_init (void) if (num_pgt_pages > pgt_cache_water[1]) pgt_cache_water[1] = num_pgt_pages; + show_mem() ; + /* install the gate page in the global page table: */ put_gate_page(virt_to_page(__start_gate_section), GATE_ADDR); diff -urNp acpi/include/asm-ia64/acpi.h disc/include/asm-ia64/acpi.h --- acpi/include/asm-ia64/acpi.h Fri Aug 9 13:30:19 2002 +++ disc/include/asm-ia64/acpi.h Fri Aug 9 12:39:54 2002 @@ -100,13 +100,6 @@ const char *acpi_get_sysname (void); int acpi_request_vector (u32 int_type); int acpi_get_prt (struct pci_vector_struct **vectors, int *count); int acpi_get_interrupt_model(int *type); - -#ifdef CONFIG_ACPI_NUMA -/* Proximity bitmap length; _PXM is at most 255 (8 bit)*/ -#define MAX_PXM_DOMAINS (256) -extern int pxm_to_nid_map[MAX_PXM_DOMAINS]; -extern int nid_to_pxm_map[NR_NODES]; -#endif #endif /*__KERNEL__*/ #endif /*_ASM_ACPI_H*/ diff -urNp acpi/include/asm-ia64/mmzone.h disc/include/asm-ia64/mmzone.h --- acpi/include/asm-ia64/mmzone.h Thu Jan 1 01:00:00 1970 +++ disc/include/asm-ia64/mmzone.h Fri Aug 9 12:59:06 2002 @@ -0,0 +1,219 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved. + */ +#ifndef _ASM_IA64_SN_MMZONE_H +#define _ASM_IA64_SN_MMZONE_H + +#include +#include + +#if defined(CONFIG_IA64_SGI_SN1) +#include +#elif defined(CONFIG_IA64_SGI_SN2) +#include +#elif defined(CONFIG_IA64_DIG) +#include +#else +#error "Unknown architecture" +#endif + +#include + +/* + * General Concepts: + * + * - Nodes are numbered several ways: + * + * compact node numbers - compact node numbers are a dense numbering of + * all the nodes in the system. An N node system will have compact + * nodes numbered 0 .. N-1. There is no significance to the node + * numbers. The compact node number assigned to a specific physical + * node may vary from boot to boot. The boot node is not necessarily + * node 0. + * + * physical node numbers - Physical node numbers may not be dense + * nor do they necessarily start with 0. The exact significance of + * a physical node number is platform specific. + * + * proximity domain numbers - these numbers are assigned by ACPI. + * Each platform must provide a platform specific function + * for mapping proximity node numbers to physical node numbers. + * + * Most of the code in the kernel uses compact node numbers to identify nodes. + * + * + * - Memory is conceptually divided into chunks. A chunk is either + * completely present, or else the kernel assumes it is completely + * absent. Each node consists of a number of possibly discontiguous chunks. + * + * - A contiguous group of memory chunks that reside on the same node + * are referred to as a clump. Note that a clump may be partially present. + * (Note, on some hardware implementations, a clump is the same as a memory + * bank or a DIMM). + * + * - a node consists of multiple clumps of memory. From a NUMA perspective, + * accesses to all clumps on the node have the same latency. Except for zone issues, + * the clumps are treated as equivalent for allocation/performance purposes. + * + * - each node has a single contiguous mem_map array. The array contains page struct + * entries for every page on the node. There are no "holes" in the mem_map array. + * The node data area (see below) has pointers to the start of the mem_map entries + * for each clump on the node. + * + * - associated with each node is a pg_data_t structure. This structure contains the + * information used by the linux memory allocator for managing the memory on the + * node. The pg_data_t structure for a node is located on the node. + * + * - to minimize offnode memory references, a "node directory" is maintained on each + * node. This directory replicates frequently used read-only data structures that + * are used in macro evaluation. Examples include the addresses of the + * pernode pg_data structures for each node. + * + * - the MAP_NR function has been modified to be "clump aware" & uses the clump_mem_map_base + * array in the node data area for generating MAP_NR numbers. + * + * - the node data area contains array of pointers to the mem_map entries for each clump + * of memory. The array is indexed by a platform specific function. + * + * - each cpu has a pointer it's node data area contained in it's cpu_data structure. + * + * - each platform is responsible for defining the following constants & functions: + * + * PLAT_BOOTMEM_ALLOC_GOAL(cnode,kaddr) - Calculate a "goal" value to be passed + * to __alloc_bootmem_node for allocating structures on nodes so that + * they dont alias to the same line in the cache as the previous + * allocated structure. You can return 0 if your platform doesnt have + * this problem. + * (Note: need better solution but works for now ZZZ). + * + * PLAT_CHUNKSIZE - defines the size of the platform memory chunk. + * + * PLAT_CHUNKNUM(kaddr) - takes a kaddr & returns its chunk number + * + * PLAT_CLUMP_MEM_MAP_INDEX(kaddr) - Given a kaddr, find the index into the + * clump_mem_map_base array of the page struct entry for the first page + * of the clump. + * + * PLAT_CLUMP_OFFSET(kaddr) - find the byte offset of a kaddr within the clump that + * contains it. + * + * PLAT_CLUMPSIZE - defines the size in bytes of the smallest clump supported on the platform. + * + * PLAT_CLUMPS_PER_NODE - maximum number of clumps per node + * + * PLAT_MAXCLUMPS - maximum number of clumps on all node combined + * + * PLAT_MAX_COMPACT_NODES - maximum number of nodes in a system. (do not confuse this + * with the maximum node number. Nodes can be sparsely numbered). + * + * PLAT_MAX_NODE_NUMBER - maximum physical node number plus 1 + * + * PLAT_MAX_PHYS_MEMORY - maximum physical memory address + * + * PLAT_PXM_TO_PHYS_NODE_NUMBER(pxm) - convert a proximity_domain number (from ACPI) + * into a physical node number + * + * PLAT_VALID_MEM_KADDR(kaddr) - tests a kaddr to see if it potentially represents a + * valid physical memory address. Return 1 if potentially valid, 0 otherwise. + * (This function generally tests to see if any invalid bits are set in + * the address). + * + * + * - each platform is responsible for defining the following typedefs:: + * + * cnodeid_t - compact node number + * + */ + + +extern struct page *invalid_mem_map; /* value returned by virt_to_page for bad addresses */ + + + +/* + * Chunk related macros + * Note: It is not clear if VALIDCHUNK is really needed. It is currently used + * ONLY in kern_address_valid. The non-NUMA variant of this always + * returns 1. + * ZZZ Fixme???? + */ +#define VALIDCHUNK(cnum) 1 + + + +/* + * Given a kaddr, find the base mem_map address for the start of the mem_map + * entries for the clump containing the kaddr. + */ +#define CLUMP_MEM_MAP_BASE(kaddr) local_node_data->clump_mem_map_base[PLAT_CLUMP_MEM_MAP_INDEX(kaddr)] + + + +/* + * Given a kaddr, this macro return the relative map number + * within the clump. + */ +#define CLUMP_MAP_NR(kaddr) (PLAT_CLUMP_OFFSET(kaddr) >> PAGE_SHIFT) + + + +/* + * Finally.... This is the MAP_NR function for the platform. + */ +#define MAP_NR_DISCONTIG(kaddr) ({long _kmns=(long)(kaddr); \ + CLUMP_MAP_NR(_kmns) + \ + CLUMP_MEM_MAP_BASE(_kmns) - mem_map;}) + +/* + * Given a pte, this macro returns a pointer to the page struct for the pte. + */ +#define pte_page(pte) virt_to_page(PAGE_OFFSET | (pte_val(pte)&_PFN_MASK)) + + + +/* + * Determine if a kaddr is a valid memory address of memory that + * actually exists. + * + * The check consists of 2 parts: + * - verify that the address is a region 7 address & does not + * contain any bits that preclude it from being a valid platform + * memory address + * - verify that the chunk actually exists. + * + * Note that IO addresses are NOT considered valid addresses. + * + * Note, many platforms can simply check if kaddr exceeds a specific size. + * (However, this wont work on SGI platforms since IO space is embedded + * within the range of valid memory addresses & nodes have holes in the + * address range between clumps). + */ +#define kern_addr_valid(kaddr) ({long _kav=(long)(kaddr); \ + PLAT_VALID_MEM_KADDR(_kav) && VALIDCHUNK(PLAT_CHUNKNUM(_kav));}) + + +/* + * Given a kaddr, return a pointer to the page struct for the page. + * If the kaddr does not represent RAM memory that potentially exists, return + * a pointer the page struct for max_mapnr. IO addresses will + * return the page for max_nr. Addresses in unpopulated RAM banks may + * return undefined results OR may panic the system. + * + */ +#define virt_to_page(kaddr) ({long _kvtp=(long)(kaddr); \ + (PLAT_VALID_MEM_KADDR(_kvtp)) \ + ? CLUMP_MEM_MAP_BASE(_kvtp) + CLUMP_MAP_NR(_kvtp) \ + : invalid_mem_map;}) + +/* + * Given a page struct entry, return the physical address that the page struct represents. + * Since IA64 has all memory in the DMA zone, the following works: + */ +#define page_to_phys(page) __pa(page_address(page)) + + +#endif /* _ASM_IA64_SN_MMZONE_H */ diff -urNp acpi/include/asm-ia64/mmzone_dig_numa.h disc/include/asm-ia64/mmzone_dig_numa.h --- acpi/include/asm-ia64/mmzone_dig_numa.h Thu Jan 1 01:00:00 1970 +++ disc/include/asm-ia64/mmzone_dig_numa.h Fri Aug 9 12:59:12 2002 @@ -0,0 +1,71 @@ +#ifndef _ASM_IA64_MMZONE_DIG_NUMA_H +#define _ASM_IA64_MMZONE_DIG_NUMA_H +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +typedef short cnodeid_t; + +/* + * Platform definitions for DIG platform with contiguous memory. + */ + +#define PLAT_MAX_NODE_NUMBER 8 /* Maximum node number +1 */ +#define PLAT_MAX_COMPACT_NODES 8 /* Maximum number of nodes in SSI */ + +#define PLAT_MAX_PHYS_MEMORY (1UL << 40) /* 1 TB */ + +/* + * Clump definitions. + * Current settings for DIG: 512MB/clump, 16GB/node. + */ +#define PLAT_CLUMPS_PER_NODE 32 +#define PLAT_CLUMP_OFFSET(addr) ((unsigned long)(addr) & (PLAT_CLUMPSIZE-1)) +#define DIG_CLUMPSHIFT 29 +#define PLAT_CLUMPSIZE (1UL << DIG_CLUMPSHIFT) +#define PLAT_MAXCLUMPS (PLAT_CLUMPS_PER_NODE*PLAT_MAX_COMPACT_NODES) + +/* + * PLAT_VALID_MEM_KADDR returns a boolean to indicate if a kaddr is + * potentially a valid cacheable identity mapped RAM memory address. + * Note that the RAM may or may not actually be present!! + */ +#define PLAT_VALID_MEM_KADDR(kaddr) 1 + +/* + * Memory is conceptually divided into chunks. A chunk is either + * completely present, or else the kernel assumes it is completely + * absent. Each node consists of a number of possibly discontiguous chunks. + * As we expect more or less contiguous memory on DIG paltforms, we set + * the CHUNKSIZE equal to the CLUMPSIZE. + */ +#define PLAT_CHUNKSIZE (PLAT_CLUMPSIZE) +#define PLAT_CHUNKNUM(addr) \ + (((addr) & (PLAT_MAX_PHYS_MEMORY-1)) >> DIG_CLUMPSHIFT) + +/* + * Given a compact nodeid & a clump number, find the address of the mem_map + * entry for the first page of the clump. + */ +#define PLAT_CLUMP_MEM_MAP_INDEX(kaddr) \ + (((unsigned long)(kaddr) & (PLAT_MAX_PHYS_MEMORY-1)) >> DIG_CLUMPSHIFT) + +/* + * Calculate a "goal" value to be passed to __alloc_bootmem_node for allocating + * structures on nodes so that they dont alias to the same line in the cache as + * the previous allocated structure. + * This macro takes an address of the end of previous allocation, rounds it to + * a page boundary & changes the node number. + */ +#define PLAT_BOOTMEM_ALLOC_GOAL(cnode,kaddr) 0 /* not used yet */ + +/* + * Convert a proximity domain number (from the ACPI tables) into a physical + * node number. + */ + +#define PLAT_PXM_TO_PHYS_NODE_NUMBER(pxm) (pxm) + +#endif /* _ASM_IA64_MMZONE_DIG_NUMA_H */ diff -urNp acpi/include/asm-ia64/nodedata.h disc/include/asm-ia64/nodedata.h --- acpi/include/asm-ia64/nodedata.h Thu Jan 1 01:00:00 1970 +++ disc/include/asm-ia64/nodedata.h Fri Aug 9 12:59:01 2002 @@ -0,0 +1,85 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved. + */ + + +#ifndef _ASM_IA64_NODEDATA_H +#define _ASM_IA64_NODEDATA_H + + +#include + + +/* + * Node Data. One of these structures is located on each node of a NUMA system. + */ + +struct pglist_data; + +typedef struct ia64_node_data_s { + cnodeid_t cnodeid; + short active_cpu_count; + + /* + * The fields are read-only (after boot). They containing pointers to various structures + * located on other nodes. Ths data is replicated on each node in order to reduce + * off-node references. + */ + struct pglist_data *pg_data_ptrs[PLAT_MAX_COMPACT_NODES]; +#ifdef CONFIG_IA64_SGI_SN + cnodeid_t physical_node_map[PLAT_MAX_NODE_NUMBER]; +#endif + struct page *clump_mem_map_base[PLAT_MAXCLUMPS]; + struct ia64_node_data_s *node_data_ptrs[PLAT_MAX_COMPACT_NODES]; +} ia64_node_data_t; + + +/* + * Return a pointer to the node_data structure for the executing cpu. + */ +#define local_node_data (local_cpu_data->node_data) + + +/* + * Return a pointer to the node_data structure for the specified cnodeid. + */ +#define node_data(cnodeid) (local_node_data->node_data_ptrs[cnodeid]) + +/* + * Get a pointer to the node_data for the current cpu. + * (boot time only) + */ +ia64_node_data_t* get_node_data_ptr(void); + + +/* + * Given a compact node id, return a pointer to the pg_data_t for the node. + * The following 2 macros are similar. + * + * NODE_DATA - should be used in all code not related to system + * initialization. It uses pernode data structures to minimize + * offnode memory references. However, these structure are not + * present during boot. This macro can be used once cpu_init + * completes. + * + * BOOT_NODE_DATA - should be used during system initialization + * prior to freeing __initdata. It does not depend on the percpu + * area being present. + * + * NOTE: The names of these macros are misleading but are difficult to change + * since they are used in generic linux & on other architecures. + * We should consider chnaging these names in 2.5.x. + */ +#define NODE_DATA(nid) (local_node_data->pg_data_ptrs[nid]) +#define BOOT_NODE_DATA(nid) boot_get_pg_data_ptr((long)(nid)) + +struct pglist_data; +extern struct pglist_data * __init boot_get_pg_data_ptr(long); + +#endif /* _ASM_IA64_NODEDATA_H */ + + diff -urNp acpi/include/asm-ia64/page.h disc/include/asm-ia64/page.h --- acpi/include/asm-ia64/page.h Mon Jul 29 15:23:26 2002 +++ disc/include/asm-ia64/page.h Fri Aug 9 12:59:01 2002 @@ -56,12 +56,7 @@ extern void copy_page (void *to, void *f # include # define virt_to_page(kaddr) (mem_map + platform_map_nr(kaddr)) # define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) -#elif defined (CONFIG_IA64_SGI_SN1) -# ifndef CONFIG_DISCONTIGMEM -# define virt_to_page(kaddr) (mem_map + MAP_NR_DENSE(kaddr)) -# define page_to_phys(page) XXX fix me -# endif -#else +#elif !defined (CONFIG_DISCONTIGMEM) # define virt_to_page(kaddr) (mem_map + MAP_NR_DENSE(kaddr)) # define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) #endif diff -urNp acpi/include/asm-ia64/pgtable.h disc/include/asm-ia64/pgtable.h --- acpi/include/asm-ia64/pgtable.h Mon Jul 29 15:23:26 2002 +++ disc/include/asm-ia64/pgtable.h Fri Aug 9 12:59:01 2002 @@ -221,6 +221,15 @@ ia64_phys_addr_valid (unsigned long addr * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ +#ifdef CONFIG_DISCONTIGMEM +#define mk_pte(page,pgprot) \ +({ \ + pte_t __pte; \ + \ + pte_val(__pte) = (unsigned long)page_address(page) - PAGE_OFFSET + pgprot_val(pgprot); \ + __pte; \ +}) +#else #define mk_pte(page,pgprot) \ ({ \ pte_t __pte; \ @@ -228,6 +237,7 @@ ia64_phys_addr_valid (unsigned long addr pte_val(__pte) = ((page - mem_map) << PAGE_SHIFT) | pgprot_val(pgprot); \ __pte; \ }) +#endif /* This takes a physical page address that is used by the remapping functions */ #define mk_pte_phys(physpage, pgprot) \ diff -urNp acpi/include/asm-ia64/processor.h disc/include/asm-ia64/processor.h --- acpi/include/asm-ia64/processor.h Mon Jul 29 15:23:26 2002 +++ disc/include/asm-ia64/processor.h Fri Aug 9 12:59:01 2002 @@ -87,6 +87,10 @@ #include #include #include +#ifdef CONFIG_NUMA +#include +#endif + /* like above but expressed as bitfields for more efficient access: */ struct ia64_psr { @@ -187,8 +191,8 @@ struct cpuinfo_ia64 { } ipi; #endif #ifdef CONFIG_NUMA - void *node_directory; - int numa_node_id; + ia64_node_data_t *node_data; + int cnodeid; struct cpuinfo_ia64 *cpu_data[NR_CPUS]; #endif /* Platform specific word. MUST BE LAST IN STRUCT */ @@ -213,7 +217,9 @@ struct cpuinfo_ia64 { */ #ifdef CONFIG_NUMA # define cpu_data(cpu) local_cpu_data->cpu_data[cpu] -# define numa_node_id() (local_cpu_data->numa_node_id) +# define local_cnodeid() (local_cpu_data->cnodeid) +# define numa_node_id() local_cnodeid() /* obsolete */ + extern int boot_get_local_cnodeid(void); /* early boot only */ #else extern struct cpuinfo_ia64 _cpu_data[NR_CPUS]; # define cpu_data(cpu) (&_cpu_data[cpu]) diff -urNp acpi/mm/bootmem.c disc/mm/bootmem.c --- acpi/mm/bootmem.c Mon Jul 29 15:23:28 2002 +++ disc/mm/bootmem.c Fri Aug 9 12:59:01 2002 @@ -19,6 +19,9 @@ #include #include +#define in_bdata(b, a, s) (((a) >= (b)->node_boot_start) && \ + ((((a) + (s) + PAGE_SIZE - 1) / PAGE_SIZE) <= (b)->node_low_pfn)) + /* * Access to this subsystem has to be serialized externally. (this is * true for the boot process anyway) @@ -47,13 +50,22 @@ static unsigned long __init init_bootmem bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; - pgdat->node_next = pgdat_list; - pgdat_list = pgdat; + /* find the right bank */ + while (bdata->node_low_pfn && end != bdata->node_low_pfn) + bdata++; + + if (bdata->node_low_pfn == 0) { BUG(); return 0; } + +#ifndef CONFIG_DISCONTIGMEM + /* first bank? add this pgdat to the list of all pgdats */ + if (bdata == pgdat->bdata) { + pgdat->node_next = pgdat_list; + pgdat_list = pgdat; + } +#endif mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); - bdata->node_boot_start = (start << PAGE_SHIFT); - bdata->node_low_pfn = end; /* * Initially all pages are reserved - setup_arch() has to @@ -249,41 +261,42 @@ found: static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { + struct page *page = pgdat->node_mem_map, *bpage; bootmem_data_t *bdata = pgdat->bdata; unsigned long i, count, total = 0; - struct page *page; unsigned long idx; - if (!bdata->node_bootmem_map) BUG(); + while (bdata->node_low_pfn) { + if (!bdata->node_bootmem_map) BUG(); - count = 0; - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); - for (i = find_first_zero_bit(bdata->node_bootmem_map, idx); - i < idx; - i = find_next_zero_bit(bdata->node_bootmem_map, idx, i + 1)) - { - page = pgdat->node_mem_map + i; - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - total += count; + count = 0; + idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + for (i = 0; i < idx; i++, page++) { + if (!test_bit(i, bdata->node_bootmem_map)) { + count++; + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + } + total += count; - /* - * Now free the allocator bitmap itself, it's not - * needed anymore: - */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + /* + * Now free the allocator bitmap itself, it's not + * needed anymore: + */ + bpage = virt_to_page(bdata->node_bootmem_map); + count = bootmem_bootmap_pages(bdata->node_low_pfn - + (bdata->node_boot_start >> PAGE_SHIFT)); + for (i = 0; i < count; i++,bpage++) { + ClearPageReserved(bpage); + set_page_count(bpage, 1); + __free_page(bpage); + } + total += count; + bdata->node_bootmem_map = NULL; + bdata++; } - total += count; - bdata->node_bootmem_map = NULL; return total; } @@ -295,12 +308,34 @@ unsigned long __init init_bootmem_node ( void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - reserve_bootmem_core(pgdat->bdata, physaddr, size); + bootmem_data_t *bdata = pgdat->bdata; + + /* find the right bank */ + while (bdata->node_low_pfn && !in_bdata(bdata, physaddr, size)) + bdata++; + + if (bdata->node_low_pfn == 0) { + BUG(); + return; + } + + reserve_bootmem_core(bdata, physaddr, size); } void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - return(free_bootmem_core(pgdat->bdata, physaddr, size)); + bootmem_data_t *bdata = pgdat->bdata; + + /* find the right bank */ + while (bdata->node_low_pfn && !in_bdata(bdata, physaddr, size)) + bdata++; + + if (bdata->node_low_pfn == 0) { + BUG(); + return; + } + + return(free_bootmem_core(bdata, physaddr, size)); } unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) @@ -312,33 +347,86 @@ unsigned long __init init_bootmem (unsig { max_low_pfn = pages; min_low_pfn = start; + contig_page_data.bdata->node_boot_start = 0; + contig_page_data.bdata->node_low_pfn = pages; return(init_bootmem_core(&contig_page_data, start, 0, pages)); } void __init reserve_bootmem (unsigned long addr, unsigned long size) { - reserve_bootmem_core(contig_page_data.bdata, addr, size); + pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; + + while (pgdat) { + bdata = pgdat->bdata; + while (bdata->node_low_pfn != 0 && !in_bdata(bdata, addr, size)) + bdata++; + if (bdata->node_low_pfn) + return(reserve_bootmem_core(bdata, addr, size)); + pgdat = pgdat->node_next; + } + BUG(); } void __init free_bootmem (unsigned long addr, unsigned long size) { - return(free_bootmem_core(contig_page_data.bdata, addr, size)); + pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; + + while (pgdat) { + bdata = pgdat->bdata; + while (bdata->node_low_pfn != 0 && !in_bdata(bdata, addr, size)) + bdata++; + if (bdata->node_low_pfn) + return(free_bootmem_core(bdata, addr, size)); + pgdat = pgdat->node_next; + } + BUG(); } unsigned long __init free_all_bootmem (void) { - return(free_all_bootmem_core(&contig_page_data)); + pg_data_t *pgdat = pgdat_list; + unsigned long ret = 0; + + while (pgdat) { + ret += free_all_bootmem_core(pgdat); + pgdat = pgdat->node_next; + } + return(ret); } void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) { pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; void *ptr; + if (goal) { + while (pgdat) { + bdata = pgdat->bdata; + while (bdata->node_low_pfn != 0 && !in_bdata(bdata, goal, size)) + bdata++; + if (bdata->node_low_pfn) { + ptr = __alloc_bootmem_core(bdata, size, align, goal); + if (ptr) + return ptr; + break; + } + pgdat = pgdat->node_next; + } + goal = 0; + } + + pgdat = pgdat_list; while (pgdat) { - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal))) - return(ptr); + bdata = pgdat->bdata; + while (bdata->node_low_pfn) { + ptr = __alloc_bootmem_core(bdata, size, align, goal); + if (ptr) + return(ptr); + bdata++; + } pgdat = pgdat->node_next; } /* @@ -352,10 +440,26 @@ void * __init __alloc_bootmem (unsigned void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { void *ptr; + bootmem_data_t *bdata = pgdat->bdata; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); - if (ptr) - return (ptr); + if (goal) { + while (bdata->node_low_pfn != 0 && !in_bdata(bdata, goal, size)) + bdata++; + if (bdata->node_low_pfn) { + ptr = __alloc_bootmem_core(bdata, size, align, goal); + if (ptr) + return ptr; + } + goal = 0; + } + + bdata = pgdat->bdata; + while (bdata->node_low_pfn) { + ptr = __alloc_bootmem_core(bdata, size, align, goal); + if (ptr) + return (ptr); + bdata++; + } /* * Whoops, we cannot satisfy the allocation request.