i386: move mm
authorThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:16:47 +0000 (11:16 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:16:47 +0000 (11:16 +0200)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
29 files changed:
arch/i386/Makefile
arch/i386/mm/Makefile [deleted file]
arch/i386/mm/Makefile_32 [deleted file]
arch/i386/mm/boot_ioremap_32.c [deleted file]
arch/i386/mm/discontig_32.c [deleted file]
arch/i386/mm/extable_32.c [deleted file]
arch/i386/mm/fault_32.c [deleted file]
arch/i386/mm/highmem_32.c [deleted file]
arch/i386/mm/hugetlbpage.c [deleted file]
arch/i386/mm/init_32.c [deleted file]
arch/i386/mm/ioremap_32.c [deleted file]
arch/i386/mm/mmap_32.c [deleted file]
arch/i386/mm/pageattr_32.c [deleted file]
arch/i386/mm/pgtable_32.c [deleted file]
arch/x86/mm/Makefile [new file with mode: 0644]
arch/x86/mm/Makefile_32 [new file with mode: 0644]
arch/x86/mm/boot_ioremap_32.c [new file with mode: 0644]
arch/x86/mm/discontig_32.c [new file with mode: 0644]
arch/x86/mm/extable_32.c [new file with mode: 0644]
arch/x86/mm/fault_32.c [new file with mode: 0644]
arch/x86/mm/highmem_32.c [new file with mode: 0644]
arch/x86/mm/hugetlbpage.c [new file with mode: 0644]
arch/x86/mm/init_32.c [new file with mode: 0644]
arch/x86/mm/ioremap_32.c [new file with mode: 0644]
arch/x86/mm/mmap_32.c [new file with mode: 0644]
arch/x86/mm/pageattr_32.c [new file with mode: 0644]
arch/x86/mm/pgtable_32.c [new file with mode: 0644]
arch/x86_64/mm/Makefile
arch/x86_64/mm/Makefile_64

index 776d8dc..cbdc14f 100644 (file)
@@ -103,7 +103,7 @@ head-y := arch/i386/kernel/head_32.o arch/i386/kernel/init_task_32.o
 
 libs-y                                         += arch/x86/lib/
 core-y                                 += arch/i386/kernel/ \
-                                          arch/i386/mm/ \
+                                          arch/x86/mm/ \
                                           $(mcore-y)/ \
                                           arch/x86/crypto/
 drivers-$(CONFIG_MATH_EMULATION)       += arch/x86/math-emu/
diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile
deleted file mode 100644 (file)
index 4042f85..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/i386/mm/Makefile_32
-else
-include ${srctree}/arch/x86_64/mm/Makefile_64
-endif
diff --git a/arch/i386/mm/Makefile_32 b/arch/i386/mm/Makefile_32
deleted file mode 100644 (file)
index 362b4ad..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for the linux i386-specific parts of the memory manager.
-#
-
-obj-y  := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
-
-obj-$(CONFIG_NUMA) += discontig_32.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/i386/mm/boot_ioremap_32.c b/arch/i386/mm/boot_ioremap_32.c
deleted file mode 100644 (file)
index 4de95a1..0000000
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * arch/i386/mm/boot_ioremap.c
- * 
- * Re-map functions for early boot-time before paging_init() when the 
- * boot-time pagetables are still in use
- *
- * Written by Dave Hansen <haveblue@us.ibm.com>
- */
-
-
-/*
- * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
- * keeps that from happenning.  If anyone has a better way, I'm listening.
- *
- * boot_pte_t is defined only if this all works correctly
- */
-
-#undef CONFIG_X86_PAE
-#undef CONFIG_PARAVIRT
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/init.h>
-#include <linux/stddef.h>
-
-/* 
- * I'm cheating here.  It is known that the two boot PTE pages are 
- * allocated next to each other.  I'm pretending that they're just
- * one big array. 
- */
-
-#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
-
-static unsigned long boot_pte_index(unsigned long vaddr) 
-{
-       return __pa(vaddr) >> PAGE_SHIFT;
-}
-
-static inline boot_pte_t* boot_vaddr_to_pte(void *address)
-{
-       boot_pte_t* boot_pg = (boot_pte_t*)pg0;
-       return &boot_pg[boot_pte_index((unsigned long)address)];
-}
-
-/*
- * This is only for a caller who is clever enough to page-align
- * phys_addr and virtual_source, and who also has a preference
- * about which virtual address from which to steal ptes
- */
-static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, 
-                   void* virtual_source)
-{
-       boot_pte_t* pte;
-       int i;
-       char *vaddr = virtual_source;
-
-       pte = boot_vaddr_to_pte(virtual_source);
-       for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
-               set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
-               __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
-       }
-}
-
-/* the virtual space we're going to remap comes from this array */
-#define BOOT_IOREMAP_PAGES 4
-#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
-static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
-                      __attribute__ ((aligned (PAGE_SIZE)));
-
-/*
- * This only applies to things which need to ioremap before paging_init()
- * bt_ioremap() and plain ioremap() are both useless at this point.
- * 
- * When used, we're still using the boot-time pagetables, which only
- * have 2 PTE pages mapping the first 8MB
- *
- * There is no unmap.  The boot-time PTE pages aren't used after boot.
- * If you really want the space back, just remap it yourself.
- * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
- */
-__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
-{
-       unsigned long last_addr, offset;
-       unsigned int nrpages;
-       
-       last_addr = phys_addr + size - 1;
-
-       /* page align the requested address */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr) - phys_addr;
-       
-       nrpages = size >> PAGE_SHIFT;
-       if (nrpages > BOOT_IOREMAP_PAGES)
-               return NULL;
-       
-       __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
-
-       return &boot_ioremap_space[offset];
-}
diff --git a/arch/i386/mm/discontig_32.c b/arch/i386/mm/discontig_32.c
deleted file mode 100644 (file)
index 860e912..0000000
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
- * August 2002: added remote node KVA remap - Martin J. Bligh 
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.          
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/highmem.h>
-#include <linux/initrd.h>
-#include <linux/nodemask.h>
-#include <linux/module.h>
-#include <linux/kexec.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/mmzone.h>
-#include <bios_ebda.h>
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-bootmem_data_t node0_bdata;
-
-/*
- * numa interface - we expect the numa architecture specific code to have
- *                  populated the following initialisation.
- *
- * 1) node_online_map  - the map of all nodes configured (online) in the system
- * 2) node_start_pfn   - the starting page frame number for a node
- * 3) node_end_pfn     - the ending page fram number for a node
- */
-unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
-unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
-
-
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * 4) physnode_map     - the mapping between a pfn and owning node
- * physnode_map keeps track of the physical memory layout of a generic
- * numa node on a 256Mb break (each element of the array will
- * represent 256Mb of memory and will be marked by the node id.  so,
- * if the first gig is on node 0, and the second gig is on node 1
- * physnode_map will contain:
- *
- *     physnode_map[0-3] = 0;
- *     physnode_map[4-7] = 1;
- *     physnode_map[8- ] = -1;
- */
-s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
-EXPORT_SYMBOL(physnode_map);
-
-void memory_present(int nid, unsigned long start, unsigned long end)
-{
-       unsigned long pfn;
-
-       printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
-                       nid, start, end);
-       printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
-       printk(KERN_DEBUG "  ");
-       for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
-               physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
-               printk("%ld ", pfn);
-       }
-       printk("\n");
-}
-
-unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
-                                             unsigned long end_pfn)
-{
-       unsigned long nr_pages = end_pfn - start_pfn;
-
-       if (!nr_pages)
-               return 0;
-
-       return (nr_pages + 1) * sizeof(struct page);
-}
-#endif
-
-extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
-extern unsigned long highend_pfn, highstart_pfn;
-
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-unsigned long node_remap_start_pfn[MAX_NUMNODES];
-unsigned long node_remap_size[MAX_NUMNODES];
-unsigned long node_remap_offset[MAX_NUMNODES];
-void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-void *node_remap_end_vaddr[MAX_NUMNODES];
-void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-/*
- * FLAT - support for basic PC memory model with discontig enabled, essentially
- *        a single node with all available processors in it with a flat
- *        memory map.
- */
-int __init get_memcfg_numa_flat(void)
-{
-       printk("NUMA - single node, flat memory mode\n");
-
-       /* Run the memory configuration and find the top of memory. */
-       find_max_pfn();
-       node_start_pfn[0] = 0;
-       node_end_pfn[0] = max_pfn;
-       memory_present(0, 0, max_pfn);
-
-        /* Indicate there is one node available. */
-       nodes_clear(node_online_map);
-       node_set_online(0);
-       return 1;
-}
-
-/*
- * Find the highest page frame number we have available for the node
- */
-static void __init find_max_pfn_node(int nid)
-{
-       if (node_end_pfn[nid] > max_pfn)
-               node_end_pfn[nid] = max_pfn;
-       /*
-        * if a user has given mem=XXXX, then we need to make sure 
-        * that the node _starts_ before that, too, not just ends
-        */
-       if (node_start_pfn[nid] > max_pfn)
-               node_start_pfn[nid] = max_pfn;
-       BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
-}
-
-/* 
- * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
- * method.  For node zero take this from the bottom of memory, for
- * subsequent nodes place them at node_remap_start_vaddr which contains
- * node local data in physically node local memory.  See setup_memory()
- * for details.
- */
-static void __init allocate_pgdat(int nid)
-{
-       if (nid && node_has_online_mem(nid))
-               NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
-       else {
-               NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
-               min_low_pfn += PFN_UP(sizeof(pg_data_t));
-       }
-}
-
-void *alloc_remap(int nid, unsigned long size)
-{
-       void *allocation = node_remap_alloc_vaddr[nid];
-
-       size = ALIGN(size, L1_CACHE_BYTES);
-
-       if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
-               return 0;
-
-       node_remap_alloc_vaddr[nid] += size;
-       memset(allocation, 0, size);
-
-       return allocation;
-}
-
-void __init remap_numa_kva(void)
-{
-       void *vaddr;
-       unsigned long pfn;
-       int node;
-
-       for_each_online_node(node) {
-               for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
-                       vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
-                       set_pmd_pfn((ulong) vaddr, 
-                               node_remap_start_pfn[node] + pfn, 
-                               PAGE_KERNEL_LARGE);
-               }
-       }
-}
-
-static unsigned long calculate_numa_remap_pages(void)
-{
-       int nid;
-       unsigned long size, reserve_pages = 0;
-       unsigned long pfn;
-
-       for_each_online_node(nid) {
-               unsigned old_end_pfn = node_end_pfn[nid];
-
-               /*
-                * The acpi/srat node info can show hot-add memroy zones
-                * where memory could be added but not currently present.
-                */
-               if (node_start_pfn[nid] > max_pfn)
-                       continue;
-               if (node_end_pfn[nid] > max_pfn)
-                       node_end_pfn[nid] = max_pfn;
-
-               /* ensure the remap includes space for the pgdat. */
-               size = node_remap_size[nid] + sizeof(pg_data_t);
-
-               /* convert size to large (pmd size) pages, rounding up */
-               size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
-               /* now the roundup is correct, convert to PAGE_SIZE pages */
-               size = size * PTRS_PER_PTE;
-
-               /*
-                * Validate the region we are allocating only contains valid
-                * pages.
-                */
-               for (pfn = node_end_pfn[nid] - size;
-                    pfn < node_end_pfn[nid]; pfn++)
-                       if (!page_is_ram(pfn))
-                               break;
-
-               if (pfn != node_end_pfn[nid])
-                       size = 0;
-
-               printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
-                               size, nid);
-               node_remap_size[nid] = size;
-               node_remap_offset[nid] = reserve_pages;
-               reserve_pages += size;
-               printk("Shrinking node %d from %ld pages to %ld pages\n",
-                       nid, node_end_pfn[nid], node_end_pfn[nid] - size);
-
-               if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
-                       /*
-                        * Align node_end_pfn[] and node_remap_start_pfn[] to
-                        * pmd boundary. remap_numa_kva will barf otherwise.
-                        */
-                       printk("Shrinking node %d further by %ld pages for proper alignment\n",
-                               nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
-                       size +=  node_end_pfn[nid] & (PTRS_PER_PTE-1);
-               }
-
-               node_end_pfn[nid] -= size;
-               node_remap_start_pfn[nid] = node_end_pfn[nid];
-               shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
-       }
-       printk("Reserving total of %ld pages for numa KVA remap\n",
-                       reserve_pages);
-       return reserve_pages;
-}
-
-extern void setup_bootmem_allocator(void);
-unsigned long __init setup_memory(void)
-{
-       int nid;
-       unsigned long system_start_pfn, system_max_low_pfn;
-
-       /*
-        * When mapping a NUMA machine we allocate the node_mem_map arrays
-        * from node local memory.  They are then mapped directly into KVA
-        * between zone normal and vmalloc space.  Calculate the size of
-        * this space and use it to adjust the boundry between ZONE_NORMAL
-        * and ZONE_HIGHMEM.
-        */
-       find_max_pfn();
-       get_memcfg_numa();
-
-       kva_pages = calculate_numa_remap_pages();
-
-       /* partially used pages are not usable - thus round upwards */
-       system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
-
-       kva_start_pfn = find_max_low_pfn() - kva_pages;
-
-#ifdef CONFIG_BLK_DEV_INITRD
-       /* Numa kva area is below the initrd */
-       if (LOADER_TYPE && INITRD_START)
-               kva_start_pfn = PFN_DOWN(INITRD_START)  - kva_pages;
-#endif
-       kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
-
-       system_max_low_pfn = max_low_pfn = find_max_low_pfn();
-       printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
-               kva_start_pfn, max_low_pfn);
-       printk("max_pfn = %ld\n", max_pfn);
-#ifdef CONFIG_HIGHMEM
-       highstart_pfn = highend_pfn = max_pfn;
-       if (max_pfn > system_max_low_pfn)
-               highstart_pfn = system_max_low_pfn;
-       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
-              pages_to_mb(highend_pfn - highstart_pfn));
-       num_physpages = highend_pfn;
-       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
-       num_physpages = system_max_low_pfn;
-       high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif
-       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
-                       pages_to_mb(system_max_low_pfn));
-       printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 
-                       min_low_pfn, max_low_pfn, highstart_pfn);
-
-       printk("Low memory ends at vaddr %08lx\n",
-                       (ulong) pfn_to_kaddr(max_low_pfn));
-       for_each_online_node(nid) {
-               node_remap_start_vaddr[nid] = pfn_to_kaddr(
-                               kva_start_pfn + node_remap_offset[nid]);
-               /* Init the node remap allocator */
-               node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
-                       (node_remap_size[nid] * PAGE_SIZE);
-               node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
-                       ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
-               allocate_pgdat(nid);
-               printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
-                       (ulong) node_remap_start_vaddr[nid],
-                       (ulong) pfn_to_kaddr(highstart_pfn
-                          + node_remap_offset[nid] + node_remap_size[nid]));
-       }
-       printk("High memory starts at vaddr %08lx\n",
-                       (ulong) pfn_to_kaddr(highstart_pfn));
-       for_each_online_node(nid)
-               find_max_pfn_node(nid);
-
-       memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
-       NODE_DATA(0)->bdata = &node0_bdata;
-       setup_bootmem_allocator();
-       return max_low_pfn;
-}
-
-void __init numa_kva_reserve(void)
-{
-       reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
-}
-
-void __init zone_sizes_init(void)
-{
-       int nid;
-       unsigned long max_zone_pfns[MAX_NR_ZONES];
-       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-       max_zone_pfns[ZONE_DMA] =
-               virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-       max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-       max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-#endif
-
-       /* If SRAT has not registered memory, register it now */
-       if (find_max_pfn_with_active_regions() == 0) {
-               for_each_online_node(nid) {
-                       if (node_has_online_mem(nid))
-                               add_active_range(nid, node_start_pfn[nid],
-                                                       node_end_pfn[nid]);
-               }
-       }
-
-       free_area_init_nodes(max_zone_pfns);
-       return;
-}
-
-void __init set_highmem_pages_init(int bad_ppro) 
-{
-#ifdef CONFIG_HIGHMEM
-       struct zone *zone;
-       struct page *page;
-
-       for_each_zone(zone) {
-               unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
-
-               if (!is_highmem(zone))
-                       continue;
-
-               zone_start_pfn = zone->zone_start_pfn;
-               zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-
-               printk("Initializing %s for node %d (%08lx:%08lx)\n",
-                               zone->name, zone_to_nid(zone),
-                               zone_start_pfn, zone_end_pfn);
-
-               for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
-                       if (!pfn_valid(node_pfn))
-                               continue;
-                       page = pfn_to_page(node_pfn);
-                       add_one_highpage_init(page, node_pfn, bad_ppro);
-               }
-       }
-       totalram_pages += totalhigh_pages;
-#endif
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int paddr_to_nid(u64 addr)
-{
-       int nid;
-       unsigned long pfn = PFN_DOWN(addr);
-
-       for_each_node(nid)
-               if (node_start_pfn[nid] <= pfn &&
-                   pfn < node_end_pfn[nid])
-                       return nid;
-
-       return -1;
-}
-
-/*
- * This function is used to ask node id BEFORE memmap and mem_section's
- * initialization (pfn_to_nid() can't be used yet).
- * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
- */
-int memory_add_physaddr_to_nid(u64 addr)
-{
-       int nid = paddr_to_nid(addr);
-       return (nid >= 0) ? nid : 0;
-}
-
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
diff --git a/arch/i386/mm/extable_32.c b/arch/i386/mm/extable_32.c
deleted file mode 100644 (file)
index 0ce4f22..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/i386/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/uaccess.h>
-
-int fixup_exception(struct pt_regs *regs)
-{
-       const struct exception_table_entry *fixup;
-
-#ifdef CONFIG_PNPBIOS
-       if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
-       {
-               extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
-               extern u32 pnp_bios_is_utter_crap;
-               pnp_bios_is_utter_crap = 1;
-               printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
-               __asm__ volatile(
-                       "movl %0, %%esp\n\t"
-                       "jmp *%1\n\t"
-                       : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
-               panic("do_trap: can't hit this");
-       }
-#endif
-
-       fixup = search_exception_tables(regs->eip);
-       if (fixup) {
-               regs->eip = fixup->fixup;
-               return 1;
-       }
-
-       return 0;
-}
diff --git a/arch/i386/mm/fault_32.c b/arch/i386/mm/fault_32.c
deleted file mode 100644 (file)
index fcb38e7..0000000
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- *  linux/arch/i386/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>             /* For unblank_screen() */
-#include <linux/highmem.h>
-#include <linux/bootmem.h>             /* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-
-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-
-extern void die(const char *,struct pt_regs *,long);
-
-static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
-
-int register_page_fault_notifier(struct notifier_block *nb)
-{
-       vmalloc_sync_all();
-       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
-}
-EXPORT_SYMBOL_GPL(register_page_fault_notifier);
-
-int unregister_page_fault_notifier(struct notifier_block *nb)
-{
-       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
-}
-EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
-
-static inline int notify_page_fault(struct pt_regs *regs, long err)
-{
-       struct die_args args = {
-               .regs = regs,
-               .str = "page fault",
-               .err = err,
-               .trapnr = 14,
-               .signr = SIGSEGV
-       };
-       return atomic_notifier_call_chain(&notify_page_fault_chain,
-                                         DIE_PAGE_FAULT, &args);
-}
-
-/*
- * Return EIP plus the CS segment base.  The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- * 
- * This is slow, but is very rarely executed.
- */
-static inline unsigned long get_segment_eip(struct pt_regs *regs,
-                                           unsigned long *eip_limit)
-{
-       unsigned long eip = regs->eip;
-       unsigned seg = regs->xcs & 0xffff;
-       u32 seg_ar, seg_limit, base, *desc;
-
-       /* Unlikely, but must come before segment checks. */
-       if (unlikely(regs->eflags & VM_MASK)) {
-               base = seg << 4;
-               *eip_limit = base + 0xffff;
-               return base + (eip & 0xffff);
-       }
-
-       /* The standard kernel/user address space limit. */
-       *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-       
-       /* By far the most common cases. */
-       if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-               return eip;
-
-       /* Check the segment exists, is within the current LDT/GDT size,
-          that kernel/user (ring 0..3) has the appropriate privilege,
-          that it's a code segment, and get the limit. */
-       __asm__ ("larl %3,%0; lsll %3,%1"
-                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-       if ((~seg_ar & 0x9800) || eip > seg_limit) {
-               *eip_limit = 0;
-               return 1;        /* So that returned eip > *eip_limit. */
-       }
-
-       /* Get the GDT/LDT descriptor base. 
-          When you look for races in this code remember that
-          LDT and other horrors are only used in user space. */
-       if (seg & (1<<2)) {
-               /* Must lock the LDT while reading it. */
-               down(&current->mm->context.sem);
-               desc = current->mm->context.ldt;
-               desc = (void *)desc + (seg & ~7);
-       } else {
-               /* Must disable preemption while reading the GDT. */
-               desc = (u32 *)get_cpu_gdt_table(get_cpu());
-               desc = (void *)desc + (seg & ~7);
-       }
-
-       /* Decode the code segment base from the descriptor */
-       base = get_desc_base((unsigned long *)desc);
-
-       if (seg & (1<<2)) { 
-               up(&current->mm->context.sem);
-       } else
-               put_cpu();
-
-       /* Adjust EIP and segment limit, and clamp at the kernel limit.
-          It's legitimate for segments to wrap at 0xffffffff. */
-       seg_limit += base;
-       if (seg_limit < *eip_limit && seg_limit >= base)
-               *eip_limit = seg_limit;
-       return eip + base;
-}
-
-/* 
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
- */
-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
-{ 
-       unsigned long limit;
-       unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
-       int scan_more = 1;
-       int prefetch = 0; 
-       int i;
-
-       for (i = 0; scan_more && i < 15; i++) { 
-               unsigned char opcode;
-               unsigned char instr_hi;
-               unsigned char instr_lo;
-
-               if (instr > (unsigned char *)limit)
-                       break;
-               if (probe_kernel_address(instr, opcode))
-                       break; 
-
-               instr_hi = opcode & 0xf0; 
-               instr_lo = opcode & 0x0f; 
-               instr++;
-
-               switch (instr_hi) { 
-               case 0x20:
-               case 0x30:
-                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
-                       scan_more = ((instr_lo & 7) == 0x6);
-                       break;
-                       
-               case 0x60:
-                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
-                       scan_more = (instr_lo & 0xC) == 0x4;
-                       break;          
-               case 0xF0:
-                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
-                       scan_more = !instr_lo || (instr_lo>>1) == 1;
-                       break;                  
-               case 0x00:
-                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
-                       scan_more = 0;
-                       if (instr > (unsigned char *)limit)
-                               break;
-                       if (probe_kernel_address(instr, opcode))
-                               break;
-                       prefetch = (instr_lo == 0xF) &&
-                               (opcode == 0x0D || opcode == 0x18);
-                       break;                  
-               default:
-                       scan_more = 0;
-                       break;
-               } 
-       }
-       return prefetch;
-}
-
-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-                             unsigned long error_code)
-{
-       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-                    boot_cpu_data.x86 >= 6)) {
-               /* Catch an obscure case of prefetch inside an NX page. */
-               if (nx_enabled && (error_code & 16))
-                       return 0;
-               return __is_prefetch(regs, addr);
-       }
-       return 0;
-} 
-
-static noinline void force_sig_info_fault(int si_signo, int si_code,
-       unsigned long address, struct task_struct *tsk)
-{
-       siginfo_t info;
-
-       info.si_signo = si_signo;
-       info.si_errno = 0;
-       info.si_code = si_code;
-       info.si_addr = (void __user *)address;
-       force_sig_info(si_signo, &info, tsk);
-}
-
-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
-
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
-       unsigned index = pgd_index(address);
-       pgd_t *pgd_k;
-       pud_t *pud, *pud_k;
-       pmd_t *pmd, *pmd_k;
-
-       pgd += index;
-       pgd_k = init_mm.pgd + index;
-
-       if (!pgd_present(*pgd_k))
-               return NULL;
-
-       /*
-        * set_pgd(pgd, *pgd_k); here would be useless on PAE
-        * and redundant with the set_pmd() on non-PAE. As would
-        * set_pud.
-        */
-
-       pud = pud_offset(pgd, address);
-       pud_k = pud_offset(pgd_k, address);
-       if (!pud_present(*pud_k))
-               return NULL;
-
-       pmd = pmd_offset(pud, address);
-       pmd_k = pmd_offset(pud_k, address);
-       if (!pmd_present(*pmd_k))
-               return NULL;
-       if (!pmd_present(*pmd)) {
-               set_pmd(pmd, *pmd_k);
-               arch_flush_lazy_mmu_mode();
-       } else
-               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-       return pmd_k;
-}
-
-/*
- * Handle a fault on the vmalloc or module mapping area
- *
- * This assumes no large pages in there.
- */
-static inline int vmalloc_fault(unsigned long address)
-{
-       unsigned long pgd_paddr;
-       pmd_t *pmd_k;
-       pte_t *pte_k;
-       /*
-        * Synchronize this task's top level page-table
-        * with the 'reference' page table.
-        *
-        * Do _not_ use "current" here. We might be inside
-        * an interrupt in the middle of a task switch..
-        */
-       pgd_paddr = read_cr3();
-       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-       if (!pmd_k)
-               return -1;
-       pte_k = pte_offset_kernel(pmd_k, address);
-       if (!pte_present(*pte_k))
-               return -1;
-       return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- *
- * error_code:
- *     bit 0 == 0 means no page found, 1 means protection fault
- *     bit 1 == 0 means read, 1 means write
- *     bit 2 == 0 means kernel, 1 means user-mode
- *     bit 3 == 1 means use of reserved bit detected
- *     bit 4 == 1 means fault was an instruction fetch
- */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-                                     unsigned long error_code)
-{
-       struct task_struct *tsk;
-       struct mm_struct *mm;
-       struct vm_area_struct * vma;
-       unsigned long address;
-       int write, si_code;
-       int fault;
-
-       /* get the address */
-        address = read_cr2();
-
-       tsk = current;
-
-       si_code = SEGV_MAPERR;
-
-       /*
-        * We fault-in kernel-space virtual memory on-demand. The
-        * 'reference' page table is init_mm.pgd.
-        *
-        * NOTE! We MUST NOT take any locks for this case. We may
-        * be in an interrupt or a critical region, and should
-        * only copy the information from the master page table,
-        * nothing more.
-        *
-        * This verifies that the fault happens in kernel space
-        * (error_code & 4) == 0, and that the fault was not a
-        * protection error (error_code & 9) == 0.
-        */
-       if (unlikely(address >= TASK_SIZE)) {
-               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
-                       return;
-               if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
-                       return;
-               /*
-                * Don't take the mm semaphore here. If we fixup a prefetch
-                * fault we could otherwise deadlock.
-                */
-               goto bad_area_nosemaphore;
-       }
-
-       if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
-               return;
-
-       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
-          fault has been handled. */
-       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
-               local_irq_enable();
-
-       mm = tsk->mm;
-
-       /*
-        * If we're in an interrupt, have no user context or are running in an
-        * atomic region then we must not take the fault..
-        */
-       if (in_atomic() || !mm)
-               goto bad_area_nosemaphore;
-
-       /* When running in the kernel we expect faults to occur only to
-        * addresses in user space.  All other faults represent errors in the
-        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
-        * erroneous fault occurring in a code path which already holds mmap_sem
-        * we will deadlock attempting to validate the fault against the
-        * address space.  Luckily the kernel only validly references user
-        * space from well defined areas of code, which are listed in the
-        * exceptions table.
-        *
-        * As the vast majority of faults will be valid we will only perform
-        * the source reference check when there is a possibilty of a deadlock.
-        * Attempt to lock the address space, if we cannot we then validate the
-        * source.  If this is invalid we can skip the address space check,
-        * thus avoiding the deadlock.
-        */
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               if ((error_code & 4) == 0 &&
-                   !search_exception_tables(regs->eip))
-                       goto bad_area_nosemaphore;
-               down_read(&mm->mmap_sem);
-       }
-
-       vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-       if (vma->vm_start <= address)
-               goto good_area;
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
-       if (error_code & 4) {
-               /*
-                * Accessing the stack below %esp is always a bug.
-                * The large cushion allows instructions like enter
-                * and pusha to work.  ("enter $65535,$31" pushes
-                * 32 pointers and then decrements %esp by 65535.)
-                */
-               if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
-                       goto bad_area;
-       }
-       if (expand_stack(vma, address))
-               goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-       si_code = SEGV_ACCERR;
-       write = 0;
-       switch (error_code & 3) {
-               default:        /* 3: write, present */
-                               /* fall through */
-               case 2:         /* write, not present */
-                       if (!(vma->vm_flags & VM_WRITE))
-                               goto bad_area;
-                       write++;
-                       break;
-               case 1:         /* read, present */
-                       goto bad_area;
-               case 0:         /* read, not present */
-                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                               goto bad_area;
-       }
-
- survive:
-       /*
-        * If for any reason at all we couldn't handle the fault,
-        * make sure we exit gracefully rather than endlessly redo
-        * the fault.
-        */
-       fault = handle_mm_fault(mm, vma, address, write);
-       if (unlikely(fault & VM_FAULT_ERROR)) {
-               if (fault & VM_FAULT_OOM)
-                       goto out_of_memory;
-               else if (fault & VM_FAULT_SIGBUS)
-                       goto do_sigbus;
-               BUG();
-       }
-       if (fault & VM_FAULT_MAJOR)
-               tsk->maj_flt++;
-       else
-               tsk->min_flt++;
-
-       /*
-        * Did it hit the DOS screen memory VA from vm86 mode?
-        */
-       if (regs->eflags & VM_MASK) {
-               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
-               if (bit < 32)
-                       tsk->thread.screen_bitmap |= 1 << bit;
-       }
-       up_read(&mm->mmap_sem);
-       return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-       up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-       /* User mode accesses just cause a SIGSEGV */
-       if (error_code & 4) {
-               /*
-                * It's possible to have interrupts off here.
-                */
-               local_irq_enable();
-
-               /* 
-                * Valid to do another page fault here because this one came 
-                * from user space.
-                */
-               if (is_prefetch(regs, address, error_code))
-                       return;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit()) {
-                       printk("%s%s[%d]: segfault at %08lx eip %08lx "
-                           "esp %08lx error %lx\n",
-                           tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
-                           tsk->comm, tsk->pid, address, regs->eip,
-                           regs->esp, error_code);
-               }
-               tsk->thread.cr2 = address;
-               /* Kernel addresses are always protection faults */
-               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-               tsk->thread.trap_no = 14;
-               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-               return;
-       }
-
-#ifdef CONFIG_X86_F00F_BUG
-       /*
-        * Pentium F0 0F C7 C8 bug workaround.
-        */
-       if (boot_cpu_data.f00f_bug) {
-               unsigned long nr;
-               
-               nr = (address - idt_descr.address) >> 3;
-
-               if (nr == 6) {
-                       do_invalid_op(regs, 0);
-                       return;
-               }
-       }
-#endif
-
-no_context:
-       /* Are we prepared to handle this kernel fault?  */
-       if (fixup_exception(regs))
-               return;
-
-       /* 
-        * Valid to do another page fault here, because if this fault
-        * had been triggered by is_prefetch fixup_exception would have 
-        * handled it.
-        */
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-       bust_spinlocks(1);
-
-       if (oops_may_print()) {
-               __typeof__(pte_val(__pte(0))) page;
-
-#ifdef CONFIG_X86_PAE
-               if (error_code & 16) {
-                       pte_t *pte = lookup_address(address);
-
-                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
-                               printk(KERN_CRIT "kernel tried to execute "
-                                       "NX-protected page - exploit attempt? "
-                                       "(uid: %d)\n", current->uid);
-               }
-#endif
-               if (address < PAGE_SIZE)
-                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
-                                       "pointer dereference");
-               else
-                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
-                                       " request");
-               printk(" at virtual address %08lx\n",address);
-               printk(KERN_ALERT " printing eip:\n");
-               printk("%08lx\n", regs->eip);
-
-               page = read_cr3();
-               page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
-#ifdef CONFIG_X86_PAE
-               printk(KERN_ALERT "*pdpt = %016Lx\n", page);
-               if ((page >> PAGE_SHIFT) < max_low_pfn
-                   && page & _PAGE_PRESENT) {
-                       page &= PAGE_MASK;
-                       page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
-                                                                & (PTRS_PER_PMD - 1)];
-                       printk(KERN_ALERT "*pde = %016Lx\n", page);
-                       page &= ~_PAGE_NX;
-               }
-#else
-               printk(KERN_ALERT "*pde = %08lx\n", page);
-#endif
-
-               /*
-                * We must not directly access the pte in the highpte
-                * case if the page table is located in highmem.
-                * And let's rather not kmap-atomic the pte, just in case
-                * it's allocated already.
-                */
-               if ((page >> PAGE_SHIFT) < max_low_pfn
-                   && (page & _PAGE_PRESENT)) {
-                       page &= PAGE_MASK;
-                       page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
-                                                                & (PTRS_PER_PTE - 1)];
-                       printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
-               }
-       }
-
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-       die("Oops", regs, error_code);
-       bust_spinlocks(0);
-       do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-       up_read(&mm->mmap_sem);
-       if (is_init(tsk)) {
-               yield();
-               down_read(&mm->mmap_sem);
-               goto survive;
-       }
-       printk("VM: killing process %s\n", tsk->comm);
-       if (error_code & 4)
-               do_exit(SIGKILL);
-       goto no_context;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-
-       /* Kernel mode? Handle exceptions or die */
-       if (!(error_code & 4))
-               goto no_context;
-
-       /* User space => ok to do another page fault */
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-       tsk->thread.cr2 = address;
-       tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 14;
-       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-
-void vmalloc_sync_all(void)
-{
-       /*
-        * Note that races in the updates of insync and start aren't
-        * problematic: insync can only get set bits added, and updates to
-        * start are only improving performance (without affecting correctness
-        * if undone).
-        */
-       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-       static unsigned long start = TASK_SIZE;
-       unsigned long address;
-
-       if (SHARED_KERNEL_PMD)
-               return;
-
-       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
-       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
-               if (!test_bit(pgd_index(address), insync)) {
-                       unsigned long flags;
-                       struct page *page;
-
-                       spin_lock_irqsave(&pgd_lock, flags);
-                       for (page = pgd_list; page; page =
-                                       (struct page *)page->index)
-                               if (!vmalloc_sync_one(page_address(page),
-                                                               address)) {
-                                       BUG_ON(page != pgd_list);
-                                       break;
-                               }
-                       spin_unlock_irqrestore(&pgd_lock, flags);
-                       if (!page)
-                               set_bit(pgd_index(address), insync);
-               }
-               if (address == start && test_bit(pgd_index(address), insync))
-                       start = address + PGDIR_SIZE;
-       }
-}
diff --git a/arch/i386/mm/highmem_32.c b/arch/i386/mm/highmem_32.c
deleted file mode 100644 (file)
index 1c3bf95..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-#include <linux/highmem.h>
-#include <linux/module.h>
-
-void *kmap(struct page *page)
-{
-       might_sleep();
-       if (!PageHighMem(page))
-               return page_address(page);
-       return kmap_high(page);
-}
-
-void kunmap(struct page *page)
-{
-       if (in_interrupt())
-               BUG();
-       if (!PageHighMem(page))
-               return;
-       kunmap_high(page);
-}
-
-/*
- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
- * no global lock is needed and because the kmap code must perform a global TLB
- * invalidation when the kmap pool wraps.
- *
- * However when holding an atomic kmap is is not legal to sleep, so atomic
- * kmaps are appropriate for short, tight code paths only.
- */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
-{
-       enum fixed_addresses idx;
-       unsigned long vaddr;
-
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
-       pagefault_disable();
-
-       if (!PageHighMem(page))
-               return page_address(page);
-
-       idx = type + KM_TYPE_NR*smp_processor_id();
-       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-       BUG_ON(!pte_none(*(kmap_pte-idx)));
-       set_pte(kmap_pte-idx, mk_pte(page, prot));
-       arch_flush_lazy_mmu_mode();
-
-       return (void *)vaddr;
-}
-
-void *kmap_atomic(struct page *page, enum km_type type)
-{
-       return kmap_atomic_prot(page, type, kmap_prot);
-}
-
-void kunmap_atomic(void *kvaddr, enum km_type type)
-{
-       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
-
-       /*
-        * Force other mappings to Oops if they'll try to access this pte
-        * without first remap it.  Keeping stale mappings around is a bad idea
-        * also, in case the page changes cacheability attributes or becomes
-        * a protected page in a hypervisor.
-        */
-       if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
-               kpte_clear_flush(kmap_pte-idx, vaddr);
-       else {
-#ifdef CONFIG_DEBUG_HIGHMEM
-               BUG_ON(vaddr < PAGE_OFFSET);
-               BUG_ON(vaddr >= (unsigned long)high_memory);
-#endif
-       }
-
-       arch_flush_lazy_mmu_mode();
-       pagefault_enable();
-}
-
-/* This is the same as kmap_atomic() but can map memory that doesn't
- * have a struct page associated with it.
- */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
-{
-       enum fixed_addresses idx;
-       unsigned long vaddr;
-
-       pagefault_disable();
-
-       idx = type + KM_TYPE_NR*smp_processor_id();
-       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
-       arch_flush_lazy_mmu_mode();
-
-       return (void*) vaddr;
-}
-
-struct page *kmap_atomic_to_page(void *ptr)
-{
-       unsigned long idx, vaddr = (unsigned long)ptr;
-       pte_t *pte;
-
-       if (vaddr < FIXADDR_START)
-               return virt_to_page(ptr);
-
-       idx = virt_to_fix(vaddr);
-       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
-       return pte_page(*pte);
-}
-
-EXPORT_SYMBOL(kmap);
-EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
deleted file mode 100644 (file)
index 6c06d9c..0000000
+++ /dev/null
@@ -1,391 +0,0 @@
-/*
- * IA-32 Huge TLB Page Support for Kernel.
- *
- * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-
-static unsigned long page_table_shareable(struct vm_area_struct *svma,
-                               struct vm_area_struct *vma,
-                               unsigned long addr, pgoff_t idx)
-{
-       unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
-                               svma->vm_start;
-       unsigned long sbase = saddr & PUD_MASK;
-       unsigned long s_end = sbase + PUD_SIZE;
-
-       /*
-        * match the virtual addresses, permission and the alignment of the
-        * page table page.
-        */
-       if (pmd_index(addr) != pmd_index(saddr) ||
-           vma->vm_flags != svma->vm_flags ||
-           sbase < svma->vm_start || svma->vm_end < s_end)
-               return 0;
-
-       return saddr;
-}
-
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
-       unsigned long base = addr & PUD_MASK;
-       unsigned long end = base + PUD_SIZE;
-
-       /*
-        * check on proper vm_flags and page table alignment
-        */
-       if (vma->vm_flags & VM_MAYSHARE &&
-           vma->vm_start <= base && end <= vma->vm_end)
-               return 1;
-       return 0;
-}
-
-/*
- * search for a shareable pmd page for hugetlb.
- */
-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
-{
-       struct vm_area_struct *vma = find_vma(mm, addr);
-       struct address_space *mapping = vma->vm_file->f_mapping;
-       pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-                       vma->vm_pgoff;
-       struct prio_tree_iter iter;
-       struct vm_area_struct *svma;
-       unsigned long saddr;
-       pte_t *spte = NULL;
-
-       if (!vma_shareable(vma, addr))
-               return;
-
-       spin_lock(&mapping->i_mmap_lock);
-       vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
-               if (svma == vma)
-                       continue;
-
-               saddr = page_table_shareable(svma, vma, addr, idx);
-               if (saddr) {
-                       spte = huge_pte_offset(svma->vm_mm, saddr);
-                       if (spte) {
-                               get_page(virt_to_page(spte));
-                               break;
-                       }
-               }
-       }
-
-       if (!spte)
-               goto out;
-
-       spin_lock(&mm->page_table_lock);
-       if (pud_none(*pud))
-               pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
-       else
-               put_page(virt_to_page(spte));
-       spin_unlock(&mm->page_table_lock);
-out:
-       spin_unlock(&mapping->i_mmap_lock);
-}
-
-/*
- * unmap huge page backed by shared pte.
- *
- * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
- * called with vma->vm_mm->page_table_lock held.
- *
- * returns: 1 successfully unmapped a shared pte page
- *         0 the underlying pte page is not shared, or it is the last user
- */
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       pgd_t *pgd = pgd_offset(mm, *addr);
-       pud_t *pud = pud_offset(pgd, *addr);
-
-       BUG_ON(page_count(virt_to_page(ptep)) == 0);
-       if (page_count(virt_to_page(ptep)) == 1)
-               return 0;
-
-       pud_clear(pud);
-       put_page(virt_to_page(ptep));
-       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
-       return 1;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pte_t *pte = NULL;
-
-       pgd = pgd_offset(mm, addr);
-       pud = pud_alloc(mm, pgd, addr);
-       if (pud) {
-               if (pud_none(*pud))
-                       huge_pmd_share(mm, addr, pud);
-               pte = (pte_t *) pmd_alloc(mm, pud, addr);
-       }
-       BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
-
-       return pte;
-}
-
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd = NULL;
-
-       pgd = pgd_offset(mm, addr);
-       if (pgd_present(*pgd)) {
-               pud = pud_offset(pgd, addr);
-               if (pud_present(*pud))
-                       pmd = pmd_offset(pud, addr);
-       }
-       return (pte_t *) pmd;
-}
-
-#if 0  /* This is just for testing */
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-       unsigned long start = address;
-       int length = 1;
-       int nr;
-       struct page *page;
-       struct vm_area_struct *vma;
-
-       vma = find_vma(mm, addr);
-       if (!vma || !is_vm_hugetlb_page(vma))
-               return ERR_PTR(-EINVAL);
-
-       pte = huge_pte_offset(mm, address);
-
-       /* hugetlb should be locked, and hence, prefaulted */
-       WARN_ON(!pte || pte_none(*pte));
-
-       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-       WARN_ON(!PageCompound(page));
-
-       return page;
-}
-
-int pmd_huge(pmd_t pmd)
-{
-       return 0;
-}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd, int write)
-{
-       return NULL;
-}
-
-#else
-
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-       return ERR_PTR(-EINVAL);
-}
-
-int pmd_huge(pmd_t pmd)
-{
-       return !!(pmd_val(pmd) & _PAGE_PSE);
-}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd, int write)
-{
-       struct page *page;
-
-       page = pte_page(*(pte_t *)pmd);
-       if (page)
-               page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
-       return page;
-}
-#endif
-
-/* x86_64 also uses this file */
-
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
-               unsigned long addr, unsigned long len,
-               unsigned long pgoff, unsigned long flags)
-{
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long start_addr;
-
-       if (len > mm->cached_hole_size) {
-               start_addr = mm->free_area_cache;
-       } else {
-               start_addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
-       }
-
-full_search:
-       addr = ALIGN(start_addr, HPAGE_SIZE);
-
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (TASK_SIZE - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != TASK_UNMAPPED_BASE) {
-                               start_addr = TASK_UNMAPPED_BASE;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-               addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-       }
-}
-
-static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-               unsigned long addr0, unsigned long len,
-               unsigned long pgoff, unsigned long flags)
-{
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma, *prev_vma;
-       unsigned long base = mm->mmap_base, addr = addr0;
-       unsigned long largest_hole = mm->cached_hole_size;
-       int first_time = 1;
-
-       /* don't allow allocations above current base */
-       if (mm->free_area_cache > base)
-               mm->free_area_cache = base;
-
-       if (len <= largest_hole) {
-               largest_hole = 0;
-               mm->free_area_cache  = base;
-       }
-try_again:
-       /* make sure it can fit in the remaining address space */
-       if (mm->free_area_cache < len)
-               goto fail;
-
-       /* either no address requested or cant fit in requested address hole */
-       addr = (mm->free_area_cache - len) & HPAGE_MASK;
-       do {
-               /*
-                * Lookup failure means no vma is above this address,
-                * i.e. return with success:
-                */
-               if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
-                       return addr;
-
-               /*
-                * new region fits between prev_vma->vm_end and
-                * vma->vm_start, use it:
-                */
-               if (addr + len <= vma->vm_start &&
-                           (!prev_vma || (addr >= prev_vma->vm_end))) {
-                       /* remember the address as a hint for next time */
-                       mm->cached_hole_size = largest_hole;
-                       return (mm->free_area_cache = addr);
-               } else {
-                       /* pull free_area_cache down to the first hole */
-                       if (mm->free_area_cache == vma->vm_end) {
-                               mm->free_area_cache = vma->vm_start;
-                               mm->cached_hole_size = largest_hole;
-                       }
-               }
-
-               /* remember the largest hole we saw so far */
-               if (addr + largest_hole < vma->vm_start)
-                       largest_hole = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = (vma->vm_start - len) & HPAGE_MASK;
-       } while (len <= vma->vm_start);
-
-fail:
-       /*
-        * if hint left us with no space for the requested
-        * mapping then try again:
-        */
-       if (first_time) {
-               mm->free_area_cache = base;
-               largest_hole = 0;
-               first_time = 0;
-               goto try_again;
-       }
-       /*
-        * A failed mmap() very likely causes application failure,
-        * so fall back to the bottom-up function here. This scenario
-        * can happen with large stack limits and large mmap()
-        * allocations.
-        */
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       mm->cached_hole_size = ~0UL;
-       addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
-                       len, pgoff, flags);
-
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = base;
-       mm->cached_hole_size = ~0UL;
-
-       return addr;
-}
-
-unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-               unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-
-       if (len & ~HPAGE_MASK)
-               return -EINVAL;
-       if (len > TASK_SIZE)
-               return -ENOMEM;
-
-       if (flags & MAP_FIXED) {
-               if (prepare_hugepage_range(addr, len))
-                       return -EINVAL;
-               return addr;
-       }
-
-       if (addr) {
-               addr = ALIGN(addr, HPAGE_SIZE);
-               vma = find_vma(mm, addr);
-               if (TASK_SIZE - len >= addr &&
-                   (!vma || addr + len <= vma->vm_start))
-                       return addr;
-       }
-       if (mm->get_unmapped_area == arch_get_unmapped_area)
-               return hugetlb_get_unmapped_area_bottomup(file, addr, len,
-                               pgoff, flags);
-       else
-               return hugetlb_get_unmapped_area_topdown(file, addr, len,
-                               pgoff, flags);
-}
-
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
-
diff --git a/arch/i386/mm/init_32.c b/arch/i386/mm/init_32.c
deleted file mode 100644 (file)
index 730a5b1..0000000
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- *  linux/arch/i386/mm/init.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *
- *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- */
-
-#include <linux/module.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/swap.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/pfn.h>
-#include <linux/poison.h>
-#include <linux/bootmem.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/efi.h>
-#include <linux/memory_hotplug.h>
-#include <linux/initrd.h>
-#include <linux/cpumask.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/dma.h>
-#include <asm/fixmap.h>
-#include <asm/e820.h>
-#include <asm/apic.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/sections.h>
-#include <asm/paravirt.h>
-
-unsigned int __VMALLOC_RESERVE = 128 << 20;
-
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-unsigned long highstart_pfn, highend_pfn;
-
-static int noinline do_test_wp_bit(void);
-
-/*
- * Creates a middle page table and puts a pointer to it in the
- * given global directory entry. This only returns the gd entry
- * in non-PAE compilation mode, since the middle layer is folded.
- */
-static pmd_t * __init one_md_table_init(pgd_t *pgd)
-{
-       pud_t *pud;
-       pmd_t *pmd_table;
-               
-#ifdef CONFIG_X86_PAE
-       if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-               pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
-               paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
-               set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
-               pud = pud_offset(pgd, 0);
-               if (pmd_table != pmd_offset(pud, 0))
-                       BUG();
-       }
-#endif
-       pud = pud_offset(pgd, 0);
-       pmd_table = pmd_offset(pud, 0);
-       return pmd_table;
-}
-
-/*
- * Create a page table and place a pointer to it in a middle page
- * directory entry.
- */
-static pte_t * __init one_page_table_init(pmd_t *pmd)
-{
-       if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
-               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-
-               paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
-               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
-               BUG_ON(page_table != pte_offset_kernel(pmd, 0));
-       }
-       
-       return pte_offset_kernel(pmd, 0);
-}
-
-/*
- * This function initializes a certain range of kernel virtual memory 
- * with new bootmem page tables, everywhere page tables are missing in
- * the given range.
- */
-
-/*
- * NOTE: The pagetables are allocated contiguous on the physical space 
- * so we can cache the place of the first one and move around without 
- * checking the pgd every time.
- */
-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
-{
-       pgd_t *pgd;
-       pmd_t *pmd;
-       int pgd_idx, pmd_idx;
-       unsigned long vaddr;
-
-       vaddr = start;
-       pgd_idx = pgd_index(vaddr);
-       pmd_idx = pmd_index(vaddr);
-       pgd = pgd_base + pgd_idx;
-
-       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
-               pmd = one_md_table_init(pgd);
-               pmd = pmd + pmd_index(vaddr);
-               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
-                       one_page_table_init(pmd);
-
-                       vaddr += PMD_SIZE;
-               }
-               pmd_idx = 0;
-       }
-}
-
-static inline int is_kernel_text(unsigned long addr)
-{
-       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
-               return 1;
-       return 0;
-}
-
-/*
- * This maps the physical memory to kernel virtual address space, a total 
- * of max_low_pfn pages, by creating page tables starting from address 
- * PAGE_OFFSET.
- */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
-{
-       unsigned long pfn;
-       pgd_t *pgd;
-       pmd_t *pmd;
-       pte_t *pte;
-       int pgd_idx, pmd_idx, pte_ofs;
-
-       pgd_idx = pgd_index(PAGE_OFFSET);
-       pgd = pgd_base + pgd_idx;
-       pfn = 0;
-
-       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
-               pmd = one_md_table_init(pgd);
-               if (pfn >= max_low_pfn)
-                       continue;
-               for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
-                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
-
-                       /* Map with big pages if possible, otherwise create normal page tables. */
-                       if (cpu_has_pse) {
-                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
-                               if (is_kernel_text(address) || is_kernel_text(address2))
-                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
-                               else
-                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
-
-                               pfn += PTRS_PER_PTE;
-                       } else {
-                               pte = one_page_table_init(pmd);
-
-                               for (pte_ofs = 0;
-                                    pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
-                                    pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
-                                       if (is_kernel_text(address))
-                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
-                                       else
-                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-                               }
-                       }
-               }
-       }
-}
-
-static inline int page_kills_ppro(unsigned long pagenr)
-{
-       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
-               return 1;
-       return 0;
-}
-
-int page_is_ram(unsigned long pagenr)
-{
-       int i;
-       unsigned long addr, end;
-
-       if (efi_enabled) {
-               efi_memory_desc_t *md;
-               void *p;
-
-               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                       md = p;
-                       if (!is_available_memory(md))
-                               continue;
-                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
-
-                       if ((pagenr >= addr) && (pagenr < end))
-                               return 1;
-               }
-               return 0;
-       }
-
-       for (i = 0; i < e820.nr_map; i++) {
-
-               if (e820.map[i].type != E820_RAM)       /* not usable memory */
-                       continue;
-               /*
-                *      !!!FIXME!!! Some BIOSen report areas as RAM that
-                *      are not. Notably the 640->1Mb area. We need a sanity
-                *      check here.
-                */
-               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
-               if  ((pagenr >= addr) && (pagenr < end))
-                       return 1;
-       }
-       return 0;
-}
-
-#ifdef CONFIG_HIGHMEM
-pte_t *kmap_pte;
-pgprot_t kmap_prot;
-
-#define kmap_get_fixmap_pte(vaddr)                                     \
-       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
-
-static void __init kmap_init(void)
-{
-       unsigned long kmap_vstart;
-
-       /* cache the first kmap pte */
-       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
-       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
-       kmap_prot = PAGE_KERNEL;
-}
-
-static void __init permanent_kmaps_init(pgd_t *pgd_base)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       unsigned long vaddr;
-
-       vaddr = PKMAP_BASE;
-       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
-
-       pgd = swapper_pg_dir + pgd_index(vaddr);
-       pud = pud_offset(pgd, vaddr);
-       pmd = pmd_offset(pud, vaddr);
-       pte = pte_offset_kernel(pmd, vaddr);
-       pkmap_page_table = pte; 
-}
-
-static void __meminit free_new_highpage(struct page *page)
-{
-       init_page_count(page);
-       __free_page(page);
-       totalhigh_pages++;
-}
-
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
-{
-       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
-               ClearPageReserved(page);
-               free_new_highpage(page);
-       } else
-               SetPageReserved(page);
-}
-
-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
-{
-       free_new_highpage(page);
-       totalram_pages++;
-#ifdef CONFIG_FLATMEM
-       max_mapnr = max(pfn, max_mapnr);
-#endif
-       num_physpages++;
-       return 0;
-}
-
-/*
- * Not currently handling the NUMA case.
- * Assuming single node and all memory that
- * has been added dynamically that would be
- * onlined here is in HIGHMEM
- */
-void __meminit online_page(struct page *page)
-{
-       ClearPageReserved(page);
-       add_one_highpage_hotplug(page, page_to_pfn(page));
-}
-
-
-#ifdef CONFIG_NUMA
-extern void set_highmem_pages_init(int);
-#else
-static void __init set_highmem_pages_init(int bad_ppro)
-{
-       int pfn;
-       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
-               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
-       totalram_pages += totalhigh_pages;
-}
-#endif /* CONFIG_FLATMEM */
-
-#else
-#define kmap_init() do { } while (0)
-#define permanent_kmaps_init(pgd_base) do { } while (0)
-#define set_highmem_pages_init(bad_ppro) do { } while (0)
-#endif /* CONFIG_HIGHMEM */
-
-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
-EXPORT_SYMBOL(__PAGE_KERNEL);
-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
-
-#ifdef CONFIG_NUMA
-extern void __init remap_numa_kva(void);
-#else
-#define remap_numa_kva() do {} while (0)
-#endif
-
-void __init native_pagetable_setup_start(pgd_t *base)
-{
-#ifdef CONFIG_X86_PAE
-       int i;
-
-       /*
-        * Init entries of the first-level page table to the
-        * zero page, if they haven't already been set up.
-        *
-        * In a normal native boot, we'll be running on a
-        * pagetable rooted in swapper_pg_dir, but not in PAE
-        * mode, so this will end up clobbering the mappings
-        * for the lower 24Mbytes of the address space,
-        * without affecting the kernel address space.
-        */
-       for (i = 0; i < USER_PTRS_PER_PGD; i++)
-               set_pgd(&base[i],
-                       __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
-
-       /* Make sure kernel address space is empty so that a pagetable
-          will be allocated for it. */
-       memset(&base[USER_PTRS_PER_PGD], 0,
-              KERNEL_PGD_PTRS * sizeof(pgd_t));
-#else
-       paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
-#endif
-}
-
-void __init native_pagetable_setup_done(pgd_t *base)
-{
-#ifdef CONFIG_X86_PAE
-       /*
-        * Add low memory identity-mappings - SMP needs it when
-        * starting up on an AP from real-mode. In the non-PAE
-        * case we already have these mappings through head.S.
-        * All user-space mappings are explicitly cleared after
-        * SMP startup.
-        */
-       set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
-#endif
-}
-
-/*
- * Build a proper pagetable for the kernel mappings.  Up until this
- * point, we've been running on some set of pagetables constructed by
- * the boot process.
- *
- * If we're booting on native hardware, this will be a pagetable
- * constructed in arch/i386/kernel/head.S, and not running in PAE mode
- * (even if we'll end up running in PAE).  The root of the pagetable
- * will be swapper_pg_dir.
- *
- * If we're booting paravirtualized under a hypervisor, then there are
- * more options: we may already be running PAE, and the pagetable may
- * or may not be based in swapper_pg_dir.  In any case,
- * paravirt_pagetable_setup_start() will set up swapper_pg_dir
- * appropriately for the rest of the initialization to work.
- *
- * In general, pagetable_init() assumes that the pagetable may already
- * be partially populated, and so it avoids stomping on any existing
- * mappings.
- */
-static void __init pagetable_init (void)
-{
-       unsigned long vaddr, end;
-       pgd_t *pgd_base = swapper_pg_dir;
-
-       paravirt_pagetable_setup_start(pgd_base);
-
-       /* Enable PSE if available */
-       if (cpu_has_pse)
-               set_in_cr4(X86_CR4_PSE);
-
-       /* Enable PGE if available */
-       if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
-               __PAGE_KERNEL |= _PAGE_GLOBAL;
-               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
-       }
-
-       kernel_physical_mapping_init(pgd_base);
-       remap_numa_kva();
-
-       /*
-        * Fixed mappings, only the page table structure has to be
-        * created - mappings will be set by set_fixmap():
-        */
-       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
-       end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
-       page_table_range_init(vaddr, end, pgd_base);
-
-       permanent_kmaps_init(pgd_base);
-
-       paravirt_pagetable_setup_done(pgd_base);
-}
-
-#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
-/*
- * Swap suspend & friends need this for resume because things like the intel-agp
- * driver might have split up a kernel 4MB mapping.
- */
-char __nosavedata swsusp_pg_dir[PAGE_SIZE]
-       __attribute__ ((aligned (PAGE_SIZE)));
-
-static inline void save_pg_dir(void)
-{
-       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
-}
-#else
-static inline void save_pg_dir(void)
-{
-}
-#endif
-
-void zap_low_mappings (void)
-{
-       int i;
-
-       save_pg_dir();
-
-       /*
-        * Zap initial low-memory mappings.
-        *
-        * Note that "pgd_clear()" doesn't do it for
-        * us, because pgd_clear() is a no-op on i386.
-        */
-       for (i = 0; i < USER_PTRS_PER_PGD; i++)
-#ifdef CONFIG_X86_PAE
-               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
-#else
-               set_pgd(swapper_pg_dir+i, __pgd(0));
-#endif
-       flush_tlb_all();
-}
-
-int nx_enabled = 0;
-
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata = 0;
-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on      Enable
- * off     Disable
- */
-static int __init noexec_setup(char *str)
-{
-       if (!str || !strcmp(str, "on")) {
-               if (cpu_has_nx) {
-                       __supported_pte_mask |= _PAGE_NX;
-                       disable_nx = 0;
-               }
-       } else if (!strcmp(str,"off")) {
-               disable_nx = 1;
-               __supported_pte_mask &= ~_PAGE_NX;
-       } else
-               return -EINVAL;
-
-       return 0;
-}
-early_param("noexec", noexec_setup);
-
-static void __init set_nx(void)
-{
-       unsigned int v[4], l, h;
-
-       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
-               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-               if ((v[3] & (1 << 20)) && !disable_nx) {
-                       rdmsr(MSR_EFER, l, h);
-                       l |= EFER_NX;
-                       wrmsr(MSR_EFER, l, h);
-                       nx_enabled = 1;
-                       __supported_pte_mask |= _PAGE_NX;
-               }
-       }
-}
-
-/*
- * Enables/disables executability of a given kernel page and
- * returns the previous setting.
- */
-int __init set_kernel_exec(unsigned long vaddr, int enable)
-{
-       pte_t *pte;
-       int ret = 1;
-
-       if (!nx_enabled)
-               goto out;
-
-       pte = lookup_address(vaddr);
-       BUG_ON(!pte);
-
-       if (!pte_exec_kernel(*pte))
-               ret = 0;
-
-       if (enable)
-               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
-       else
-               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
-       pte_update_defer(&init_mm, vaddr, pte);
-       __flush_tlb_all();
-out:
-       return ret;
-}
-
-#endif
-
-/*
- * paging_init() sets up the page tables - note that the first 8MB are
- * already mapped by head.S.
- *
- * This routines also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
- */
-void __init paging_init(void)
-{
-#ifdef CONFIG_X86_PAE
-       set_nx();
-       if (nx_enabled)
-               printk("NX (Execute Disable) protection: active\n");
-#endif
-
-       pagetable_init();
-
-       load_cr3(swapper_pg_dir);
-
-#ifdef CONFIG_X86_PAE
-       /*
-        * We will bail out later - printk doesn't work right now so
-        * the user would just see a hanging kernel.
-        */
-       if (cpu_has_pae)
-               set_in_cr4(X86_CR4_PAE);
-#endif
-       __flush_tlb_all();
-
-       kmap_init();
-}
-
-/*
- * Test if the WP bit works in supervisor mode. It isn't supported on 386's
- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
- * used to involve black magic jumps to work around some nasty CPU bugs,
- * but fortunately the switch to using exceptions got rid of all that.
- */
-
-static void __init test_wp_bit(void)
-{
-       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
-
-       /* Any page-aligned address will do, the test is non-destructive */
-       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
-       boot_cpu_data.wp_works_ok = do_test_wp_bit();
-       clear_fixmap(FIX_WP_TEST);
-
-       if (!boot_cpu_data.wp_works_ok) {
-               printk("No.\n");
-#ifdef CONFIG_X86_WP_WORKS_OK
-               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
-#endif
-       } else {
-               printk("Ok.\n");
-       }
-}
-
-static struct kcore_list kcore_mem, kcore_vmalloc; 
-
-void __init mem_init(void)
-{
-       extern int ppro_with_ram_bug(void);
-       int codesize, reservedpages, datasize, initsize;
-       int tmp;
-       int bad_ppro;
-
-#ifdef CONFIG_FLATMEM
-       BUG_ON(!mem_map);
-#endif
-       
-       bad_ppro = ppro_with_ram_bug();
-
-#ifdef CONFIG_HIGHMEM
-       /* check that fixmap and pkmap do not overlap */
-       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
-               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
-               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
-                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
-               BUG();
-       }
-#endif
-       /* this will put all low memory onto the freelists */
-       totalram_pages += free_all_bootmem();
-
-       reservedpages = 0;
-       for (tmp = 0; tmp < max_low_pfn; tmp++)
-               /*
-                * Only count reserved RAM pages
-                */
-               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
-                       reservedpages++;
-
-       set_highmem_pages_init(bad_ppro);
-
-       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
-       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
-       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
-
-       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
-       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
-                  VMALLOC_END-VMALLOC_START);
-
-       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
-               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-               num_physpages << (PAGE_SHIFT-10),
-               codesize >> 10,
-               reservedpages << (PAGE_SHIFT-10),
-               datasize >> 10,
-               initsize >> 10,
-               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
-              );
-
-#if 1 /* double-sanity-check paranoia */
-       printk("virtual kernel memory layout:\n"
-              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-#ifdef CONFIG_HIGHMEM
-              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-#endif
-              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
-              FIXADDR_START, FIXADDR_TOP,
-              (FIXADDR_TOP - FIXADDR_START) >> 10,
-
-#ifdef CONFIG_HIGHMEM
-              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
-              (LAST_PKMAP*PAGE_SIZE) >> 10,
-#endif
-
-              VMALLOC_START, VMALLOC_END,
-              (VMALLOC_END - VMALLOC_START) >> 20,
-
-              (unsigned long)__va(0), (unsigned long)high_memory,
-              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
-
-              (unsigned long)&__init_begin, (unsigned long)&__init_end,
-              ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
-
-              (unsigned long)&_etext, (unsigned long)&_edata,
-              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
-
-              (unsigned long)&_text, (unsigned long)&_etext,
-              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
-
-#ifdef CONFIG_HIGHMEM
-       BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
-       BUG_ON(VMALLOC_END                     > PKMAP_BASE);
-#endif
-       BUG_ON(VMALLOC_START                   > VMALLOC_END);
-       BUG_ON((unsigned long)high_memory      > VMALLOC_START);
-#endif /* double-sanity-check paranoia */
-
-#ifdef CONFIG_X86_PAE
-       if (!cpu_has_pae)
-               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
-#endif
-       if (boot_cpu_data.wp_works_ok < 0)
-               test_wp_bit();
-
-       /*
-        * Subtle. SMP is doing it's boot stuff late (because it has to
-        * fork idle threads) - but it also needs low mappings for the
-        * protected-mode entry to work. We zap these entries only after
-        * the WP-bit has been tested.
-        */
-#ifndef CONFIG_SMP
-       zap_low_mappings();
-#endif
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
-{
-       struct pglist_data *pgdata = NODE_DATA(nid);
-       struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
-       unsigned long start_pfn = start >> PAGE_SHIFT;
-       unsigned long nr_pages = size >> PAGE_SHIFT;
-
-       return __add_pages(zone, start_pfn, nr_pages);
-}
-
-int remove_memory(u64 start, u64 size)
-{
-       return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(remove_memory);
-#endif
-
-struct kmem_cache *pmd_cache;
-
-void __init pgtable_cache_init(void)
-{
-       size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
-
-       if (PTRS_PER_PMD > 1) {
-               pmd_cache = kmem_cache_create("pmd",
-                                       PTRS_PER_PMD*sizeof(pmd_t),
-                                       PTRS_PER_PMD*sizeof(pmd_t),
-                                       SLAB_PANIC,
-                                       pmd_ctor);
-               if (!SHARED_KERNEL_PMD) {
-                       /* If we're in PAE mode and have a non-shared
-                          kernel pmd, then the pgd size must be a
-                          page size.  This is because the pgd_list
-                          links through the page structure, so there
-                          can only be one pgd per page for this to
-                          work. */
-                       pgd_size = PAGE_SIZE;
-               }
-       }
-}
-
-/*
- * This function cannot be __init, since exceptions don't work in that
- * section.  Put this after the callers, so that it cannot be inlined.
- */
-static int noinline do_test_wp_bit(void)
-{
-       char tmp_reg;
-       int flag;
-
-       __asm__ __volatile__(
-               "       movb %0,%1      \n"
-               "1:     movb %1,%0      \n"
-               "       xorl %2,%2      \n"
-               "2:                     \n"
-               ".section __ex_table,\"a\"\n"
-               "       .align 4        \n"
-               "       .long 1b,2b     \n"
-               ".previous              \n"
-               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
-                "=q" (tmp_reg),
-                "=r" (flag)
-               :"2" (1)
-               :"memory");
-       
-       return flag;
-}
-
-#ifdef CONFIG_DEBUG_RODATA
-
-void mark_rodata_ro(void)
-{
-       unsigned long start = PFN_ALIGN(_text);
-       unsigned long size = PFN_ALIGN(_etext) - start;
-
-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
-       /* It must still be possible to apply SMP alternatives. */
-       if (num_possible_cpus() <= 1)
-#endif
-       {
-               change_page_attr(virt_to_page(start),
-                                size >> PAGE_SHIFT, PAGE_KERNEL_RX);
-               printk("Write protecting the kernel text: %luk\n", size >> 10);
-       }
-#endif
-       start += size;
-       size = (unsigned long)__end_rodata - start;
-       change_page_attr(virt_to_page(start),
-                        size >> PAGE_SHIFT, PAGE_KERNEL_RO);
-       printk("Write protecting the kernel read-only data: %luk\n",
-              size >> 10);
-
-       /*
-        * change_page_attr() requires a global_flush_tlb() call after it.
-        * We do this after the printk so that if something went wrong in the
-        * change, the printk gets out at least to give a better debug hint
-        * of who is the culprit.
-        */
-       global_flush_tlb();
-}
-#endif
-
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-       unsigned long addr;
-
-       for (addr = begin; addr < end; addr += PAGE_SIZE) {
-               ClearPageReserved(virt_to_page(addr));
-               init_page_count(virt_to_page(addr));
-               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-               free_page(addr);
-               totalram_pages++;
-       }
-       printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-}
-
-void free_initmem(void)
-{
-       free_init_pages("unused kernel memory",
-                       (unsigned long)(&__init_begin),
-                       (unsigned long)(&__init_end));
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_init_pages("initrd memory", start, end);
-}
-#endif
-
diff --git a/arch/i386/mm/ioremap_32.c b/arch/i386/mm/ioremap_32.c
deleted file mode 100644 (file)
index 0b27831..0000000
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * arch/i386/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <asm/fixmap.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-
-#define ISA_START_ADDRESS      0xa0000
-#define ISA_END_ADDRESS                0x100000
-
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
-       void __iomem * addr;
-       struct vm_struct * area;
-       unsigned long offset, last_addr;
-       pgprot_t prot;
-
-       /* Don't allow wraparound or zero size */
-       last_addr = phys_addr + size - 1;
-       if (!size || last_addr < phys_addr)
-               return NULL;
-
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return (void __iomem *) phys_to_virt(phys_addr);
-
-       /*
-        * Don't allow anybody to remap normal RAM that we're using..
-        */
-       if (phys_addr <= virt_to_phys(high_memory - 1)) {
-               char *t_addr, *t_end;
-               struct page *page;
-
-               t_addr = __va(phys_addr);
-               t_end = t_addr + (size - 1);
-          
-               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-                       if(!PageReserved(page))
-                               return NULL;
-       }
-
-       prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
-                       | _PAGE_ACCESSED | flags);
-
-       /*
-        * Mappings have to be page-aligned
-        */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-       /*
-        * Ok, go for it..
-        */
-       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
-       if (!area)
-               return NULL;
-       area->phys_addr = phys_addr;
-       addr = (void __iomem *) area->addr;
-       if (ioremap_page_range((unsigned long) addr,
-                       (unsigned long) addr + size, phys_addr, prot)) {
-               vunmap((void __force *) addr);
-               return NULL;
-       }
-       return (void __iomem *) (offset + (char __iomem *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache     -   map bus memory into CPU space
- * @offset:    bus address of the memory
- * @size:      size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address. 
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many 
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- * 
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
-       unsigned long last_addr;
-       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
-       if (!p) 
-               return p; 
-
-       /* Guaranteed to be > phys_addr, as per __ioremap() */
-       last_addr = phys_addr + size - 1;
-
-       if (last_addr < virt_to_phys(high_memory) - 1) {
-               struct page *ppage = virt_to_page(__va(phys_addr));             
-               unsigned long npages;
-
-               phys_addr &= PAGE_MASK;
-
-               /* This might overflow and become zero.. */
-               last_addr = PAGE_ALIGN(last_addr);
-
-               /* .. but that's ok, because modulo-2**n arithmetic will make
-               * the page-aligned "last - first" come out right.
-               */
-               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
-
-               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
-                       iounmap(p); 
-                       p = NULL;
-               }
-               global_flush_tlb();
-       }
-
-       return p;                                       
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
-       struct vm_struct *p, *o;
-
-       if ((void __force *)addr <= high_memory)
-               return;
-
-       /*
-        * __ioremap special-cases the PCI/ISA range by not instantiating a
-        * vm_area and by simply returning an address into the kernel mapping
-        * of ISA space.   So handle that here.
-        */
-       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
-                       addr < phys_to_virt(ISA_END_ADDRESS))
-               return;
-
-       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-
-       /* Use the vm area unlocked, assuming the caller
-          ensures there isn't another iounmap for the same address
-          in parallel. Reuse of the virtual address is prevented by
-          leaving it in the global lists until we're done with it.
-          cpa takes care of the direct mappings. */
-       read_lock(&vmlist_lock);
-       for (p = vmlist; p; p = p->next) {
-               if (p->addr == addr)
-                       break;
-       }
-       read_unlock(&vmlist_lock);
-
-       if (!p) {
-               printk("iounmap: bad address %p\n", addr);
-               dump_stack();
-               return;
-       }
-
-       /* Reset the direct mapping. Can block */
-       if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
-               change_page_attr(virt_to_page(__va(p->phys_addr)),
-                                get_vm_area_size(p) >> PAGE_SHIFT,
-                                PAGE_KERNEL);
-               global_flush_tlb();
-       } 
-
-       /* Finally remove it */
-       o = remove_vm_area((void *)addr);
-       BUG_ON(p != o || o == NULL);
-       kfree(p); 
-}
-EXPORT_SYMBOL(iounmap);
-
-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
-{
-       unsigned long offset, last_addr;
-       unsigned int nrpages;
-       enum fixed_addresses idx;
-
-       /* Don't allow wraparound or zero size */
-       last_addr = phys_addr + size - 1;
-       if (!size || last_addr < phys_addr)
-               return NULL;
-
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return phys_to_virt(phys_addr);
-
-       /*
-        * Mappings have to be page-aligned
-        */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr) - phys_addr;
-
-       /*
-        * Mappings have to fit in the FIX_BTMAP area.
-        */
-       nrpages = size >> PAGE_SHIFT;
-       if (nrpages > NR_FIX_BTMAPS)
-               return NULL;
-
-       /*
-        * Ok, go for it..
-        */
-       idx = FIX_BTMAP_BEGIN;
-       while (nrpages > 0) {
-               set_fixmap(idx, phys_addr);
-               phys_addr += PAGE_SIZE;
-               --idx;
-               --nrpages;
-       }
-       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
-}
-
-void __init bt_iounmap(void *addr, unsigned long size)
-{
-       unsigned long virt_addr;
-       unsigned long offset;
-       unsigned int nrpages;
-       enum fixed_addresses idx;
-
-       virt_addr = (unsigned long)addr;
-       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
-               return;
-       offset = virt_addr & ~PAGE_MASK;
-       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
-
-       idx = FIX_BTMAP_BEGIN;
-       while (nrpages > 0) {
-               clear_fixmap(idx);
-               --idx;
-               --nrpages;
-       }
-}
diff --git a/arch/i386/mm/mmap_32.c b/arch/i386/mm/mmap_32.c
deleted file mode 100644 (file)
index 552e084..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  linux/arch/i386/mm/mmap.c
- *
- *  flexible mmap layout support
- *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
- */
-
-#include <linux/personality.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave an at least ~128 MB hole.
- */
-#define MIN_GAP (128*1024*1024)
-#define MAX_GAP (TASK_SIZE/6*5)
-
-static inline unsigned long mmap_base(struct mm_struct *mm)
-{
-       unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
-       unsigned long random_factor = 0;
-
-       if (current->flags & PF_RANDOMIZE)
-               random_factor = get_random_int() % (1024*1024);
-
-       if (gap < MIN_GAP)
-               gap = MIN_GAP;
-       else if (gap > MAX_GAP)
-               gap = MAX_GAP;
-
-       return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
-}
-
-/*
- * This function, called very early during the creation of a new
- * process VM image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-       /*
-        * Fall back to the standard layout if the personality
-        * bit is set, or if the expected stack growth is unlimited:
-        */
-       if (sysctl_legacy_va_layout ||
-                       (current->personality & ADDR_COMPAT_LAYOUT) ||
-                       current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
-               mm->mmap_base = TASK_UNMAPPED_BASE;
-               mm->get_unmapped_area = arch_get_unmapped_area;
-               mm->unmap_area = arch_unmap_area;
-       } else {
-               mm->mmap_base = mmap_base(mm);
-               mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-               mm->unmap_area = arch_unmap_area_topdown;
-       }
-}
diff --git a/arch/i386/mm/pageattr_32.c b/arch/i386/mm/pageattr_32.c
deleted file mode 100644 (file)
index 4241a74..0000000
+++ /dev/null
@@ -1,278 +0,0 @@
-/* 
- * Copyright 2002 Andi Kleen, SuSE Labs. 
- * Thanks to Ben LaHaise for precious feedback.
- */ 
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/sections.h>
-
-static DEFINE_SPINLOCK(cpa_lock);
-static struct list_head df_list = LIST_HEAD_INIT(df_list);
-
-
-pte_t *lookup_address(unsigned long address) 
-{ 
-       pgd_t *pgd = pgd_offset_k(address);
-       pud_t *pud;
-       pmd_t *pmd;
-       if (pgd_none(*pgd))
-               return NULL;
-       pud = pud_offset(pgd, address);
-       if (pud_none(*pud))
-               return NULL;
-       pmd = pmd_offset(pud, address);
-       if (pmd_none(*pmd))
-               return NULL;
-       if (pmd_large(*pmd))
-               return (pte_t *)pmd;
-        return pte_offset_kernel(pmd, address);
-} 
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
-                                       pgprot_t ref_prot)
-{ 
-       int i; 
-       unsigned long addr;
-       struct page *base;
-       pte_t *pbase;
-
-       spin_unlock_irq(&cpa_lock);
-       base = alloc_pages(GFP_KERNEL, 0);
-       spin_lock_irq(&cpa_lock);
-       if (!base) 
-               return NULL;
-
-       /*
-        * page_private is used to track the number of entries in
-        * the page table page that have non standard attributes.
-        */
-       SetPagePrivate(base);
-       page_private(base) = 0;
-
-       address = __pa(address);
-       addr = address & LARGE_PAGE_MASK; 
-       pbase = (pte_t *)page_address(base);
-       paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
-                                          addr == address ? prot : ref_prot));
-       }
-       return base;
-} 
-
-static void cache_flush_page(struct page *p)
-{ 
-       unsigned long adr = (unsigned long)page_address(p);
-       int i;
-       for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
-               asm volatile("clflush (%0)" :: "r" (adr + i));
-}
-
-static void flush_kernel_map(void *arg)
-{
-       struct list_head *lh = (struct list_head *)arg;
-       struct page *p;
-
-       /* High level code is not ready for clflush yet */
-       if (0 && cpu_has_clflush) {
-               list_for_each_entry (p, lh, lru)
-                       cache_flush_page(p);
-       } else if (boot_cpu_data.x86_model >= 4)
-               wbinvd();
-
-       /* Flush all to work around Errata in early athlons regarding 
-        * large page flushing. 
-        */
-       __flush_tlb_all();      
-}
-
-static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
-{ 
-       struct page *page;
-       unsigned long flags;
-
-       set_pte_atomic(kpte, pte);      /* change init_mm */
-       if (SHARED_KERNEL_PMD)
-               return;
-
-       spin_lock_irqsave(&pgd_lock, flags);
-       for (page = pgd_list; page; page = (struct page *)page->index) {
-               pgd_t *pgd;
-               pud_t *pud;
-               pmd_t *pmd;
-               pgd = (pgd_t *)page_address(page) + pgd_index(address);
-               pud = pud_offset(pgd, address);
-               pmd = pmd_offset(pud, address);
-               set_pte_atomic((pte_t *)pmd, pte);
-       }
-       spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-/* 
- * No more special protections in this 2/4MB area - revert to a
- * large page again. 
- */
-static inline void revert_page(struct page *kpte_page, unsigned long address)
-{
-       pgprot_t ref_prot;
-       pte_t *linear;
-
-       ref_prot =
-       ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-               ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
-
-       linear = (pte_t *)
-               pmd_offset(pud_offset(pgd_offset_k(address), address), address);
-       set_pmd_pte(linear,  address,
-                   pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
-                           ref_prot));
-}
-
-static inline void save_page(struct page *kpte_page)
-{
-       if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
-               list_add(&kpte_page->lru, &df_list);
-}
-
-static int
-__change_page_attr(struct page *page, pgprot_t prot)
-{ 
-       pte_t *kpte; 
-       unsigned long address;
-       struct page *kpte_page;
-
-       BUG_ON(PageHighMem(page));
-       address = (unsigned long)page_address(page);
-
-       kpte = lookup_address(address);
-       if (!kpte)
-               return -EINVAL;
-       kpte_page = virt_to_page(kpte);
-       BUG_ON(PageLRU(kpte_page));
-       BUG_ON(PageCompound(kpte_page));
-
-       if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
-               if (!pte_huge(*kpte)) {
-                       set_pte_atomic(kpte, mk_pte(page, prot)); 
-               } else {
-                       pgprot_t ref_prot;
-                       struct page *split;
-
-                       ref_prot =
-                       ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-                               ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
-                       split = split_large_page(address, prot, ref_prot);
-                       if (!split)
-                               return -ENOMEM;
-                       set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
-                       kpte_page = split;
-               }
-               page_private(kpte_page)++;
-       } else if (!pte_huge(*kpte)) {
-               set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
-               BUG_ON(page_private(kpte_page) == 0);
-               page_private(kpte_page)--;
-       } else
-               BUG();
-
-       /*
-        * If the pte was reserved, it means it was created at boot
-        * time (not via split_large_page) and in turn we must not
-        * replace it with a largepage.
-        */
-
-       save_page(kpte_page);
-       if (!PageReserved(kpte_page)) {
-               if (cpu_has_pse && (page_private(kpte_page) == 0)) {
-                       paravirt_release_pt(page_to_pfn(kpte_page));
-                       revert_page(kpte_page, address);
-               }
-       }
-       return 0;
-} 
-
-static inline void flush_map(struct list_head *l)
-{
-       on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- * 
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- * 
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-       int err = 0; 
-       int i; 
-       unsigned long flags;
-
-       spin_lock_irqsave(&cpa_lock, flags);
-       for (i = 0; i < numpages; i++, page++) { 
-               err = __change_page_attr(page, prot);
-               if (err) 
-                       break; 
-       }       
-       spin_unlock_irqrestore(&cpa_lock, flags);
-       return err;
-}
-
-void global_flush_tlb(void)
-{
-       struct list_head l;
-       struct page *pg, *next;
-
-       BUG_ON(irqs_disabled());
-
-       spin_lock_irq(&cpa_lock);
-       list_replace_init(&df_list, &l);
-       spin_unlock_irq(&cpa_lock);
-       flush_map(&l);
-       list_for_each_entry_safe(pg, next, &l, lru) {
-               list_del(&pg->lru);
-               clear_bit(PG_arch_1, &pg->flags);
-               if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
-                       continue;
-               ClearPagePrivate(pg);
-               __free_page(pg);
-       }
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
-       if (PageHighMem(page))
-               return;
-       if (!enable)
-               debug_check_no_locks_freed(page_address(page),
-                                          numpages * PAGE_SIZE);
-
-       /* the return value is ignored - the calls cannot fail,
-        * large pages are disabled at boot time.
-        */
-       change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
-       /* we should perform an IPI and flush all tlbs,
-        * but that can deadlock->flush only current cpu.
-        */
-       __flush_tlb_all();
-}
-#endif
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/i386/mm/pgtable_32.c b/arch/i386/mm/pgtable_32.c
deleted file mode 100644 (file)
index 01437c4..0000000
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- *  linux/arch/i386/mm/pgtable.c
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/smp.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/quicklist.h>
-
-#include <asm/system.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/fixmap.h>
-#include <asm/e820.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-
-void show_mem(void)
-{
-       int total = 0, reserved = 0;
-       int shared = 0, cached = 0;
-       int highmem = 0;
-       struct page *page;
-       pg_data_t *pgdat;
-       unsigned long i;
-       unsigned long flags;
-
-       printk(KERN_INFO "Mem-info:\n");
-       show_free_areas();
-       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-       for_each_online_pgdat(pgdat) {
-               pgdat_resize_lock(pgdat, &flags);
-               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-                       page = pgdat_page_nr(pgdat, i);
-                       total++;
-                       if (PageHighMem(page))
-                               highmem++;
-                       if (PageReserved(page))
-                               reserved++;
-                       else if (PageSwapCache(page))
-                               cached++;
-                       else if (page_count(page))
-                               shared += page_count(page) - 1;
-               }
-               pgdat_resize_unlock(pgdat, &flags);
-       }
-       printk(KERN_INFO "%d pages of RAM\n", total);
-       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
-       printk(KERN_INFO "%d reserved pages\n", reserved);
-       printk(KERN_INFO "%d pages shared\n", shared);
-       printk(KERN_INFO "%d pages swap cached\n", cached);
-
-       printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
-       printk(KERN_INFO "%lu pages writeback\n",
-                                       global_page_state(NR_WRITEBACK));
-       printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
-       printk(KERN_INFO "%lu pages slab\n",
-               global_page_state(NR_SLAB_RECLAIMABLE) +
-               global_page_state(NR_SLAB_UNRECLAIMABLE));
-       printk(KERN_INFO "%lu pages pagetables\n",
-                                       global_page_state(NR_PAGETABLE));
-}
-
-/*
- * Associate a virtual page frame with a given physical page frame 
- * and protection flags for that frame.
- */ 
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       pgd = swapper_pg_dir + pgd_index(vaddr);
-       if (pgd_none(*pgd)) {
-               BUG();
-               return;
-       }
-       pud = pud_offset(pgd, vaddr);
-       if (pud_none(*pud)) {
-               BUG();
-               return;
-       }
-       pmd = pmd_offset(pud, vaddr);
-       if (pmd_none(*pmd)) {
-               BUG();
-               return;
-       }
-       pte = pte_offset_kernel(pmd, vaddr);
-       if (pgprot_val(flags))
-               /* <pfn,flags> stored as-is, to permit clearing entries */
-               set_pte(pte, pfn_pte(pfn, flags));
-       else
-               pte_clear(&init_mm, vaddr, pte);
-
-       /*
-        * It's enough to flush this one mapping.
-        * (PGE mappings get flushed as well)
-        */
-       __flush_tlb_one(vaddr);
-}
-
-/*
- * Associate a large virtual page frame with a given physical page frame 
- * and protection flags for that frame. pfn is for the base of the page,
- * vaddr is what the page gets mapped to - both must be properly aligned. 
- * The pmd must already be instantiated. Assumes PAE mode.
- */ 
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-
-       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
-               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
-               return; /* BUG(); */
-       }
-       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
-               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
-               return; /* BUG(); */
-       }
-       pgd = swapper_pg_dir + pgd_index(vaddr);
-       if (pgd_none(*pgd)) {
-               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
-               return; /* BUG(); */
-       }
-       pud = pud_offset(pgd, vaddr);
-       pmd = pmd_offset(pud, vaddr);
-       set_pmd(pmd, pfn_pmd(pfn, flags));
-       /*
-        * It's enough to flush this one mapping.
-        * (PGE mappings get flushed as well)
-        */
-       __flush_tlb_one(vaddr);
-}
-
-static int fixmaps;
-unsigned long __FIXADDR_TOP = 0xfffff000;
-EXPORT_SYMBOL(__FIXADDR_TOP);
-
-void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
-       unsigned long address = __fix_to_virt(idx);
-
-       if (idx >= __end_of_fixed_addresses) {
-               BUG();
-               return;
-       }
-       set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
-       fixmaps++;
-}
-
-/**
- * reserve_top_address - reserves a hole in the top of kernel address space
- * @reserve - size of hole to reserve
- *
- * Can be used to relocate the fixmap area and poke a hole in the top
- * of kernel address space to make room for a hypervisor.
- */
-void reserve_top_address(unsigned long reserve)
-{
-       BUG_ON(fixmaps > 0);
-       printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
-              (int)-reserve);
-       __FIXADDR_TOP = -reserve - PAGE_SIZE;
-       __VMALLOC_RESERVE += reserve;
-}
-
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-       return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-}
-
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-       struct page *pte;
-
-#ifdef CONFIG_HIGHPTE
-       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
-       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
-       return pte;
-}
-
-void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
-{
-       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
-DEFINE_SPINLOCK(pgd_lock);
-struct page *pgd_list;
-
-static inline void pgd_list_add(pgd_t *pgd)
-{
-       struct page *page = virt_to_page(pgd);
-       page->index = (unsigned long)pgd_list;
-       if (pgd_list)
-               set_page_private(pgd_list, (unsigned long)&page->index);
-       pgd_list = page;
-       set_page_private(page, (unsigned long)&pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
-       struct page *next, **pprev, *page = virt_to_page(pgd);
-       next = (struct page *)page->index;
-       pprev = (struct page **)page_private(page);
-       *pprev = next;
-       if (next)
-               set_page_private(next, (unsigned long)pprev);
-}
-
-
-
-#if (PTRS_PER_PMD == 1)
-/* Non-PAE pgd constructor */
-static void pgd_ctor(void *pgd)
-{
-       unsigned long flags;
-
-       /* !PAE, no pagetable sharing */
-       memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-
-       spin_lock_irqsave(&pgd_lock, flags);
-
-       /* must happen under lock */
-       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-                       swapper_pg_dir + USER_PTRS_PER_PGD,
-                       KERNEL_PGD_PTRS);
-       paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-                               __pa(swapper_pg_dir) >> PAGE_SHIFT,
-                               USER_PTRS_PER_PGD,
-                               KERNEL_PGD_PTRS);
-       pgd_list_add(pgd);
-       spin_unlock_irqrestore(&pgd_lock, flags);
-}
-#else  /* PTRS_PER_PMD > 1 */
-/* PAE pgd constructor */
-static void pgd_ctor(void *pgd)
-{
-       /* PAE, kernel PMD may be shared */
-
-       if (SHARED_KERNEL_PMD) {
-               clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-                               swapper_pg_dir + USER_PTRS_PER_PGD,
-                               KERNEL_PGD_PTRS);
-       } else {
-               unsigned long flags;
-
-               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-               spin_lock_irqsave(&pgd_lock, flags);
-               pgd_list_add(pgd);
-               spin_unlock_irqrestore(&pgd_lock, flags);
-       }
-}
-#endif /* PTRS_PER_PMD */
-
-static void pgd_dtor(void *pgd)
-{
-       unsigned long flags; /* can be called from interrupt context */
-
-       if (SHARED_KERNEL_PMD)
-               return;
-
-       paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
-       spin_lock_irqsave(&pgd_lock, flags);
-       pgd_list_del(pgd);
-       spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-#define UNSHARED_PTRS_PER_PGD                          \
-       (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
-/* If we allocate a pmd for part of the kernel address space, then
-   make sure its initialized with the appropriate kernel mappings.
-   Otherwise use a cached zeroed pmd.  */
-static pmd_t *pmd_cache_alloc(int idx)
-{
-       pmd_t *pmd;
-
-       if (idx >= USER_PTRS_PER_PGD) {
-               pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
-
-               if (pmd)
-                       memcpy(pmd,
-                              (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
-                              sizeof(pmd_t) * PTRS_PER_PMD);
-       } else
-               pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-
-       return pmd;
-}
-
-static void pmd_cache_free(pmd_t *pmd, int idx)
-{
-       if (idx >= USER_PTRS_PER_PGD)
-               free_page((unsigned long)pmd);
-       else
-               kmem_cache_free(pmd_cache, pmd);
-}
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-       int i;
-       pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
-
-       if (PTRS_PER_PMD == 1 || !pgd)
-               return pgd;
-
-       for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-               pmd_t *pmd = pmd_cache_alloc(i);
-
-               if (!pmd)
-                       goto out_oom;
-
-               paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
-               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
-       }
-       return pgd;
-
-out_oom:
-       for (i--; i >= 0; i--) {
-               pgd_t pgdent = pgd[i];
-               void* pmd = (void *)__va(pgd_val(pgdent)-1);
-               paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-               pmd_cache_free(pmd, i);
-       }
-       quicklist_free(0, pgd_dtor, pgd);
-       return NULL;
-}
-
-void pgd_free(pgd_t *pgd)
-{
-       int i;
-
-       /* in the PAE case user pgd entries are overwritten before usage */
-       if (PTRS_PER_PMD > 1)
-               for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-                       pgd_t pgdent = pgd[i];
-                       void* pmd = (void *)__va(pgd_val(pgdent)-1);
-                       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-                       pmd_cache_free(pmd, i);
-               }
-       /* in the non-PAE case, free_pgtables() clears user pgd entries */
-       quicklist_free(0, pgd_dtor, pgd);
-}
-
-void check_pgt_cache(void)
-{
-       quicklist_trim(0, pgd_dtor, 25, 16);
-}
-
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
new file mode 100644 (file)
index 0000000..7317648
--- /dev/null
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/mm/Makefile_32
+else
+include ${srctree}/arch/x86_64/mm/Makefile_64
+endif
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
new file mode 100644 (file)
index 0000000..362b4ad
--- /dev/null
@@ -0,0 +1,10 @@
+#
+# Makefile for the linux i386-specific parts of the memory manager.
+#
+
+obj-y  := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
+
+obj-$(CONFIG_NUMA) += discontig_32.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_HIGHMEM) += highmem_32.o
+obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
new file mode 100644 (file)
index 0000000..4de95a1
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * arch/i386/mm/boot_ioremap.c
+ * 
+ * Re-map functions for early boot-time before paging_init() when the 
+ * boot-time pagetables are still in use
+ *
+ * Written by Dave Hansen <haveblue@us.ibm.com>
+ */
+
+
+/*
+ * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
+ * keeps that from happenning.  If anyone has a better way, I'm listening.
+ *
+ * boot_pte_t is defined only if this all works correctly
+ */
+
+#undef CONFIG_X86_PAE
+#undef CONFIG_PARAVIRT
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <linux/init.h>
+#include <linux/stddef.h>
+
+/* 
+ * I'm cheating here.  It is known that the two boot PTE pages are 
+ * allocated next to each other.  I'm pretending that they're just
+ * one big array. 
+ */
+
+#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
+
+static unsigned long boot_pte_index(unsigned long vaddr) 
+{
+       return __pa(vaddr) >> PAGE_SHIFT;
+}
+
+static inline boot_pte_t* boot_vaddr_to_pte(void *address)
+{
+       boot_pte_t* boot_pg = (boot_pte_t*)pg0;
+       return &boot_pg[boot_pte_index((unsigned long)address)];
+}
+
+/*
+ * This is only for a caller who is clever enough to page-align
+ * phys_addr and virtual_source, and who also has a preference
+ * about which virtual address from which to steal ptes
+ */
+static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, 
+                   void* virtual_source)
+{
+       boot_pte_t* pte;
+       int i;
+       char *vaddr = virtual_source;
+
+       pte = boot_vaddr_to_pte(virtual_source);
+       for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
+               set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
+               __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
+       }
+}
+
+/* the virtual space we're going to remap comes from this array */
+#define BOOT_IOREMAP_PAGES 4
+#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
+static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
+                      __attribute__ ((aligned (PAGE_SIZE)));
+
+/*
+ * This only applies to things which need to ioremap before paging_init()
+ * bt_ioremap() and plain ioremap() are both useless at this point.
+ * 
+ * When used, we're still using the boot-time pagetables, which only
+ * have 2 PTE pages mapping the first 8MB
+ *
+ * There is no unmap.  The boot-time PTE pages aren't used after boot.
+ * If you really want the space back, just remap it yourself.
+ * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
+ */
+__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
+{
+       unsigned long last_addr, offset;
+       unsigned int nrpages;
+       
+       last_addr = phys_addr + size - 1;
+
+       /* page align the requested address */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr) - phys_addr;
+       
+       nrpages = size >> PAGE_SHIFT;
+       if (nrpages > BOOT_IOREMAP_PAGES)
+               return NULL;
+       
+       __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
+
+       return &boot_ioremap_space[offset];
+}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
new file mode 100644 (file)
index 0000000..860e912
--- /dev/null
@@ -0,0 +1,431 @@
+/*
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh 
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/highmem.h>
+#include <linux/initrd.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/kexec.h>
+#include <linux/pfn.h>
+#include <linux/swap.h>
+
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/mmzone.h>
+#include <bios_ebda.h>
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+bootmem_data_t node0_bdata;
+
+/*
+ * numa interface - we expect the numa architecture specific code to have
+ *                  populated the following initialisation.
+ *
+ * 1) node_online_map  - the map of all nodes configured (online) in the system
+ * 2) node_start_pfn   - the starting page frame number for a node
+ * 3) node_end_pfn     - the ending page fram number for a node
+ */
+unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+
+
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * 4) physnode_map     - the mapping between a pfn and owning node
+ * physnode_map keeps track of the physical memory layout of a generic
+ * numa node on a 256Mb break (each element of the array will
+ * represent 256Mb of memory and will be marked by the node id.  so,
+ * if the first gig is on node 0, and the second gig is on node 1
+ * physnode_map will contain:
+ *
+ *     physnode_map[0-3] = 0;
+ *     physnode_map[4-7] = 1;
+ *     physnode_map[8- ] = -1;
+ */
+s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+EXPORT_SYMBOL(physnode_map);
+
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+       unsigned long pfn;
+
+       printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
+                       nid, start, end);
+       printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
+       printk(KERN_DEBUG "  ");
+       for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
+               physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+               printk("%ld ", pfn);
+       }
+       printk("\n");
+}
+
+unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
+                                             unsigned long end_pfn)
+{
+       unsigned long nr_pages = end_pfn - start_pfn;
+
+       if (!nr_pages)
+               return 0;
+
+       return (nr_pages + 1) * sizeof(struct page);
+}
+#endif
+
+extern unsigned long find_max_low_pfn(void);
+extern void add_one_highpage_init(struct page *, int, int);
+extern unsigned long highend_pfn, highstart_pfn;
+
+#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
+
+unsigned long node_remap_start_pfn[MAX_NUMNODES];
+unsigned long node_remap_size[MAX_NUMNODES];
+unsigned long node_remap_offset[MAX_NUMNODES];
+void *node_remap_start_vaddr[MAX_NUMNODES];
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void *node_remap_end_vaddr[MAX_NUMNODES];
+void *node_remap_alloc_vaddr[MAX_NUMNODES];
+static unsigned long kva_start_pfn;
+static unsigned long kva_pages;
+/*
+ * FLAT - support for basic PC memory model with discontig enabled, essentially
+ *        a single node with all available processors in it with a flat
+ *        memory map.
+ */
+int __init get_memcfg_numa_flat(void)
+{
+       printk("NUMA - single node, flat memory mode\n");
+
+       /* Run the memory configuration and find the top of memory. */
+       find_max_pfn();
+       node_start_pfn[0] = 0;
+       node_end_pfn[0] = max_pfn;
+       memory_present(0, 0, max_pfn);
+
+        /* Indicate there is one node available. */
+       nodes_clear(node_online_map);
+       node_set_online(0);
+       return 1;
+}
+
+/*
+ * Find the highest page frame number we have available for the node
+ */
+static void __init find_max_pfn_node(int nid)
+{
+       if (node_end_pfn[nid] > max_pfn)
+               node_end_pfn[nid] = max_pfn;
+       /*
+        * if a user has given mem=XXXX, then we need to make sure 
+        * that the node _starts_ before that, too, not just ends
+        */
+       if (node_start_pfn[nid] > max_pfn)
+               node_start_pfn[nid] = max_pfn;
+       BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
+}
+
+/* 
+ * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
+ * method.  For node zero take this from the bottom of memory, for
+ * subsequent nodes place them at node_remap_start_vaddr which contains
+ * node local data in physically node local memory.  See setup_memory()
+ * for details.
+ */
+static void __init allocate_pgdat(int nid)
+{
+       if (nid && node_has_online_mem(nid))
+               NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
+       else {
+               NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
+               min_low_pfn += PFN_UP(sizeof(pg_data_t));
+       }
+}
+
+void *alloc_remap(int nid, unsigned long size)
+{
+       void *allocation = node_remap_alloc_vaddr[nid];
+
+       size = ALIGN(size, L1_CACHE_BYTES);
+
+       if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+               return 0;
+
+       node_remap_alloc_vaddr[nid] += size;
+       memset(allocation, 0, size);
+
+       return allocation;
+}
+
+void __init remap_numa_kva(void)
+{
+       void *vaddr;
+       unsigned long pfn;
+       int node;
+
+       for_each_online_node(node) {
+               for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
+                       vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+                       set_pmd_pfn((ulong) vaddr, 
+                               node_remap_start_pfn[node] + pfn, 
+                               PAGE_KERNEL_LARGE);
+               }
+       }
+}
+
+static unsigned long calculate_numa_remap_pages(void)
+{
+       int nid;
+       unsigned long size, reserve_pages = 0;
+       unsigned long pfn;
+
+       for_each_online_node(nid) {
+               unsigned old_end_pfn = node_end_pfn[nid];
+
+               /*
+                * The acpi/srat node info can show hot-add memroy zones
+                * where memory could be added but not currently present.
+                */
+               if (node_start_pfn[nid] > max_pfn)
+                       continue;
+               if (node_end_pfn[nid] > max_pfn)
+                       node_end_pfn[nid] = max_pfn;
+
+               /* ensure the remap includes space for the pgdat. */
+               size = node_remap_size[nid] + sizeof(pg_data_t);
+
+               /* convert size to large (pmd size) pages, rounding up */
+               size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
+               /* now the roundup is correct, convert to PAGE_SIZE pages */
+               size = size * PTRS_PER_PTE;
+
+               /*
+                * Validate the region we are allocating only contains valid
+                * pages.
+                */
+               for (pfn = node_end_pfn[nid] - size;
+                    pfn < node_end_pfn[nid]; pfn++)
+                       if (!page_is_ram(pfn))
+                               break;
+
+               if (pfn != node_end_pfn[nid])
+                       size = 0;
+
+               printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
+                               size, nid);
+               node_remap_size[nid] = size;
+               node_remap_offset[nid] = reserve_pages;
+               reserve_pages += size;
+               printk("Shrinking node %d from %ld pages to %ld pages\n",
+                       nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+
+               if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
+                       /*
+                        * Align node_end_pfn[] and node_remap_start_pfn[] to
+                        * pmd boundary. remap_numa_kva will barf otherwise.
+                        */
+                       printk("Shrinking node %d further by %ld pages for proper alignment\n",
+                               nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
+                       size +=  node_end_pfn[nid] & (PTRS_PER_PTE-1);
+               }
+
+               node_end_pfn[nid] -= size;
+               node_remap_start_pfn[nid] = node_end_pfn[nid];
+               shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+       }
+       printk("Reserving total of %ld pages for numa KVA remap\n",
+                       reserve_pages);
+       return reserve_pages;
+}
+
+extern void setup_bootmem_allocator(void);
+unsigned long __init setup_memory(void)
+{
+       int nid;
+       unsigned long system_start_pfn, system_max_low_pfn;
+
+       /*
+        * When mapping a NUMA machine we allocate the node_mem_map arrays
+        * from node local memory.  They are then mapped directly into KVA
+        * between zone normal and vmalloc space.  Calculate the size of
+        * this space and use it to adjust the boundry between ZONE_NORMAL
+        * and ZONE_HIGHMEM.
+        */
+       find_max_pfn();
+       get_memcfg_numa();
+
+       kva_pages = calculate_numa_remap_pages();
+
+       /* partially used pages are not usable - thus round upwards */
+       system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
+
+       kva_start_pfn = find_max_low_pfn() - kva_pages;
+
+#ifdef CONFIG_BLK_DEV_INITRD
+       /* Numa kva area is below the initrd */
+       if (LOADER_TYPE && INITRD_START)
+               kva_start_pfn = PFN_DOWN(INITRD_START)  - kva_pages;
+#endif
+       kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
+
+       system_max_low_pfn = max_low_pfn = find_max_low_pfn();
+       printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
+               kva_start_pfn, max_low_pfn);
+       printk("max_pfn = %ld\n", max_pfn);
+#ifdef CONFIG_HIGHMEM
+       highstart_pfn = highend_pfn = max_pfn;
+       if (max_pfn > system_max_low_pfn)
+               highstart_pfn = system_max_low_pfn;
+       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+              pages_to_mb(highend_pfn - highstart_pfn));
+       num_physpages = highend_pfn;
+       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+       num_physpages = system_max_low_pfn;
+       high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                       pages_to_mb(system_max_low_pfn));
+       printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 
+                       min_low_pfn, max_low_pfn, highstart_pfn);
+
+       printk("Low memory ends at vaddr %08lx\n",
+                       (ulong) pfn_to_kaddr(max_low_pfn));
+       for_each_online_node(nid) {
+               node_remap_start_vaddr[nid] = pfn_to_kaddr(
+                               kva_start_pfn + node_remap_offset[nid]);
+               /* Init the node remap allocator */
+               node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+                       (node_remap_size[nid] * PAGE_SIZE);
+               node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+                       ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
+               allocate_pgdat(nid);
+               printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+                       (ulong) node_remap_start_vaddr[nid],
+                       (ulong) pfn_to_kaddr(highstart_pfn
+                          + node_remap_offset[nid] + node_remap_size[nid]));
+       }
+       printk("High memory starts at vaddr %08lx\n",
+                       (ulong) pfn_to_kaddr(highstart_pfn));
+       for_each_online_node(nid)
+               find_max_pfn_node(nid);
+
+       memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
+       NODE_DATA(0)->bdata = &node0_bdata;
+       setup_bootmem_allocator();
+       return max_low_pfn;
+}
+
+void __init numa_kva_reserve(void)
+{
+       reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
+}
+
+void __init zone_sizes_init(void)
+{
+       int nid;
+       unsigned long max_zone_pfns[MAX_NR_ZONES];
+       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+       max_zone_pfns[ZONE_DMA] =
+               virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+       max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+       max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+#endif
+
+       /* If SRAT has not registered memory, register it now */
+       if (find_max_pfn_with_active_regions() == 0) {
+               for_each_online_node(nid) {
+                       if (node_has_online_mem(nid))
+                               add_active_range(nid, node_start_pfn[nid],
+                                                       node_end_pfn[nid]);
+               }
+       }
+
+       free_area_init_nodes(max_zone_pfns);
+       return;
+}
+
+void __init set_highmem_pages_init(int bad_ppro) 
+{
+#ifdef CONFIG_HIGHMEM
+       struct zone *zone;
+       struct page *page;
+
+       for_each_zone(zone) {
+               unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+
+               if (!is_highmem(zone))
+                       continue;
+
+               zone_start_pfn = zone->zone_start_pfn;
+               zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+               printk("Initializing %s for node %d (%08lx:%08lx)\n",
+                               zone->name, zone_to_nid(zone),
+                               zone_start_pfn, zone_end_pfn);
+
+               for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
+                       if (!pfn_valid(node_pfn))
+                               continue;
+                       page = pfn_to_page(node_pfn);
+                       add_one_highpage_init(page, node_pfn, bad_ppro);
+               }
+       }
+       totalram_pages += totalhigh_pages;
+#endif
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int paddr_to_nid(u64 addr)
+{
+       int nid;
+       unsigned long pfn = PFN_DOWN(addr);
+
+       for_each_node(nid)
+               if (node_start_pfn[nid] <= pfn &&
+                   pfn < node_end_pfn[nid])
+                       return nid;
+
+       return -1;
+}
+
+/*
+ * This function is used to ask node id BEFORE memmap and mem_section's
+ * initialization (pfn_to_nid() can't be used yet).
+ * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
+ */
+int memory_add_physaddr_to_nid(u64 addr)
+{
+       int nid = paddr_to_nid(addr);
+       return (nid >= 0) ? nid : 0;
+}
+
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
new file mode 100644 (file)
index 0000000..0ce4f22
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * linux/arch/i386/mm/extable.c
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+
+int fixup_exception(struct pt_regs *regs)
+{
+       const struct exception_table_entry *fixup;
+
+#ifdef CONFIG_PNPBIOS
+       if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
+       {
+               extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+               extern u32 pnp_bios_is_utter_crap;
+               pnp_bios_is_utter_crap = 1;
+               printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+               __asm__ volatile(
+                       "movl %0, %%esp\n\t"
+                       "jmp *%1\n\t"
+                       : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+               panic("do_trap: can't hit this");
+       }
+#endif
+
+       fixup = search_exception_tables(regs->eip);
+       if (fixup) {
+               regs->eip = fixup->fixup;
+               return 1;
+       }
+
+       return 0;
+}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
new file mode 100644 (file)
index 0000000..fcb38e7
--- /dev/null
@@ -0,0 +1,657 @@
+/*
+ *  linux/arch/i386/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>             /* For unblank_screen() */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>             /* for max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+
+extern void die(const char *,struct pt_regs *,long);
+
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+       vmalloc_sync_all();
+       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
+
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+
+static inline int notify_page_fault(struct pt_regs *regs, long err)
+{
+       struct die_args args = {
+               .regs = regs,
+               .str = "page fault",
+               .err = err,
+               .trapnr = 14,
+               .signr = SIGSEGV
+       };
+       return atomic_notifier_call_chain(&notify_page_fault_chain,
+                                         DIE_PAGE_FAULT, &args);
+}
+
+/*
+ * Return EIP plus the CS segment base.  The segment limit is also
+ * adjusted, clamped to the kernel/user address space (whichever is
+ * appropriate), and returned in *eip_limit.
+ *
+ * The segment is checked, because it might have been changed by another
+ * task between the original faulting instruction and here.
+ *
+ * If CS is no longer a valid code segment, or if EIP is beyond the
+ * limit, or if it is a kernel address when CS is not a kernel segment,
+ * then the returned value will be greater than *eip_limit.
+ * 
+ * This is slow, but is very rarely executed.
+ */
+static inline unsigned long get_segment_eip(struct pt_regs *regs,
+                                           unsigned long *eip_limit)
+{
+       unsigned long eip = regs->eip;
+       unsigned seg = regs->xcs & 0xffff;
+       u32 seg_ar, seg_limit, base, *desc;
+
+       /* Unlikely, but must come before segment checks. */
+       if (unlikely(regs->eflags & VM_MASK)) {
+               base = seg << 4;
+               *eip_limit = base + 0xffff;
+               return base + (eip & 0xffff);
+       }
+
+       /* The standard kernel/user address space limit. */
+       *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
+       
+       /* By far the most common cases. */
+       if (likely(SEGMENT_IS_FLAT_CODE(seg)))
+               return eip;
+
+       /* Check the segment exists, is within the current LDT/GDT size,
+          that kernel/user (ring 0..3) has the appropriate privilege,
+          that it's a code segment, and get the limit. */
+       __asm__ ("larl %3,%0; lsll %3,%1"
+                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
+       if ((~seg_ar & 0x9800) || eip > seg_limit) {
+               *eip_limit = 0;
+               return 1;        /* So that returned eip > *eip_limit. */
+       }
+
+       /* Get the GDT/LDT descriptor base. 
+          When you look for races in this code remember that
+          LDT and other horrors are only used in user space. */
+       if (seg & (1<<2)) {
+               /* Must lock the LDT while reading it. */
+               down(&current->mm->context.sem);
+               desc = current->mm->context.ldt;
+               desc = (void *)desc + (seg & ~7);
+       } else {
+               /* Must disable preemption while reading the GDT. */
+               desc = (u32 *)get_cpu_gdt_table(get_cpu());
+               desc = (void *)desc + (seg & ~7);
+       }
+
+       /* Decode the code segment base from the descriptor */
+       base = get_desc_base((unsigned long *)desc);
+
+       if (seg & (1<<2)) { 
+               up(&current->mm->context.sem);
+       } else
+               put_cpu();
+
+       /* Adjust EIP and segment limit, and clamp at the kernel limit.
+          It's legitimate for segments to wrap at 0xffffffff. */
+       seg_limit += base;
+       if (seg_limit < *eip_limit && seg_limit >= base)
+               *eip_limit = seg_limit;
+       return eip + base;
+}
+
+/* 
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ */
+static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
+{ 
+       unsigned long limit;
+       unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
+       int scan_more = 1;
+       int prefetch = 0; 
+       int i;
+
+       for (i = 0; scan_more && i < 15; i++) { 
+               unsigned char opcode;
+               unsigned char instr_hi;
+               unsigned char instr_lo;
+
+               if (instr > (unsigned char *)limit)
+                       break;
+               if (probe_kernel_address(instr, opcode))
+                       break; 
+
+               instr_hi = opcode & 0xf0; 
+               instr_lo = opcode & 0x0f; 
+               instr++;
+
+               switch (instr_hi) { 
+               case 0x20:
+               case 0x30:
+                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
+                       scan_more = ((instr_lo & 7) == 0x6);
+                       break;
+                       
+               case 0x60:
+                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                       scan_more = (instr_lo & 0xC) == 0x4;
+                       break;          
+               case 0xF0:
+                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
+                       scan_more = !instr_lo || (instr_lo>>1) == 1;
+                       break;                  
+               case 0x00:
+                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                       scan_more = 0;
+                       if (instr > (unsigned char *)limit)
+                               break;
+                       if (probe_kernel_address(instr, opcode))
+                               break;
+                       prefetch = (instr_lo == 0xF) &&
+                               (opcode == 0x0D || opcode == 0x18);
+                       break;                  
+               default:
+                       scan_more = 0;
+                       break;
+               } 
+       }
+       return prefetch;
+}
+
+static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+                             unsigned long error_code)
+{
+       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+                    boot_cpu_data.x86 >= 6)) {
+               /* Catch an obscure case of prefetch inside an NX page. */
+               if (nx_enabled && (error_code & 16))
+                       return 0;
+               return __is_prefetch(regs, addr);
+       }
+       return 0;
+} 
+
+static noinline void force_sig_info_fault(int si_signo, int si_code,
+       unsigned long address, struct task_struct *tsk)
+{
+       siginfo_t info;
+
+       info.si_signo = si_signo;
+       info.si_errno = 0;
+       info.si_code = si_code;
+       info.si_addr = (void __user *)address;
+       force_sig_info(si_signo, &info, tsk);
+}
+
+fastcall void do_invalid_op(struct pt_regs *, unsigned long);
+
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+       unsigned index = pgd_index(address);
+       pgd_t *pgd_k;
+       pud_t *pud, *pud_k;
+       pmd_t *pmd, *pmd_k;
+
+       pgd += index;
+       pgd_k = init_mm.pgd + index;
+
+       if (!pgd_present(*pgd_k))
+               return NULL;
+
+       /*
+        * set_pgd(pgd, *pgd_k); here would be useless on PAE
+        * and redundant with the set_pmd() on non-PAE. As would
+        * set_pud.
+        */
+
+       pud = pud_offset(pgd, address);
+       pud_k = pud_offset(pgd_k, address);
+       if (!pud_present(*pud_k))
+               return NULL;
+
+       pmd = pmd_offset(pud, address);
+       pmd_k = pmd_offset(pud_k, address);
+       if (!pmd_present(*pmd_k))
+               return NULL;
+       if (!pmd_present(*pmd)) {
+               set_pmd(pmd, *pmd_k);
+               arch_flush_lazy_mmu_mode();
+       } else
+               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+       return pmd_k;
+}
+
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * This assumes no large pages in there.
+ */
+static inline int vmalloc_fault(unsigned long address)
+{
+       unsigned long pgd_paddr;
+       pmd_t *pmd_k;
+       pte_t *pte_k;
+       /*
+        * Synchronize this task's top level page-table
+        * with the 'reference' page table.
+        *
+        * Do _not_ use "current" here. We might be inside
+        * an interrupt in the middle of a task switch..
+        */
+       pgd_paddr = read_cr3();
+       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+       if (!pmd_k)
+               return -1;
+       pte_k = pte_offset_kernel(pmd_k, address);
+       if (!pte_present(*pte_k))
+               return -1;
+       return 0;
+}
+
+int show_unhandled_signals = 1;
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * error_code:
+ *     bit 0 == 0 means no page found, 1 means protection fault
+ *     bit 1 == 0 means read, 1 means write
+ *     bit 2 == 0 means kernel, 1 means user-mode
+ *     bit 3 == 1 means use of reserved bit detected
+ *     bit 4 == 1 means fault was an instruction fetch
+ */
+fastcall void __kprobes do_page_fault(struct pt_regs *regs,
+                                     unsigned long error_code)
+{
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       struct vm_area_struct * vma;
+       unsigned long address;
+       int write, si_code;
+       int fault;
+
+       /* get the address */
+        address = read_cr2();
+
+       tsk = current;
+
+       si_code = SEGV_MAPERR;
+
+       /*
+        * We fault-in kernel-space virtual memory on-demand. The
+        * 'reference' page table is init_mm.pgd.
+        *
+        * NOTE! We MUST NOT take any locks for this case. We may
+        * be in an interrupt or a critical region, and should
+        * only copy the information from the master page table,
+        * nothing more.
+        *
+        * This verifies that the fault happens in kernel space
+        * (error_code & 4) == 0, and that the fault was not a
+        * protection error (error_code & 9) == 0.
+        */
+       if (unlikely(address >= TASK_SIZE)) {
+               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
+                       return;
+               if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                       return;
+               /*
+                * Don't take the mm semaphore here. If we fixup a prefetch
+                * fault we could otherwise deadlock.
+                */
+               goto bad_area_nosemaphore;
+       }
+
+       if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+               return;
+
+       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+          fault has been handled. */
+       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+               local_irq_enable();
+
+       mm = tsk->mm;
+
+       /*
+        * If we're in an interrupt, have no user context or are running in an
+        * atomic region then we must not take the fault..
+        */
+       if (in_atomic() || !mm)
+               goto bad_area_nosemaphore;
+
+       /* When running in the kernel we expect faults to occur only to
+        * addresses in user space.  All other faults represent errors in the
+        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+        * erroneous fault occurring in a code path which already holds mmap_sem
+        * we will deadlock attempting to validate the fault against the
+        * address space.  Luckily the kernel only validly references user
+        * space from well defined areas of code, which are listed in the
+        * exceptions table.
+        *
+        * As the vast majority of faults will be valid we will only perform
+        * the source reference check when there is a possibilty of a deadlock.
+        * Attempt to lock the address space, if we cannot we then validate the
+        * source.  If this is invalid we can skip the address space check,
+        * thus avoiding the deadlock.
+        */
+       if (!down_read_trylock(&mm->mmap_sem)) {
+               if ((error_code & 4) == 0 &&
+                   !search_exception_tables(regs->eip))
+                       goto bad_area_nosemaphore;
+               down_read(&mm->mmap_sem);
+       }
+
+       vma = find_vma(mm, address);
+       if (!vma)
+               goto bad_area;
+       if (vma->vm_start <= address)
+               goto good_area;
+       if (!(vma->vm_flags & VM_GROWSDOWN))
+               goto bad_area;
+       if (error_code & 4) {
+               /*
+                * Accessing the stack below %esp is always a bug.
+                * The large cushion allows instructions like enter
+                * and pusha to work.  ("enter $65535,$31" pushes
+                * 32 pointers and then decrements %esp by 65535.)
+                */
+               if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
+                       goto bad_area;
+       }
+       if (expand_stack(vma, address))
+               goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+       si_code = SEGV_ACCERR;
+       write = 0;
+       switch (error_code & 3) {
+               default:        /* 3: write, present */
+                               /* fall through */
+               case 2:         /* write, not present */
+                       if (!(vma->vm_flags & VM_WRITE))
+                               goto bad_area;
+                       write++;
+                       break;
+               case 1:         /* read, present */
+                       goto bad_area;
+               case 0:         /* read, not present */
+                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                               goto bad_area;
+       }
+
+ survive:
+       /*
+        * If for any reason at all we couldn't handle the fault,
+        * make sure we exit gracefully rather than endlessly redo
+        * the fault.
+        */
+       fault = handle_mm_fault(mm, vma, address, write);
+       if (unlikely(fault & VM_FAULT_ERROR)) {
+               if (fault & VM_FAULT_OOM)
+                       goto out_of_memory;
+               else if (fault & VM_FAULT_SIGBUS)
+                       goto do_sigbus;
+               BUG();
+       }
+       if (fault & VM_FAULT_MAJOR)
+               tsk->maj_flt++;
+       else
+               tsk->min_flt++;
+
+       /*
+        * Did it hit the DOS screen memory VA from vm86 mode?
+        */
+       if (regs->eflags & VM_MASK) {
+               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+               if (bit < 32)
+                       tsk->thread.screen_bitmap |= 1 << bit;
+       }
+       up_read(&mm->mmap_sem);
+       return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+       up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+       /* User mode accesses just cause a SIGSEGV */
+       if (error_code & 4) {
+               /*
+                * It's possible to have interrupts off here.
+                */
+               local_irq_enable();
+
+               /* 
+                * Valid to do another page fault here because this one came 
+                * from user space.
+                */
+               if (is_prefetch(regs, address, error_code))
+                       return;
+
+               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                   printk_ratelimit()) {
+                       printk("%s%s[%d]: segfault at %08lx eip %08lx "
+                           "esp %08lx error %lx\n",
+                           tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+                           tsk->comm, tsk->pid, address, regs->eip,
+                           regs->esp, error_code);
+               }
+               tsk->thread.cr2 = address;
+               /* Kernel addresses are always protection faults */
+               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+               tsk->thread.trap_no = 14;
+               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+               return;
+       }
+
+#ifdef CONFIG_X86_F00F_BUG
+       /*
+        * Pentium F0 0F C7 C8 bug workaround.
+        */
+       if (boot_cpu_data.f00f_bug) {
+               unsigned long nr;
+               
+               nr = (address - idt_descr.address) >> 3;
+
+               if (nr == 6) {
+                       do_invalid_op(regs, 0);
+                       return;
+               }
+       }
+#endif
+
+no_context:
+       /* Are we prepared to handle this kernel fault?  */
+       if (fixup_exception(regs))
+               return;
+
+       /* 
+        * Valid to do another page fault here, because if this fault
+        * had been triggered by is_prefetch fixup_exception would have 
+        * handled it.
+        */
+       if (is_prefetch(regs, address, error_code))
+               return;
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+
+       bust_spinlocks(1);
+
+       if (oops_may_print()) {
+               __typeof__(pte_val(__pte(0))) page;
+
+#ifdef CONFIG_X86_PAE
+               if (error_code & 16) {
+                       pte_t *pte = lookup_address(address);
+
+                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+                               printk(KERN_CRIT "kernel tried to execute "
+                                       "NX-protected page - exploit attempt? "
+                                       "(uid: %d)\n", current->uid);
+               }
+#endif
+               if (address < PAGE_SIZE)
+                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
+                                       "pointer dereference");
+               else
+                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
+                                       " request");
+               printk(" at virtual address %08lx\n",address);
+               printk(KERN_ALERT " printing eip:\n");
+               printk("%08lx\n", regs->eip);
+
+               page = read_cr3();
+               page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+               printk(KERN_ALERT "*pdpt = %016Lx\n", page);
+               if ((page >> PAGE_SHIFT) < max_low_pfn
+                   && page & _PAGE_PRESENT) {
+                       page &= PAGE_MASK;
+                       page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+                                                                & (PTRS_PER_PMD - 1)];
+                       printk(KERN_ALERT "*pde = %016Lx\n", page);
+                       page &= ~_PAGE_NX;
+               }
+#else
+               printk(KERN_ALERT "*pde = %08lx\n", page);
+#endif
+
+               /*
+                * We must not directly access the pte in the highpte
+                * case if the page table is located in highmem.
+                * And let's rather not kmap-atomic the pte, just in case
+                * it's allocated already.
+                */
+               if ((page >> PAGE_SHIFT) < max_low_pfn
+                   && (page & _PAGE_PRESENT)) {
+                       page &= PAGE_MASK;
+                       page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+                                                                & (PTRS_PER_PTE - 1)];
+                       printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
+               }
+       }
+
+       tsk->thread.cr2 = address;
+       tsk->thread.trap_no = 14;
+       tsk->thread.error_code = error_code;
+       die("Oops", regs, error_code);
+       bust_spinlocks(0);
+       do_exit(SIGKILL);
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+       up_read(&mm->mmap_sem);
+       if (is_init(tsk)) {
+               yield();
+               down_read(&mm->mmap_sem);
+               goto survive;
+       }
+       printk("VM: killing process %s\n", tsk->comm);
+       if (error_code & 4)
+               do_exit(SIGKILL);
+       goto no_context;
+
+do_sigbus:
+       up_read(&mm->mmap_sem);
+
+       /* Kernel mode? Handle exceptions or die */
+       if (!(error_code & 4))
+               goto no_context;
+
+       /* User space => ok to do another page fault */
+       if (is_prefetch(regs, address, error_code))
+               return;
+
+       tsk->thread.cr2 = address;
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 14;
+       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+void vmalloc_sync_all(void)
+{
+       /*
+        * Note that races in the updates of insync and start aren't
+        * problematic: insync can only get set bits added, and updates to
+        * start are only improving performance (without affecting correctness
+        * if undone).
+        */
+       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+       static unsigned long start = TASK_SIZE;
+       unsigned long address;
+
+       if (SHARED_KERNEL_PMD)
+               return;
+
+       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+               if (!test_bit(pgd_index(address), insync)) {
+                       unsigned long flags;
+                       struct page *page;
+
+                       spin_lock_irqsave(&pgd_lock, flags);
+                       for (page = pgd_list; page; page =
+                                       (struct page *)page->index)
+                               if (!vmalloc_sync_one(page_address(page),
+                                                               address)) {
+                                       BUG_ON(page != pgd_list);
+                                       break;
+                               }
+                       spin_unlock_irqrestore(&pgd_lock, flags);
+                       if (!page)
+                               set_bit(pgd_index(address), insync);
+               }
+               if (address == start && test_bit(pgd_index(address), insync))
+                       start = address + PGDIR_SIZE;
+       }
+}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
new file mode 100644 (file)
index 0000000..1c3bf95
--- /dev/null
@@ -0,0 +1,113 @@
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+void *kmap(struct page *page)
+{
+       might_sleep();
+       if (!PageHighMem(page))
+               return page_address(page);
+       return kmap_high(page);
+}
+
+void kunmap(struct page *page)
+{
+       if (in_interrupt())
+               BUG();
+       if (!PageHighMem(page))
+               return;
+       kunmap_high(page);
+}
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap is is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+{
+       enum fixed_addresses idx;
+       unsigned long vaddr;
+
+       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       pagefault_disable();
+
+       if (!PageHighMem(page))
+               return page_address(page);
+
+       idx = type + KM_TYPE_NR*smp_processor_id();
+       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+       BUG_ON(!pte_none(*(kmap_pte-idx)));
+       set_pte(kmap_pte-idx, mk_pte(page, prot));
+       arch_flush_lazy_mmu_mode();
+
+       return (void *)vaddr;
+}
+
+void *kmap_atomic(struct page *page, enum km_type type)
+{
+       return kmap_atomic_prot(page, type, kmap_prot);
+}
+
+void kunmap_atomic(void *kvaddr, enum km_type type)
+{
+       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+
+       /*
+        * Force other mappings to Oops if they'll try to access this pte
+        * without first remap it.  Keeping stale mappings around is a bad idea
+        * also, in case the page changes cacheability attributes or becomes
+        * a protected page in a hypervisor.
+        */
+       if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+               kpte_clear_flush(kmap_pte-idx, vaddr);
+       else {
+#ifdef CONFIG_DEBUG_HIGHMEM
+               BUG_ON(vaddr < PAGE_OFFSET);
+               BUG_ON(vaddr >= (unsigned long)high_memory);
+#endif
+       }
+
+       arch_flush_lazy_mmu_mode();
+       pagefault_enable();
+}
+
+/* This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+{
+       enum fixed_addresses idx;
+       unsigned long vaddr;
+
+       pagefault_disable();
+
+       idx = type + KM_TYPE_NR*smp_processor_id();
+       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
+       arch_flush_lazy_mmu_mode();
+
+       return (void*) vaddr;
+}
+
+struct page *kmap_atomic_to_page(void *ptr)
+{
+       unsigned long idx, vaddr = (unsigned long)ptr;
+       pte_t *pte;
+
+       if (vaddr < FIXADDR_START)
+               return virt_to_page(ptr);
+
+       idx = virt_to_fix(vaddr);
+       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+       return pte_page(*pte);
+}
+
+EXPORT_SYMBOL(kmap);
+EXPORT_SYMBOL(kunmap);
+EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
new file mode 100644 (file)
index 0000000..6c06d9c
--- /dev/null
@@ -0,0 +1,391 @@
+/*
+ * IA-32 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+                               struct vm_area_struct *vma,
+                               unsigned long addr, pgoff_t idx)
+{
+       unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+                               svma->vm_start;
+       unsigned long sbase = saddr & PUD_MASK;
+       unsigned long s_end = sbase + PUD_SIZE;
+
+       /*
+        * match the virtual addresses, permission and the alignment of the
+        * page table page.
+        */
+       if (pmd_index(addr) != pmd_index(saddr) ||
+           vma->vm_flags != svma->vm_flags ||
+           sbase < svma->vm_start || svma->vm_end < s_end)
+               return 0;
+
+       return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+       unsigned long base = addr & PUD_MASK;
+       unsigned long end = base + PUD_SIZE;
+
+       /*
+        * check on proper vm_flags and page table alignment
+        */
+       if (vma->vm_flags & VM_MAYSHARE &&
+           vma->vm_start <= base && end <= vma->vm_end)
+               return 1;
+       return 0;
+}
+
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+       struct vm_area_struct *vma = find_vma(mm, addr);
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
+       struct prio_tree_iter iter;
+       struct vm_area_struct *svma;
+       unsigned long saddr;
+       pte_t *spte = NULL;
+
+       if (!vma_shareable(vma, addr))
+               return;
+
+       spin_lock(&mapping->i_mmap_lock);
+       vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+               if (svma == vma)
+                       continue;
+
+               saddr = page_table_shareable(svma, vma, addr, idx);
+               if (saddr) {
+                       spte = huge_pte_offset(svma->vm_mm, saddr);
+                       if (spte) {
+                               get_page(virt_to_page(spte));
+                               break;
+                       }
+               }
+       }
+
+       if (!spte)
+               goto out;
+
+       spin_lock(&mm->page_table_lock);
+       if (pud_none(*pud))
+               pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+       else
+               put_page(virt_to_page(spte));
+       spin_unlock(&mm->page_table_lock);
+out:
+       spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *         0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       pgd_t *pgd = pgd_offset(mm, *addr);
+       pud_t *pud = pud_offset(pgd, *addr);
+
+       BUG_ON(page_count(virt_to_page(ptep)) == 0);
+       if (page_count(virt_to_page(ptep)) == 1)
+               return 0;
+
+       pud_clear(pud);
+       put_page(virt_to_page(ptep));
+       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       return 1;
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pte_t *pte = NULL;
+
+       pgd = pgd_offset(mm, addr);
+       pud = pud_alloc(mm, pgd, addr);
+       if (pud) {
+               if (pud_none(*pud))
+                       huge_pmd_share(mm, addr, pud);
+               pte = (pte_t *) pmd_alloc(mm, pud, addr);
+       }
+       BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+       return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd = NULL;
+
+       pgd = pgd_offset(mm, addr);
+       if (pgd_present(*pgd)) {
+               pud = pud_offset(pgd, addr);
+               if (pud_present(*pud))
+                       pmd = pmd_offset(pud, addr);
+       }
+       return (pte_t *) pmd;
+}
+
+#if 0  /* This is just for testing */
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+       unsigned long start = address;
+       int length = 1;
+       int nr;
+       struct page *page;
+       struct vm_area_struct *vma;
+
+       vma = find_vma(mm, addr);
+       if (!vma || !is_vm_hugetlb_page(vma))
+               return ERR_PTR(-EINVAL);
+
+       pte = huge_pte_offset(mm, address);
+
+       /* hugetlb should be locked, and hence, prefaulted */
+       WARN_ON(!pte || pte_none(*pte));
+
+       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+       WARN_ON(!PageCompound(page));
+
+       return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+       return 0;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+               pmd_t *pmd, int write)
+{
+       return NULL;
+}
+
+#else
+
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+       return ERR_PTR(-EINVAL);
+}
+
+int pmd_huge(pmd_t pmd)
+{
+       return !!(pmd_val(pmd) & _PAGE_PSE);
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+               pmd_t *pmd, int write)
+{
+       struct page *page;
+
+       page = pte_page(*(pte_t *)pmd);
+       if (page)
+               page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+       return page;
+}
+#endif
+
+/* x86_64 also uses this file */
+
+#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+               unsigned long addr, unsigned long len,
+               unsigned long pgoff, unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       unsigned long start_addr;
+
+       if (len > mm->cached_hole_size) {
+               start_addr = mm->free_area_cache;
+       } else {
+               start_addr = TASK_UNMAPPED_BASE;
+               mm->cached_hole_size = 0;
+       }
+
+full_search:
+       addr = ALIGN(start_addr, HPAGE_SIZE);
+
+       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+               /* At this point:  (!vma || addr < vma->vm_end). */
+               if (TASK_SIZE - len < addr) {
+                       /*
+                        * Start a new search - just in case we missed
+                        * some holes.
+                        */
+                       if (start_addr != TASK_UNMAPPED_BASE) {
+                               start_addr = TASK_UNMAPPED_BASE;
+                               mm->cached_hole_size = 0;
+                               goto full_search;
+                       }
+                       return -ENOMEM;
+               }
+               if (!vma || addr + len <= vma->vm_start) {
+                       mm->free_area_cache = addr + len;
+                       return addr;
+               }
+               if (addr + mm->cached_hole_size < vma->vm_start)
+                       mm->cached_hole_size = vma->vm_start - addr;
+               addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+       }
+}
+
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+               unsigned long addr0, unsigned long len,
+               unsigned long pgoff, unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma, *prev_vma;
+       unsigned long base = mm->mmap_base, addr = addr0;
+       unsigned long largest_hole = mm->cached_hole_size;
+       int first_time = 1;
+
+       /* don't allow allocations above current base */
+       if (mm->free_area_cache > base)
+               mm->free_area_cache = base;
+
+       if (len <= largest_hole) {
+               largest_hole = 0;
+               mm->free_area_cache  = base;
+       }
+try_again:
+       /* make sure it can fit in the remaining address space */
+       if (mm->free_area_cache < len)
+               goto fail;
+
+       /* either no address requested or cant fit in requested address hole */
+       addr = (mm->free_area_cache - len) & HPAGE_MASK;
+       do {
+               /*
+                * Lookup failure means no vma is above this address,
+                * i.e. return with success:
+                */
+               if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+                       return addr;
+
+               /*
+                * new region fits between prev_vma->vm_end and
+                * vma->vm_start, use it:
+                */
+               if (addr + len <= vma->vm_start &&
+                           (!prev_vma || (addr >= prev_vma->vm_end))) {
+                       /* remember the address as a hint for next time */
+                       mm->cached_hole_size = largest_hole;
+                       return (mm->free_area_cache = addr);
+               } else {
+                       /* pull free_area_cache down to the first hole */
+                       if (mm->free_area_cache == vma->vm_end) {
+                               mm->free_area_cache = vma->vm_start;
+                               mm->cached_hole_size = largest_hole;
+                       }
+               }
+
+               /* remember the largest hole we saw so far */
+               if (addr + largest_hole < vma->vm_start)
+                       largest_hole = vma->vm_start - addr;
+
+               /* try just below the current vma->vm_start */
+               addr = (vma->vm_start - len) & HPAGE_MASK;
+       } while (len <= vma->vm_start);
+
+fail:
+       /*
+        * if hint left us with no space for the requested
+        * mapping then try again:
+        */
+       if (first_time) {
+               mm->free_area_cache = base;
+               largest_hole = 0;
+               first_time = 0;
+               goto try_again;
+       }
+       /*
+        * A failed mmap() very likely causes application failure,
+        * so fall back to the bottom-up function here. This scenario
+        * can happen with large stack limits and large mmap()
+        * allocations.
+        */
+       mm->free_area_cache = TASK_UNMAPPED_BASE;
+       mm->cached_hole_size = ~0UL;
+       addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
+                       len, pgoff, flags);
+
+       /*
+        * Restore the topdown base:
+        */
+       mm->free_area_cache = base;
+       mm->cached_hole_size = ~0UL;
+
+       return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+               unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+
+       if (len & ~HPAGE_MASK)
+               return -EINVAL;
+       if (len > TASK_SIZE)
+               return -ENOMEM;
+
+       if (flags & MAP_FIXED) {
+               if (prepare_hugepage_range(addr, len))
+                       return -EINVAL;
+               return addr;
+       }
+
+       if (addr) {
+               addr = ALIGN(addr, HPAGE_SIZE);
+               vma = find_vma(mm, addr);
+               if (TASK_SIZE - len >= addr &&
+                   (!vma || addr + len <= vma->vm_start))
+                       return addr;
+       }
+       if (mm->get_unmapped_area == arch_get_unmapped_area)
+               return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+                               pgoff, flags);
+       else
+               return hugetlb_get_unmapped_area_topdown(file, addr, len,
+                               pgoff, flags);
+}
+
+#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
new file mode 100644 (file)
index 0000000..730a5b1
--- /dev/null
@@ -0,0 +1,858 @@
+/*
+ *  linux/arch/i386/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/efi.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/paravirt.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+unsigned long highstart_pfn, highend_pfn;
+
+static int noinline do_test_wp_bit(void);
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+       pud_t *pud;
+       pmd_t *pmd_table;
+               
+#ifdef CONFIG_X86_PAE
+       if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
+               pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+               paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+               set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+               pud = pud_offset(pgd, 0);
+               if (pmd_table != pmd_offset(pud, 0))
+                       BUG();
+       }
+#endif
+       pud = pud_offset(pgd, 0);
+       pmd_table = pmd_offset(pud, 0);
+       return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry.
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+       if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
+               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+               paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+               BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+       }
+       
+       return pte_offset_kernel(pmd, 0);
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory 
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ */
+
+/*
+ * NOTE: The pagetables are allocated contiguous on the physical space 
+ * so we can cache the place of the first one and move around without 
+ * checking the pgd every time.
+ */
+static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+       pgd_t *pgd;
+       pmd_t *pmd;
+       int pgd_idx, pmd_idx;
+       unsigned long vaddr;
+
+       vaddr = start;
+       pgd_idx = pgd_index(vaddr);
+       pmd_idx = pmd_index(vaddr);
+       pgd = pgd_base + pgd_idx;
+
+       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+               pmd = one_md_table_init(pgd);
+               pmd = pmd + pmd_index(vaddr);
+               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
+                       one_page_table_init(pmd);
+
+                       vaddr += PMD_SIZE;
+               }
+               pmd_idx = 0;
+       }
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+               return 1;
+       return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total 
+ * of max_low_pfn pages, by creating page tables starting from address 
+ * PAGE_OFFSET.
+ */
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+{
+       unsigned long pfn;
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte;
+       int pgd_idx, pmd_idx, pte_ofs;
+
+       pgd_idx = pgd_index(PAGE_OFFSET);
+       pgd = pgd_base + pgd_idx;
+       pfn = 0;
+
+       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+               pmd = one_md_table_init(pgd);
+               if (pfn >= max_low_pfn)
+                       continue;
+               for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
+                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+                       /* Map with big pages if possible, otherwise create normal page tables. */
+                       if (cpu_has_pse) {
+                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
+                               if (is_kernel_text(address) || is_kernel_text(address2))
+                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+                               else
+                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+
+                               pfn += PTRS_PER_PTE;
+                       } else {
+                               pte = one_page_table_init(pmd);
+
+                               for (pte_ofs = 0;
+                                    pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+                                    pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
+                                       if (is_kernel_text(address))
+                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+                                       else
+                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+                               }
+                       }
+               }
+       }
+}
+
+static inline int page_kills_ppro(unsigned long pagenr)
+{
+       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
+               return 1;
+       return 0;
+}
+
+int page_is_ram(unsigned long pagenr)
+{
+       int i;
+       unsigned long addr, end;
+
+       if (efi_enabled) {
+               efi_memory_desc_t *md;
+               void *p;
+
+               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                       md = p;
+                       if (!is_available_memory(md))
+                               continue;
+                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
+
+                       if ((pagenr >= addr) && (pagenr < end))
+                               return 1;
+               }
+               return 0;
+       }
+
+       for (i = 0; i < e820.nr_map; i++) {
+
+               if (e820.map[i].type != E820_RAM)       /* not usable memory */
+                       continue;
+               /*
+                *      !!!FIXME!!! Some BIOSen report areas as RAM that
+                *      are not. Notably the 640->1Mb area. We need a sanity
+                *      check here.
+                */
+               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+               if  ((pagenr >= addr) && (pagenr < end))
+                       return 1;
+       }
+       return 0;
+}
+
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+#define kmap_get_fixmap_pte(vaddr)                                     \
+       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
+
+static void __init kmap_init(void)
+{
+       unsigned long kmap_vstart;
+
+       /* cache the first kmap pte */
+       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+       kmap_prot = PAGE_KERNEL;
+}
+
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned long vaddr;
+
+       vaddr = PKMAP_BASE;
+       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       pte = pte_offset_kernel(pmd, vaddr);
+       pkmap_page_table = pte; 
+}
+
+static void __meminit free_new_highpage(struct page *page)
+{
+       init_page_count(page);
+       __free_page(page);
+       totalhigh_pages++;
+}
+
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+{
+       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+               ClearPageReserved(page);
+               free_new_highpage(page);
+       } else
+               SetPageReserved(page);
+}
+
+static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+{
+       free_new_highpage(page);
+       totalram_pages++;
+#ifdef CONFIG_FLATMEM
+       max_mapnr = max(pfn, max_mapnr);
+#endif
+       num_physpages++;
+       return 0;
+}
+
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+void __meminit online_page(struct page *page)
+{
+       ClearPageReserved(page);
+       add_one_highpage_hotplug(page, page_to_pfn(page));
+}
+
+
+#ifdef CONFIG_NUMA
+extern void set_highmem_pages_init(int);
+#else
+static void __init set_highmem_pages_init(int bad_ppro)
+{
+       int pfn;
+       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
+               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+       totalram_pages += totalhigh_pages;
+}
+#endif /* CONFIG_FLATMEM */
+
+#else
+#define kmap_init() do { } while (0)
+#define permanent_kmaps_init(pgd_base) do { } while (0)
+#define set_highmem_pages_init(bad_ppro) do { } while (0)
+#endif /* CONFIG_HIGHMEM */
+
+unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+EXPORT_SYMBOL(__PAGE_KERNEL);
+unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
+
+#ifdef CONFIG_NUMA
+extern void __init remap_numa_kva(void);
+#else
+#define remap_numa_kva() do {} while (0)
+#endif
+
+void __init native_pagetable_setup_start(pgd_t *base)
+{
+#ifdef CONFIG_X86_PAE
+       int i;
+
+       /*
+        * Init entries of the first-level page table to the
+        * zero page, if they haven't already been set up.
+        *
+        * In a normal native boot, we'll be running on a
+        * pagetable rooted in swapper_pg_dir, but not in PAE
+        * mode, so this will end up clobbering the mappings
+        * for the lower 24Mbytes of the address space,
+        * without affecting the kernel address space.
+        */
+       for (i = 0; i < USER_PTRS_PER_PGD; i++)
+               set_pgd(&base[i],
+                       __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+
+       /* Make sure kernel address space is empty so that a pagetable
+          will be allocated for it. */
+       memset(&base[USER_PTRS_PER_PGD], 0,
+              KERNEL_PGD_PTRS * sizeof(pgd_t));
+#else
+       paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
+#endif
+}
+
+void __init native_pagetable_setup_done(pgd_t *base)
+{
+#ifdef CONFIG_X86_PAE
+       /*
+        * Add low memory identity-mappings - SMP needs it when
+        * starting up on an AP from real-mode. In the non-PAE
+        * case we already have these mappings through head.S.
+        * All user-space mappings are explicitly cleared after
+        * SMP startup.
+        */
+       set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
+#endif
+}
+
+/*
+ * Build a proper pagetable for the kernel mappings.  Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/i386/kernel/head.S, and not running in PAE mode
+ * (even if we'll end up running in PAE).  The root of the pagetable
+ * will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir.  In any case,
+ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+static void __init pagetable_init (void)
+{
+       unsigned long vaddr, end;
+       pgd_t *pgd_base = swapper_pg_dir;
+
+       paravirt_pagetable_setup_start(pgd_base);
+
+       /* Enable PSE if available */
+       if (cpu_has_pse)
+               set_in_cr4(X86_CR4_PSE);
+
+       /* Enable PGE if available */
+       if (cpu_has_pge) {
+               set_in_cr4(X86_CR4_PGE);
+               __PAGE_KERNEL |= _PAGE_GLOBAL;
+               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
+       }
+
+       kernel_physical_mapping_init(pgd_base);
+       remap_numa_kva();
+
+       /*
+        * Fixed mappings, only the page table structure has to be
+        * created - mappings will be set by set_fixmap():
+        */
+       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+       end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+       page_table_range_init(vaddr, end, pgd_base);
+
+       permanent_kmaps_init(pgd_base);
+
+       paravirt_pagetable_setup_done(pgd_base);
+}
+
+#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
+/*
+ * Swap suspend & friends need this for resume because things like the intel-agp
+ * driver might have split up a kernel 4MB mapping.
+ */
+char __nosavedata swsusp_pg_dir[PAGE_SIZE]
+       __attribute__ ((aligned (PAGE_SIZE)));
+
+static inline void save_pg_dir(void)
+{
+       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+}
+#else
+static inline void save_pg_dir(void)
+{
+}
+#endif
+
+void zap_low_mappings (void)
+{
+       int i;
+
+       save_pg_dir();
+
+       /*
+        * Zap initial low-memory mappings.
+        *
+        * Note that "pgd_clear()" doesn't do it for
+        * us, because pgd_clear() is a no-op on i386.
+        */
+       for (i = 0; i < USER_PTRS_PER_PGD; i++)
+#ifdef CONFIG_X86_PAE
+               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
+#else
+               set_pgd(swapper_pg_dir+i, __pgd(0));
+#endif
+       flush_tlb_all();
+}
+
+int nx_enabled = 0;
+
+#ifdef CONFIG_X86_PAE
+
+static int disable_nx __initdata = 0;
+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+/*
+ * noexec = on|off
+ *
+ * Control non executable mappings.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+       if (!str || !strcmp(str, "on")) {
+               if (cpu_has_nx) {
+                       __supported_pte_mask |= _PAGE_NX;
+                       disable_nx = 0;
+               }
+       } else if (!strcmp(str,"off")) {
+               disable_nx = 1;
+               __supported_pte_mask &= ~_PAGE_NX;
+       } else
+               return -EINVAL;
+
+       return 0;
+}
+early_param("noexec", noexec_setup);
+
+static void __init set_nx(void)
+{
+       unsigned int v[4], l, h;
+
+       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+               if ((v[3] & (1 << 20)) && !disable_nx) {
+                       rdmsr(MSR_EFER, l, h);
+                       l |= EFER_NX;
+                       wrmsr(MSR_EFER, l, h);
+                       nx_enabled = 1;
+                       __supported_pte_mask |= _PAGE_NX;
+               }
+       }
+}
+
+/*
+ * Enables/disables executability of a given kernel page and
+ * returns the previous setting.
+ */
+int __init set_kernel_exec(unsigned long vaddr, int enable)
+{
+       pte_t *pte;
+       int ret = 1;
+
+       if (!nx_enabled)
+               goto out;
+
+       pte = lookup_address(vaddr);
+       BUG_ON(!pte);
+
+       if (!pte_exec_kernel(*pte))
+               ret = 0;
+
+       if (enable)
+               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
+       else
+               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
+       pte_update_defer(&init_mm, vaddr, pte);
+       __flush_tlb_all();
+out:
+       return ret;
+}
+
+#endif
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+#ifdef CONFIG_X86_PAE
+       set_nx();
+       if (nx_enabled)
+               printk("NX (Execute Disable) protection: active\n");
+#endif
+
+       pagetable_init();
+
+       load_cr3(swapper_pg_dir);
+
+#ifdef CONFIG_X86_PAE
+       /*
+        * We will bail out later - printk doesn't work right now so
+        * the user would just see a hanging kernel.
+        */
+       if (cpu_has_pae)
+               set_in_cr4(X86_CR4_PAE);
+#endif
+       __flush_tlb_all();
+
+       kmap_init();
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
+ * used to involve black magic jumps to work around some nasty CPU bugs,
+ * but fortunately the switch to using exceptions got rid of all that.
+ */
+
+static void __init test_wp_bit(void)
+{
+       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
+
+       /* Any page-aligned address will do, the test is non-destructive */
+       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+       boot_cpu_data.wp_works_ok = do_test_wp_bit();
+       clear_fixmap(FIX_WP_TEST);
+
+       if (!boot_cpu_data.wp_works_ok) {
+               printk("No.\n");
+#ifdef CONFIG_X86_WP_WORKS_OK
+               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+#endif
+       } else {
+               printk("Ok.\n");
+       }
+}
+
+static struct kcore_list kcore_mem, kcore_vmalloc; 
+
+void __init mem_init(void)
+{
+       extern int ppro_with_ram_bug(void);
+       int codesize, reservedpages, datasize, initsize;
+       int tmp;
+       int bad_ppro;
+
+#ifdef CONFIG_FLATMEM
+       BUG_ON(!mem_map);
+#endif
+       
+       bad_ppro = ppro_with_ram_bug();
+
+#ifdef CONFIG_HIGHMEM
+       /* check that fixmap and pkmap do not overlap */
+       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
+               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
+                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
+               BUG();
+       }
+#endif
+       /* this will put all low memory onto the freelists */
+       totalram_pages += free_all_bootmem();
+
+       reservedpages = 0;
+       for (tmp = 0; tmp < max_low_pfn; tmp++)
+               /*
+                * Only count reserved RAM pages
+                */
+               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+                       reservedpages++;
+
+       set_highmem_pages_init(bad_ppro);
+
+       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+                  VMALLOC_END-VMALLOC_START);
+
+       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
+               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+               num_physpages << (PAGE_SHIFT-10),
+               codesize >> 10,
+               reservedpages << (PAGE_SHIFT-10),
+               datasize >> 10,
+               initsize >> 10,
+               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
+              );
+
+#if 1 /* double-sanity-check paranoia */
+       printk("virtual kernel memory layout:\n"
+              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+              FIXADDR_START, FIXADDR_TOP,
+              (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+              (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+              VMALLOC_START, VMALLOC_END,
+              (VMALLOC_END - VMALLOC_START) >> 20,
+
+              (unsigned long)__va(0), (unsigned long)high_memory,
+              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+              (unsigned long)&__init_begin, (unsigned long)&__init_end,
+              ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+
+              (unsigned long)&_etext, (unsigned long)&_edata,
+              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+              (unsigned long)&_text, (unsigned long)&_etext,
+              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+#ifdef CONFIG_HIGHMEM
+       BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+       BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+#endif
+       BUG_ON(VMALLOC_START                   > VMALLOC_END);
+       BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+
+#ifdef CONFIG_X86_PAE
+       if (!cpu_has_pae)
+               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
+#endif
+       if (boot_cpu_data.wp_works_ok < 0)
+               test_wp_bit();
+
+       /*
+        * Subtle. SMP is doing it's boot stuff late (because it has to
+        * fork idle threads) - but it also needs low mappings for the
+        * protected-mode entry to work. We zap these entries only after
+        * the WP-bit has been tested.
+        */
+#ifndef CONFIG_SMP
+       zap_low_mappings();
+#endif
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+       struct pglist_data *pgdata = NODE_DATA(nid);
+       struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+
+       return __add_pages(zone, start_pfn, nr_pages);
+}
+
+int remove_memory(u64 start, u64 size)
+{
+       return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+#endif
+
+struct kmem_cache *pmd_cache;
+
+void __init pgtable_cache_init(void)
+{
+       size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
+
+       if (PTRS_PER_PMD > 1) {
+               pmd_cache = kmem_cache_create("pmd",
+                                       PTRS_PER_PMD*sizeof(pmd_t),
+                                       PTRS_PER_PMD*sizeof(pmd_t),
+                                       SLAB_PANIC,
+                                       pmd_ctor);
+               if (!SHARED_KERNEL_PMD) {
+                       /* If we're in PAE mode and have a non-shared
+                          kernel pmd, then the pgd size must be a
+                          page size.  This is because the pgd_list
+                          links through the page structure, so there
+                          can only be one pgd per page for this to
+                          work. */
+                       pgd_size = PAGE_SIZE;
+               }
+       }
+}
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section.  Put this after the callers, so that it cannot be inlined.
+ */
+static int noinline do_test_wp_bit(void)
+{
+       char tmp_reg;
+       int flag;
+
+       __asm__ __volatile__(
+               "       movb %0,%1      \n"
+               "1:     movb %1,%0      \n"
+               "       xorl %2,%2      \n"
+               "2:                     \n"
+               ".section __ex_table,\"a\"\n"
+               "       .align 4        \n"
+               "       .long 1b,2b     \n"
+               ".previous              \n"
+               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+                "=q" (tmp_reg),
+                "=r" (flag)
+               :"2" (1)
+               :"memory");
+       
+       return flag;
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+
+void mark_rodata_ro(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long size = PFN_ALIGN(_etext) - start;
+
+#ifndef CONFIG_KPROBES
+#ifdef CONFIG_HOTPLUG_CPU
+       /* It must still be possible to apply SMP alternatives. */
+       if (num_possible_cpus() <= 1)
+#endif
+       {
+               change_page_attr(virt_to_page(start),
+                                size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+               printk("Write protecting the kernel text: %luk\n", size >> 10);
+       }
+#endif
+       start += size;
+       size = (unsigned long)__end_rodata - start;
+       change_page_attr(virt_to_page(start),
+                        size >> PAGE_SHIFT, PAGE_KERNEL_RO);
+       printk("Write protecting the kernel read-only data: %luk\n",
+              size >> 10);
+
+       /*
+        * change_page_attr() requires a global_flush_tlb() call after it.
+        * We do this after the printk so that if something went wrong in the
+        * change, the printk gets out at least to give a better debug hint
+        * of who is the culprit.
+        */
+       global_flush_tlb();
+}
+#endif
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+       unsigned long addr;
+
+       for (addr = begin; addr < end; addr += PAGE_SIZE) {
+               ClearPageReserved(virt_to_page(addr));
+               init_page_count(virt_to_page(addr));
+               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+               free_page(addr);
+               totalram_pages++;
+       }
+       printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+}
+
+void free_initmem(void)
+{
+       free_init_pages("unused kernel memory",
+                       (unsigned long)(&__init_begin),
+                       (unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+       free_init_pages("initrd memory", start, end);
+}
+#endif
+
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
new file mode 100644 (file)
index 0000000..0b27831
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * arch/i386/mm/ioremap.c
+ *
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <asm/fixmap.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+
+#define ISA_START_ADDRESS      0xa0000
+#define ISA_END_ADDRESS                0x100000
+
+/*
+ * Generic mapping function (not visible outside):
+ */
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+{
+       void __iomem * addr;
+       struct vm_struct * area;
+       unsigned long offset, last_addr;
+       pgprot_t prot;
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr)
+               return NULL;
+
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped..
+        */
+       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+               return (void __iomem *) phys_to_virt(phys_addr);
+
+       /*
+        * Don't allow anybody to remap normal RAM that we're using..
+        */
+       if (phys_addr <= virt_to_phys(high_memory - 1)) {
+               char *t_addr, *t_end;
+               struct page *page;
+
+               t_addr = __va(phys_addr);
+               t_end = t_addr + (size - 1);
+          
+               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
+                       if(!PageReserved(page))
+                               return NULL;
+       }
+
+       prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
+                       | _PAGE_ACCESSED | flags);
+
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+       /*
+        * Ok, go for it..
+        */
+       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+       if (!area)
+               return NULL;
+       area->phys_addr = phys_addr;
+       addr = (void __iomem *) area->addr;
+       if (ioremap_page_range((unsigned long) addr,
+                       (unsigned long) addr + size, phys_addr, prot)) {
+               vunmap((void __force *) addr);
+               return NULL;
+       }
+       return (void __iomem *) (offset + (char __iomem *)addr);
+}
+EXPORT_SYMBOL(__ioremap);
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address. 
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many 
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ * 
+ * Must be freed with iounmap.
+ */
+
+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+       unsigned long last_addr;
+       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
+       if (!p) 
+               return p; 
+
+       /* Guaranteed to be > phys_addr, as per __ioremap() */
+       last_addr = phys_addr + size - 1;
+
+       if (last_addr < virt_to_phys(high_memory) - 1) {
+               struct page *ppage = virt_to_page(__va(phys_addr));             
+               unsigned long npages;
+
+               phys_addr &= PAGE_MASK;
+
+               /* This might overflow and become zero.. */
+               last_addr = PAGE_ALIGN(last_addr);
+
+               /* .. but that's ok, because modulo-2**n arithmetic will make
+               * the page-aligned "last - first" come out right.
+               */
+               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
+
+               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
+                       iounmap(p); 
+                       p = NULL;
+               }
+               global_flush_tlb();
+       }
+
+       return p;                                       
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+       struct vm_struct *p, *o;
+
+       if ((void __force *)addr <= high_memory)
+               return;
+
+       /*
+        * __ioremap special-cases the PCI/ISA range by not instantiating a
+        * vm_area and by simply returning an address into the kernel mapping
+        * of ISA space.   So handle that here.
+        */
+       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+                       addr < phys_to_virt(ISA_END_ADDRESS))
+               return;
+
+       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
+
+       /* Use the vm area unlocked, assuming the caller
+          ensures there isn't another iounmap for the same address
+          in parallel. Reuse of the virtual address is prevented by
+          leaving it in the global lists until we're done with it.
+          cpa takes care of the direct mappings. */
+       read_lock(&vmlist_lock);
+       for (p = vmlist; p; p = p->next) {
+               if (p->addr == addr)
+                       break;
+       }
+       read_unlock(&vmlist_lock);
+
+       if (!p) {
+               printk("iounmap: bad address %p\n", addr);
+               dump_stack();
+               return;
+       }
+
+       /* Reset the direct mapping. Can block */
+       if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
+               change_page_attr(virt_to_page(__va(p->phys_addr)),
+                                get_vm_area_size(p) >> PAGE_SHIFT,
+                                PAGE_KERNEL);
+               global_flush_tlb();
+       } 
+
+       /* Finally remove it */
+       o = remove_vm_area((void *)addr);
+       BUG_ON(p != o || o == NULL);
+       kfree(p); 
+}
+EXPORT_SYMBOL(iounmap);
+
+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+{
+       unsigned long offset, last_addr;
+       unsigned int nrpages;
+       enum fixed_addresses idx;
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr)
+               return NULL;
+
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped..
+        */
+       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+               return phys_to_virt(phys_addr);
+
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr) - phys_addr;
+
+       /*
+        * Mappings have to fit in the FIX_BTMAP area.
+        */
+       nrpages = size >> PAGE_SHIFT;
+       if (nrpages > NR_FIX_BTMAPS)
+               return NULL;
+
+       /*
+        * Ok, go for it..
+        */
+       idx = FIX_BTMAP_BEGIN;
+       while (nrpages > 0) {
+               set_fixmap(idx, phys_addr);
+               phys_addr += PAGE_SIZE;
+               --idx;
+               --nrpages;
+       }
+       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
+}
+
+void __init bt_iounmap(void *addr, unsigned long size)
+{
+       unsigned long virt_addr;
+       unsigned long offset;
+       unsigned int nrpages;
+       enum fixed_addresses idx;
+
+       virt_addr = (unsigned long)addr;
+       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
+               return;
+       offset = virt_addr & ~PAGE_MASK;
+       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+
+       idx = FIX_BTMAP_BEGIN;
+       while (nrpages > 0) {
+               clear_fixmap(idx);
+               --idx;
+               --nrpages;
+       }
+}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap_32.c
new file mode 100644 (file)
index 0000000..552e084
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ *  linux/arch/i386/mm/mmap.c
+ *
+ *  flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static inline unsigned long mmap_base(struct mm_struct *mm)
+{
+       unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+       unsigned long random_factor = 0;
+
+       if (current->flags & PF_RANDOMIZE)
+               random_factor = get_random_int() % (1024*1024);
+
+       if (gap < MIN_GAP)
+               gap = MIN_GAP;
+       else if (gap > MAX_GAP)
+               gap = MAX_GAP;
+
+       return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+       /*
+        * Fall back to the standard layout if the personality
+        * bit is set, or if the expected stack growth is unlimited:
+        */
+       if (sysctl_legacy_va_layout ||
+                       (current->personality & ADDR_COMPAT_LAYOUT) ||
+                       current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
+               mm->mmap_base = TASK_UNMAPPED_BASE;
+               mm->get_unmapped_area = arch_get_unmapped_area;
+               mm->unmap_area = arch_unmap_area;
+       } else {
+               mm->mmap_base = mmap_base(mm);
+               mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+               mm->unmap_area = arch_unmap_area_topdown;
+       }
+}
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
new file mode 100644 (file)
index 0000000..4241a74
--- /dev/null
@@ -0,0 +1,278 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs. 
+ * Thanks to Ben LaHaise for precious feedback.
+ */ 
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+
+static DEFINE_SPINLOCK(cpa_lock);
+static struct list_head df_list = LIST_HEAD_INIT(df_list);
+
+
+pte_t *lookup_address(unsigned long address) 
+{ 
+       pgd_t *pgd = pgd_offset_k(address);
+       pud_t *pud;
+       pmd_t *pmd;
+       if (pgd_none(*pgd))
+               return NULL;
+       pud = pud_offset(pgd, address);
+       if (pud_none(*pud))
+               return NULL;
+       pmd = pmd_offset(pud, address);
+       if (pmd_none(*pmd))
+               return NULL;
+       if (pmd_large(*pmd))
+               return (pte_t *)pmd;
+        return pte_offset_kernel(pmd, address);
+} 
+
+static struct page *split_large_page(unsigned long address, pgprot_t prot,
+                                       pgprot_t ref_prot)
+{ 
+       int i; 
+       unsigned long addr;
+       struct page *base;
+       pte_t *pbase;
+
+       spin_unlock_irq(&cpa_lock);
+       base = alloc_pages(GFP_KERNEL, 0);
+       spin_lock_irq(&cpa_lock);
+       if (!base) 
+               return NULL;
+
+       /*
+        * page_private is used to track the number of entries in
+        * the page table page that have non standard attributes.
+        */
+       SetPagePrivate(base);
+       page_private(base) = 0;
+
+       address = __pa(address);
+       addr = address & LARGE_PAGE_MASK; 
+       pbase = (pte_t *)page_address(base);
+       paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
+                                          addr == address ? prot : ref_prot));
+       }
+       return base;
+} 
+
+static void cache_flush_page(struct page *p)
+{ 
+       unsigned long adr = (unsigned long)page_address(p);
+       int i;
+       for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
+               asm volatile("clflush (%0)" :: "r" (adr + i));
+}
+
+static void flush_kernel_map(void *arg)
+{
+       struct list_head *lh = (struct list_head *)arg;
+       struct page *p;
+
+       /* High level code is not ready for clflush yet */
+       if (0 && cpu_has_clflush) {
+               list_for_each_entry (p, lh, lru)
+                       cache_flush_page(p);
+       } else if (boot_cpu_data.x86_model >= 4)
+               wbinvd();
+
+       /* Flush all to work around Errata in early athlons regarding 
+        * large page flushing. 
+        */
+       __flush_tlb_all();      
+}
+
+static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
+{ 
+       struct page *page;
+       unsigned long flags;
+
+       set_pte_atomic(kpte, pte);      /* change init_mm */
+       if (SHARED_KERNEL_PMD)
+               return;
+
+       spin_lock_irqsave(&pgd_lock, flags);
+       for (page = pgd_list; page; page = (struct page *)page->index) {
+               pgd_t *pgd;
+               pud_t *pud;
+               pmd_t *pmd;
+               pgd = (pgd_t *)page_address(page) + pgd_index(address);
+               pud = pud_offset(pgd, address);
+               pmd = pmd_offset(pud, address);
+               set_pte_atomic((pte_t *)pmd, pte);
+       }
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+/* 
+ * No more special protections in this 2/4MB area - revert to a
+ * large page again. 
+ */
+static inline void revert_page(struct page *kpte_page, unsigned long address)
+{
+       pgprot_t ref_prot;
+       pte_t *linear;
+
+       ref_prot =
+       ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
+               ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
+
+       linear = (pte_t *)
+               pmd_offset(pud_offset(pgd_offset_k(address), address), address);
+       set_pmd_pte(linear,  address,
+                   pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
+                           ref_prot));
+}
+
+static inline void save_page(struct page *kpte_page)
+{
+       if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
+               list_add(&kpte_page->lru, &df_list);
+}
+
+static int
+__change_page_attr(struct page *page, pgprot_t prot)
+{ 
+       pte_t *kpte; 
+       unsigned long address;
+       struct page *kpte_page;
+
+       BUG_ON(PageHighMem(page));
+       address = (unsigned long)page_address(page);
+
+       kpte = lookup_address(address);
+       if (!kpte)
+               return -EINVAL;
+       kpte_page = virt_to_page(kpte);
+       BUG_ON(PageLRU(kpte_page));
+       BUG_ON(PageCompound(kpte_page));
+
+       if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
+               if (!pte_huge(*kpte)) {
+                       set_pte_atomic(kpte, mk_pte(page, prot)); 
+               } else {
+                       pgprot_t ref_prot;
+                       struct page *split;
+
+                       ref_prot =
+                       ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
+                               ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
+                       split = split_large_page(address, prot, ref_prot);
+                       if (!split)
+                               return -ENOMEM;
+                       set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
+                       kpte_page = split;
+               }
+               page_private(kpte_page)++;
+       } else if (!pte_huge(*kpte)) {
+               set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
+               BUG_ON(page_private(kpte_page) == 0);
+               page_private(kpte_page)--;
+       } else
+               BUG();
+
+       /*
+        * If the pte was reserved, it means it was created at boot
+        * time (not via split_large_page) and in turn we must not
+        * replace it with a largepage.
+        */
+
+       save_page(kpte_page);
+       if (!PageReserved(kpte_page)) {
+               if (cpu_has_pse && (page_private(kpte_page) == 0)) {
+                       paravirt_release_pt(page_to_pfn(kpte_page));
+                       revert_page(kpte_page, address);
+               }
+       }
+       return 0;
+} 
+
+static inline void flush_map(struct list_head *l)
+{
+       on_each_cpu(flush_kernel_map, l, 1, 1);
+}
+
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * This should be used when a page is mapped with a different caching policy
+ * than write-back somewhere - some CPUs do not like it when mappings with
+ * different caching policies exist. This changes the page attributes of the
+ * in kernel linear mapping too.
+ * 
+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
+ * This function only deals with the kernel linear map.
+ * 
+ * Caller must call global_flush_tlb() after this.
+ */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+       int err = 0; 
+       int i; 
+       unsigned long flags;
+
+       spin_lock_irqsave(&cpa_lock, flags);
+       for (i = 0; i < numpages; i++, page++) { 
+               err = __change_page_attr(page, prot);
+               if (err) 
+                       break; 
+       }       
+       spin_unlock_irqrestore(&cpa_lock, flags);
+       return err;
+}
+
+void global_flush_tlb(void)
+{
+       struct list_head l;
+       struct page *pg, *next;
+
+       BUG_ON(irqs_disabled());
+
+       spin_lock_irq(&cpa_lock);
+       list_replace_init(&df_list, &l);
+       spin_unlock_irq(&cpa_lock);
+       flush_map(&l);
+       list_for_each_entry_safe(pg, next, &l, lru) {
+               list_del(&pg->lru);
+               clear_bit(PG_arch_1, &pg->flags);
+               if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
+                       continue;
+               ClearPagePrivate(pg);
+               __free_page(pg);
+       }
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+       if (PageHighMem(page))
+               return;
+       if (!enable)
+               debug_check_no_locks_freed(page_address(page),
+                                          numpages * PAGE_SIZE);
+
+       /* the return value is ignored - the calls cannot fail,
+        * large pages are disabled at boot time.
+        */
+       change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+       /* we should perform an IPI and flush all tlbs,
+        * but that can deadlock->flush only current cpu.
+        */
+       __flush_tlb_all();
+}
+#endif
+
+EXPORT_SYMBOL(change_page_attr);
+EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
new file mode 100644 (file)
index 0000000..01437c4
--- /dev/null
@@ -0,0 +1,373 @@
+/*
+ *  linux/arch/i386/mm/pgtable.c
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/quicklist.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+void show_mem(void)
+{
+       int total = 0, reserved = 0;
+       int shared = 0, cached = 0;
+       int highmem = 0;
+       struct page *page;
+       pg_data_t *pgdat;
+       unsigned long i;
+       unsigned long flags;
+
+       printk(KERN_INFO "Mem-info:\n");
+       show_free_areas();
+       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+       for_each_online_pgdat(pgdat) {
+               pgdat_resize_lock(pgdat, &flags);
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                       page = pgdat_page_nr(pgdat, i);
+                       total++;
+                       if (PageHighMem(page))
+                               highmem++;
+                       if (PageReserved(page))
+                               reserved++;
+                       else if (PageSwapCache(page))
+                               cached++;
+                       else if (page_count(page))
+                               shared += page_count(page) - 1;
+               }
+               pgdat_resize_unlock(pgdat, &flags);
+       }
+       printk(KERN_INFO "%d pages of RAM\n", total);
+       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
+       printk(KERN_INFO "%d reserved pages\n", reserved);
+       printk(KERN_INFO "%d pages shared\n", shared);
+       printk(KERN_INFO "%d pages swap cached\n", cached);
+
+       printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
+       printk(KERN_INFO "%lu pages writeback\n",
+                                       global_page_state(NR_WRITEBACK));
+       printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
+       printk(KERN_INFO "%lu pages slab\n",
+               global_page_state(NR_SLAB_RECLAIMABLE) +
+               global_page_state(NR_SLAB_UNRECLAIMABLE));
+       printk(KERN_INFO "%lu pages pagetables\n",
+                                       global_page_state(NR_PAGETABLE));
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       if (pgprot_val(flags))
+               /* <pfn,flags> stored as-is, to permit clearing entries */
+               set_pte(pte, pfn_pte(pfn, flags));
+       else
+               pte_clear(&init_mm, vaddr, pte);
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+               return; /* BUG(); */
+       }
+       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+               return; /* BUG(); */
+       }
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+               return; /* BUG(); */
+       }
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       set_pmd(pmd, pfn_pmd(pfn, flags));
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+static int fixmaps;
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
+{
+       unsigned long address = __fix_to_virt(idx);
+
+       if (idx >= __end_of_fixed_addresses) {
+               BUG();
+               return;
+       }
+       set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+       fixmaps++;
+}
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void reserve_top_address(unsigned long reserve)
+{
+       BUG_ON(fixmaps > 0);
+       printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+              (int)-reserve);
+       __FIXADDR_TOP = -reserve - PAGE_SIZE;
+       __VMALLOC_RESERVE += reserve;
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+       return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+       struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+       return pte;
+}
+
+void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
+{
+       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+DEFINE_SPINLOCK(pgd_lock);
+struct page *pgd_list;
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+       struct page *page = virt_to_page(pgd);
+       page->index = (unsigned long)pgd_list;
+       if (pgd_list)
+               set_page_private(pgd_list, (unsigned long)&page->index);
+       pgd_list = page;
+       set_page_private(page, (unsigned long)&pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+       struct page *next, **pprev, *page = virt_to_page(pgd);
+       next = (struct page *)page->index;
+       pprev = (struct page **)page_private(page);
+       *pprev = next;
+       if (next)
+               set_page_private(next, (unsigned long)pprev);
+}
+
+
+
+#if (PTRS_PER_PMD == 1)
+/* Non-PAE pgd constructor */
+static void pgd_ctor(void *pgd)
+{
+       unsigned long flags;
+
+       /* !PAE, no pagetable sharing */
+       memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+
+       spin_lock_irqsave(&pgd_lock, flags);
+
+       /* must happen under lock */
+       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+                       swapper_pg_dir + USER_PTRS_PER_PGD,
+                       KERNEL_PGD_PTRS);
+       paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+                               __pa(swapper_pg_dir) >> PAGE_SHIFT,
+                               USER_PTRS_PER_PGD,
+                               KERNEL_PGD_PTRS);
+       pgd_list_add(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+#else  /* PTRS_PER_PMD > 1 */
+/* PAE pgd constructor */
+static void pgd_ctor(void *pgd)
+{
+       /* PAE, kernel PMD may be shared */
+
+       if (SHARED_KERNEL_PMD) {
+               clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+                               swapper_pg_dir + USER_PTRS_PER_PGD,
+                               KERNEL_PGD_PTRS);
+       } else {
+               unsigned long flags;
+
+               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+               spin_lock_irqsave(&pgd_lock, flags);
+               pgd_list_add(pgd);
+               spin_unlock_irqrestore(&pgd_lock, flags);
+       }
+}
+#endif /* PTRS_PER_PMD */
+
+static void pgd_dtor(void *pgd)
+{
+       unsigned long flags; /* can be called from interrupt context */
+
+       if (SHARED_KERNEL_PMD)
+               return;
+
+       paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
+       spin_lock_irqsave(&pgd_lock, flags);
+       pgd_list_del(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+#define UNSHARED_PTRS_PER_PGD                          \
+       (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+
+/* If we allocate a pmd for part of the kernel address space, then
+   make sure its initialized with the appropriate kernel mappings.
+   Otherwise use a cached zeroed pmd.  */
+static pmd_t *pmd_cache_alloc(int idx)
+{
+       pmd_t *pmd;
+
+       if (idx >= USER_PTRS_PER_PGD) {
+               pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+
+               if (pmd)
+                       memcpy(pmd,
+                              (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
+                              sizeof(pmd_t) * PTRS_PER_PMD);
+       } else
+               pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+
+       return pmd;
+}
+
+static void pmd_cache_free(pmd_t *pmd, int idx)
+{
+       if (idx >= USER_PTRS_PER_PGD)
+               free_page((unsigned long)pmd);
+       else
+               kmem_cache_free(pmd_cache, pmd);
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+       int i;
+       pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
+
+       if (PTRS_PER_PMD == 1 || !pgd)
+               return pgd;
+
+       for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
+               pmd_t *pmd = pmd_cache_alloc(i);
+
+               if (!pmd)
+                       goto out_oom;
+
+               paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
+               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+       }
+       return pgd;
+
+out_oom:
+       for (i--; i >= 0; i--) {
+               pgd_t pgdent = pgd[i];
+               void* pmd = (void *)__va(pgd_val(pgdent)-1);
+               paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+               pmd_cache_free(pmd, i);
+       }
+       quicklist_free(0, pgd_dtor, pgd);
+       return NULL;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+       int i;
+
+       /* in the PAE case user pgd entries are overwritten before usage */
+       if (PTRS_PER_PMD > 1)
+               for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
+                       pgd_t pgdent = pgd[i];
+                       void* pmd = (void *)__va(pgd_val(pgdent)-1);
+                       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+                       pmd_cache_free(pmd, i);
+               }
+       /* in the non-PAE case, free_pgtables() clears user pgd entries */
+       quicklist_free(0, pgd_dtor, pgd);
+}
+
+void check_pgt_cache(void)
+{
+       quicklist_trim(0, pgd_dtor, 25, 16);
+}
+
index 4042f85..7317648 100644 (file)
@@ -1,5 +1,5 @@
 ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/i386/mm/Makefile_32
+include ${srctree}/arch/x86/mm/Makefile_32
 else
 include ${srctree}/arch/x86_64/mm/Makefile_64
 endif
index 2e20988..5c2883c 100644 (file)
@@ -8,4 +8,4 @@ obj-$(CONFIG_NUMA) += numa_64.o
 obj-$(CONFIG_K8_NUMA) += k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA) += srat_64.o
 
-hugetlbpage-y = ../../i386/mm/hugetlbpage.o
+hugetlbpage-y = ../../x86/mm/hugetlbpage.o