[PATCH] mm: fix a race condition under SMC + COW
[linux-2.6-microblaze.git] / mm / memory.c
index fa941b1..160f5b5 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
+#include <linux/writeback.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -1226,7 +1227,12 @@ out:
        return retval;
 }
 
-/*
+/**
+ * vm_insert_page - insert single page into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @page: source kernel page
+ *
  * This allows drivers to insert individual pages they've allocated
  * into a user vma.
  *
@@ -1318,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
        return 0;
 }
 
-/*  Note: this is only safe if the mm semaphore is held when called. */
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ * @prot: page protection flags for this mapping
+ *
+ *  Note: this is only safe if the mm semaphore is held when called.
+ */
 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
 {
@@ -1466,11 +1481,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto gotten;
 
        /*
-        * Only catch write-faults on shared writable pages, read-only
-        * shared pages can get COWed by get_user_pages(.write=1, .force=1).
+        * Take out anonymous pages first, anonymous shared vmas are
+        * not dirty accountable.
         */
-       if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+       if (PageAnon(old_page)) {
+               if (!TestSetPageLocked(old_page)) {
+                       reuse = can_share_swap_page(old_page);
+                       unlock_page(old_page);
+               }
+       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
+               /*
+                * Only catch write-faults on shared writable pages,
+                * read-only shared pages can get COWed by
+                * get_user_pages(.write=1, .force=1).
+                */
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                        /*
                         * Notify the address space that the page is about to
@@ -1502,9 +1527,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                dirty_page = old_page;
                get_page(dirty_page);
                reuse = 1;
-       } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
-               reuse = can_share_swap_page(old_page);
-               unlock_page(old_page);
        }
 
        if (reuse) {
@@ -1555,7 +1577,14 @@ gotten:
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                lazy_mmu_prot_update(entry);
-               ptep_establish(vma, address, page_table, entry);
+               /*
+                * Clear the pte entry and flush it first, before updating the
+                * pte with the new entry. This will avoid a race condition
+                * seen in the presence of one thread doing SMC and another
+                * thread doing COW.
+                */
+               ptep_clear_flush(vma, address, page_table);
+               set_pte_at(mm, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                lru_cache_add_active(new_page);
                page_add_new_anon_rmap(new_page, vma, address);
@@ -1571,7 +1600,7 @@ gotten:
 unlock:
        pte_unmap_unlock(page_table, ptl);
        if (dirty_page) {
-               set_page_dirty(dirty_page);
+               set_page_dirty_balance(dirty_page);
                put_page(dirty_page);
        }
        return ret;
@@ -1793,9 +1822,10 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
  *
  * NOTE! We have to be ready to update the memory sharing
  * between the file and the memory map for a potential last
@@ -1864,11 +1894,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 }
 EXPORT_UNUSED_SYMBOL(vmtruncate_range);  /*  June 2006  */
 
-/* 
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @addr: address to start
+ * @vma: user vma this addresses belong to
+ *
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
  * because it doesn't cost us any seek time.  We also make sure to queue
- * the 'original' request together with the readahead ones...  
+ * the 'original' request together with the readahead ones...
  *
  * This has been extended to use the NUMA policies from the mm triggering
  * the readahead.
@@ -2218,7 +2253,7 @@ retry:
 unlock:
        pte_unmap_unlock(page_table, ptl);
        if (dirty_page) {
-               set_page_dirty(dirty_page);
+               set_page_dirty_balance(dirty_page);
                put_page(dirty_page);
        }
        return ret;
@@ -2227,6 +2262,54 @@ oom:
        return VM_FAULT_OOM;
 }
 
+/*
+ * do_no_pfn() tries to create a new page mapping for a page without
+ * a struct_page backing it
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * It is expected that the ->nopfn handler always returns the same pfn
+ * for a given virtual mapping.
+ *
+ * Mark this `noinline' to prevent it from bloating the main pagefault code.
+ */
+static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
+                    unsigned long address, pte_t *page_table, pmd_t *pmd,
+                    int write_access)
+{
+       spinlock_t *ptl;
+       pte_t entry;
+       unsigned long pfn;
+       int ret = VM_FAULT_MINOR;
+
+       pte_unmap(page_table);
+       BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+       BUG_ON(is_cow_mapping(vma->vm_flags));
+
+       pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+       if (pfn == NOPFN_OOM)
+               return VM_FAULT_OOM;
+       if (pfn == NOPFN_SIGBUS)
+               return VM_FAULT_SIGBUS;
+
+       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+       /* Only go through if we didn't race with anybody else... */
+       if (pte_none(*page_table)) {
+               entry = pfn_pte(pfn, vma->vm_page_prot);
+               if (write_access)
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               set_pte_at(mm, address, page_table, entry);
+       }
+       pte_unmap_unlock(page_table, ptl);
+       return ret;
+}
+
 /*
  * Fault of a previously existing named mapping. Repopulate the pte
  * from the encoded file_pte if possible. This enables swappable
@@ -2289,11 +2372,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        old_entry = entry = *pte;
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
-                       if (!vma->vm_ops || !vma->vm_ops->nopage)
-                               return do_anonymous_page(mm, vma, address,
-                                       pte, pmd, write_access);
-                       return do_no_page(mm, vma, address,
-                                       pte, pmd, write_access);
+                       if (vma->vm_ops) {
+                               if (vma->vm_ops->nopage)
+                                       return do_no_page(mm, vma, address,
+                                                         pte, pmd,
+                                                         write_access);
+                               if (unlikely(vma->vm_ops->nopfn))
+                                       return do_no_pfn(mm, vma, address, pte,
+                                                        pmd, write_access);
+                       }
+                       return do_anonymous_page(mm, vma, address,
+                                                pte, pmd, write_access);
                }
                if (pte_file(entry))
                        return do_file_page(mm, vma, address,
@@ -2522,3 +2611,56 @@ int in_gate_area_no_task(unsigned long addr)
 }
 
 #endif /* __HAVE_ARCH_GATE_AREA */
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+       struct mm_struct *mm;
+       struct vm_area_struct *vma;
+       struct page *page;
+       void *old_buf = buf;
+
+       mm = get_task_mm(tsk);
+       if (!mm)
+               return 0;
+
+       down_read(&mm->mmap_sem);
+       /* ignore errors, just check how much was sucessfully transfered */
+       while (len) {
+               int bytes, ret, offset;
+               void *maddr;
+
+               ret = get_user_pages(tsk, mm, addr, 1,
+                               write, 1, &page, &vma);
+               if (ret <= 0)
+                       break;
+
+               bytes = len;
+               offset = addr & (PAGE_SIZE-1);
+               if (bytes > PAGE_SIZE-offset)
+                       bytes = PAGE_SIZE-offset;
+
+               maddr = kmap(page);
+               if (write) {
+                       copy_to_user_page(vma, page, addr,
+                                         maddr + offset, buf, bytes);
+                       set_page_dirty_lock(page);
+               } else {
+                       copy_from_user_page(vma, page, addr,
+                                           buf, maddr + offset, bytes);
+               }
+               kunmap(page);
+               page_cache_release(page);
+               len -= bytes;
+               buf += bytes;
+               addr += bytes;
+       }
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+
+       return buf - old_buf;
+}