x86/mm: Break out user address space handling
[linux-2.6-microblaze.git] / arch / x86 / mm / fault.c
index 47bebfe..0d1f5d3 100644 (file)
@@ -966,6 +966,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
                __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
 }
 
+/* Handle faults in the kernel portion of the address space */
 static void
 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
          u32 *pkey, unsigned int fault)
@@ -1032,7 +1033,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
        }
 }
 
-static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
 {
        if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
                return 0;
@@ -1071,7 +1072,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * (Optional Invalidation).
  */
 static noinline int
-spurious_fault(unsigned long error_code, unsigned long address)
+spurious_kernel_fault(unsigned long error_code, unsigned long address)
 {
        pgd_t *pgd;
        p4d_t *p4d;
@@ -1102,27 +1103,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
                return 0;
 
        if (p4d_large(*p4d))
-               return spurious_fault_check(error_code, (pte_t *) p4d);
+               return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
 
        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return 0;
 
        if (pud_large(*pud))
-               return spurious_fault_check(error_code, (pte_t *) pud);
+               return spurious_kernel_fault_check(error_code, (pte_t *) pud);
 
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return 0;
 
        if (pmd_large(*pmd))
-               return spurious_fault_check(error_code, (pte_t *) pmd);
+               return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
 
        pte = pte_offset_kernel(pmd, address);
        if (!pte_present(*pte))
                return 0;
 
-       ret = spurious_fault_check(error_code, pte);
+       ret = spurious_kernel_fault_check(error_code, pte);
        if (!ret)
                return 0;
 
@@ -1130,12 +1131,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
         * Make sure we have permissions in PMD.
         * If not, then there's a bug in the page tables:
         */
-       ret = spurious_fault_check(error_code, (pte_t *) pmd);
+       ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
 
        return ret;
 }
-NOKPROBE_SYMBOL(spurious_fault);
+NOKPROBE_SYMBOL(spurious_kernel_fault);
 
 int show_unhandled_signals = 1;
 
@@ -1203,31 +1204,16 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 }
 
 /*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
+ * Called for all faults where 'address' is part of the kernel address
+ * space.  Might get called for faults that originate from *code* that
+ * ran in userspace or the kernel.
  */
-static noinline void
-__do_page_fault(struct pt_regs *regs, unsigned long error_code,
-               unsigned long address)
+static void
+do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
+                  unsigned long address)
 {
-       struct vm_area_struct *vma;
-       struct task_struct *tsk;
-       struct mm_struct *mm;
-       vm_fault_t fault, major = 0;
-       unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
-       u32 pkey;
-
-       tsk = current;
-       mm = tsk->mm;
-
-       prefetchw(&mm->mmap_sem);
-
-       if (unlikely(kmmio_fault(regs, address)))
-               return;
-
        /*
-        * We fault-in kernel-space virtual memory on-demand. The
+        * We can fault-in kernel-space virtual memory on-demand. The
         * 'reference' page table is init_mm.pgd.
         *
         * NOTE! We MUST NOT take any locks for this case. We may
@@ -1235,41 +1221,66 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         * only copy the information from the master page table,
         * nothing more.
         *
-        * This verifies that the fault happens in kernel space
-        * (error_code & 4) == 0, and that the fault was not a
-        * protection error (error_code & 9) == 0.
+        * Before doing this on-demand faulting, ensure that the
+        * fault is not any of the following:
+        * 1. A fault on a PTE with a reserved bit set.
+        * 2. A fault caused by a user-mode access.  (Do not demand-
+        *    fault kernel memory due to user-mode accesses).
+        * 3. A fault caused by a page-level protection violation.
+        *    (A demand fault would be on a non-present page which
+        *     would have X86_PF_PROT==0).
         */
-       if (unlikely(fault_in_kernel_space(address))) {
-               if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
-                       if (vmalloc_fault(address) >= 0)
-                               return;
-               }
-
-               /* Can handle a stale RO->RW TLB: */
-               if (spurious_fault(error_code, address))
+       if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+               if (vmalloc_fault(address) >= 0)
                        return;
+       }
 
-               /* kprobes don't want to hook the spurious faults: */
-               if (kprobes_fault(regs))
-                       return;
-               /*
-                * Don't take the mm semaphore here. If we fixup a prefetch
-                * fault we could otherwise deadlock:
-                */
-               bad_area_nosemaphore(regs, error_code, address, NULL);
+       /* Was the fault spurious, caused by lazy TLB invalidation? */
+       if (spurious_kernel_fault(hw_error_code, address))
+               return;
 
+       /* kprobes don't want to hook the spurious faults: */
+       if (kprobes_fault(regs))
                return;
-       }
+
+       /*
+        * Note, despite being a "bad area", there are quite a few
+        * acceptable reasons to get here, such as erratum fixups
+        * and handling kernel code that can fault, like get_user().
+        *
+        * Don't take the mm semaphore here. If we fixup a prefetch
+        * fault we could otherwise deadlock:
+        */
+       bad_area_nosemaphore(regs, hw_error_code, address, NULL);
+}
+NOKPROBE_SYMBOL(do_kern_addr_fault);
+
+/* Handle faults in the user portion of the address space */
+static inline
+void do_user_addr_fault(struct pt_regs *regs,
+                       unsigned long hw_error_code,
+                       unsigned long address)
+{
+       unsigned long sw_error_code;
+       struct vm_area_struct *vma;
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       vm_fault_t fault, major = 0;
+       unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+       u32 pkey;
+
+       tsk = current;
+       mm = tsk->mm;
 
        /* kprobes don't want to hook the spurious faults: */
        if (unlikely(kprobes_fault(regs)))
                return;
 
-       if (unlikely(error_code & X86_PF_RSVD))
-               pgtable_bad(regs, error_code, address);
+       if (unlikely(hw_error_code & X86_PF_RSVD))
+               pgtable_bad(regs, hw_error_code, address);
 
-       if (unlikely(smap_violation(error_code, regs))) {
-               bad_area_nosemaphore(regs, error_code, address, NULL);
+       if (unlikely(smap_violation(hw_error_code, regs))) {
+               bad_area_nosemaphore(regs, hw_error_code, address, NULL);
                return;
        }
 
@@ -1278,10 +1289,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         * in a region with pagefaults disabled then we must not take the fault
         */
        if (unlikely(faulthandler_disabled() || !mm)) {
-               bad_area_nosemaphore(regs, error_code, address, NULL);
+               bad_area_nosemaphore(regs, hw_error_code, address, NULL);
                return;
        }
 
+       /*
+        * hw_error_code is literally the "page fault error code" passed to
+        * the kernel directly from the hardware.  But, we will shortly be
+        * modifying it in software, so give it a new name.
+        */
+       sw_error_code = hw_error_code;
+
        /*
         * It's safe to allow irq's after cr2 has been saved and the
         * vmalloc fault has been handled.
@@ -1291,7 +1309,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         */
        if (user_mode(regs)) {
                local_irq_enable();
-               error_code |= X86_PF_USER;
+               /*
+                * Up to this point, X86_PF_USER set in hw_error_code
+                * indicated a user-mode access.  But, after this,
+                * X86_PF_USER in sw_error_code will indicate either
+                * that, *or* an implicit kernel(supervisor)-mode access
+                * which originated from user mode.
+                */
+               if (!(hw_error_code & X86_PF_USER)) {
+                       /*
+                        * The CPU was in user mode, but the CPU says
+                        * the fault was not a user-mode access.
+                        * Must be an implicit kernel-mode access,
+                        * which we do not expect to happen in the
+                        * user address space.
+                        */
+                       pr_warn_once("kernel-mode error from user-mode: %lx\n",
+                                       hw_error_code);
+
+                       sw_error_code |= X86_PF_USER;
+               }
                flags |= FAULT_FLAG_USER;
        } else {
                if (regs->flags & X86_EFLAGS_IF)
@@ -1300,9 +1337,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
-       if (error_code & X86_PF_WRITE)
+       if (sw_error_code & X86_PF_WRITE)
                flags |= FAULT_FLAG_WRITE;
-       if (error_code & X86_PF_INSTR)
+       if (sw_error_code & X86_PF_INSTR)
                flags |= FAULT_FLAG_INSTRUCTION;
 
        /*
@@ -1322,9 +1359,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         * space check, thus avoiding the deadlock:
         */
        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
-               if (!(error_code & X86_PF_USER) &&
+               if (!(sw_error_code & X86_PF_USER) &&
                    !search_exception_tables(regs->ip)) {
-                       bad_area_nosemaphore(regs, error_code, address, NULL);
+                       bad_area_nosemaphore(regs, sw_error_code, address, NULL);
                        return;
                }
 retry:
@@ -1340,16 +1377,16 @@ retry:
 
        vma = find_vma(mm, address);
        if (unlikely(!vma)) {
-               bad_area(regs, error_code, address);
+               bad_area(regs, sw_error_code, address);
                return;
        }
        if (likely(vma->vm_start <= address))
                goto good_area;
        if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
-               bad_area(regs, error_code, address);
+               bad_area(regs, sw_error_code, address);
                return;
        }
-       if (error_code & X86_PF_USER) {
+       if (sw_error_code & X86_PF_USER) {
                /*
                 * Accessing the stack below %sp is always a bug.
                 * The large cushion allows instructions like enter
@@ -1357,12 +1394,12 @@ retry:
                 * 32 pointers and then decrements %sp by 65535.)
                 */
                if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
-                       bad_area(regs, error_code, address);
+                       bad_area(regs, sw_error_code, address);
                        return;
                }
        }
        if (unlikely(expand_stack(vma, address))) {
-               bad_area(regs, error_code, address);
+               bad_area(regs, sw_error_code, address);
                return;
        }
 
@@ -1371,8 +1408,8 @@ retry:
         * we can handle it..
         */
 good_area:
-       if (unlikely(access_error(error_code, vma))) {
-               bad_area_access_error(regs, error_code, address, vma);
+       if (unlikely(access_error(sw_error_code, vma))) {
+               bad_area_access_error(regs, sw_error_code, address, vma);
                return;
        }
 
@@ -1414,13 +1451,13 @@ good_area:
                        return;
 
                /* Not returning to user mode? Handle exceptions or die: */
-               no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+               no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
                return;
        }
 
        up_read(&mm->mmap_sem);
        if (unlikely(fault & VM_FAULT_ERROR)) {
-               mm_fault_error(regs, error_code, address, &pkey, fault);
+               mm_fault_error(regs, sw_error_code, address, &pkey, fault);
                return;
        }
 
@@ -1438,6 +1475,28 @@ good_area:
 
        check_v8086_mode(regs, address, tsk);
 }
+NOKPROBE_SYMBOL(do_user_addr_fault);
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
+               unsigned long address)
+{
+       prefetchw(&current->mm->mmap_sem);
+
+       if (unlikely(kmmio_fault(regs, address)))
+               return;
+
+       /* Was the fault on kernel-controlled part of the address space? */
+       if (unlikely(fault_in_kernel_space(address)))
+               do_kern_addr_fault(regs, hw_error_code, address);
+       else
+               do_user_addr_fault(regs, hw_error_code, address);
+}
 NOKPROBE_SYMBOL(__do_page_fault);
 
 static nokprobe_inline void