* only copy the information from the master page table,
* nothing more.
*
- * This verifies that the fault happens in kernel space
- * (error_code & 4) == 0, and that the fault was not a
- * protection error (error_code & 9) == 0.
+ * Before doing this on-demand faulting, ensure that the
+ * fault is not any of the following:
+ * 1. A fault on a PTE with a reserved bit set.
+ * 2. A fault caused by a user-mode access. (Do not demand-
+ * fault kernel memory due to user-mode accesses).
+ * 3. A fault caused by a page-level protection violation.
+ * (A demand fault would be on a non-present page which
+ * would have X86_PF_PROT==0).
*/
- if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
- if (vmalloc_fault(address) >= 0)
- return;
- }
-
- /* Can handle a stale RO->RW TLB: */
- if (spurious_fault(error_code, address))
+ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
return;
+ }
- /* kprobes don't want to hook the spurious faults: */
- if (kprobes_fault(regs))
- return;
- /*
- * Don't take the mm semaphore here. If we fixup a prefetch
- * fault we could otherwise deadlock:
- */
- bad_area_nosemaphore(regs, error_code, address);
+ /* Was the fault spurious, caused by lazy TLB invalidation? */
+ if (spurious_kernel_fault(hw_error_code, address))
+ return;
+ /* kprobes don't want to hook the spurious faults: */
+ if (kprobes_fault(regs))
return;
- }
+
+ /*
+ * Note, despite being a "bad area", there are quite a few
+ * acceptable reasons to get here, such as erratum fixups
+ * and handling kernel code that can fault, like get_user().
+ *
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock:
+ */
- bad_area_nosemaphore(regs, hw_error_code, address, NULL);
++ bad_area_nosemaphore(regs, hw_error_code, address);
+}
+NOKPROBE_SYMBOL(do_kern_addr_fault);
+
+/* Handle faults in the user portion of the address space */
+static inline
+void do_user_addr_fault(struct pt_regs *regs,
+ unsigned long hw_error_code,
+ unsigned long address)
+{
+ unsigned long sw_error_code;
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ vm_fault_t fault, major = 0;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- u32 pkey;
+
+ tsk = current;
+ mm = tsk->mm;
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
- if (unlikely(error_code & X86_PF_RSVD))
- pgtable_bad(regs, error_code, address);
+ /*
+ * Reserved bits are never expected to be set on
+ * entries in the user portion of the page tables.
+ */
+ if (unlikely(hw_error_code & X86_PF_RSVD))
+ pgtable_bad(regs, hw_error_code, address);
- if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
+ /*
+ * Check for invalid kernel (supervisor) access to user
+ * pages in the user address space.
+ */
+ if (unlikely(smap_violation(hw_error_code, regs))) {
- bad_area_nosemaphore(regs, hw_error_code, address, NULL);
++ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, hw_error_code, address, NULL);
- bad_area_nosemaphore(regs, error_code, address);
++ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & X86_PF_WRITE)
+ if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (error_code & X86_PF_INSTR)
+ if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
+#ifdef CONFIG_X86_64
+ /*
+ * Instruction fetch faults in the vsyscall page might need
+ * emulation. The vsyscall page is at a high address
+ * (>PAGE_OFFSET), but is considered to be part of the user
+ * address space.
+ *
+ * The vsyscall page does not have a "real" VMA, so do this
+ * emulation before we go searching for VMAs.
+ */
+ if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
+ if (emulate_vsyscall(regs, address))
+ return;
+ }
+#endif
+
/*
- * When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in
- * the kernel and should generate an OOPS. Unfortunately, in the
- * case of an erroneous fault occurring in a code path which already
- * holds mmap_sem we will deadlock attempting to validate the fault
- * against the address space. Luckily the kernel only validly
- * references user space from well defined areas of code, which are
- * listed in the exceptions table.
+ * Kernel-mode access to the user address space should only occur
+ * on well-defined single instructions listed in the exception
+ * tables. But, an erroneous kernel fault occurring outside one of
+ * those areas which also holds mmap_sem might deadlock attempting
+ * to validate the fault against the address space.
*
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a
- * deadlock. Attempt to lock the address space, if we cannot we then
- * validate the source. If this is invalid we can skip the address
- * space check, thus avoiding the deadlock:
+ * Only do the expensive exception table search when we might be at
+ * risk of a deadlock. This happens if we
+ * 1. Failed to acquire mmap_sem, and
+ * 2. The access did not originate in userspace. Note: either the
+ * hardware or earlier page fault code may set X86_PF_USER
+ * in sw_error_code.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if (!(error_code & X86_PF_USER) &&
+ if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
- bad_area_nosemaphore(regs, error_code, address);
+ /*
+ * Fault from code in kernel from
+ * which we do not expect faults.
+ */
- bad_area_nosemaphore(regs, sw_error_code, address, NULL);
++ bad_area_nosemaphore(regs, sw_error_code, address);
return;
}
retry:
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, sw_error_code, address, &pkey, fault);
- mm_fault_error(regs, error_code, address, fault);
++ mm_fault_error(regs, sw_error_code, address, fault);
return;
}