Merge branch 'x86/pti' into x86/mm, to pick up dependencies
authorIngo Molnar <mingo@kernel.org>
Mon, 12 Mar 2018 11:10:03 +0000 (12:10 +0100)
committerIngo Molnar <mingo@kernel.org>
Mon, 12 Mar 2018 11:10:03 +0000 (12:10 +0100)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
12 files changed:
1  2 
Makefile
arch/x86/Kconfig
arch/x86/entry/entry_64.S
arch/x86/include/asm/nospec-branch.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_32.h
arch/x86/include/asm/pgtable_64.h
arch/x86/kernel/head_64.S
arch/x86/kernel/setup.c
arch/x86/mm/fault.c
include/linux/compiler-gcc.h

diff --combined Makefile
+++ b/Makefile
@@@ -2,7 -2,7 +2,7 @@@
  VERSION = 4
  PATCHLEVEL = 16
  SUBLEVEL = 0
 -EXTRAVERSION = -rc1
 +EXTRAVERSION = -rc3
  NAME = Fearless Coyote
  
  # *DOCUMENTATION*
@@@ -489,6 -489,11 +489,11 @@@ KBUILD_CFLAGS += $(CLANG_TARGET) $(CLAN
  KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
  endif
  
+ RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern -mindirect-branch-register
+ RETPOLINE_CFLAGS_CLANG := -mretpoline-external-thunk
+ RETPOLINE_CFLAGS := $(call cc-option,$(RETPOLINE_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_CFLAGS_CLANG)))
+ export RETPOLINE_CFLAGS
  ifeq ($(config-targets),1)
  # ===========================================================================
  # *config targets only - make sure prerequisites are updated, and descend
diff --combined arch/x86/Kconfig
@@@ -423,6 -423,12 +423,6 @@@ config X86_MPPARS
          For old smp systems that do not have proper acpi support. Newer systems
          (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
  
 -config X86_BIGSMP
 -      bool "Support for big SMP systems with more than 8 CPUs"
 -      depends on X86_32 && SMP
 -      ---help---
 -        This option is needed for the systems that have more than 8 CPUs
 -
  config GOLDFISH
         def_bool y
         depends on X86_GOLDFISH
  config RETPOLINE
        bool "Avoid speculative indirect branches in kernel"
        default y
+       select STACK_VALIDATION if HAVE_STACK_VALIDATION
        help
          Compile kernel with the retpoline compiler options to guard against
          kernel-to-user data leaks by avoiding speculative indirect
@@@ -454,12 -461,6 +455,12 @@@ config INTEL_RD
          Say N if unsure.
  
  if X86_32
 +config X86_BIGSMP
 +      bool "Support for big SMP systems with more than 8 CPUs"
 +      depends on SMP
 +      ---help---
 +        This option is needed for the systems that have more than 8 CPUs
 +
  config X86_EXTENDED_PLATFORM
        bool "Support for extended (non-PC) x86 platforms"
        default y
@@@ -949,66 -950,25 +950,66 @@@ config MAXSM
          Enable maximum number of CPUS and NUMA Nodes for this architecture.
          If unsure, say N.
  
 +#
 +# The maximum number of CPUs supported:
 +#
 +# The main config value is NR_CPUS, which defaults to NR_CPUS_DEFAULT,
 +# and which can be configured interactively in the
 +# [NR_CPUS_RANGE_BEGIN ... NR_CPUS_RANGE_END] range.
 +#
 +# The ranges are different on 32-bit and 64-bit kernels, depending on
 +# hardware capabilities and scalability features of the kernel.
 +#
 +# ( If MAXSMP is enabled we just use the highest possible value and disable
 +#   interactive configuration. )
 +#
 +
 +config NR_CPUS_RANGE_BEGIN
 +      int
 +      default NR_CPUS_RANGE_END if MAXSMP
 +      default    1 if !SMP
 +      default    2
 +
 +config NR_CPUS_RANGE_END
 +      int
 +      depends on X86_32
 +      default   64 if  SMP &&  X86_BIGSMP
 +      default    8 if  SMP && !X86_BIGSMP
 +      default    1 if !SMP
 +
 +config NR_CPUS_RANGE_END
 +      int
 +      depends on X86_64
 +      default 8192 if  SMP && ( MAXSMP ||  CPUMASK_OFFSTACK)
 +      default  512 if  SMP && (!MAXSMP && !CPUMASK_OFFSTACK)
 +      default    1 if !SMP
 +
 +config NR_CPUS_DEFAULT
 +      int
 +      depends on X86_32
 +      default   32 if  X86_BIGSMP
 +      default    8 if  SMP
 +      default    1 if !SMP
 +
 +config NR_CPUS_DEFAULT
 +      int
 +      depends on X86_64
 +      default 8192 if  MAXSMP
 +      default   64 if  SMP
 +      default    1 if !SMP
 +
  config NR_CPUS
        int "Maximum number of CPUs" if SMP && !MAXSMP
 -      range 2 8 if SMP && X86_32 && !X86_BIGSMP
 -      range 2 64 if SMP && X86_32 && X86_BIGSMP
 -      range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
 -      range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
 -      default "1" if !SMP
 -      default "8192" if MAXSMP
 -      default "32" if SMP && X86_BIGSMP
 -      default "8" if SMP && X86_32
 -      default "64" if SMP
 +      range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END
 +      default NR_CPUS_DEFAULT
        ---help---
          This allows you to specify the maximum number of CPUs which this
          kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum
          supported value is 8192, otherwise the maximum value is 512.  The
          minimum value which makes sense is 2.
  
 -        This is purely to save memory - each supported CPU adds
 -        approximately eight kilobytes to the kernel image.
 +        This is purely to save memory: each supported CPU adds about 8KB
 +        to the kernel image.
  
  config SCHED_SMT
        bool "SMT (Hyperthreading) scheduler support"
@@@ -1404,7 -1364,7 +1405,7 @@@ config HIGHMEM4
  
  config HIGHMEM64G
        bool "64GB"
 -      depends on !M486
 +      depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6
        select X86_PAE
        ---help---
          Select this if you have a 32-bit processor and more than 4
@@@ -1471,8 -1431,6 +1472,8 @@@ config X86_PA
  
  config X86_5LEVEL
        bool "Enable 5-level page tables support"
 +      select DYNAMIC_MEMORY_LAYOUT
 +      select SPARSEMEM_VMEMMAP
        depends on X86_64
        ---help---
          5-level paging enables access to larger address space:
  
          It will be supported by future Intel CPUs.
  
 -        Note: a kernel with this option enabled can only be booted
 -        on machines that support the feature.
 +        A kernel with the option enabled can be booted on machines that
 +        support 4- or 5-level paging.
  
          See Documentation/x86/x86_64/5level-paging.txt for more
          information.
@@@ -2186,17 -2144,10 +2187,17 @@@ config PHYSICAL_ALIG
  
          Don't change this unless you know what you are doing.
  
 +config DYNAMIC_MEMORY_LAYOUT
 +      bool
 +      ---help---
 +        This option makes base addresses of vmalloc and vmemmap as well as
 +        __PAGE_OFFSET movable during boot.
 +
  config RANDOMIZE_MEMORY
        bool "Randomize the kernel memory sections"
        depends on X86_64
        depends on RANDOMIZE_BASE
 +      select DYNAMIC_MEMORY_LAYOUT
        default RANDOMIZE_BASE
        ---help---
           Randomizes the base virtual address of kernel memory sections
@@@ -2315,7 -2266,7 +2316,7 @@@ choic
          it can be used to assist security vulnerability exploitation.
  
          This setting can be changed at boot time via the kernel command
-         line parameter vsyscall=[native|emulate|none].
+         line parameter vsyscall=[emulate|none].
  
          On a system with recent enough glibc (2.14 or newer) and no
          static binaries, you can say None without a performance penalty
  
          If unsure, select "Emulate".
  
-       config LEGACY_VSYSCALL_NATIVE
-               bool "Native"
-               help
-                 Actual executable code is located in the fixed vsyscall
-                 address mapping, implementing time() efficiently. Since
-                 this makes the mapping executable, it can be used during
-                 security vulnerability exploitation (traditionally as
-                 ROP gadgets). This configuration is not recommended.
        config LEGACY_VSYSCALL_EMULATE
                bool "Emulate"
                help
@@@ -260,13 -260,8 +260,13 @@@ GLOBAL(entry_SYSCALL_64_after_hwframe
         * Change top bits to match most significant bit (47th or 56th bit
         * depending on paging mode) in the address.
         */
 +#ifdef CONFIG_X86_5LEVEL
 +      ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
 +              "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
 +#else
        shl     $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
        sar     $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
 +#endif
  
        /* If this changed %rcx, it was not canonical */
        cmpq    %rcx, %r11
@@@ -369,8 -364,7 +369,7 @@@ ENTRY(__switch_to_asm
         * exist, overwrite the RSB with entries which capture
         * speculative execution to prevent attack.
         */
-       /* Clobbers %rbx */
-       FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+       FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
  #endif
  
        /* restore callee-saved registers */
@@@ -454,9 -448,19 +453,19 @@@ END(irq_entries_start
   *
   * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
   */
- .macro ENTER_IRQ_STACK regs=1 old_rsp
+ .macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
        DEBUG_ENTRY_ASSERT_IRQS_OFF
+       .if \save_ret
+       /*
+        * If save_ret is set, the original stack contains one additional
+        * entry -- the return address. Therefore, move the address one
+        * entry below %rsp to \old_rsp.
+        */
+       leaq    8(%rsp), \old_rsp
+       .else
        movq    %rsp, \old_rsp
+       .endif
  
        .if \regs
        UNWIND_HINT_REGS base=\old_rsp
        .if \regs
        UNWIND_HINT_REGS indirect=1
        .endif
+       .if \save_ret
+       /*
+        * Push the return address to the stack. This return address can
+        * be found at the "real" original RSP, which was offset by 8 at
+        * the beginning of this macro.
+        */
+       pushq   -8(\old_rsp)
+       .endif
  .endm
  
  /*
  .endm
  
  /*
-  * Interrupt entry/exit.
-  *
-  * Interrupt entry points save only callee clobbered registers in fast path.
+  * Interrupt entry helper function.
   *
-  * Entry runs with interrupts off.
+  * Entry runs with interrupts off. Stack layout at entry:
+  * +----------------------------------------------------+
+  * | regs->ss                                         |
+  * | regs->rsp                                                |
+  * | regs->eflags                                     |
+  * | regs->cs                                         |
+  * | regs->ip                                         |
+  * +----------------------------------------------------+
+  * | regs->orig_ax = ~(interrupt number)              |
+  * +----------------------------------------------------+
+  * | return address                                   |
+  * +----------------------------------------------------+
   */
- /* 0(%rsp): ~(interrupt number) */
-       .macro interrupt func
+ ENTRY(interrupt_entry)
+       UNWIND_HINT_FUNC
+       ASM_CLAC
        cld
  
-       testb   $3, CS-ORIG_RAX(%rsp)
+       testb   $3, CS-ORIG_RAX+8(%rsp)
        jz      1f
        SWAPGS
-       call    switch_to_thread_stack
+       /*
+        * Switch to the thread stack. The IRET frame and orig_ax are
+        * on the stack, as well as the return address. RDI..R12 are
+        * not (yet) on the stack and space has not (yet) been
+        * allocated for them.
+        */
+       pushq   %rdi
+       /* Need to switch before accessing the thread stack. */
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+       movq    %rsp, %rdi
+       movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+        /*
+         * We have RDI, return address, and orig_ax on the stack on
+         * top of the IRET frame. That means offset=24
+         */
+       UNWIND_HINT_IRET_REGS base=%rdi offset=24
+       pushq   7*8(%rdi)               /* regs->ss */
+       pushq   6*8(%rdi)               /* regs->rsp */
+       pushq   5*8(%rdi)               /* regs->eflags */
+       pushq   4*8(%rdi)               /* regs->cs */
+       pushq   3*8(%rdi)               /* regs->ip */
+       pushq   2*8(%rdi)               /* regs->orig_ax */
+       pushq   8(%rdi)                 /* return address */
+       UNWIND_HINT_FUNC
+       movq    (%rdi), %rdi
  1:
  
-       PUSH_AND_CLEAR_REGS
-       ENCODE_FRAME_POINTER
+       PUSH_AND_CLEAR_REGS save_ret=1
+       ENCODE_FRAME_POINTER 8
  
-       testb   $3, CS(%rsp)
+       testb   $3, CS+8(%rsp)
        jz      1f
  
        /*
         *
         * We need to tell lockdep that IRQs are off.  We can't do this until
         * we fix gsbase, and we should do it before enter_from_user_mode
-        * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
+        * (which can take locks).  Since TRACE_IRQS_OFF is idempotent,
         * the simplest way to handle it is to just call it twice if
         * we enter from user mode.  There's no reason to optimize this since
         * TRACE_IRQS_OFF is a no-op if lockdep is off.
        CALL_enter_from_user_mode
  
  1:
-       ENTER_IRQ_STACK old_rsp=%rdi
+       ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
        /* We entered an interrupt context - irqs are off: */
        TRACE_IRQS_OFF
  
-       call    \func   /* rdi points to pt_regs */
-       .endm
+       ret
+ END(interrupt_entry)
+ /* Interrupt entry/exit. */
  
        /*
         * The interrupt stubs push (~vector+0x80) onto the stack and
         */
        .p2align CONFIG_X86_L1_CACHE_SHIFT
  common_interrupt:
-       ASM_CLAC
        addq    $-0x80, (%rsp)                  /* Adjust vector to [-256, -1] range */
-       interrupt do_IRQ
+       call    interrupt_entry
+       UNWIND_HINT_REGS indirect=1
+       call    do_IRQ  /* rdi points to pt_regs */
        /* 0(%rsp): old RSP */
  ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_ANY)
@@@ -771,10 -826,11 +831,11 @@@ END(common_interrupt
  .macro apicinterrupt3 num sym do_sym
  ENTRY(\sym)
        UNWIND_HINT_IRET_REGS
-       ASM_CLAC
        pushq   $~(\num)
  .Lcommon_\sym:
-       interrupt \do_sym
+       call    interrupt_entry
+       UNWIND_HINT_REGS indirect=1
+       call    \do_sym /* rdi points to pt_regs */
        jmp     ret_from_intr
  END(\sym)
  .endm
@@@ -837,34 -893,6 +898,6 @@@ apicinterrupt IRQ_WORK_VECTOR                     irq_wor
   */
  #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
  
- /*
-  * Switch to the thread stack.  This is called with the IRET frame and
-  * orig_ax on the stack.  (That is, RDI..R12 are not on the stack and
-  * space has not been allocated for them.)
-  */
- ENTRY(switch_to_thread_stack)
-       UNWIND_HINT_FUNC
-       pushq   %rdi
-       /* Need to switch before accessing the thread stack. */
-       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-       movq    %rsp, %rdi
-       movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-       UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
-       pushq   7*8(%rdi)               /* regs->ss */
-       pushq   6*8(%rdi)               /* regs->rsp */
-       pushq   5*8(%rdi)               /* regs->eflags */
-       pushq   4*8(%rdi)               /* regs->cs */
-       pushq   3*8(%rdi)               /* regs->ip */
-       pushq   2*8(%rdi)               /* regs->orig_ax */
-       pushq   8(%rdi)                 /* return address */
-       UNWIND_HINT_FUNC
-       movq    (%rdi), %rdi
-       ret
- END(switch_to_thread_stack)
  .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
  ENTRY(\sym)
        UNWIND_HINT_IRET_REGS offset=\has_error_code*8
        pushq   $-1                             /* ORIG_RAX: no syscall to restart */
        .endif
  
-       /* Save all registers in pt_regs */
-       PUSH_AND_CLEAR_REGS
-       ENCODE_FRAME_POINTER
        .if \paranoid < 2
-       testb   $3, CS(%rsp)                    /* If coming from userspace, switch stacks */
+       testb   $3, CS-ORIG_RAX(%rsp)           /* If coming from userspace, switch stacks */
        jnz     .Lfrom_usermode_switch_stack_\@
        .endif
  
@@@ -1135,13 -1159,15 +1164,15 @@@ idtentry machine_check               do_mce                  has_er
  #endif
  
  /*
-  * Switch gs if needed.
+  * Save all registers in pt_regs, and switch gs if needed.
   * Use slow, but surefire "are we in kernel?" check.
   * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
   */
  ENTRY(paranoid_entry)
        UNWIND_HINT_FUNC
        cld
+       PUSH_AND_CLEAR_REGS save_ret=1
+       ENCODE_FRAME_POINTER 8
        movl    $1, %ebx
        movl    $MSR_GS_BASE, %ecx
        rdmsr
@@@ -1186,12 -1212,14 +1217,14 @@@ ENTRY(paranoid_exit
  END(paranoid_exit)
  
  /*
-  * Switch gs if needed.
+  * Save all registers in pt_regs, and switch GS if needed.
   * Return: EBX=0: came from user mode; EBX=1: otherwise
   */
  ENTRY(error_entry)
-       UNWIND_HINT_REGS offset=8
+       UNWIND_HINT_FUNC
        cld
+       PUSH_AND_CLEAR_REGS save_ret=1
+       ENCODE_FRAME_POINTER 8
        testb   $3, CS+8(%rsp)
        jz      .Lerror_kernelspace
  
@@@ -1582,8 -1610,6 +1615,6 @@@ end_repeat_nmi
         * frame to point back to repeat_nmi.
         */
        pushq   $-1                             /* ORIG_RAX: no syscall to restart */
-       PUSH_AND_CLEAR_REGS
-       ENCODE_FRAME_POINTER
  
        /*
         * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
@@@ -8,6 -8,50 +8,50 @@@
  #include <asm/cpufeatures.h>
  #include <asm/msr-index.h>
  
+ /*
+  * Fill the CPU return stack buffer.
+  *
+  * Each entry in the RSB, if used for a speculative 'ret', contains an
+  * infinite 'pause; lfence; jmp' loop to capture speculative execution.
+  *
+  * This is required in various cases for retpoline and IBRS-based
+  * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+  * eliminate potentially bogus entries from the RSB, and sometimes
+  * purely to ensure that it doesn't get empty, which on some CPUs would
+  * allow predictions from other (unwanted!) sources to be used.
+  *
+  * We define a CPP macro such that it can be used from both .S files and
+  * inline assembly. It's possible to do a .macro and then include that
+  * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+  */
+ #define RSB_CLEAR_LOOPS               32      /* To forcibly overwrite all entries */
+ #define RSB_FILL_LOOPS                16      /* To avoid underflow */
+ /*
+  * Google experimented with loop-unrolling and this turned out to be
+  * the optimal version â€” two calls, each with their own speculation
+  * trap should their return address end up getting used, in a loop.
+  */
+ #define __FILL_RETURN_BUFFER(reg, nr, sp)     \
+       mov     $(nr/2), reg;                   \
+ 771:                                          \
+       call    772f;                           \
+ 773:  /* speculation trap */                  \
+       pause;                                  \
+       lfence;                                 \
+       jmp     773b;                           \
+ 772:                                          \
+       call    774f;                           \
+ 775:  /* speculation trap */                  \
+       pause;                                  \
+       lfence;                                 \
+       jmp     775b;                           \
+ 774:                                          \
+       dec     reg;                            \
+       jnz     771b;                           \
+       add     $(BITS_PER_LONG/8) * nr, sp;
  #ifdef __ASSEMBLY__
  
  /*
        .popsection
  .endm
  
+ /*
+  * This should be used immediately before an indirect jump/call. It tells
+  * objtool the subsequent indirect jump/call is vouched safe for retpoline
+  * builds.
+  */
+ .macro ANNOTATE_RETPOLINE_SAFE
+       .Lannotate_\@:
+       .pushsection .discard.retpoline_safe
+       _ASM_PTR .Lannotate_\@
+       .popsection
+ .endm
  /*
   * These are the bare retpoline primitives for indirect jmp and call.
   * Do not use these directly; they only exist to make the ALTERNATIVE
  .macro JMP_NOSPEC reg:req
  #ifdef CONFIG_RETPOLINE
        ANNOTATE_NOSPEC_ALTERNATIVE
-       ALTERNATIVE_2 __stringify(jmp *\reg),                           \
+       ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg),  \
                __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
-               __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
+               __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
  #else
        jmp     *\reg
  #endif
  .macro CALL_NOSPEC reg:req
  #ifdef CONFIG_RETPOLINE
        ANNOTATE_NOSPEC_ALTERNATIVE
-       ALTERNATIVE_2 __stringify(call *\reg),                          \
+       ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \
                __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
-               __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
+               __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD
  #else
        call    *\reg
  #endif
  .endm
  
- /* This clobbers the BX register */
- .macro FILL_RETURN_BUFFER nr:req ftr:req
+  /*
+   * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+   * monstrosity above, manually.
+   */
+ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
  #ifdef CONFIG_RETPOLINE
-       ALTERNATIVE "", "call __clear_rsb", \ftr
+       ANNOTATE_NOSPEC_ALTERNATIVE
+       ALTERNATIVE "jmp .Lskip_rsb_\@",                                \
+               __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP))    \
+               \ftr
+ .Lskip_rsb_\@:
  #endif
  .endm
  
        ".long 999b - .\n\t"                                    \
        ".popsection\n\t"
  
+ #define ANNOTATE_RETPOLINE_SAFE                                       \
+       "999:\n\t"                                              \
+       ".pushsection .discard.retpoline_safe\n\t"              \
+       _ASM_PTR " 999b\n\t"                                    \
+       ".popsection\n\t"
  #if defined(CONFIG_X86_64) && defined(RETPOLINE)
  
  /*
  # define CALL_NOSPEC                                          \
        ANNOTATE_NOSPEC_ALTERNATIVE                             \
        ALTERNATIVE(                                            \
+       ANNOTATE_RETPOLINE_SAFE                                 \
        "call *%[thunk_target]\n",                              \
        "call __x86_indirect_thunk_%V[thunk_target]\n",         \
        X86_FEATURE_RETPOLINE)
@@@ -156,62 -226,53 +226,90 @@@ extern char __indirect_thunk_end[]
  static inline void vmexit_fill_RSB(void)
  {
  #ifdef CONFIG_RETPOLINE
-       alternative_input("",
-                         "call __fill_rsb",
-                         X86_FEATURE_RETPOLINE,
-                         ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
+       unsigned long loops;
+       asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
+                     ALTERNATIVE("jmp 910f",
+                                 __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
+                                 X86_FEATURE_RETPOLINE)
+                     "910:"
+                     : "=r" (loops), ASM_CALL_CONSTRAINT
+                     : : "memory" );
  #endif
  }
  
+ #define alternative_msr_write(_msr, _val, _feature)           \
+       asm volatile(ALTERNATIVE("",                            \
+                                "movl %[msr], %%ecx\n\t"       \
+                                "movl %[val], %%eax\n\t"       \
+                                "movl $0, %%edx\n\t"           \
+                                "wrmsr",                       \
+                                _feature)                      \
+                    : : [msr] "i" (_msr), [val] "i" (_val)     \
+                    : "eax", "ecx", "edx", "memory")
  static inline void indirect_branch_prediction_barrier(void)
  {
-       asm volatile(ALTERNATIVE("",
-                                "movl %[msr], %%ecx\n\t"
-                                "movl %[val], %%eax\n\t"
-                                "movl $0, %%edx\n\t"
-                                "wrmsr",
-                                X86_FEATURE_USE_IBPB)
-                    : : [msr] "i" (MSR_IA32_PRED_CMD),
-                        [val] "i" (PRED_CMD_IBPB)
-                    : "eax", "ecx", "edx", "memory");
+       alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,
+                             X86_FEATURE_USE_IBPB);
  }
  
+ /*
+  * With retpoline, we must use IBRS to restrict branch prediction
+  * before calling into firmware.
+  *
+  * (Implemented as CPP macros due to header hell.)
+  */
+ #define firmware_restrict_branch_speculation_start()                  \
+ do {                                                                  \
+       preempt_disable();                                              \
+       alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS,       \
+                             X86_FEATURE_USE_IBRS_FW);                 \
+ } while (0)
+ #define firmware_restrict_branch_speculation_end()                    \
+ do {                                                                  \
+       alternative_msr_write(MSR_IA32_SPEC_CTRL, 0,                    \
+                             X86_FEATURE_USE_IBRS_FW);                 \
+       preempt_enable();                                               \
+ } while (0)
  #endif /* __ASSEMBLY__ */
 +
 +/*
 + * Below is used in the eBPF JIT compiler and emits the byte sequence
 + * for the following assembly:
 + *
 + * With retpolines configured:
 + *
 + *    callq do_rop
 + *  spec_trap:
 + *    pause
 + *    lfence
 + *    jmp spec_trap
 + *  do_rop:
 + *    mov %rax,(%rsp)
 + *    retq
 + *
 + * Without retpolines configured:
 + *
 + *    jmp *%rax
 + */
 +#ifdef CONFIG_RETPOLINE
 +# define RETPOLINE_RAX_BPF_JIT_SIZE   17
 +# define RETPOLINE_RAX_BPF_JIT()                              \
 +      EMIT1_off32(0xE8, 7);    /* callq do_rop */             \
 +      /* spec_trap: */                                        \
 +      EMIT2(0xF3, 0x90);       /* pause */                    \
 +      EMIT3(0x0F, 0xAE, 0xE8); /* lfence */                   \
 +      EMIT2(0xEB, 0xF9);       /* jmp spec_trap */            \
 +      /* do_rop: */                                           \
 +      EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */    \
 +      EMIT1(0xC3);             /* retq */
 +#else
 +# define RETPOLINE_RAX_BPF_JIT_SIZE   2
 +# define RETPOLINE_RAX_BPF_JIT()                              \
 +      EMIT2(0xFF, 0xE0);       /* jmp *%rax */
 +#endif
 +
  #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
@@@ -7,6 -7,7 +7,7 @@@
  #ifdef CONFIG_PARAVIRT
  #include <asm/pgtable_types.h>
  #include <asm/asm.h>
+ #include <asm/nospec-branch.h>
  
  #include <asm/paravirt_types.h>
  
@@@ -567,22 -568,17 +568,22 @@@ static inline p4dval_t p4d_val(p4d_t p4
        return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
  }
  
 -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 +static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
 -      pgdval_t val = native_pgd_val(pgd);
 -
 -      PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
 +      PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
  }
  
 -static inline void pgd_clear(pgd_t *pgdp)
 -{
 -      set_pgd(pgdp, __pgd(0));
 -}
 +#define set_pgd(pgdp, pgdval) do {                                    \
 +      if (pgtable_l5_enabled)                                         \
 +              __set_pgd(pgdp, pgdval);                                \
 +      else                                                            \
 +              set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd });     \
 +} while (0)
 +
 +#define pgd_clear(pgdp) do {                                          \
 +      if (pgtable_l5_enabled)                                         \
 +              set_pgd(pgdp, __pgd(0));                                \
 +} while (0)
  
  #endif  /* CONFIG_PGTABLE_LEVELS == 5 */
  
@@@ -884,23 -880,27 +885,27 @@@ extern void default_banner(void)
  
  #define INTERRUPT_RETURN                                              \
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,       \
-                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
+                 ANNOTATE_RETPOLINE_SAFE;                                      \
+                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret);)
  
  #define DISABLE_INTERRUPTS(clobbers)                                  \
        PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
                  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);            \
+                 ANNOTATE_RETPOLINE_SAFE;                                      \
                  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable);    \
                  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
  
  #define ENABLE_INTERRUPTS(clobbers)                                   \
        PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,  \
                  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);            \
+                 ANNOTATE_RETPOLINE_SAFE;                                      \
                  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);     \
                  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
  
  #ifdef CONFIG_X86_32
  #define GET_CR0_INTO_EAX                              \
        push %ecx; push %edx;                           \
+       ANNOTATE_RETPOLINE_SAFE;                                \
        call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
        pop %edx; pop %ecx
  #else /* !CONFIG_X86_32 */
   */
  #define SWAPGS                                                                \
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,     \
-                 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs)          \
+                 ANNOTATE_RETPOLINE_SAFE;                                      \
+                 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs);         \
                 )
  
  #define GET_CR2_INTO_RAX                              \
-       call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2)
+       ANNOTATE_RETPOLINE_SAFE;                                \
+       call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2);
  
  #define USERGS_SYSRET64                                                       \
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),       \
                  CLBR_NONE,                                            \
-                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+                 ANNOTATE_RETPOLINE_SAFE;                                      \
+                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64);)
  
  #ifdef CONFIG_DEBUG_ENTRY
  #define SAVE_FLAGS(clobbers)                                        \
        PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
                  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
+                 ANNOTATE_RETPOLINE_SAFE;                                  \
                  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \
                  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
  #endif
@@@ -65,7 -65,7 +65,7 @@@ extern pmdval_t early_pmd_flags
  
  #ifndef __PAGETABLE_P4D_FOLDED
  #define set_pgd(pgdp, pgd)            native_set_pgd(pgdp, pgd)
 -#define pgd_clear(pgd)                        native_pgd_clear(pgd)
 +#define pgd_clear(pgd)                        (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0)
  #endif
  
  #ifndef set_p4d
@@@ -350,14 -350,14 +350,14 @@@ static inline pmd_t pmd_set_flags(pmd_
  {
        pmdval_t v = native_pmd_val(pmd);
  
-       return __pmd(v | set);
+       return native_make_pmd(v | set);
  }
  
  static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
  {
        pmdval_t v = native_pmd_val(pmd);
  
-       return __pmd(v & ~clear);
+       return native_make_pmd(v & ~clear);
  }
  
  static inline pmd_t pmd_mkold(pmd_t pmd)
@@@ -409,14 -409,14 +409,14 @@@ static inline pud_t pud_set_flags(pud_
  {
        pudval_t v = native_pud_val(pud);
  
-       return __pud(v | set);
+       return native_make_pud(v | set);
  }
  
  static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
  {
        pudval_t v = native_pud_val(pud);
  
-       return __pud(v & ~clear);
+       return native_make_pud(v & ~clear);
  }
  
  static inline pud_t pud_mkold(pud_t pud)
@@@ -859,8 -859,6 +859,8 @@@ static inline unsigned long p4d_index(u
  #if CONFIG_PGTABLE_LEVELS > 4
  static inline int pgd_present(pgd_t pgd)
  {
 +      if (!pgtable_l5_enabled)
 +              return 1;
        return pgd_flags(pgd) & _PAGE_PRESENT;
  }
  
@@@ -878,8 -876,6 +878,8 @@@ static inline unsigned long pgd_page_va
  /* to find an entry in a page-table-directory. */
  static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
  {
 +      if (!pgtable_l5_enabled)
 +              return (p4d_t *)pgd;
        return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
  }
  
@@@ -887,9 -883,6 +887,9 @@@ static inline int pgd_bad(pgd_t pgd
  {
        unsigned long ignore_flags = _PAGE_USER;
  
 +      if (!pgtable_l5_enabled)
 +              return 0;
 +
        if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;
  
  
  static inline int pgd_none(pgd_t pgd)
  {
 +      if (!pgtable_l5_enabled)
 +              return 0;
        /*
         * There is no need to do a workaround for the KNL stray
         * A/D bit erratum here.  PGDs only point to page tables
@@@ -32,9 -32,8 +32,10 @@@ extern pmd_t initial_pg_pmd[]
  static inline void pgtable_cache_init(void) { }
  static inline void check_pgt_cache(void) { }
  void paging_init(void);
+ void sync_initial_page_table(void);
  
 +static inline int pgd_large(pgd_t pgd) { return 0; }
 +
  /*
   * Define this if things work differently on an i386 and an i486:
   * it will (on an i486) warn about kernel memory accesses that are
@@@ -28,6 -28,7 +28,7 @@@ extern pgd_t init_top_pgt[]
  #define swapper_pg_dir init_top_pgt
  
  extern void paging_init(void);
+ static inline void sync_initial_page_table(void) { }
  
  #define pte_ERROR(e)                                  \
        pr_err("%s:%d: bad pte %p(%016lx)\n",           \
@@@ -217,26 -218,29 +218,26 @@@ static inline pgd_t pti_set_user_pgd(pg
  
  static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
  {
 -#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
 -      p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
 -#else
 -      *p4dp = p4d;
 -#endif
 +      pgd_t pgd;
 +
 +      if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
 +              *p4dp = p4d;
 +              return;
 +      }
 +
 +      pgd = native_make_pgd(native_p4d_val(p4d));
 +      pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd);
 +      *p4dp = native_make_p4d(native_pgd_val(pgd));
  }
  
  static inline void native_p4d_clear(p4d_t *p4d)
  {
 -#ifdef CONFIG_X86_5LEVEL
        native_set_p4d(p4d, native_make_p4d(0));
 -#else
 -      native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
 -#endif
  }
  
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
 -#ifdef CONFIG_PAGE_TABLE_ISOLATION
        *pgdp = pti_set_user_pgd(pgdp, pgd);
 -#else
 -      *pgdp = pgd;
 -#endif
  }
  
  static inline void native_pgd_clear(pgd_t *pgd)
@@@ -23,6 -23,7 +23,7 @@@
  #include <asm/nops.h>
  #include "../entry/calling.h"
  #include <asm/export.h>
+ #include <asm/nospec-branch.h>
  
  #ifdef CONFIG_PARAVIRT
  #include <asm/asm-offsets.h>
   *
   */
  
 +#define l4_index(x)   (((x) >> 39) & 511)
  #define pud_index(x)  (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
  
 -#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
 -PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
 -PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
 -#endif
 +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
 +L4_START_KERNEL = l4_index(__START_KERNEL_map)
 +
  L3_START_KERNEL = pud_index(__START_KERNEL_map)
  
        .text
@@@ -124,10 -125,7 +125,10 @@@ ENTRY(secondary_startup_64
        /* Enable PAE mode, PGE and LA57 */
        movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
  #ifdef CONFIG_X86_5LEVEL
 +      testl   $1, __pgtable_l5_enabled(%rip)
 +      jz      1f
        orl     $X86_CR4_LA57, %ecx
 +1:
  #endif
        movq    %rcx, %cr4
  
  
        /* Ensure I am executing from virtual addresses */
        movq    $1f, %rax
+       ANNOTATE_RETPOLINE_SAFE
        jmp     *%rax
  1:
        UNWIND_HINT_EMPTY
@@@ -375,7 -374,12 +377,7 @@@ GLOBAL(name
  
        __INITDATA
  NEXT_PGD_PAGE(early_top_pgt)
 -      .fill   511,8,0
 -#ifdef CONFIG_X86_5LEVEL
 -      .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 -#else
 -      .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 -#endif
 +      .fill   512,8,0
        .fill   PTI_USER_PGD_FILL,8,0
  
  NEXT_PAGE(early_dynamic_pgts)
  #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
  NEXT_PGD_PAGE(init_top_pgt)
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 -      .org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
 +      .org    init_top_pgt + L4_PAGE_OFFSET*8, 0
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 -      .org    init_top_pgt + PGD_START_KERNEL*8, 0
 +      .org    init_top_pgt + L4_START_KERNEL*8, 0
        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
        .fill   PTI_USER_PGD_FILL,8,0
diff --combined arch/x86/kernel/setup.c
@@@ -189,7 -189,9 +189,7 @@@ struct ist_info ist_info
  #endif
  
  #else
 -struct cpuinfo_x86 boot_cpu_data __read_mostly = {
 -      .x86_phys_bits = MAX_PHYSMEM_BITS,
 -};
 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
  EXPORT_SYMBOL(boot_cpu_data);
  #endif
  
@@@ -849,7 -851,6 +849,7 @@@ void __init setup_arch(char **cmdline_p
        __flush_tlb_all();
  #else
        printk(KERN_INFO "Command line: %s\n", boot_command_line);
 +      boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
  #endif
  
        /*
  
        kasan_init();
  
- #ifdef CONFIG_X86_32
-       /* sync back kernel address range */
-       clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
-                       swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
-                       KERNEL_PGD_PTRS);
        /*
-        * sync back low identity map too.  It is used for example
-        * in the 32-bit EFI stub.
+        * Sync back kernel address range.
+        *
+        * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+        * this call?
         */
-       clone_pgd_range(initial_page_table,
-                       swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
-                       min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
- #endif
+       sync_initial_page_table();
  
        tboot_probe();
  
diff --combined arch/x86/mm/fault.c
@@@ -439,7 -439,7 +439,7 @@@ static noinline int vmalloc_fault(unsig
        if (pgd_none(*pgd_ref))
                return -1;
  
 -      if (CONFIG_PGTABLE_LEVELS > 4) {
 +      if (pgtable_l5_enabled) {
                if (pgd_none(*pgd)) {
                        set_pgd(pgd, *pgd_ref);
                        arch_flush_lazy_mmu_mode();
        if (p4d_none(*p4d_ref))
                return -1;
  
 -      if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) {
 +      if (p4d_none(*p4d) && !pgtable_l5_enabled) {
                set_p4d(p4d, *p4d_ref);
                arch_flush_lazy_mmu_mode();
        } else {
@@@ -1248,10 -1248,6 +1248,6 @@@ __do_page_fault(struct pt_regs *regs, u
        tsk = current;
        mm = tsk->mm;
  
-       /*
-        * Detect and handle instructions that would cause a page fault for
-        * both a tracked kernel page and a userspace page.
-        */
        prefetchw(&mm->mmap_sem);
  
        if (unlikely(kmmio_fault(regs, address)))
  #define __weak                __attribute__((weak))
  #define __alias(symbol)       __attribute__((alias(#symbol)))
  
+ #ifdef RETPOLINE
+ #define __noretpoline __attribute__((indirect_branch("keep")))
+ #endif
  /*
   * it doesn't make sense on ARM (currently the only user of __naked)
   * to trace naked functions because then mcount is called without
  #endif
  #endif
  
 +/*
 + * calling noreturn functions, __builtin_unreachable() and __builtin_trap()
 + * confuse the stack allocation in gcc, leading to overly large stack
 + * frames, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365
 + *
 + * Adding an empty inline assembly before it works around the problem
 + */
 +#define barrier_before_unreachable() asm volatile("")
 +
  /*
   * Mark a position in code as unreachable.  This can be used to
   * suppress control flow warnings after asm blocks that transfer
   * unreleased.  Really, we need to have autoconf for the kernel.
   */
  #define unreachable() \
 -      do { annotate_unreachable(); __builtin_unreachable(); } while (0)
 +      do {                                    \
 +              annotate_unreachable();         \
 +              barrier_before_unreachable();   \
 +              __builtin_unreachable();        \
 +      } while (0)
  
  /* Mark a function definition as prohibited from being cloned. */
  #define __noclone     __attribute__((__noclone__, __optimize__("no-tracer")))