arch/x86/entry/entry_32.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  *  Copyright (C) 1991,1992  Linus Torvalds
   4  *
   5  * entry_32.S contains the system-call and low-level fault and trap handling routines.
   6  *
   7  * Stack layout while running C code:
   8  *      ptrace needs to have all registers on the stack.
   9  *      If the order here is changed, it needs to be
  10  *      updated in fork.c:copy_process(), signal.c:do_signal(),
  11  *      ptrace.c and ptrace.h
  12  *
  13  *       0(%esp) - %ebx
  14  *       4(%esp) - %ecx
  15  *       8(%esp) - %edx
  16  *       C(%esp) - %esi
  17  *      10(%esp) - %edi
  18  *      14(%esp) - %ebp
  19  *      18(%esp) - %eax
  20  *      1C(%esp) - %ds
  21  *      20(%esp) - %es
  22  *      24(%esp) - %fs
  23  *      28(%esp) - %gs          saved iff !CONFIG_X86_32_LAZY_GS
  24  *      2C(%esp) - orig_eax
  25  *      30(%esp) - %eip
  26  *      34(%esp) - %cs
  27  *      38(%esp) - %eflags
  28  *      3C(%esp) - %oldesp
  29  *      40(%esp) - %oldss
  30  */
  31
  32 #include <linux/linkage.h>
  33 #include <linux/err.h>
  34 #include <asm/thread_info.h>
  35 #include <asm/irqflags.h>
  36 #include <asm/errno.h>
  37 #include <asm/segment.h>
  38 #include <asm/smp.h>
  39 #include <asm/percpu.h>
  40 #include <asm/processor-flags.h>
  41 #include <asm/irq_vectors.h>
  42 #include <asm/cpufeatures.h>
  43 #include <asm/alternative-asm.h>
  44 #include <asm/asm.h>
  45 #include <asm/smap.h>
  46 #include <asm/frame.h>
  47 #include <asm/trapnr.h>
  48 #include <asm/nospec-branch.h>
  49
  50 #include "calling.h"
  51
  52         .section .entry.text, "ax"
  53
  54 #define PTI_SWITCH_MASK         (1 << PAGE_SHIFT)
  55
  56 /*
  57  * User gs save/restore
  58  *
  59  * %gs is used for userland TLS and kernel only uses it for stack
  60  * canary which is required to be at %gs:20 by gcc.  Read the comment
  61  * at the top of stackprotector.h for more info.
  62  *
  63  * Local labels 98 and 99 are used.
  64  */
  65 #ifdef CONFIG_X86_32_LAZY_GS
  66
  67  /* unfortunately push/pop can't be no-op */
  68 .macro PUSH_GS
  69         pushl   $0
  70 .endm
  71 .macro POP_GS pop=0
  72         addl    $(4 + \pop), %esp
  73 .endm
  74 .macro POP_GS_EX
  75 .endm
  76
  77  /* all the rest are no-op */
  78 .macro PTGS_TO_GS
  79 .endm
  80 .macro PTGS_TO_GS_EX
  81 .endm
  82 .macro GS_TO_REG reg
  83 .endm
  84 .macro REG_TO_PTGS reg
  85 .endm
  86 .macro SET_KERNEL_GS reg
  87 .endm
  88
  89 #else   /* CONFIG_X86_32_LAZY_GS */
  90
  91 .macro PUSH_GS
  92         pushl   %gs
  93 .endm
  94
  95 .macro POP_GS pop=0
  96 98:     popl    %gs
  97   .if \pop <> 0
  98         add     $\pop, %esp
  99   .endif
 100 .endm
 101 .macro POP_GS_EX
 102 .pushsection .fixup, "ax"
 103 99:     movl    $0, (%esp)
 104         jmp     98b
 105 .popsection
 106         _ASM_EXTABLE(98b, 99b)
 107 .endm
 108
 109 .macro PTGS_TO_GS
 110 98:     mov     PT_GS(%esp), %gs
 111 .endm
 112 .macro PTGS_TO_GS_EX
 113 .pushsection .fixup, "ax"
 114 99:     movl    $0, PT_GS(%esp)
 115         jmp     98b
 116 .popsection
 117         _ASM_EXTABLE(98b, 99b)
 118 .endm
 119
 120 .macro GS_TO_REG reg
 121         movl    %gs, \reg
 122 .endm
 123 .macro REG_TO_PTGS reg
 124         movl    \reg, PT_GS(%esp)
 125 .endm
 126 .macro SET_KERNEL_GS reg
 127         movl    $(__KERNEL_STACK_CANARY), \reg
 128         movl    \reg, %gs
 129 .endm
 130
 131 #endif /* CONFIG_X86_32_LAZY_GS */
 132
 133 /* Unconditionally switch to user cr3 */
 134 .macro SWITCH_TO_USER_CR3 scratch_reg:req
 135         ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 136
 137         movl    %cr3, \scratch_reg
 138         orl     $PTI_SWITCH_MASK, \scratch_reg
 139         movl    \scratch_reg, %cr3
 140 .Lend_\@:
 141 .endm
 142
 143 .macro BUG_IF_WRONG_CR3 no_user_check=0
 144 #ifdef CONFIG_DEBUG_ENTRY
 145         ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 146         .if \no_user_check == 0
 147         /* coming from usermode? */
 148         testl   $USER_SEGMENT_RPL_MASK, PT_CS(%esp)
 149         jz      .Lend_\@
 150         .endif
 151         /* On user-cr3? */
 152         movl    %cr3, %eax
 153         testl   $PTI_SWITCH_MASK, %eax
 154         jnz     .Lend_\@
 155         /* From userspace with kernel cr3 - BUG */
 156         ud2
 157 .Lend_\@:
 158 #endif
 159 .endm
 160
 161 /*
 162  * Switch to kernel cr3 if not already loaded and return current cr3 in
 163  * \scratch_reg
 164  */
 165 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
 166         ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
 167         movl    %cr3, \scratch_reg
 168         /* Test if we are already on kernel CR3 */
 169         testl   $PTI_SWITCH_MASK, \scratch_reg
 170         jz      .Lend_\@
 171         andl    $(~PTI_SWITCH_MASK), \scratch_reg
 172         movl    \scratch_reg, %cr3
 173         /* Return original CR3 in \scratch_reg */
 174         orl     $PTI_SWITCH_MASK, \scratch_reg
 175 .Lend_\@:
 176 .endm
 177
 178 #define CS_FROM_ENTRY_STACK     (1 << 31)
 179 #define CS_FROM_USER_CR3        (1 << 30)
 180 #define CS_FROM_KERNEL          (1 << 29)
 181 #define CS_FROM_ESPFIX          (1 << 28)
 182
 183 .macro FIXUP_FRAME
 184         /*
 185          * The high bits of the CS dword (__csh) are used for CS_FROM_*.
 186          * Clear them in case hardware didn't do this for us.
 187          */
 188         andl    $0x0000ffff, 4*4(%esp)
 189
 190 #ifdef CONFIG_VM86
 191         testl   $X86_EFLAGS_VM, 5*4(%esp)
 192         jnz     .Lfrom_usermode_no_fixup_\@
 193 #endif
 194         testl   $USER_SEGMENT_RPL_MASK, 4*4(%esp)
 195         jnz     .Lfrom_usermode_no_fixup_\@
 196
 197         orl     $CS_FROM_KERNEL, 4*4(%esp)
 198
 199         /*
 200          * When we're here from kernel mode; the (exception) stack looks like:
 201          *
 202          *  6*4(%esp) - <previous context>
 203          *  5*4(%esp) - flags
 204          *  4*4(%esp) - cs
 205          *  3*4(%esp) - ip
 206          *  2*4(%esp) - orig_eax
 207          *  1*4(%esp) - gs / function
 208          *  0*4(%esp) - fs
 209          *
 210          * Lets build a 5 entry IRET frame after that, such that struct pt_regs
 211          * is complete and in particular regs->sp is correct. This gives us
 212          * the original 6 enties as gap:
 213          *
 214          * 14*4(%esp) - <previous context>
 215          * 13*4(%esp) - gap / flags
 216          * 12*4(%esp) - gap / cs
 217          * 11*4(%esp) - gap / ip
 218          * 10*4(%esp) - gap / orig_eax
 219          *  9*4(%esp) - gap / gs / function
 220          *  8*4(%esp) - gap / fs
 221          *  7*4(%esp) - ss
 222          *  6*4(%esp) - sp
 223          *  5*4(%esp) - flags
 224          *  4*4(%esp) - cs
 225          *  3*4(%esp) - ip
 226          *  2*4(%esp) - orig_eax
 227          *  1*4(%esp) - gs / function
 228          *  0*4(%esp) - fs
 229          */
 230
 231         pushl   %ss             # ss
 232         pushl   %esp            # sp (points at ss)
 233         addl    $7*4, (%esp)    # point sp back at the previous context
 234         pushl   7*4(%esp)       # flags
 235         pushl   7*4(%esp)       # cs
 236         pushl   7*4(%esp)       # ip
 237         pushl   7*4(%esp)       # orig_eax
 238         pushl   7*4(%esp)       # gs / function
 239         pushl   7*4(%esp)       # fs
 240 .Lfrom_usermode_no_fixup_\@:
 241 .endm
 242
 243 .macro IRET_FRAME
 244         /*
 245          * We're called with %ds, %es, %fs, and %gs from the interrupted
 246          * frame, so we shouldn't use them.  Also, we may be in ESPFIX
 247          * mode and therefore have a nonzero SS base and an offset ESP,
 248          * so any attempt to access the stack needs to use SS.  (except for
 249          * accesses through %esp, which automatically use SS.)
 250          */
 251         testl $CS_FROM_KERNEL, 1*4(%esp)
 252         jz .Lfinished_frame_\@
 253
 254         /*
 255          * Reconstruct the 3 entry IRET frame right after the (modified)
 256          * regs->sp without lowering %esp in between, such that an NMI in the
 257          * middle doesn't scribble our stack.
 258          */
 259         pushl   %eax
 260         pushl   %ecx
 261         movl    5*4(%esp), %eax         # (modified) regs->sp
 262
 263         movl    4*4(%esp), %ecx         # flags
 264         movl    %ecx, %ss:-1*4(%eax)
 265
 266         movl    3*4(%esp), %ecx         # cs
 267         andl    $0x0000ffff, %ecx
 268         movl    %ecx, %ss:-2*4(%eax)
 269
 270         movl    2*4(%esp), %ecx         # ip
 271         movl    %ecx, %ss:-3*4(%eax)
 272
 273         movl    1*4(%esp), %ecx         # eax
 274         movl    %ecx, %ss:-4*4(%eax)
 275
 276         popl    %ecx
 277         lea     -4*4(%eax), %esp
 278         popl    %eax
 279 .Lfinished_frame_\@:
 280 .endm
 281
 282 .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
 283         cld
 284 .if \skip_gs == 0
 285         PUSH_GS
 286 .endif
 287         pushl   %fs
 288
 289         pushl   %eax
 290         movl    $(__KERNEL_PERCPU), %eax
 291         movl    %eax, %fs
 292 .if \unwind_espfix > 0
 293         UNWIND_ESPFIX_STACK
 294 .endif
 295         popl    %eax
 296
 297         FIXUP_FRAME
 298         pushl   %es
 299         pushl   %ds
 300         pushl   \pt_regs_ax
 301         pushl   %ebp
 302         pushl   %edi
 303         pushl   %esi
 304         pushl   %edx
 305         pushl   %ecx
 306         pushl   %ebx
 307         movl    $(__USER_DS), %edx
 308         movl    %edx, %ds
 309         movl    %edx, %es
 310 .if \skip_gs == 0
 311         SET_KERNEL_GS %edx
 312 .endif
 313         /* Switch to kernel stack if necessary */
 314 .if \switch_stacks > 0
 315         SWITCH_TO_KERNEL_STACK
 316 .endif
 317 .endm
 318
 319 .macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
 320         SAVE_ALL unwind_espfix=\unwind_espfix
 321
 322         BUG_IF_WRONG_CR3
 323
 324         /*
 325          * Now switch the CR3 when PTI is enabled.
 326          *
 327          * We can enter with either user or kernel cr3, the code will
 328          * store the old cr3 in \cr3_reg and switches to the kernel cr3
 329          * if necessary.
 330          */
 331         SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
 332
 333 .Lend_\@:
 334 .endm
 335
 336 .macro RESTORE_INT_REGS
 337         popl    %ebx
 338         popl    %ecx
 339         popl    %edx
 340         popl    %esi
 341         popl    %edi
 342         popl    %ebp
 343         popl    %eax
 344 .endm
 345
 346 .macro RESTORE_REGS pop=0
 347         RESTORE_INT_REGS
 348 1:      popl    %ds
 349 2:      popl    %es
 350 3:      popl    %fs
 351         POP_GS \pop
 352         IRET_FRAME
 353 .pushsection .fixup, "ax"
 354 4:      movl    $0, (%esp)
 355         jmp     1b
 356 5:      movl    $0, (%esp)
 357         jmp     2b
 358 6:      movl    $0, (%esp)
 359         jmp     3b
 360 .popsection
 361         _ASM_EXTABLE(1b, 4b)
 362         _ASM_EXTABLE(2b, 5b)
 363         _ASM_EXTABLE(3b, 6b)
 364         POP_GS_EX
 365 .endm
 366
 367 .macro RESTORE_ALL_NMI cr3_reg:req pop=0
 368         /*
 369          * Now switch the CR3 when PTI is enabled.
 370          *
 371          * We enter with kernel cr3 and switch the cr3 to the value
 372          * stored on \cr3_reg, which is either a user or a kernel cr3.
 373          */
 374         ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
 375
 376         testl   $PTI_SWITCH_MASK, \cr3_reg
 377         jz      .Lswitched_\@
 378
 379         /* User cr3 in \cr3_reg - write it to hardware cr3 */
 380         movl    \cr3_reg, %cr3
 381
 382 .Lswitched_\@:
 383
 384         BUG_IF_WRONG_CR3
 385
 386         RESTORE_REGS pop=\pop
 387 .endm
 388
 389 .macro CHECK_AND_APPLY_ESPFIX
 390 #ifdef CONFIG_X86_ESPFIX32
 391 #define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
 392 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET
 393
 394         ALTERNATIVE     "jmp .Lend_\@", "", X86_BUG_ESPFIX
 395
 396         movl    PT_EFLAGS(%esp), %eax           # mix EFLAGS, SS and CS
 397         /*
 398          * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
 399          * are returning to the kernel.
 400          * See comments in process.c:copy_thread() for details.
 401          */
 402         movb    PT_OLDSS(%esp), %ah
 403         movb    PT_CS(%esp), %al
 404         andl    $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
 405         cmpl    $((SEGMENT_LDT << 8) | USER_RPL), %eax
 406         jne     .Lend_\@        # returning to user-space with LDT SS
 407
 408         /*
 409          * Setup and switch to ESPFIX stack
 410          *
 411          * We're returning to userspace with a 16 bit stack. The CPU will not
 412          * restore the high word of ESP for us on executing iret... This is an
 413          * "official" bug of all the x86-compatible CPUs, which we can work
 414          * around to make dosemu and wine happy. We do this by preloading the
 415          * high word of ESP with the high word of the userspace ESP while
 416          * compensating for the offset by changing to the ESPFIX segment with
 417          * a base address that matches for the difference.
 418          */
 419         mov     %esp, %edx                      /* load kernel esp */
 420         mov     PT_OLDESP(%esp), %eax           /* load userspace esp */
 421         mov     %dx, %ax                        /* eax: new kernel esp */
 422         sub     %eax, %edx                      /* offset (low word is 0) */
 423         shr     $16, %edx
 424         mov     %dl, GDT_ESPFIX_SS + 4          /* bits 16..23 */
 425         mov     %dh, GDT_ESPFIX_SS + 7          /* bits 24..31 */
 426         pushl   $__ESPFIX_SS
 427         pushl   %eax                            /* new kernel esp */
 428         /*
 429          * Disable interrupts, but do not irqtrace this section: we
 430          * will soon execute iret and the tracer was already set to
 431          * the irqstate after the IRET:
 432          */
 433         DISABLE_INTERRUPTS(CLBR_ANY)
 434         lss     (%esp), %esp                    /* switch to espfix segment */
 435 .Lend_\@:
 436 #endif /* CONFIG_X86_ESPFIX32 */
 437 .endm
 438
 439 /*
 440  * Called with pt_regs fully populated and kernel segments loaded,
 441  * so we can access PER_CPU and use the integer registers.
 442  *
 443  * We need to be very careful here with the %esp switch, because an NMI
 444  * can happen everywhere. If the NMI handler finds itself on the
 445  * entry-stack, it will overwrite the task-stack and everything we
 446  * copied there. So allocate the stack-frame on the task-stack and
 447  * switch to it before we do any copying.
 448  */
 449
 450 .macro SWITCH_TO_KERNEL_STACK
 451
 452         BUG_IF_WRONG_CR3
 453
 454         SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
 455
 456         /*
 457          * %eax now contains the entry cr3 and we carry it forward in
 458          * that register for the time this macro runs
 459          */
 460
 461         /* Are we on the entry stack? Bail out if not! */
 462         movl    PER_CPU_VAR(cpu_entry_area), %ecx
 463         addl    $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
 464         subl    %esp, %ecx      /* ecx = (end of entry_stack) - esp */
 465         cmpl    $SIZEOF_entry_stack, %ecx
 466         jae     .Lend_\@
 467
 468         /* Load stack pointer into %esi and %edi */
 469         movl    %esp, %esi
 470         movl    %esi, %edi
 471
 472         /* Move %edi to the top of the entry stack */
 473         andl    $(MASK_entry_stack), %edi
 474         addl    $(SIZEOF_entry_stack), %edi
 475
 476         /* Load top of task-stack into %edi */
 477         movl    TSS_entry2task_stack(%edi), %edi
 478
 479         /* Special case - entry from kernel mode via entry stack */
 480 #ifdef CONFIG_VM86
 481         movl    PT_EFLAGS(%esp), %ecx           # mix EFLAGS and CS
 482         movb    PT_CS(%esp), %cl
 483         andl    $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
 484 #else
 485         movl    PT_CS(%esp), %ecx
 486         andl    $SEGMENT_RPL_MASK, %ecx
 487 #endif
 488         cmpl    $USER_RPL, %ecx
 489         jb      .Lentry_from_kernel_\@
 490
 491         /* Bytes to copy */
 492         movl    $PTREGS_SIZE, %ecx
 493
 494 #ifdef CONFIG_VM86
 495         testl   $X86_EFLAGS_VM, PT_EFLAGS(%esi)
 496         jz      .Lcopy_pt_regs_\@
 497
 498         /*
 499          * Stack-frame contains 4 additional segment registers when
 500          * coming from VM86 mode
 501          */
 502         addl    $(4 * 4), %ecx
 503
 504 #endif
 505 .Lcopy_pt_regs_\@:
 506
 507         /* Allocate frame on task-stack */
 508         subl    %ecx, %edi
 509
 510         /* Switch to task-stack */
 511         movl    %edi, %esp
 512
 513         /*
 514          * We are now on the task-stack and can safely copy over the
 515          * stack-frame
 516          */
 517         shrl    $2, %ecx
 518         cld
 519         rep movsl
 520
 521         jmp .Lend_\@
 522
 523 .Lentry_from_kernel_\@:
 524
 525         /*
 526          * This handles the case when we enter the kernel from
 527          * kernel-mode and %esp points to the entry-stack. When this
 528          * happens we need to switch to the task-stack to run C code,
 529          * but switch back to the entry-stack again when we approach
 530          * iret and return to the interrupted code-path. This usually
 531          * happens when we hit an exception while restoring user-space
 532          * segment registers on the way back to user-space or when the
 533          * sysenter handler runs with eflags.tf set.
 534          *
 535          * When we switch to the task-stack here, we can't trust the
 536          * contents of the entry-stack anymore, as the exception handler
 537          * might be scheduled out or moved to another CPU. Therefore we
 538          * copy the complete entry-stack to the task-stack and set a
 539          * marker in the iret-frame (bit 31 of the CS dword) to detect
 540          * what we've done on the iret path.
 541          *
 542          * On the iret path we copy everything back and switch to the
 543          * entry-stack, so that the interrupted kernel code-path
 544          * continues on the same stack it was interrupted with.
 545          *
 546          * Be aware that an NMI can happen anytime in this code.
 547          *
 548          * %esi: Entry-Stack pointer (same as %esp)
 549          * %edi: Top of the task stack
 550          * %eax: CR3 on kernel entry
 551          */
 552
 553         /* Calculate number of bytes on the entry stack in %ecx */
 554         movl    %esi, %ecx
 555
 556         /* %ecx to the top of entry-stack */
 557         andl    $(MASK_entry_stack), %ecx
 558         addl    $(SIZEOF_entry_stack), %ecx
 559
 560         /* Number of bytes on the entry stack to %ecx */
 561         sub     %esi, %ecx
 562
 563         /* Mark stackframe as coming from entry stack */
 564         orl     $CS_FROM_ENTRY_STACK, PT_CS(%esp)
 565
 566         /*
 567          * Test the cr3 used to enter the kernel and add a marker
 568          * so that we can switch back to it before iret.
 569          */
 570         testl   $PTI_SWITCH_MASK, %eax
 571         jz      .Lcopy_pt_regs_\@
 572         orl     $CS_FROM_USER_CR3, PT_CS(%esp)
 573
 574         /*
 575          * %esi and %edi are unchanged, %ecx contains the number of
 576          * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
 577          * the stack-frame on task-stack and copy everything over
 578          */
 579         jmp .Lcopy_pt_regs_\@
 580
 581 .Lend_\@:
 582 .endm
 583
 584 /*
 585  * Switch back from the kernel stack to the entry stack.
 586  *
 587  * The %esp register must point to pt_regs on the task stack. It will
 588  * first calculate the size of the stack-frame to copy, depending on
 589  * whether we return to VM86 mode or not. With that it uses 'rep movsl'
 590  * to copy the contents of the stack over to the entry stack.
 591  *
 592  * We must be very careful here, as we can't trust the contents of the
 593  * task-stack once we switched to the entry-stack. When an NMI happens
 594  * while on the entry-stack, the NMI handler will switch back to the top
 595  * of the task stack, overwriting our stack-frame we are about to copy.
 596  * Therefore we switch the stack only after everything is copied over.
 597  */
 598 .macro SWITCH_TO_ENTRY_STACK
 599
 600         /* Bytes to copy */
 601         movl    $PTREGS_SIZE, %ecx
 602
 603 #ifdef CONFIG_VM86
 604         testl   $(X86_EFLAGS_VM), PT_EFLAGS(%esp)
 605         jz      .Lcopy_pt_regs_\@
 606
 607         /* Additional 4 registers to copy when returning to VM86 mode */
 608         addl    $(4 * 4), %ecx
 609
 610 .Lcopy_pt_regs_\@:
 611 #endif
 612
 613         /* Initialize source and destination for movsl */
 614         movl    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
 615         subl    %ecx, %edi
 616         movl    %esp, %esi
 617
 618         /* Save future stack pointer in %ebx */
 619         movl    %edi, %ebx
 620
 621         /* Copy over the stack-frame */
 622         shrl    $2, %ecx
 623         cld
 624         rep movsl
 625
 626         /*
 627          * Switch to entry-stack - needs to happen after everything is
 628          * copied because the NMI handler will overwrite the task-stack
 629          * when on entry-stack
 630          */
 631         movl    %ebx, %esp
 632
 633 .Lend_\@:
 634 .endm
 635
 636 /*
 637  * This macro handles the case when we return to kernel-mode on the iret
 638  * path and have to switch back to the entry stack and/or user-cr3
 639  *
 640  * See the comments below the .Lentry_from_kernel_\@ label in the
 641  * SWITCH_TO_KERNEL_STACK macro for more details.
 642  */
 643 .macro PARANOID_EXIT_TO_KERNEL_MODE
 644
 645         /*
 646          * Test if we entered the kernel with the entry-stack. Most
 647          * likely we did not, because this code only runs on the
 648          * return-to-kernel path.
 649          */
 650         testl   $CS_FROM_ENTRY_STACK, PT_CS(%esp)
 651         jz      .Lend_\@
 652
 653         /* Unlikely slow-path */
 654
 655         /* Clear marker from stack-frame */
 656         andl    $(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
 657
 658         /* Copy the remaining task-stack contents to entry-stack */
 659         movl    %esp, %esi
 660         movl    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
 661
 662         /* Bytes on the task-stack to ecx */
 663         movl    PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
 664         subl    %esi, %ecx
 665
 666         /* Allocate stack-frame on entry-stack */
 667         subl    %ecx, %edi
 668
 669         /*
 670          * Save future stack-pointer, we must not switch until the
 671          * copy is done, otherwise the NMI handler could destroy the
 672          * contents of the task-stack we are about to copy.
 673          */
 674         movl    %edi, %ebx
 675
 676         /* Do the copy */
 677         shrl    $2, %ecx
 678         cld
 679         rep movsl
 680
 681         /* Safe to switch to entry-stack now */
 682         movl    %ebx, %esp
 683
 684         /*
 685          * We came from entry-stack and need to check if we also need to
 686          * switch back to user cr3.
 687          */
 688         testl   $CS_FROM_USER_CR3, PT_CS(%esp)
 689         jz      .Lend_\@
 690
 691         /* Clear marker from stack-frame */
 692         andl    $(~CS_FROM_USER_CR3), PT_CS(%esp)
 693
 694         SWITCH_TO_USER_CR3 scratch_reg=%eax
 695
 696 .Lend_\@:
 697 .endm
 698
 699 /**
 700  * idtentry - Macro to generate entry stubs for simple IDT entries
 701  * @vector:             Vector number
 702  * @asmsym:             ASM symbol for the entry point
 703  * @cfunc:              C function to be called
 704  * @has_error_code:     Hardware pushed error code on stack
 705  */
 706 .macro idtentry vector asmsym cfunc has_error_code:req
 707 SYM_CODE_START(\asmsym)
 708         ASM_CLAC
 709         cld
 710
 711         .if \has_error_code == 0
 712                 pushl   $0              /* Clear the error code */
 713         .endif
 714
 715         /* Push the C-function address into the GS slot */
 716         pushl   $\cfunc
 717         /* Invoke the common exception entry */
 718         jmp     handle_exception
 719 SYM_CODE_END(\asmsym)
 720 .endm
 721
 722 .macro idtentry_irq vector cfunc
 723         .p2align CONFIG_X86_L1_CACHE_SHIFT
 724 SYM_CODE_START_LOCAL(asm_\cfunc)
 725         ASM_CLAC
 726         SAVE_ALL switch_stacks=1
 727         ENCODE_FRAME_POINTER
 728         movl    %esp, %eax
 729         movl    PT_ORIG_EAX(%esp), %edx         /* get the vector from stack */
 730         movl    $-1, PT_ORIG_EAX(%esp)          /* no syscall to restart */
 731         call    \cfunc
 732         jmp     handle_exception_return
 733 SYM_CODE_END(asm_\cfunc)
 734 .endm
 735
 736 .macro idtentry_sysvec vector cfunc
 737         idtentry \vector asm_\cfunc \cfunc has_error_code=0
 738 .endm
 739
 740 /*
 741  * Include the defines which emit the idt entries which are shared
 742  * shared between 32 and 64 bit and emit the __irqentry_text_* markers
 743  * so the stacktrace boundary checks work.
 744  */
 745         .align 16
 746         .globl __irqentry_text_start
 747 __irqentry_text_start:
 748
 749 #include <asm/idtentry.h>
 750
 751         .align 16
 752         .globl __irqentry_text_end
 753 __irqentry_text_end:
 754
 755 /*
 756  * %eax: prev task
 757  * %edx: next task
 758  */
 759 .pushsection .text, "ax"
 760 SYM_CODE_START(__switch_to_asm)
 761         /*
 762          * Save callee-saved registers
 763          * This must match the order in struct inactive_task_frame
 764          */
 765         pushl   %ebp
 766         pushl   %ebx
 767         pushl   %edi
 768         pushl   %esi
 769         /*
 770          * Flags are saved to prevent AC leakage. This could go
 771          * away if objtool would have 32bit support to verify
 772          * the STAC/CLAC correctness.
 773          */
 774         pushfl
 775
 776         /* switch stack */
 777         movl    %esp, TASK_threadsp(%eax)
 778         movl    TASK_threadsp(%edx), %esp
 779
 780 #ifdef CONFIG_STACKPROTECTOR
 781         movl    TASK_stack_canary(%edx), %ebx
 782         movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
 783 #endif
 784
 785 #ifdef CONFIG_RETPOLINE
 786         /*
 787          * When switching from a shallower to a deeper call stack
 788          * the RSB may either underflow or use entries populated
 789          * with userspace addresses. On CPUs where those concerns
 790          * exist, overwrite the RSB with entries which capture
 791          * speculative execution to prevent attack.
 792          */
 793         FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
 794 #endif
 795
 796         /* Restore flags or the incoming task to restore AC state. */
 797         popfl
 798         /* restore callee-saved registers */
 799         popl    %esi
 800         popl    %edi
 801         popl    %ebx
 802         popl    %ebp
 803
 804         jmp     __switch_to
 805 SYM_CODE_END(__switch_to_asm)
 806 .popsection
 807
 808 /*
 809  * The unwinder expects the last frame on the stack to always be at the same
 810  * offset from the end of the page, which allows it to validate the stack.
 811  * Calling schedule_tail() directly would break that convention because its an
 812  * asmlinkage function so its argument has to be pushed on the stack.  This
 813  * wrapper creates a proper "end of stack" frame header before the call.
 814  */
 815 .pushsection .text, "ax"
 816 SYM_FUNC_START(schedule_tail_wrapper)
 817         FRAME_BEGIN
 818
 819         pushl   %eax
 820         call    schedule_tail
 821         popl    %eax
 822
 823         FRAME_END
 824         ret
 825 SYM_FUNC_END(schedule_tail_wrapper)
 826 .popsection
 827
 828 /*
 829  * A newly forked process directly context switches into this address.
 830  *
 831  * eax: prev task we switched from
 832  * ebx: kernel thread func (NULL for user thread)
 833  * edi: kernel thread arg
 834  */
 835 .pushsection .text, "ax"
 836 SYM_CODE_START(ret_from_fork)
 837         call    schedule_tail_wrapper
 838
 839         testl   %ebx, %ebx
 840         jnz     1f              /* kernel threads are uncommon */
 841
 842 2:
 843         /* When we fork, we trace the syscall return in the child, too. */
 844         movl    %esp, %eax
 845         call    syscall_exit_to_user_mode
 846         jmp     .Lsyscall_32_done
 847
 848         /* kernel thread */
 849 1:      movl    %edi, %eax
 850         CALL_NOSPEC ebx
 851         /*
 852          * A kernel thread is allowed to return here after successfully
 853          * calling kernel_execve().  Exit to userspace to complete the execve()
 854          * syscall.
 855          */
 856         movl    $0, PT_EAX(%esp)
 857         jmp     2b
 858 SYM_CODE_END(ret_from_fork)
 859 .popsection
 860
 861 SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
 862 /*
 863  * All code from here through __end_SYSENTER_singlestep_region is subject
 864  * to being single-stepped if a user program sets TF and executes SYSENTER.
 865  * There is absolutely nothing that we can do to prevent this from happening
 866  * (thanks Intel!).  To keep our handling of this situation as simple as
 867  * possible, we handle TF just like AC and NT, except that our #DB handler
 868  * will ignore all of the single-step traps generated in this range.
 869  */
 870
 871 /*
 872  * 32-bit SYSENTER entry.
 873  *
 874  * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
 875  * if X86_FEATURE_SEP is available.  This is the preferred system call
 876  * entry on 32-bit systems.
 877  *
 878  * The SYSENTER instruction, in principle, should *only* occur in the
 879  * vDSO.  In practice, a small number of Android devices were shipped
 880  * with a copy of Bionic that inlined a SYSENTER instruction.  This
 881  * never happened in any of Google's Bionic versions -- it only happened
 882  * in a narrow range of Intel-provided versions.
 883  *
 884  * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
 885  * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
 886  * SYSENTER does not save anything on the stack,
 887  * and does not save old EIP (!!!), ESP, or EFLAGS.
 888  *
 889  * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
 890  * user and/or vm86 state), we explicitly disable the SYSENTER
 891  * instruction in vm86 mode by reprogramming the MSRs.
 892  *
 893  * Arguments:
 894  * eax  system call number
 895  * ebx  arg1
 896  * ecx  arg2
 897  * edx  arg3
 898  * esi  arg4
 899  * edi  arg5
 900  * ebp  user stack
 901  * 0(%ebp) arg6
 902  */
 903 SYM_FUNC_START(entry_SYSENTER_32)
 904         /*
 905          * On entry-stack with all userspace-regs live - save and
 906          * restore eflags and %eax to use it as scratch-reg for the cr3
 907          * switch.
 908          */
 909         pushfl
 910         pushl   %eax
 911         BUG_IF_WRONG_CR3 no_user_check=1
 912         SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
 913         popl    %eax
 914         popfl
 915
 916         /* Stack empty again, switch to task stack */
 917         movl    TSS_entry2task_stack(%esp), %esp
 918
 919 .Lsysenter_past_esp:
 920         pushl   $__USER_DS              /* pt_regs->ss */
 921         pushl   $0                      /* pt_regs->sp (placeholder) */
 922         pushfl                          /* pt_regs->flags (except IF = 0) */
 923         pushl   $__USER_CS              /* pt_regs->cs */
 924         pushl   $0                      /* pt_regs->ip = 0 (placeholder) */
 925         pushl   %eax                    /* pt_regs->orig_ax */
 926         SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest, stack already switched */
 927
 928         /*
 929          * SYSENTER doesn't filter flags, so we need to clear NT, AC
 930          * and TF ourselves.  To save a few cycles, we can check whether
 931          * either was set instead of doing an unconditional popfq.
 932          * This needs to happen before enabling interrupts so that
 933          * we don't get preempted with NT set.
 934          *
 935          * If TF is set, we will single-step all the way to here -- do_debug
 936          * will ignore all the traps.  (Yes, this is slow, but so is
 937          * single-stepping in general.  This allows us to avoid having
 938          * a more complicated code to handle the case where a user program
 939          * forces us to single-step through the SYSENTER entry code.)
 940          *
 941          * NB.: .Lsysenter_fix_flags is a label with the code under it moved
 942          * out-of-line as an optimization: NT is unlikely to be set in the
 943          * majority of the cases and instead of polluting the I$ unnecessarily,
 944          * we're keeping that code behind a branch which will predict as
 945          * not-taken and therefore its instructions won't be fetched.
 946          */
 947         testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
 948         jnz     .Lsysenter_fix_flags
 949 .Lsysenter_flags_fixed:
 950
 951         movl    %esp, %eax
 952         call    do_SYSENTER_32
 953         testl   %eax, %eax
 954         jz      .Lsyscall_32_done
 955
 956         STACKLEAK_ERASE
 957
 958         /* Opportunistic SYSEXIT */
 959
 960         /*
 961          * Setup entry stack - we keep the pointer in %eax and do the
 962          * switch after almost all user-state is restored.
 963          */
 964
 965         /* Load entry stack pointer and allocate frame for eflags/eax */
 966         movl    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
 967         subl    $(2*4), %eax
 968
 969         /* Copy eflags and eax to entry stack */
 970         movl    PT_EFLAGS(%esp), %edi
 971         movl    PT_EAX(%esp), %esi
 972         movl    %edi, (%eax)
 973         movl    %esi, 4(%eax)
 974
 975         /* Restore user registers and segments */
 976         movl    PT_EIP(%esp), %edx      /* pt_regs->ip */
 977         movl    PT_OLDESP(%esp), %ecx   /* pt_regs->sp */
 978 1:      mov     PT_FS(%esp), %fs
 979         PTGS_TO_GS
 980
 981         popl    %ebx                    /* pt_regs->bx */
 982         addl    $2*4, %esp              /* skip pt_regs->cx and pt_regs->dx */
 983         popl    %esi                    /* pt_regs->si */
 984         popl    %edi                    /* pt_regs->di */
 985         popl    %ebp                    /* pt_regs->bp */
 986
 987         /* Switch to entry stack */
 988         movl    %eax, %esp
 989
 990         /* Now ready to switch the cr3 */
 991         SWITCH_TO_USER_CR3 scratch_reg=%eax
 992
 993         /*
 994          * Restore all flags except IF. (We restore IF separately because
 995          * STI gives a one-instruction window in which we won't be interrupted,
 996          * whereas POPF does not.)
 997          */
 998         btrl    $X86_EFLAGS_IF_BIT, (%esp)
 999         BUG_IF_WRONG_CR3 no_user_check=1
1000         popfl
1001         popl    %eax
1002
1003         /*
1004          * Return back to the vDSO, which will pop ecx and edx.
1005          * Don't bother with DS and ES (they already contain __USER_DS).
1006          */
1007         sti
1008         sysexit
1009
1010 .pushsection .fixup, "ax"
1011 2:      movl    $0, PT_FS(%esp)
1012         jmp     1b
1013 .popsection
1014         _ASM_EXTABLE(1b, 2b)
1015         PTGS_TO_GS_EX
1016
1017 .Lsysenter_fix_flags:
1018         pushl   $X86_EFLAGS_FIXED
1019         popfl
1020         jmp     .Lsysenter_flags_fixed
1021 SYM_ENTRY(__end_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
1022 SYM_FUNC_END(entry_SYSENTER_32)
1023
1024 /*
1025  * 32-bit legacy system call entry.
1026  *
1027  * 32-bit x86 Linux system calls traditionally used the INT $0x80
1028  * instruction.  INT $0x80 lands here.
1029  *
1030  * This entry point can be used by any 32-bit perform system calls.
1031  * Instances of INT $0x80 can be found inline in various programs and
1032  * libraries.  It is also used by the vDSO's __kernel_vsyscall
1033  * fallback for hardware that doesn't support a faster entry method.
1034  * Restarted 32-bit system calls also fall back to INT $0x80
1035  * regardless of what instruction was originally used to do the system
1036  * call.  (64-bit programs can use INT $0x80 as well, but they can
1037  * only run on 64-bit kernels and therefore land in
1038  * entry_INT80_compat.)
1039  *
1040  * This is considered a slow path.  It is not used by most libc
1041  * implementations on modern hardware except during process startup.
1042  *
1043  * Arguments:
1044  * eax  system call number
1045  * ebx  arg1
1046  * ecx  arg2
1047  * edx  arg3
1048  * esi  arg4
1049  * edi  arg5
1050  * ebp  arg6
1051  */
1052 SYM_FUNC_START(entry_INT80_32)
1053         ASM_CLAC
1054         pushl   %eax                    /* pt_regs->orig_ax */
1055
1056         SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1    /* save rest */
1057
1058         movl    %esp, %eax
1059         call    do_int80_syscall_32
1060 .Lsyscall_32_done:
1061         STACKLEAK_ERASE
1062
1063 restore_all_switch_stack:
1064         SWITCH_TO_ENTRY_STACK
1065         CHECK_AND_APPLY_ESPFIX
1066
1067         /* Switch back to user CR3 */
1068         SWITCH_TO_USER_CR3 scratch_reg=%eax
1069
1070         BUG_IF_WRONG_CR3
1071
1072         /* Restore user state */
1073         RESTORE_REGS pop=4                      # skip orig_eax/error_code
1074 .Lirq_return:
1075         /*
1076          * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
1077          * when returning from IPI handler and when returning from
1078          * scheduler to user-space.
1079          */
1080         INTERRUPT_RETURN
1081
1082 .section .fixup, "ax"
1083 SYM_CODE_START(asm_iret_error)
1084         pushl   $0                              # no error code
1085         pushl   $iret_error
1086
1087 #ifdef CONFIG_DEBUG_ENTRY
1088         /*
1089          * The stack-frame here is the one that iret faulted on, so its a
1090          * return-to-user frame. We are on kernel-cr3 because we come here from
1091          * the fixup code. This confuses the CR3 checker, so switch to user-cr3
1092          * as the checker expects it.
1093          */
1094         pushl   %eax
1095         SWITCH_TO_USER_CR3 scratch_reg=%eax
1096         popl    %eax
1097 #endif
1098
1099         jmp     handle_exception
1100 SYM_CODE_END(asm_iret_error)
1101 .previous
1102         _ASM_EXTABLE(.Lirq_return, asm_iret_error)
1103 SYM_FUNC_END(entry_INT80_32)
1104
1105 .macro FIXUP_ESPFIX_STACK
1106 /*
1107  * Switch back for ESPFIX stack to the normal zerobased stack
1108  *
1109  * We can't call C functions using the ESPFIX stack. This code reads
1110  * the high word of the segment base from the GDT and swiches to the
1111  * normal stack and adjusts ESP with the matching offset.
1112  *
1113  * We might be on user CR3 here, so percpu data is not mapped and we can't
1114  * access the GDT through the percpu segment.  Instead, use SGDT to find
1115  * the cpu_entry_area alias of the GDT.
1116  */
1117 #ifdef CONFIG_X86_ESPFIX32
1118         /* fixup the stack */
1119         pushl   %ecx
1120         subl    $2*4, %esp
1121         sgdt    (%esp)
1122         movl    2(%esp), %ecx                           /* GDT address */
1123         /*
1124          * Careful: ECX is a linear pointer, so we need to force base
1125          * zero.  %cs is the only known-linear segment we have right now.
1126          */
1127         mov     %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al    /* bits 16..23 */
1128         mov     %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah    /* bits 24..31 */
1129         shl     $16, %eax
1130         addl    $2*4, %esp
1131         popl    %ecx
1132         addl    %esp, %eax                      /* the adjusted stack pointer */
1133         pushl   $__KERNEL_DS
1134         pushl   %eax
1135         lss     (%esp), %esp                    /* switch to the normal stack segment */
1136 #endif
1137 .endm
1138
1139 .macro UNWIND_ESPFIX_STACK
1140         /* It's safe to clobber %eax, all other regs need to be preserved */
1141 #ifdef CONFIG_X86_ESPFIX32
1142         movl    %ss, %eax
1143         /* see if on espfix stack */
1144         cmpw    $__ESPFIX_SS, %ax
1145         jne     .Lno_fixup_\@
1146         /* switch to normal stack */
1147         FIXUP_ESPFIX_STACK
1148 .Lno_fixup_\@:
1149 #endif
1150 .endm
1151
1152 SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
1153         /* the function address is in %gs's slot on the stack */
1154         SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
1155         ENCODE_FRAME_POINTER
1156
1157         /* fixup %gs */
1158         GS_TO_REG %ecx
1159         movl    PT_GS(%esp), %edi               # get the function address
1160         REG_TO_PTGS %ecx
1161         SET_KERNEL_GS %ecx
1162
1163         /* fixup orig %eax */
1164         movl    PT_ORIG_EAX(%esp), %edx         # get the error code
1165         movl    $-1, PT_ORIG_EAX(%esp)          # no syscall to restart
1166
1167         movl    %esp, %eax                      # pt_regs pointer
1168         CALL_NOSPEC edi
1169
1170 handle_exception_return:
1171 #ifdef CONFIG_VM86
1172         movl    PT_EFLAGS(%esp), %eax           # mix EFLAGS and CS
1173         movb    PT_CS(%esp), %al
1174         andl    $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
1175 #else
1176         /*
1177          * We can be coming here from child spawned by kernel_thread().
1178          */
1179         movl    PT_CS(%esp), %eax
1180         andl    $SEGMENT_RPL_MASK, %eax
1181 #endif
1182         cmpl    $USER_RPL, %eax                 # returning to v8086 or userspace ?
1183         jnb     ret_to_user
1184
1185         PARANOID_EXIT_TO_KERNEL_MODE
1186         BUG_IF_WRONG_CR3
1187         RESTORE_REGS 4
1188         jmp     .Lirq_return
1189
1190 ret_to_user:
1191         movl    %esp, %eax
1192         jmp     restore_all_switch_stack
1193 SYM_CODE_END(handle_exception)
1194
1195 SYM_CODE_START(asm_exc_double_fault)
1196 1:
1197         /*
1198          * This is a task gate handler, not an interrupt gate handler.
1199          * The error code is on the stack, but the stack is otherwise
1200          * empty.  Interrupts are off.  Our state is sane with the following
1201          * exceptions:
1202          *
1203          *  - CR0.TS is set.  "TS" literally means "task switched".
1204          *  - EFLAGS.NT is set because we're a "nested task".
1205          *  - The doublefault TSS has back_link set and has been marked busy.
1206          *  - TR points to the doublefault TSS and the normal TSS is busy.
1207          *  - CR3 is the normal kernel PGD.  This would be delightful, except
1208          *    that the CPU didn't bother to save the old CR3 anywhere.  This
1209          *    would make it very awkward to return back to the context we came
1210          *    from.
1211          *
1212          * The rest of EFLAGS is sanitized for us, so we don't need to
1213          * worry about AC or DF.
1214          *
1215          * Don't even bother popping the error code.  It's always zero,
1216          * and ignoring it makes us a bit more robust against buggy
1217          * hypervisor task gate implementations.
1218          *
1219          * We will manually undo the task switch instead of doing a
1220          * task-switching IRET.
1221          */
1222
1223         clts                            /* clear CR0.TS */
1224         pushl   $X86_EFLAGS_FIXED
1225         popfl                           /* clear EFLAGS.NT */
1226
1227         call    doublefault_shim
1228
1229         /* We don't support returning, so we have no IRET here. */
1230 1:
1231         hlt
1232         jmp 1b
1233 SYM_CODE_END(asm_exc_double_fault)
1234
1235 /*
1236  * NMI is doubly nasty.  It can happen on the first instruction of
1237  * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
1238  * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
1239  * switched stacks.  We handle both conditions by simply checking whether we
1240  * interrupted kernel code running on the SYSENTER stack.
1241  */
1242 SYM_CODE_START(asm_exc_nmi)
1243         ASM_CLAC
1244
1245 #ifdef CONFIG_X86_ESPFIX32
1246         /*
1247          * ESPFIX_SS is only ever set on the return to user path
1248          * after we've switched to the entry stack.
1249          */
1250         pushl   %eax
1251         movl    %ss, %eax
1252         cmpw    $__ESPFIX_SS, %ax
1253         popl    %eax
1254         je      .Lnmi_espfix_stack
1255 #endif
1256
1257         pushl   %eax                            # pt_regs->orig_ax
1258         SAVE_ALL_NMI cr3_reg=%edi
1259         ENCODE_FRAME_POINTER
1260         xorl    %edx, %edx                      # zero error code
1261         movl    %esp, %eax                      # pt_regs pointer
1262
1263         /* Are we currently on the SYSENTER stack? */
1264         movl    PER_CPU_VAR(cpu_entry_area), %ecx
1265         addl    $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
1266         subl    %eax, %ecx      /* ecx = (end of entry_stack) - esp */
1267         cmpl    $SIZEOF_entry_stack, %ecx
1268         jb      .Lnmi_from_sysenter_stack
1269
1270         /* Not on SYSENTER stack. */
1271         call    exc_nmi
1272         jmp     .Lnmi_return
1273
1274 .Lnmi_from_sysenter_stack:
1275         /*
1276          * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
1277          * is using the thread stack right now, so it's safe for us to use it.
1278          */
1279         movl    %esp, %ebx
1280         movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
1281         call    exc_nmi
1282         movl    %ebx, %esp
1283
1284 .Lnmi_return:
1285 #ifdef CONFIG_X86_ESPFIX32
1286         testl   $CS_FROM_ESPFIX, PT_CS(%esp)
1287         jnz     .Lnmi_from_espfix
1288 #endif
1289
1290         CHECK_AND_APPLY_ESPFIX
1291         RESTORE_ALL_NMI cr3_reg=%edi pop=4
1292         jmp     .Lirq_return
1293
1294 #ifdef CONFIG_X86_ESPFIX32
1295 .Lnmi_espfix_stack:
1296         /*
1297          * Create the pointer to LSS back
1298          */
1299         pushl   %ss
1300         pushl   %esp
1301         addl    $4, (%esp)
1302
1303         /* Copy the (short) IRET frame */
1304         pushl   4*4(%esp)       # flags
1305         pushl   4*4(%esp)       # cs
1306         pushl   4*4(%esp)       # ip
1307
1308         pushl   %eax            # orig_ax
1309
1310         SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
1311         ENCODE_FRAME_POINTER
1312
1313         /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
1314         xorl    $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)
1315
1316         xorl    %edx, %edx                      # zero error code
1317         movl    %esp, %eax                      # pt_regs pointer
1318         jmp     .Lnmi_from_sysenter_stack
1319
1320 .Lnmi_from_espfix:
1321         RESTORE_ALL_NMI cr3_reg=%edi
1322         /*
1323          * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
1324          * fix up the gap and long frame:
1325          *
1326          *  3 - original frame  (exception)
1327          *  2 - ESPFIX block    (above)
1328          *  6 - gap             (FIXUP_FRAME)
1329          *  5 - long frame      (FIXUP_FRAME)
1330          *  1 - orig_ax
1331          */
1332         lss     (1+5+6)*4(%esp), %esp                   # back to espfix stack
1333         jmp     .Lirq_return
1334 #endif
1335 SYM_CODE_END(asm_exc_nmi)
1336
1337 .pushsection .text, "ax"
1338 SYM_CODE_START(rewind_stack_do_exit)
1339         /* Prevent any naive code from trying to unwind to our caller. */
1340         xorl    %ebp, %ebp
1341
1342         movl    PER_CPU_VAR(cpu_current_top_of_stack), %esi
1343         leal    -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
1344
1345         call    do_exit
1346 1:      jmp 1b
1347 SYM_CODE_END(rewind_stack_do_exit)
1348 .popsection