arch/x86/crypto/aesni-intel_asm.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Implement AES algorithm in Intel AES-NI instructions.
   4  *
   5  * The white paper of AES-NI instructions can be downloaded from:
   6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7  *
   8  * Copyright (C) 2008, Intel Corp.
   9  *    Author: Huang Ying <ying.huang@intel.com>
  10  *            Vinodh Gopal <vinodh.gopal@intel.com>
  11  *            Kahraman Akdemir
  12  *
  13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14  * interface for 64-bit kernels.
  15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17  *             Adrian Hoban <adrian.hoban@intel.com>
  18  *             James Guilford (james.guilford@intel.com)
  19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20  *             Tadeusz Struk (tadeusz.struk@intel.com)
  21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22  *    Copyright (c) 2010, Intel Corporation.
  23  *
  24  * Ported x86_64 version to x86:
  25  *    Author: Mathias Krause <minipli@googlemail.com>
  26  */
  27
  28 #include <linux/linkage.h>
  29 #include <asm/frame.h>
  30 #include <asm/nospec-branch.h>
  31
  32 /*
  33  * The following macros are used to move an (un)aligned 16 byte value to/from
  34  * an XMM register.  This can done for either FP or integer values, for FP use
  35  * movaps (move aligned packed single) or integer use movdqa (move double quad
  36  * aligned).  It doesn't make a performance difference which instruction is used
  37  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38  * shorter, so that is the one we'll use for now. (same for unaligned).
  39  */
  40 #define MOVADQ  movaps
  41 #define MOVUDQ  movups
  42
  43 #ifdef __x86_64__
  44
  45 # constants in mergeable sections, linker can reorder and merge
  46 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  47 .align 16
  48 .Lgf128mul_x_ble_mask:
  49         .octa 0x00000000000000010000000000000087
  50 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  51 .align 16
  52 POLY:   .octa 0xC2000000000000000000000000000001
  53 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  54 .align 16
  55 TWOONE: .octa 0x00000001000000000000000000000001
  56
  57 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  58 .align 16
  59 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  60 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  61 .align 16
  62 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  63 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  64 .align 16
  65 MASK2:      .octa 0xffffffffffffffff0000000000000000
  66 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  67 .align 16
  68 ONE:        .octa 0x00000000000000000000000000000001
  69 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  70 .align 16
  71 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  72 .section        .rodata.cst16.dec, "aM", @progbits, 16
  73 .align 16
  74 dec:        .octa 0x1
  75 .section        .rodata.cst16.enc, "aM", @progbits, 16
  76 .align 16
  77 enc:        .octa 0x2
  78
  79 # order of these constants should not change.
  80 # more specifically, ALL_F should follow SHIFT_MASK,
  81 # and zero should follow ALL_F
  82 .section        .rodata, "a", @progbits
  83 .align 16
  84 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  85 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  86             .octa 0x00000000000000000000000000000000
  87
  88 .text
  89
  90
  91 #define STACK_OFFSET    8*3
  92
  93 #define AadHash 16*0
  94 #define AadLen 16*1
  95 #define InLen (16*1)+8
  96 #define PBlockEncKey 16*2
  97 #define OrigIV 16*3
  98 #define CurCount 16*4
  99 #define PBlockLen 16*5
 100 #define HashKey         16*6    // store HashKey <<1 mod poly here
 101 #define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
 102 #define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
 103 #define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 104 #define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 105                                 // bits of  HashKey <<1 mod poly here
 106                                 //(for Karatsuba purposes)
 107 #define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 108                                 // bits of  HashKey^2 <<1 mod poly here
 109                                 // (for Karatsuba purposes)
 110 #define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 111                                 // bits of  HashKey^3 <<1 mod poly here
 112                                 // (for Karatsuba purposes)
 113 #define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 114                                 // bits of  HashKey^4 <<1 mod poly here
 115                                 // (for Karatsuba purposes)
 116
 117 #define arg1 rdi
 118 #define arg2 rsi
 119 #define arg3 rdx
 120 #define arg4 rcx
 121 #define arg5 r8
 122 #define arg6 r9
 123 #define arg7 STACK_OFFSET+8(%rsp)
 124 #define arg8 STACK_OFFSET+16(%rsp)
 125 #define arg9 STACK_OFFSET+24(%rsp)
 126 #define arg10 STACK_OFFSET+32(%rsp)
 127 #define arg11 STACK_OFFSET+40(%rsp)
 128 #define keysize 2*15*16(%arg1)
 129 #endif
 130
 131
 132 #define STATE1  %xmm0
 133 #define STATE2  %xmm4
 134 #define STATE3  %xmm5
 135 #define STATE4  %xmm6
 136 #define STATE   STATE1
 137 #define IN1     %xmm1
 138 #define IN2     %xmm7
 139 #define IN3     %xmm8
 140 #define IN4     %xmm9
 141 #define IN      IN1
 142 #define KEY     %xmm2
 143 #define IV      %xmm3
 144
 145 #define BSWAP_MASK %xmm10
 146 #define CTR     %xmm11
 147 #define INC     %xmm12
 148
 149 #define GF128MUL_MASK %xmm10
 150
 151 #ifdef __x86_64__
 152 #define AREG    %rax
 153 #define KEYP    %rdi
 154 #define OUTP    %rsi
 155 #define UKEYP   OUTP
 156 #define INP     %rdx
 157 #define LEN     %rcx
 158 #define IVP     %r8
 159 #define KLEN    %r9d
 160 #define T1      %r10
 161 #define TKEYP   T1
 162 #define T2      %r11
 163 #define TCTR_LOW T2
 164 #else
 165 #define AREG    %eax
 166 #define KEYP    %edi
 167 #define OUTP    AREG
 168 #define UKEYP   OUTP
 169 #define INP     %edx
 170 #define LEN     %esi
 171 #define IVP     %ebp
 172 #define KLEN    %ebx
 173 #define T1      %ecx
 174 #define TKEYP   T1
 175 #endif
 176
 177 .macro FUNC_SAVE
 178         push    %r12
 179         push    %r13
 180         push    %r14
 181 #
 182 # states of %xmm registers %xmm6:%xmm15 not saved
 183 # all %xmm registers are clobbered
 184 #
 185 .endm
 186
 187
 188 .macro FUNC_RESTORE
 189         pop     %r14
 190         pop     %r13
 191         pop     %r12
 192 .endm
 193
 194 # Precompute hashkeys.
 195 # Input: Hash subkey.
 196 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 197 # once per key.
 198 # clobbers r12, and tmp xmm registers.
 199 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 200         mov     \SUBKEY, %r12
 201         movdqu  (%r12), \TMP3
 202         movdqa  SHUF_MASK(%rip), \TMP2
 203         pshufb  \TMP2, \TMP3
 204
 205         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 206
 207         movdqa  \TMP3, \TMP2
 208         psllq   $1, \TMP3
 209         psrlq   $63, \TMP2
 210         movdqa  \TMP2, \TMP1
 211         pslldq  $8, \TMP2
 212         psrldq  $8, \TMP1
 213         por     \TMP2, \TMP3
 214
 215         # reduce HashKey<<1
 216
 217         pshufd  $0x24, \TMP1, \TMP2
 218         pcmpeqd TWOONE(%rip), \TMP2
 219         pand    POLY(%rip), \TMP2
 220         pxor    \TMP2, \TMP3
 221         movdqu  \TMP3, HashKey(%arg2)
 222
 223         movdqa     \TMP3, \TMP5
 224         pshufd     $78, \TMP3, \TMP1
 225         pxor       \TMP3, \TMP1
 226         movdqu     \TMP1, HashKey_k(%arg2)
 227
 228         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 229 # TMP5 = HashKey^2<<1 (mod poly)
 230         movdqu     \TMP5, HashKey_2(%arg2)
 231 # HashKey_2 = HashKey^2<<1 (mod poly)
 232         pshufd     $78, \TMP5, \TMP1
 233         pxor       \TMP5, \TMP1
 234         movdqu     \TMP1, HashKey_2_k(%arg2)
 235
 236         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 237 # TMP5 = HashKey^3<<1 (mod poly)
 238         movdqu     \TMP5, HashKey_3(%arg2)
 239         pshufd     $78, \TMP5, \TMP1
 240         pxor       \TMP5, \TMP1
 241         movdqu     \TMP1, HashKey_3_k(%arg2)
 242
 243         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 244 # TMP5 = HashKey^3<<1 (mod poly)
 245         movdqu     \TMP5, HashKey_4(%arg2)
 246         pshufd     $78, \TMP5, \TMP1
 247         pxor       \TMP5, \TMP1
 248         movdqu     \TMP1, HashKey_4_k(%arg2)
 249 .endm
 250
 251 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 252 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 253 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 254         mov \AADLEN, %r11
 255         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 256         xor %r11d, %r11d
 257         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 258         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 259         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 260         mov \Iv, %rax
 261         movdqu (%rax), %xmm0
 262         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 263
 264         movdqa  SHUF_MASK(%rip), %xmm2
 265         pshufb %xmm2, %xmm0
 266         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 267
 268         PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 269         movdqu HashKey(%arg2), %xmm13
 270
 271         CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 272         %xmm4, %xmm5, %xmm6
 273 .endm
 274
 275 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 276 # struct has been initialized by GCM_INIT.
 277 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 278 # Clobbers rax, r10-r13, and xmm0-xmm15
 279 .macro GCM_ENC_DEC operation
 280         movdqu AadHash(%arg2), %xmm8
 281         movdqu HashKey(%arg2), %xmm13
 282         add %arg5, InLen(%arg2)
 283
 284         xor %r11d, %r11d # initialise the data pointer offset as zero
 285         PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 286
 287         sub %r11, %arg5         # sub partial block data used
 288         mov %arg5, %r13         # save the number of bytes
 289
 290         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 291         mov %r13, %r12
 292         # Encrypt/Decrypt first few blocks
 293
 294         and     $(3<<4), %r12
 295         jz      _initial_num_blocks_is_0_\@
 296         cmp     $(2<<4), %r12
 297         jb      _initial_num_blocks_is_1_\@
 298         je      _initial_num_blocks_is_2_\@
 299 _initial_num_blocks_is_3_\@:
 300         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 301 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 302         sub     $48, %r13
 303         jmp     _initial_blocks_\@
 304 _initial_num_blocks_is_2_\@:
 305         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 306 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 307         sub     $32, %r13
 308         jmp     _initial_blocks_\@
 309 _initial_num_blocks_is_1_\@:
 310         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 311 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 312         sub     $16, %r13
 313         jmp     _initial_blocks_\@
 314 _initial_num_blocks_is_0_\@:
 315         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 316 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 317 _initial_blocks_\@:
 318
 319         # Main loop - Encrypt/Decrypt remaining blocks
 320
 321         cmp     $0, %r13
 322         je      _zero_cipher_left_\@
 323         sub     $64, %r13
 324         je      _four_cipher_left_\@
 325 _crypt_by_4_\@:
 326         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 327         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 328         %xmm7, %xmm8, enc
 329         add     $64, %r11
 330         sub     $64, %r13
 331         jne     _crypt_by_4_\@
 332 _four_cipher_left_\@:
 333         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 334 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 335 _zero_cipher_left_\@:
 336         movdqu %xmm8, AadHash(%arg2)
 337         movdqu %xmm0, CurCount(%arg2)
 338
 339         mov     %arg5, %r13
 340         and     $15, %r13                       # %r13 = arg5 (mod 16)
 341         je      _multiple_of_16_bytes_\@
 342
 343         mov %r13, PBlockLen(%arg2)
 344
 345         # Handle the last <16 Byte block separately
 346         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 347         movdqu %xmm0, CurCount(%arg2)
 348         movdqa SHUF_MASK(%rip), %xmm10
 349         pshufb %xmm10, %xmm0
 350
 351         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 352         movdqu %xmm0, PBlockEncKey(%arg2)
 353
 354         cmp     $16, %arg5
 355         jge _large_enough_update_\@
 356
 357         lea (%arg4,%r11,1), %r10
 358         mov %r13, %r12
 359         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 360         jmp _data_read_\@
 361
 362 _large_enough_update_\@:
 363         sub     $16, %r11
 364         add     %r13, %r11
 365
 366         # receive the last <16 Byte block
 367         movdqu  (%arg4, %r11, 1), %xmm1
 368
 369         sub     %r13, %r11
 370         add     $16, %r11
 371
 372         lea     SHIFT_MASK+16(%rip), %r12
 373         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 374         # (r13 is the number of bytes in plaintext mod 16)
 375         sub     %r13, %r12
 376         # get the appropriate shuffle mask
 377         movdqu  (%r12), %xmm2
 378         # shift right 16-r13 bytes
 379         pshufb  %xmm2, %xmm1
 380
 381 _data_read_\@:
 382         lea ALL_F+16(%rip), %r12
 383         sub %r13, %r12
 384
 385 .ifc \operation, dec
 386         movdqa  %xmm1, %xmm2
 387 .endif
 388         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 389         movdqu  (%r12), %xmm1
 390         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 391         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 392 .ifc \operation, dec
 393         pand    %xmm1, %xmm2
 394         movdqa SHUF_MASK(%rip), %xmm10
 395         pshufb %xmm10 ,%xmm2
 396
 397         pxor %xmm2, %xmm8
 398 .else
 399         movdqa SHUF_MASK(%rip), %xmm10
 400         pshufb %xmm10,%xmm0
 401
 402         pxor    %xmm0, %xmm8
 403 .endif
 404
 405         movdqu %xmm8, AadHash(%arg2)
 406 .ifc \operation, enc
 407         # GHASH computation for the last <16 byte block
 408         movdqa SHUF_MASK(%rip), %xmm10
 409         # shuffle xmm0 back to output as ciphertext
 410         pshufb %xmm10, %xmm0
 411 .endif
 412
 413         # Output %r13 bytes
 414         movq %xmm0, %rax
 415         cmp $8, %r13
 416         jle _less_than_8_bytes_left_\@
 417         mov %rax, (%arg3 , %r11, 1)
 418         add $8, %r11
 419         psrldq $8, %xmm0
 420         movq %xmm0, %rax
 421         sub $8, %r13
 422 _less_than_8_bytes_left_\@:
 423         mov %al,  (%arg3, %r11, 1)
 424         add $1, %r11
 425         shr $8, %rax
 426         sub $1, %r13
 427         jne _less_than_8_bytes_left_\@
 428 _multiple_of_16_bytes_\@:
 429 .endm
 430
 431 # GCM_COMPLETE Finishes update of tag of last partial block
 432 # Output: Authorization Tag (AUTH_TAG)
 433 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 434 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 435         movdqu AadHash(%arg2), %xmm8
 436         movdqu HashKey(%arg2), %xmm13
 437
 438         mov PBlockLen(%arg2), %r12
 439
 440         cmp $0, %r12
 441         je _partial_done\@
 442
 443         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 444
 445 _partial_done\@:
 446         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 447         shl     $3, %r12                  # convert into number of bits
 448         movd    %r12d, %xmm15             # len(A) in %xmm15
 449         mov InLen(%arg2), %r12
 450         shl     $3, %r12                  # len(C) in bits (*128)
 451         movq    %r12, %xmm1
 452
 453         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 454         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 455         pxor    %xmm15, %xmm8
 456         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 457         # final GHASH computation
 458         movdqa SHUF_MASK(%rip), %xmm10
 459         pshufb %xmm10, %xmm8
 460
 461         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 462         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 463         pxor    %xmm8, %xmm0
 464 _return_T_\@:
 465         mov     \AUTHTAG, %r10                     # %r10 = authTag
 466         mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 467         cmp     $16, %r11
 468         je      _T_16_\@
 469         cmp     $8, %r11
 470         jl      _T_4_\@
 471 _T_8_\@:
 472         movq    %xmm0, %rax
 473         mov     %rax, (%r10)
 474         add     $8, %r10
 475         sub     $8, %r11
 476         psrldq  $8, %xmm0
 477         cmp     $0, %r11
 478         je      _return_T_done_\@
 479 _T_4_\@:
 480         movd    %xmm0, %eax
 481         mov     %eax, (%r10)
 482         add     $4, %r10
 483         sub     $4, %r11
 484         psrldq  $4, %xmm0
 485         cmp     $0, %r11
 486         je      _return_T_done_\@
 487 _T_123_\@:
 488         movd    %xmm0, %eax
 489         cmp     $2, %r11
 490         jl      _T_1_\@
 491         mov     %ax, (%r10)
 492         cmp     $2, %r11
 493         je      _return_T_done_\@
 494         add     $2, %r10
 495         sar     $16, %eax
 496 _T_1_\@:
 497         mov     %al, (%r10)
 498         jmp     _return_T_done_\@
 499 _T_16_\@:
 500         movdqu  %xmm0, (%r10)
 501 _return_T_done_\@:
 502 .endm
 503
 504 #ifdef __x86_64__
 505 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 506 *
 507 *
 508 * Input: A and B (128-bits each, bit-reflected)
 509 * Output: C = A*B*x mod poly, (i.e. >>1 )
 510 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 511 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 512 *
 513 */
 514 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 515         movdqa    \GH, \TMP1
 516         pshufd    $78, \GH, \TMP2
 517         pshufd    $78, \HK, \TMP3
 518         pxor      \GH, \TMP2            # TMP2 = a1+a0
 519         pxor      \HK, \TMP3            # TMP3 = b1+b0
 520         pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 521         pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 522         pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 523         pxor      \GH, \TMP2
 524         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 525         movdqa    \TMP2, \TMP3
 526         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 527         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 528         pxor      \TMP3, \GH
 529         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 530
 531         # first phase of the reduction
 532
 533         movdqa    \GH, \TMP2
 534         movdqa    \GH, \TMP3
 535         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 536                                         # in in order to perform
 537                                         # independent shifts
 538         pslld     $31, \TMP2            # packed right shift <<31
 539         pslld     $30, \TMP3            # packed right shift <<30
 540         pslld     $25, \TMP4            # packed right shift <<25
 541         pxor      \TMP3, \TMP2          # xor the shifted versions
 542         pxor      \TMP4, \TMP2
 543         movdqa    \TMP2, \TMP5
 544         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 545         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 546         pxor      \TMP2, \GH
 547
 548         # second phase of the reduction
 549
 550         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 551                                         # in in order to perform
 552                                         # independent shifts
 553         movdqa    \GH,\TMP3
 554         movdqa    \GH,\TMP4
 555         psrld     $1,\TMP2              # packed left shift >>1
 556         psrld     $2,\TMP3              # packed left shift >>2
 557         psrld     $7,\TMP4              # packed left shift >>7
 558         pxor      \TMP3,\TMP2           # xor the shifted versions
 559         pxor      \TMP4,\TMP2
 560         pxor      \TMP5, \TMP2
 561         pxor      \TMP2, \GH
 562         pxor      \TMP1, \GH            # result is in TMP1
 563 .endm
 564
 565 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 566 # where 0 < DLEN < 16
 567 # Clobbers %rax, DLEN and XMM1
 568 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 569         cmp $8, \DLEN
 570         jl _read_lt8_\@
 571         mov (\DPTR), %rax
 572         movq %rax, \XMMDst
 573         sub $8, \DLEN
 574         jz _done_read_partial_block_\@
 575         xor %eax, %eax
 576 _read_next_byte_\@:
 577         shl $8, %rax
 578         mov 7(\DPTR, \DLEN, 1), %al
 579         dec \DLEN
 580         jnz _read_next_byte_\@
 581         movq %rax, \XMM1
 582         pslldq $8, \XMM1
 583         por \XMM1, \XMMDst
 584         jmp _done_read_partial_block_\@
 585 _read_lt8_\@:
 586         xor %eax, %eax
 587 _read_next_byte_lt8_\@:
 588         shl $8, %rax
 589         mov -1(\DPTR, \DLEN, 1), %al
 590         dec \DLEN
 591         jnz _read_next_byte_lt8_\@
 592         movq %rax, \XMMDst
 593 _done_read_partial_block_\@:
 594 .endm
 595
 596 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 597 # clobbers r10-11, xmm14
 598 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 599         TMP6 TMP7
 600         MOVADQ     SHUF_MASK(%rip), %xmm14
 601         mov        \AAD, %r10           # %r10 = AAD
 602         mov        \AADLEN, %r11                # %r11 = aadLen
 603         pxor       \TMP7, \TMP7
 604         pxor       \TMP6, \TMP6
 605
 606         cmp        $16, %r11
 607         jl         _get_AAD_rest\@
 608 _get_AAD_blocks\@:
 609         movdqu     (%r10), \TMP7
 610         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 611         pxor       \TMP7, \TMP6
 612         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 613         add        $16, %r10
 614         sub        $16, %r11
 615         cmp        $16, %r11
 616         jge        _get_AAD_blocks\@
 617
 618         movdqu     \TMP6, \TMP7
 619
 620         /* read the last <16B of AAD */
 621 _get_AAD_rest\@:
 622         cmp        $0, %r11
 623         je         _get_AAD_done\@
 624
 625         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 626         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 627         pxor       \TMP6, \TMP7
 628         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 629         movdqu \TMP7, \TMP6
 630
 631 _get_AAD_done\@:
 632         movdqu \TMP6, AadHash(%arg2)
 633 .endm
 634
 635 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 636 # between update calls.
 637 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 638 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 639 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 640 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 641         AAD_HASH operation
 642         mov     PBlockLen(%arg2), %r13
 643         cmp     $0, %r13
 644         je      _partial_block_done_\@  # Leave Macro if no partial blocks
 645         # Read in input data without over reading
 646         cmp     $16, \PLAIN_CYPH_LEN
 647         jl      _fewer_than_16_bytes_\@
 648         movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 649         jmp     _data_read_\@
 650
 651 _fewer_than_16_bytes_\@:
 652         lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 653         mov     \PLAIN_CYPH_LEN, %r12
 654         READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 655
 656         mov PBlockLen(%arg2), %r13
 657
 658 _data_read_\@:                          # Finished reading in data
 659
 660         movdqu  PBlockEncKey(%arg2), %xmm9
 661         movdqu  HashKey(%arg2), %xmm13
 662
 663         lea     SHIFT_MASK(%rip), %r12
 664
 665         # adjust the shuffle mask pointer to be able to shift r13 bytes
 666         # r16-r13 is the number of bytes in plaintext mod 16)
 667         add     %r13, %r12
 668         movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 669         pshufb  %xmm2, %xmm9            # shift right r13 bytes
 670
 671 .ifc \operation, dec
 672         movdqa  %xmm1, %xmm3
 673         pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 674
 675         mov     \PLAIN_CYPH_LEN, %r10
 676         add     %r13, %r10
 677         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 678         sub     $16, %r10
 679         # Determine if if partial block is not being filled and
 680         # shift mask accordingly
 681         jge     _no_extra_mask_1_\@
 682         sub     %r10, %r12
 683 _no_extra_mask_1_\@:
 684
 685         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 686         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 687         pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 688
 689         pand    %xmm1, %xmm3
 690         movdqa  SHUF_MASK(%rip), %xmm10
 691         pshufb  %xmm10, %xmm3
 692         pshufb  %xmm2, %xmm3
 693         pxor    %xmm3, \AAD_HASH
 694
 695         cmp     $0, %r10
 696         jl      _partial_incomplete_1_\@
 697
 698         # GHASH computation for the last <16 Byte block
 699         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 700         xor     %eax, %eax
 701
 702         mov     %rax, PBlockLen(%arg2)
 703         jmp     _dec_done_\@
 704 _partial_incomplete_1_\@:
 705         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 706 _dec_done_\@:
 707         movdqu  \AAD_HASH, AadHash(%arg2)
 708 .else
 709         pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 710
 711         mov     \PLAIN_CYPH_LEN, %r10
 712         add     %r13, %r10
 713         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 714         sub     $16, %r10
 715         # Determine if if partial block is not being filled and
 716         # shift mask accordingly
 717         jge     _no_extra_mask_2_\@
 718         sub     %r10, %r12
 719 _no_extra_mask_2_\@:
 720
 721         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 722         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 723         pand    %xmm1, %xmm9
 724
 725         movdqa  SHUF_MASK(%rip), %xmm1
 726         pshufb  %xmm1, %xmm9
 727         pshufb  %xmm2, %xmm9
 728         pxor    %xmm9, \AAD_HASH
 729
 730         cmp     $0, %r10
 731         jl      _partial_incomplete_2_\@
 732
 733         # GHASH computation for the last <16 Byte block
 734         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 735         xor     %eax, %eax
 736
 737         mov     %rax, PBlockLen(%arg2)
 738         jmp     _encode_done_\@
 739 _partial_incomplete_2_\@:
 740         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 741 _encode_done_\@:
 742         movdqu  \AAD_HASH, AadHash(%arg2)
 743
 744         movdqa  SHUF_MASK(%rip), %xmm10
 745         # shuffle xmm9 back to output as ciphertext
 746         pshufb  %xmm10, %xmm9
 747         pshufb  %xmm2, %xmm9
 748 .endif
 749         # output encrypted Bytes
 750         cmp     $0, %r10
 751         jl      _partial_fill_\@
 752         mov     %r13, %r12
 753         mov     $16, %r13
 754         # Set r13 to be the number of bytes to write out
 755         sub     %r12, %r13
 756         jmp     _count_set_\@
 757 _partial_fill_\@:
 758         mov     \PLAIN_CYPH_LEN, %r13
 759 _count_set_\@:
 760         movdqa  %xmm9, %xmm0
 761         movq    %xmm0, %rax
 762         cmp     $8, %r13
 763         jle     _less_than_8_bytes_left_\@
 764
 765         mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 766         add     $8, \DATA_OFFSET
 767         psrldq  $8, %xmm0
 768         movq    %xmm0, %rax
 769         sub     $8, %r13
 770 _less_than_8_bytes_left_\@:
 771         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 772         add     $1, \DATA_OFFSET
 773         shr     $8, %rax
 774         sub     $1, %r13
 775         jne     _less_than_8_bytes_left_\@
 776 _partial_block_done_\@:
 777 .endm # PARTIAL_BLOCK
 778
 779 /*
 780 * if a = number of total plaintext bytes
 781 * b = floor(a/16)
 782 * num_initial_blocks = b mod 4
 783 * encrypt the initial num_initial_blocks blocks and apply ghash on
 784 * the ciphertext
 785 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 786 * are clobbered
 787 * arg1, %arg2, %arg3 are used as a pointer only, not modified
 788 */
 789
 790
 791 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 792         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 793         MOVADQ          SHUF_MASK(%rip), %xmm14
 794
 795         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 796
 797         # start AES for num_initial_blocks blocks
 798
 799         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 800
 801 .if (\i == 5) || (\i == 6) || (\i == 7)
 802
 803         MOVADQ          ONE(%RIP),\TMP1
 804         MOVADQ          0(%arg1),\TMP2
 805 .irpc index, \i_seq
 806         paddd           \TMP1, \XMM0                 # INCR Y0
 807 .ifc \operation, dec
 808         movdqa     \XMM0, %xmm\index
 809 .else
 810         MOVADQ          \XMM0, %xmm\index
 811 .endif
 812         pshufb  %xmm14, %xmm\index      # perform a 16 byte swap
 813         pxor            \TMP2, %xmm\index
 814 .endr
 815         lea     0x10(%arg1),%r10
 816         mov     keysize,%eax
 817         shr     $2,%eax                         # 128->4, 192->6, 256->8
 818         add     $5,%eax                       # 128->9, 192->11, 256->13
 819
 820 aes_loop_initial_\@:
 821         MOVADQ  (%r10),\TMP1
 822 .irpc   index, \i_seq
 823         aesenc  \TMP1, %xmm\index
 824 .endr
 825         add     $16,%r10
 826         sub     $1,%eax
 827         jnz     aes_loop_initial_\@
 828
 829         MOVADQ  (%r10), \TMP1
 830 .irpc index, \i_seq
 831         aesenclast \TMP1, %xmm\index         # Last Round
 832 .endr
 833 .irpc index, \i_seq
 834         movdqu     (%arg4 , %r11, 1), \TMP1
 835         pxor       \TMP1, %xmm\index
 836         movdqu     %xmm\index, (%arg3 , %r11, 1)
 837         # write back plaintext/ciphertext for num_initial_blocks
 838         add        $16, %r11
 839
 840 .ifc \operation, dec
 841         movdqa     \TMP1, %xmm\index
 842 .endif
 843         pshufb     %xmm14, %xmm\index
 844
 845                 # prepare plaintext/ciphertext for GHASH computation
 846 .endr
 847 .endif
 848
 849         # apply GHASH on num_initial_blocks blocks
 850
 851 .if \i == 5
 852         pxor       %xmm5, %xmm6
 853         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854         pxor       %xmm6, %xmm7
 855         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 856         pxor       %xmm7, %xmm8
 857         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 858 .elseif \i == 6
 859         pxor       %xmm6, %xmm7
 860         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 861         pxor       %xmm7, %xmm8
 862         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 863 .elseif \i == 7
 864         pxor       %xmm7, %xmm8
 865         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 866 .endif
 867         cmp        $64, %r13
 868         jl      _initial_blocks_done\@
 869         # no need for precomputed values
 870 /*
 871 *
 872 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 873 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 874 */
 875         MOVADQ     ONE(%RIP),\TMP1
 876         paddd      \TMP1, \XMM0              # INCR Y0
 877         MOVADQ     \XMM0, \XMM1
 878         pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 879
 880         paddd      \TMP1, \XMM0              # INCR Y0
 881         MOVADQ     \XMM0, \XMM2
 882         pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 883
 884         paddd      \TMP1, \XMM0              # INCR Y0
 885         MOVADQ     \XMM0, \XMM3
 886         pshufb %xmm14, \XMM3        # perform a 16 byte swap
 887
 888         paddd      \TMP1, \XMM0              # INCR Y0
 889         MOVADQ     \XMM0, \XMM4
 890         pshufb %xmm14, \XMM4        # perform a 16 byte swap
 891
 892         MOVADQ     0(%arg1),\TMP1
 893         pxor       \TMP1, \XMM1
 894         pxor       \TMP1, \XMM2
 895         pxor       \TMP1, \XMM3
 896         pxor       \TMP1, \XMM4
 897 .irpc index, 1234 # do 4 rounds
 898         movaps 0x10*\index(%arg1), \TMP1
 899         aesenc     \TMP1, \XMM1
 900         aesenc     \TMP1, \XMM2
 901         aesenc     \TMP1, \XMM3
 902         aesenc     \TMP1, \XMM4
 903 .endr
 904 .irpc index, 56789 # do next 5 rounds
 905         movaps 0x10*\index(%arg1), \TMP1
 906         aesenc     \TMP1, \XMM1
 907         aesenc     \TMP1, \XMM2
 908         aesenc     \TMP1, \XMM3
 909         aesenc     \TMP1, \XMM4
 910 .endr
 911         lea        0xa0(%arg1),%r10
 912         mov        keysize,%eax
 913         shr        $2,%eax                      # 128->4, 192->6, 256->8
 914         sub        $4,%eax                      # 128->0, 192->2, 256->4
 915         jz         aes_loop_pre_done\@
 916
 917 aes_loop_pre_\@:
 918         MOVADQ     (%r10),\TMP2
 919 .irpc   index, 1234
 920         aesenc     \TMP2, %xmm\index
 921 .endr
 922         add        $16,%r10
 923         sub        $1,%eax
 924         jnz        aes_loop_pre_\@
 925
 926 aes_loop_pre_done\@:
 927         MOVADQ     (%r10), \TMP2
 928         aesenclast \TMP2, \XMM1
 929         aesenclast \TMP2, \XMM2
 930         aesenclast \TMP2, \XMM3
 931         aesenclast \TMP2, \XMM4
 932         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 933         pxor       \TMP1, \XMM1
 934 .ifc \operation, dec
 935         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 936         movdqa     \TMP1, \XMM1
 937 .endif
 938         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 939         pxor       \TMP1, \XMM2
 940 .ifc \operation, dec
 941         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 942         movdqa     \TMP1, \XMM2
 943 .endif
 944         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 945         pxor       \TMP1, \XMM3
 946 .ifc \operation, dec
 947         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 948         movdqa     \TMP1, \XMM3
 949 .endif
 950         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 951         pxor       \TMP1, \XMM4
 952 .ifc \operation, dec
 953         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 954         movdqa     \TMP1, \XMM4
 955 .else
 956         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 957         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 958         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 959         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 960 .endif
 961
 962         add        $64, %r11
 963         pshufb %xmm14, \XMM1 # perform a 16 byte swap
 964         pxor       \XMMDst, \XMM1
 965 # combine GHASHed value with the corresponding ciphertext
 966         pshufb %xmm14, \XMM2 # perform a 16 byte swap
 967         pshufb %xmm14, \XMM3 # perform a 16 byte swap
 968         pshufb %xmm14, \XMM4 # perform a 16 byte swap
 969
 970 _initial_blocks_done\@:
 971
 972 .endm
 973
 974 /*
 975 * encrypt 4 blocks at a time
 976 * ghash the 4 previously encrypted ciphertext blocks
 977 * arg1, %arg3, %arg4 are used as pointers only, not modified
 978 * %r11 is the data offset value
 979 */
 980 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 981 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 982
 983         movdqa    \XMM1, \XMM5
 984         movdqa    \XMM2, \XMM6
 985         movdqa    \XMM3, \XMM7
 986         movdqa    \XMM4, \XMM8
 987
 988         movdqa    SHUF_MASK(%rip), %xmm15
 989         # multiply TMP5 * HashKey using karatsuba
 990
 991         movdqa    \XMM5, \TMP4
 992         pshufd    $78, \XMM5, \TMP6
 993         pxor      \XMM5, \TMP6
 994         paddd     ONE(%rip), \XMM0              # INCR CNT
 995         movdqu    HashKey_4(%arg2), \TMP5
 996         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 997         movdqa    \XMM0, \XMM1
 998         paddd     ONE(%rip), \XMM0              # INCR CNT
 999         movdqa    \XMM0, \XMM2
1000         paddd     ONE(%rip), \XMM0              # INCR CNT
1001         movdqa    \XMM0, \XMM3
1002         paddd     ONE(%rip), \XMM0              # INCR CNT
1003         movdqa    \XMM0, \XMM4
1004         pshufb %xmm15, \XMM1    # perform a 16 byte swap
1005         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1006         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1007         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1008         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1009
1010         pxor      (%arg1), \XMM1
1011         pxor      (%arg1), \XMM2
1012         pxor      (%arg1), \XMM3
1013         pxor      (%arg1), \XMM4
1014         movdqu    HashKey_4_k(%arg2), \TMP5
1015         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1016         movaps 0x10(%arg1), \TMP1
1017         aesenc    \TMP1, \XMM1              # Round 1
1018         aesenc    \TMP1, \XMM2
1019         aesenc    \TMP1, \XMM3
1020         aesenc    \TMP1, \XMM4
1021         movaps 0x20(%arg1), \TMP1
1022         aesenc    \TMP1, \XMM1              # Round 2
1023         aesenc    \TMP1, \XMM2
1024         aesenc    \TMP1, \XMM3
1025         aesenc    \TMP1, \XMM4
1026         movdqa    \XMM6, \TMP1
1027         pshufd    $78, \XMM6, \TMP2
1028         pxor      \XMM6, \TMP2
1029         movdqu    HashKey_3(%arg2), \TMP5
1030         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1031         movaps 0x30(%arg1), \TMP3
1032         aesenc    \TMP3, \XMM1              # Round 3
1033         aesenc    \TMP3, \XMM2
1034         aesenc    \TMP3, \XMM3
1035         aesenc    \TMP3, \XMM4
1036         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1037         movaps 0x40(%arg1), \TMP3
1038         aesenc    \TMP3, \XMM1              # Round 4
1039         aesenc    \TMP3, \XMM2
1040         aesenc    \TMP3, \XMM3
1041         aesenc    \TMP3, \XMM4
1042         movdqu    HashKey_3_k(%arg2), \TMP5
1043         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1044         movaps 0x50(%arg1), \TMP3
1045         aesenc    \TMP3, \XMM1              # Round 5
1046         aesenc    \TMP3, \XMM2
1047         aesenc    \TMP3, \XMM3
1048         aesenc    \TMP3, \XMM4
1049         pxor      \TMP1, \TMP4
1050 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1051         pxor      \XMM6, \XMM5
1052         pxor      \TMP2, \TMP6
1053         movdqa    \XMM7, \TMP1
1054         pshufd    $78, \XMM7, \TMP2
1055         pxor      \XMM7, \TMP2
1056         movdqu    HashKey_2(%arg2), \TMP5
1057
1058         # Multiply TMP5 * HashKey using karatsuba
1059
1060         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1061         movaps 0x60(%arg1), \TMP3
1062         aesenc    \TMP3, \XMM1              # Round 6
1063         aesenc    \TMP3, \XMM2
1064         aesenc    \TMP3, \XMM3
1065         aesenc    \TMP3, \XMM4
1066         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1067         movaps 0x70(%arg1), \TMP3
1068         aesenc    \TMP3, \XMM1              # Round 7
1069         aesenc    \TMP3, \XMM2
1070         aesenc    \TMP3, \XMM3
1071         aesenc    \TMP3, \XMM4
1072         movdqu    HashKey_2_k(%arg2), \TMP5
1073         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1074         movaps 0x80(%arg1), \TMP3
1075         aesenc    \TMP3, \XMM1              # Round 8
1076         aesenc    \TMP3, \XMM2
1077         aesenc    \TMP3, \XMM3
1078         aesenc    \TMP3, \XMM4
1079         pxor      \TMP1, \TMP4
1080 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1081         pxor      \XMM7, \XMM5
1082         pxor      \TMP2, \TMP6
1083
1084         # Multiply XMM8 * HashKey
1085         # XMM8 and TMP5 hold the values for the two operands
1086
1087         movdqa    \XMM8, \TMP1
1088         pshufd    $78, \XMM8, \TMP2
1089         pxor      \XMM8, \TMP2
1090         movdqu    HashKey(%arg2), \TMP5
1091         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1092         movaps 0x90(%arg1), \TMP3
1093         aesenc    \TMP3, \XMM1             # Round 9
1094         aesenc    \TMP3, \XMM2
1095         aesenc    \TMP3, \XMM3
1096         aesenc    \TMP3, \XMM4
1097         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1098         lea       0xa0(%arg1),%r10
1099         mov       keysize,%eax
1100         shr       $2,%eax                       # 128->4, 192->6, 256->8
1101         sub       $4,%eax                       # 128->0, 192->2, 256->4
1102         jz        aes_loop_par_enc_done\@
1103
1104 aes_loop_par_enc\@:
1105         MOVADQ    (%r10),\TMP3
1106 .irpc   index, 1234
1107         aesenc    \TMP3, %xmm\index
1108 .endr
1109         add       $16,%r10
1110         sub       $1,%eax
1111         jnz       aes_loop_par_enc\@
1112
1113 aes_loop_par_enc_done\@:
1114         MOVADQ    (%r10), \TMP3
1115         aesenclast \TMP3, \XMM1           # Round 10
1116         aesenclast \TMP3, \XMM2
1117         aesenclast \TMP3, \XMM3
1118         aesenclast \TMP3, \XMM4
1119         movdqu    HashKey_k(%arg2), \TMP5
1120         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1121         movdqu    (%arg4,%r11,1), \TMP3
1122         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1123         movdqu    16(%arg4,%r11,1), \TMP3
1124         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1125         movdqu    32(%arg4,%r11,1), \TMP3
1126         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1127         movdqu    48(%arg4,%r11,1), \TMP3
1128         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1129         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1130         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1131         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1132         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1133         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1134         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1135         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1136         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1137
1138         pxor      \TMP4, \TMP1
1139         pxor      \XMM8, \XMM5
1140         pxor      \TMP6, \TMP2
1141         pxor      \TMP1, \TMP2
1142         pxor      \XMM5, \TMP2
1143         movdqa    \TMP2, \TMP3
1144         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1145         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1146         pxor      \TMP3, \XMM5
1147         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1148
1149         # first phase of reduction
1150
1151         movdqa    \XMM5, \TMP2
1152         movdqa    \XMM5, \TMP3
1153         movdqa    \XMM5, \TMP4
1154 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155         pslld     $31, \TMP2                   # packed right shift << 31
1156         pslld     $30, \TMP3                   # packed right shift << 30
1157         pslld     $25, \TMP4                   # packed right shift << 25
1158         pxor      \TMP3, \TMP2                 # xor the shifted versions
1159         pxor      \TMP4, \TMP2
1160         movdqa    \TMP2, \TMP5
1161         psrldq    $4, \TMP5                    # right shift T5 1 DW
1162         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1163         pxor      \TMP2, \XMM5
1164
1165         # second phase of reduction
1166
1167         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1168         movdqa    \XMM5,\TMP3
1169         movdqa    \XMM5,\TMP4
1170         psrld     $1, \TMP2                    # packed left shift >>1
1171         psrld     $2, \TMP3                    # packed left shift >>2
1172         psrld     $7, \TMP4                    # packed left shift >>7
1173         pxor      \TMP3,\TMP2                  # xor the shifted versions
1174         pxor      \TMP4,\TMP2
1175         pxor      \TMP5, \TMP2
1176         pxor      \TMP2, \XMM5
1177         pxor      \TMP1, \XMM5                 # result is in TMP1
1178
1179         pxor      \XMM5, \XMM1
1180 .endm
1181
1182 /*
1183 * decrypt 4 blocks at a time
1184 * ghash the 4 previously decrypted ciphertext blocks
1185 * arg1, %arg3, %arg4 are used as pointers only, not modified
1186 * %r11 is the data offset value
1187 */
1188 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1189 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1190
1191         movdqa    \XMM1, \XMM5
1192         movdqa    \XMM2, \XMM6
1193         movdqa    \XMM3, \XMM7
1194         movdqa    \XMM4, \XMM8
1195
1196         movdqa    SHUF_MASK(%rip), %xmm15
1197         # multiply TMP5 * HashKey using karatsuba
1198
1199         movdqa    \XMM5, \TMP4
1200         pshufd    $78, \XMM5, \TMP6
1201         pxor      \XMM5, \TMP6
1202         paddd     ONE(%rip), \XMM0              # INCR CNT
1203         movdqu    HashKey_4(%arg2), \TMP5
1204         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1205         movdqa    \XMM0, \XMM1
1206         paddd     ONE(%rip), \XMM0              # INCR CNT
1207         movdqa    \XMM0, \XMM2
1208         paddd     ONE(%rip), \XMM0              # INCR CNT
1209         movdqa    \XMM0, \XMM3
1210         paddd     ONE(%rip), \XMM0              # INCR CNT
1211         movdqa    \XMM0, \XMM4
1212         pshufb %xmm15, \XMM1    # perform a 16 byte swap
1213         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1214         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1215         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1216         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1217
1218         pxor      (%arg1), \XMM1
1219         pxor      (%arg1), \XMM2
1220         pxor      (%arg1), \XMM3
1221         pxor      (%arg1), \XMM4
1222         movdqu    HashKey_4_k(%arg2), \TMP5
1223         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1224         movaps 0x10(%arg1), \TMP1
1225         aesenc    \TMP1, \XMM1              # Round 1
1226         aesenc    \TMP1, \XMM2
1227         aesenc    \TMP1, \XMM3
1228         aesenc    \TMP1, \XMM4
1229         movaps 0x20(%arg1), \TMP1
1230         aesenc    \TMP1, \XMM1              # Round 2
1231         aesenc    \TMP1, \XMM2
1232         aesenc    \TMP1, \XMM3
1233         aesenc    \TMP1, \XMM4
1234         movdqa    \XMM6, \TMP1
1235         pshufd    $78, \XMM6, \TMP2
1236         pxor      \XMM6, \TMP2
1237         movdqu    HashKey_3(%arg2), \TMP5
1238         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1239         movaps 0x30(%arg1), \TMP3
1240         aesenc    \TMP3, \XMM1              # Round 3
1241         aesenc    \TMP3, \XMM2
1242         aesenc    \TMP3, \XMM3
1243         aesenc    \TMP3, \XMM4
1244         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1245         movaps 0x40(%arg1), \TMP3
1246         aesenc    \TMP3, \XMM1              # Round 4
1247         aesenc    \TMP3, \XMM2
1248         aesenc    \TMP3, \XMM3
1249         aesenc    \TMP3, \XMM4
1250         movdqu    HashKey_3_k(%arg2), \TMP5
1251         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1252         movaps 0x50(%arg1), \TMP3
1253         aesenc    \TMP3, \XMM1              # Round 5
1254         aesenc    \TMP3, \XMM2
1255         aesenc    \TMP3, \XMM3
1256         aesenc    \TMP3, \XMM4
1257         pxor      \TMP1, \TMP4
1258 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1259         pxor      \XMM6, \XMM5
1260         pxor      \TMP2, \TMP6
1261         movdqa    \XMM7, \TMP1
1262         pshufd    $78, \XMM7, \TMP2
1263         pxor      \XMM7, \TMP2
1264         movdqu    HashKey_2(%arg2), \TMP5
1265
1266         # Multiply TMP5 * HashKey using karatsuba
1267
1268         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1269         movaps 0x60(%arg1), \TMP3
1270         aesenc    \TMP3, \XMM1              # Round 6
1271         aesenc    \TMP3, \XMM2
1272         aesenc    \TMP3, \XMM3
1273         aesenc    \TMP3, \XMM4
1274         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1275         movaps 0x70(%arg1), \TMP3
1276         aesenc    \TMP3, \XMM1              # Round 7
1277         aesenc    \TMP3, \XMM2
1278         aesenc    \TMP3, \XMM3
1279         aesenc    \TMP3, \XMM4
1280         movdqu    HashKey_2_k(%arg2), \TMP5
1281         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1282         movaps 0x80(%arg1), \TMP3
1283         aesenc    \TMP3, \XMM1              # Round 8
1284         aesenc    \TMP3, \XMM2
1285         aesenc    \TMP3, \XMM3
1286         aesenc    \TMP3, \XMM4
1287         pxor      \TMP1, \TMP4
1288 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1289         pxor      \XMM7, \XMM5
1290         pxor      \TMP2, \TMP6
1291
1292         # Multiply XMM8 * HashKey
1293         # XMM8 and TMP5 hold the values for the two operands
1294
1295         movdqa    \XMM8, \TMP1
1296         pshufd    $78, \XMM8, \TMP2
1297         pxor      \XMM8, \TMP2
1298         movdqu    HashKey(%arg2), \TMP5
1299         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1300         movaps 0x90(%arg1), \TMP3
1301         aesenc    \TMP3, \XMM1             # Round 9
1302         aesenc    \TMP3, \XMM2
1303         aesenc    \TMP3, \XMM3
1304         aesenc    \TMP3, \XMM4
1305         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1306         lea       0xa0(%arg1),%r10
1307         mov       keysize,%eax
1308         shr       $2,%eax                       # 128->4, 192->6, 256->8
1309         sub       $4,%eax                       # 128->0, 192->2, 256->4
1310         jz        aes_loop_par_dec_done\@
1311
1312 aes_loop_par_dec\@:
1313         MOVADQ    (%r10),\TMP3
1314 .irpc   index, 1234
1315         aesenc    \TMP3, %xmm\index
1316 .endr
1317         add       $16,%r10
1318         sub       $1,%eax
1319         jnz       aes_loop_par_dec\@
1320
1321 aes_loop_par_dec_done\@:
1322         MOVADQ    (%r10), \TMP3
1323         aesenclast \TMP3, \XMM1           # last round
1324         aesenclast \TMP3, \XMM2
1325         aesenclast \TMP3, \XMM3
1326         aesenclast \TMP3, \XMM4
1327         movdqu    HashKey_k(%arg2), \TMP5
1328         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1329         movdqu    (%arg4,%r11,1), \TMP3
1330         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1331         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1332         movdqa    \TMP3, \XMM1
1333         movdqu    16(%arg4,%r11,1), \TMP3
1334         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1335         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1336         movdqa    \TMP3, \XMM2
1337         movdqu    32(%arg4,%r11,1), \TMP3
1338         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1339         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1340         movdqa    \TMP3, \XMM3
1341         movdqu    48(%arg4,%r11,1), \TMP3
1342         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1343         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1344         movdqa    \TMP3, \XMM4
1345         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1346         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1347         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1348         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1349
1350         pxor      \TMP4, \TMP1
1351         pxor      \XMM8, \XMM5
1352         pxor      \TMP6, \TMP2
1353         pxor      \TMP1, \TMP2
1354         pxor      \XMM5, \TMP2
1355         movdqa    \TMP2, \TMP3
1356         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1357         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1358         pxor      \TMP3, \XMM5
1359         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1360
1361         # first phase of reduction
1362
1363         movdqa    \XMM5, \TMP2
1364         movdqa    \XMM5, \TMP3
1365         movdqa    \XMM5, \TMP4
1366 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367         pslld     $31, \TMP2                   # packed right shift << 31
1368         pslld     $30, \TMP3                   # packed right shift << 30
1369         pslld     $25, \TMP4                   # packed right shift << 25
1370         pxor      \TMP3, \TMP2                 # xor the shifted versions
1371         pxor      \TMP4, \TMP2
1372         movdqa    \TMP2, \TMP5
1373         psrldq    $4, \TMP5                    # right shift T5 1 DW
1374         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1375         pxor      \TMP2, \XMM5
1376
1377         # second phase of reduction
1378
1379         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1380         movdqa    \XMM5,\TMP3
1381         movdqa    \XMM5,\TMP4
1382         psrld     $1, \TMP2                    # packed left shift >>1
1383         psrld     $2, \TMP3                    # packed left shift >>2
1384         psrld     $7, \TMP4                    # packed left shift >>7
1385         pxor      \TMP3,\TMP2                  # xor the shifted versions
1386         pxor      \TMP4,\TMP2
1387         pxor      \TMP5, \TMP2
1388         pxor      \TMP2, \XMM5
1389         pxor      \TMP1, \XMM5                 # result is in TMP1
1390
1391         pxor      \XMM5, \XMM1
1392 .endm
1393
1394 /* GHASH the last 4 ciphertext blocks. */
1395 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1397
1398         # Multiply TMP6 * HashKey (using Karatsuba)
1399
1400         movdqa    \XMM1, \TMP6
1401         pshufd    $78, \XMM1, \TMP2
1402         pxor      \XMM1, \TMP2
1403         movdqu    HashKey_4(%arg2), \TMP5
1404         pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1405         pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1406         movdqu    HashKey_4_k(%arg2), \TMP4
1407         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1408         movdqa    \XMM1, \XMMDst
1409         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1410
1411         # Multiply TMP1 * HashKey (using Karatsuba)
1412
1413         movdqa    \XMM2, \TMP1
1414         pshufd    $78, \XMM2, \TMP2
1415         pxor      \XMM2, \TMP2
1416         movdqu    HashKey_3(%arg2), \TMP5
1417         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1418         pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1419         movdqu    HashKey_3_k(%arg2), \TMP4
1420         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1421         pxor      \TMP1, \TMP6
1422         pxor      \XMM2, \XMMDst
1423         pxor      \TMP2, \XMM1
1424 # results accumulated in TMP6, XMMDst, XMM1
1425
1426         # Multiply TMP1 * HashKey (using Karatsuba)
1427
1428         movdqa    \XMM3, \TMP1
1429         pshufd    $78, \XMM3, \TMP2
1430         pxor      \XMM3, \TMP2
1431         movdqu    HashKey_2(%arg2), \TMP5
1432         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1433         pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1434         movdqu    HashKey_2_k(%arg2), \TMP4
1435         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1436         pxor      \TMP1, \TMP6
1437         pxor      \XMM3, \XMMDst
1438         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1439
1440         # Multiply TMP1 * HashKey (using Karatsuba)
1441         movdqa    \XMM4, \TMP1
1442         pshufd    $78, \XMM4, \TMP2
1443         pxor      \XMM4, \TMP2
1444         movdqu    HashKey(%arg2), \TMP5
1445         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1446         pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1447         movdqu    HashKey_k(%arg2), \TMP4
1448         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1449         pxor      \TMP1, \TMP6
1450         pxor      \XMM4, \XMMDst
1451         pxor      \XMM1, \TMP2
1452         pxor      \TMP6, \TMP2
1453         pxor      \XMMDst, \TMP2
1454         # middle section of the temp results combined as in karatsuba algorithm
1455         movdqa    \TMP2, \TMP4
1456         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1457         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1458         pxor      \TMP4, \XMMDst
1459         pxor      \TMP2, \TMP6
1460 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461         # first phase of the reduction
1462         movdqa    \XMMDst, \TMP2
1463         movdqa    \XMMDst, \TMP3
1464         movdqa    \XMMDst, \TMP4
1465 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466         pslld     $31, \TMP2                # packed right shifting << 31
1467         pslld     $30, \TMP3                # packed right shifting << 30
1468         pslld     $25, \TMP4                # packed right shifting << 25
1469         pxor      \TMP3, \TMP2              # xor the shifted versions
1470         pxor      \TMP4, \TMP2
1471         movdqa    \TMP2, \TMP7
1472         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1473         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1474         pxor      \TMP2, \XMMDst
1475
1476         # second phase of the reduction
1477         movdqa    \XMMDst, \TMP2
1478         # make 3 copies of XMMDst for doing 3 shift operations
1479         movdqa    \XMMDst, \TMP3
1480         movdqa    \XMMDst, \TMP4
1481         psrld     $1, \TMP2                 # packed left shift >> 1
1482         psrld     $2, \TMP3                 # packed left shift >> 2
1483         psrld     $7, \TMP4                 # packed left shift >> 7
1484         pxor      \TMP3, \TMP2              # xor the shifted versions
1485         pxor      \TMP4, \TMP2
1486         pxor      \TMP7, \TMP2
1487         pxor      \TMP2, \XMMDst
1488         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1489 .endm
1490
1491
1492 /* Encryption of a single block
1493 * uses eax & r10
1494 */
1495
1496 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1497
1498         pxor            (%arg1), \XMM0
1499         mov             keysize,%eax
1500         shr             $2,%eax                 # 128->4, 192->6, 256->8
1501         add             $5,%eax                 # 128->9, 192->11, 256->13
1502         lea             16(%arg1), %r10   # get first expanded key address
1503
1504 _esb_loop_\@:
1505         MOVADQ          (%r10),\TMP1
1506         aesenc          \TMP1,\XMM0
1507         add             $16,%r10
1508         sub             $1,%eax
1509         jnz             _esb_loop_\@
1510
1511         MOVADQ          (%r10),\TMP1
1512         aesenclast      \TMP1,\XMM0
1513 .endm
1514 /*****************************************************************************
1515 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1516 *                   struct gcm_context_data *data
1517 *                                      // Context data
1518 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1519 *                   const u8 *in,      // Ciphertext input
1520 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1521 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1522 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1524 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1526 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1528 *                                      // given authentication tag and only return the plaintext if they match.
1529 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530 *                                      // (most likely), 12 or 8.
1531 *
1532 * Assumptions:
1533 *
1534 * keys:
1535 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1536 *       set of 11 keys in the data structure void *aes_ctx
1537 *
1538 * iv:
1539 *       0                   1                   2                   3
1540 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542 *       |                             Salt  (From the SA)               |
1543 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544 *       |                     Initialization Vector                     |
1545 *       |         (This is the sequence number from IPSec header)       |
1546 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547 *       |                              0x1                              |
1548 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 *
1550 *
1551 *
1552 * AAD:
1553 *       AAD padded to 128 bits with 0
1554 *       for example, assume AAD is a u32 vector
1555 *
1556 *       if AAD is 8 bytes:
1557 *       AAD[3] = {A0, A1};
1558 *       padded AAD in xmm register = {A1 A0 0 0}
1559 *
1560 *       0                   1                   2                   3
1561 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563 *       |                               SPI (A1)                        |
1564 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 *       |                     32-bit Sequence Number (A0)               |
1566 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1567 *       |                              0x0                              |
1568 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569 *
1570 *                                       AAD Format with 32-bit Sequence Number
1571 *
1572 *       if AAD is 12 bytes:
1573 *       AAD[3] = {A0, A1, A2};
1574 *       padded AAD in xmm register = {A2 A1 A0 0}
1575 *
1576 *       0                   1                   2                   3
1577 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1581 *       |                               SPI (A2)                        |
1582 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1584 *       |                                                               |
1585 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586 *       |                              0x0                              |
1587 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588 *
1589 *                        AAD Format with 64-bit Extended Sequence Number
1590 *
1591 * poly = x^128 + x^127 + x^126 + x^121 + 1
1592 *
1593 *****************************************************************************/
1594 SYM_FUNC_START(aesni_gcm_dec)
1595         FUNC_SAVE
1596
1597         GCM_INIT %arg6, arg7, arg8, arg9
1598         GCM_ENC_DEC dec
1599         GCM_COMPLETE arg10, arg11
1600         FUNC_RESTORE
1601         ret
1602 SYM_FUNC_END(aesni_gcm_dec)
1603
1604
1605 /*****************************************************************************
1606 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1607 *                    struct gcm_context_data *data
1608 *                                        // Context data
1609 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1610 *                    const u8 *in,       // Plaintext input
1611 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1612 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1613 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1615 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1617 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618 *                    u8 *auth_tag,       // Authenticated Tag output.
1619 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1620 *                                        // 12 or 8.
1621 *
1622 * Assumptions:
1623 *
1624 * keys:
1625 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1626 *       first set of 11 keys in the data structure void *aes_ctx
1627 *
1628 *
1629 * iv:
1630 *       0                   1                   2                   3
1631 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633 *       |                             Salt  (From the SA)               |
1634 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635 *       |                     Initialization Vector                     |
1636 *       |         (This is the sequence number from IPSec header)       |
1637 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638 *       |                              0x1                              |
1639 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 *
1641 *
1642 *
1643 * AAD:
1644 *       AAD padded to 128 bits with 0
1645 *       for example, assume AAD is a u32 vector
1646 *
1647 *       if AAD is 8 bytes:
1648 *       AAD[3] = {A0, A1};
1649 *       padded AAD in xmm register = {A1 A0 0 0}
1650 *
1651 *       0                   1                   2                   3
1652 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654 *       |                               SPI (A1)                        |
1655 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 *       |                     32-bit Sequence Number (A0)               |
1657 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658 *       |                              0x0                              |
1659 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660 *
1661 *                                 AAD Format with 32-bit Sequence Number
1662 *
1663 *       if AAD is 12 bytes:
1664 *       AAD[3] = {A0, A1, A2};
1665 *       padded AAD in xmm register = {A2 A1 A0 0}
1666 *
1667 *       0                   1                   2                   3
1668 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1670 *       |                               SPI (A2)                        |
1671 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1673 *       |                                                               |
1674 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675 *       |                              0x0                              |
1676 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677 *
1678 *                         AAD Format with 64-bit Extended Sequence Number
1679 *
1680 * poly = x^128 + x^127 + x^126 + x^121 + 1
1681 ***************************************************************************/
1682 SYM_FUNC_START(aesni_gcm_enc)
1683         FUNC_SAVE
1684
1685         GCM_INIT %arg6, arg7, arg8, arg9
1686         GCM_ENC_DEC enc
1687
1688         GCM_COMPLETE arg10, arg11
1689         FUNC_RESTORE
1690         ret
1691 SYM_FUNC_END(aesni_gcm_enc)
1692
1693 /*****************************************************************************
1694 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1695 *                     struct gcm_context_data *data,
1696 *                                         // context data
1697 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1698 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1700 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1702 *                     u64 aad_len)        // Length of AAD in bytes.
1703 */
1704 SYM_FUNC_START(aesni_gcm_init)
1705         FUNC_SAVE
1706         GCM_INIT %arg3, %arg4,%arg5, %arg6
1707         FUNC_RESTORE
1708         ret
1709 SYM_FUNC_END(aesni_gcm_init)
1710
1711 /*****************************************************************************
1712 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1713 *                    struct gcm_context_data *data,
1714 *                                        // context data
1715 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1716 *                    const u8 *in,       // Plaintext input
1717 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1718 */
1719 SYM_FUNC_START(aesni_gcm_enc_update)
1720         FUNC_SAVE
1721         GCM_ENC_DEC enc
1722         FUNC_RESTORE
1723         ret
1724 SYM_FUNC_END(aesni_gcm_enc_update)
1725
1726 /*****************************************************************************
1727 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1728 *                    struct gcm_context_data *data,
1729 *                                        // context data
1730 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1731 *                    const u8 *in,       // Plaintext input
1732 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1733 */
1734 SYM_FUNC_START(aesni_gcm_dec_update)
1735         FUNC_SAVE
1736         GCM_ENC_DEC dec
1737         FUNC_RESTORE
1738         ret
1739 SYM_FUNC_END(aesni_gcm_dec_update)
1740
1741 /*****************************************************************************
1742 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1743 *                    struct gcm_context_data *data,
1744 *                                        // context data
1745 *                    u8 *auth_tag,       // Authenticated Tag output.
1746 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1747 *                                        // 12 or 8.
1748 */
1749 SYM_FUNC_START(aesni_gcm_finalize)
1750         FUNC_SAVE
1751         GCM_COMPLETE %arg3 %arg4
1752         FUNC_RESTORE
1753         ret
1754 SYM_FUNC_END(aesni_gcm_finalize)
1755
1756 #endif
1757
1758
1759 SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1760 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1761         pshufd $0b11111111, %xmm1, %xmm1
1762         shufps $0b00010000, %xmm0, %xmm4
1763         pxor %xmm4, %xmm0
1764         shufps $0b10001100, %xmm0, %xmm4
1765         pxor %xmm4, %xmm0
1766         pxor %xmm1, %xmm0
1767         movaps %xmm0, (TKEYP)
1768         add $0x10, TKEYP
1769         ret
1770 SYM_FUNC_END(_key_expansion_256a)
1771 SYM_FUNC_END_ALIAS(_key_expansion_128)
1772
1773 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1774         pshufd $0b01010101, %xmm1, %xmm1
1775         shufps $0b00010000, %xmm0, %xmm4
1776         pxor %xmm4, %xmm0
1777         shufps $0b10001100, %xmm0, %xmm4
1778         pxor %xmm4, %xmm0
1779         pxor %xmm1, %xmm0
1780
1781         movaps %xmm2, %xmm5
1782         movaps %xmm2, %xmm6
1783         pslldq $4, %xmm5
1784         pshufd $0b11111111, %xmm0, %xmm3
1785         pxor %xmm3, %xmm2
1786         pxor %xmm5, %xmm2
1787
1788         movaps %xmm0, %xmm1
1789         shufps $0b01000100, %xmm0, %xmm6
1790         movaps %xmm6, (TKEYP)
1791         shufps $0b01001110, %xmm2, %xmm1
1792         movaps %xmm1, 0x10(TKEYP)
1793         add $0x20, TKEYP
1794         ret
1795 SYM_FUNC_END(_key_expansion_192a)
1796
1797 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1798         pshufd $0b01010101, %xmm1, %xmm1
1799         shufps $0b00010000, %xmm0, %xmm4
1800         pxor %xmm4, %xmm0
1801         shufps $0b10001100, %xmm0, %xmm4
1802         pxor %xmm4, %xmm0
1803         pxor %xmm1, %xmm0
1804
1805         movaps %xmm2, %xmm5
1806         pslldq $4, %xmm5
1807         pshufd $0b11111111, %xmm0, %xmm3
1808         pxor %xmm3, %xmm2
1809         pxor %xmm5, %xmm2
1810
1811         movaps %xmm0, (TKEYP)
1812         add $0x10, TKEYP
1813         ret
1814 SYM_FUNC_END(_key_expansion_192b)
1815
1816 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1817         pshufd $0b10101010, %xmm1, %xmm1
1818         shufps $0b00010000, %xmm2, %xmm4
1819         pxor %xmm4, %xmm2
1820         shufps $0b10001100, %xmm2, %xmm4
1821         pxor %xmm4, %xmm2
1822         pxor %xmm1, %xmm2
1823         movaps %xmm2, (TKEYP)
1824         add $0x10, TKEYP
1825         ret
1826 SYM_FUNC_END(_key_expansion_256b)
1827
1828 /*
1829  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830  *                   unsigned int key_len)
1831  */
1832 SYM_FUNC_START(aesni_set_key)
1833         FRAME_BEGIN
1834 #ifndef __x86_64__
1835         pushl KEYP
1836         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1837         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1838         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1839 #endif
1840         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1841         movaps %xmm0, (KEYP)
1842         lea 0x10(KEYP), TKEYP           # key addr
1843         movl %edx, 480(KEYP)
1844         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1845         cmp $24, %dl
1846         jb .Lenc_key128
1847         je .Lenc_key192
1848         movups 0x10(UKEYP), %xmm2       # other user key
1849         movaps %xmm2, (TKEYP)
1850         add $0x10, TKEYP
1851         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1852         call _key_expansion_256a
1853         aeskeygenassist $0x1, %xmm0, %xmm1
1854         call _key_expansion_256b
1855         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1856         call _key_expansion_256a
1857         aeskeygenassist $0x2, %xmm0, %xmm1
1858         call _key_expansion_256b
1859         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1860         call _key_expansion_256a
1861         aeskeygenassist $0x4, %xmm0, %xmm1
1862         call _key_expansion_256b
1863         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1864         call _key_expansion_256a
1865         aeskeygenassist $0x8, %xmm0, %xmm1
1866         call _key_expansion_256b
1867         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1868         call _key_expansion_256a
1869         aeskeygenassist $0x10, %xmm0, %xmm1
1870         call _key_expansion_256b
1871         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1872         call _key_expansion_256a
1873         aeskeygenassist $0x20, %xmm0, %xmm1
1874         call _key_expansion_256b
1875         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1876         call _key_expansion_256a
1877         jmp .Ldec_key
1878 .Lenc_key192:
1879         movq 0x10(UKEYP), %xmm2         # other user key
1880         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1881         call _key_expansion_192a
1882         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1883         call _key_expansion_192b
1884         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1885         call _key_expansion_192a
1886         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1887         call _key_expansion_192b
1888         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1889         call _key_expansion_192a
1890         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1891         call _key_expansion_192b
1892         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1893         call _key_expansion_192a
1894         aeskeygenassist $0x80, %xmm2, %xmm1     # round 8
1895         call _key_expansion_192b
1896         jmp .Ldec_key
1897 .Lenc_key128:
1898         aeskeygenassist $0x1, %xmm0, %xmm1      # round 1
1899         call _key_expansion_128
1900         aeskeygenassist $0x2, %xmm0, %xmm1      # round 2
1901         call _key_expansion_128
1902         aeskeygenassist $0x4, %xmm0, %xmm1      # round 3
1903         call _key_expansion_128
1904         aeskeygenassist $0x8, %xmm0, %xmm1      # round 4
1905         call _key_expansion_128
1906         aeskeygenassist $0x10, %xmm0, %xmm1     # round 5
1907         call _key_expansion_128
1908         aeskeygenassist $0x20, %xmm0, %xmm1     # round 6
1909         call _key_expansion_128
1910         aeskeygenassist $0x40, %xmm0, %xmm1     # round 7
1911         call _key_expansion_128
1912         aeskeygenassist $0x80, %xmm0, %xmm1     # round 8
1913         call _key_expansion_128
1914         aeskeygenassist $0x1b, %xmm0, %xmm1     # round 9
1915         call _key_expansion_128
1916         aeskeygenassist $0x36, %xmm0, %xmm1     # round 10
1917         call _key_expansion_128
1918 .Ldec_key:
1919         sub $0x10, TKEYP
1920         movaps (KEYP), %xmm0
1921         movaps (TKEYP), %xmm1
1922         movaps %xmm0, 240(TKEYP)
1923         movaps %xmm1, 240(KEYP)
1924         add $0x10, KEYP
1925         lea 240-16(TKEYP), UKEYP
1926 .align 4
1927 .Ldec_key_loop:
1928         movaps (KEYP), %xmm0
1929         aesimc %xmm0, %xmm1
1930         movaps %xmm1, (UKEYP)
1931         add $0x10, KEYP
1932         sub $0x10, UKEYP
1933         cmp TKEYP, KEYP
1934         jb .Ldec_key_loop
1935         xor AREG, AREG
1936 #ifndef __x86_64__
1937         popl KEYP
1938 #endif
1939         FRAME_END
1940         ret
1941 SYM_FUNC_END(aesni_set_key)
1942
1943 /*
1944  * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1945  */
1946 SYM_FUNC_START(aesni_enc)
1947         FRAME_BEGIN
1948 #ifndef __x86_64__
1949         pushl KEYP
1950         pushl KLEN
1951         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1952         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1953         movl (FRAME_OFFSET+20)(%esp), INP       # src
1954 #endif
1955         movl 480(KEYP), KLEN            # key length
1956         movups (INP), STATE             # input
1957         call _aesni_enc1
1958         movups STATE, (OUTP)            # output
1959 #ifndef __x86_64__
1960         popl KLEN
1961         popl KEYP
1962 #endif
1963         FRAME_END
1964         ret
1965 SYM_FUNC_END(aesni_enc)
1966
1967 /*
1968  * _aesni_enc1:         internal ABI
1969  * input:
1970  *      KEYP:           key struct pointer
1971  *      KLEN:           round count
1972  *      STATE:          initial state (input)
1973  * output:
1974  *      STATE:          finial state (output)
1975  * changed:
1976  *      KEY
1977  *      TKEYP (T1)
1978  */
1979 SYM_FUNC_START_LOCAL(_aesni_enc1)
1980         movaps (KEYP), KEY              # key
1981         mov KEYP, TKEYP
1982         pxor KEY, STATE         # round 0
1983         add $0x30, TKEYP
1984         cmp $24, KLEN
1985         jb .Lenc128
1986         lea 0x20(TKEYP), TKEYP
1987         je .Lenc192
1988         add $0x20, TKEYP
1989         movaps -0x60(TKEYP), KEY
1990         aesenc KEY, STATE
1991         movaps -0x50(TKEYP), KEY
1992         aesenc KEY, STATE
1993 .align 4
1994 .Lenc192:
1995         movaps -0x40(TKEYP), KEY
1996         aesenc KEY, STATE
1997         movaps -0x30(TKEYP), KEY
1998         aesenc KEY, STATE
1999 .align 4
2000 .Lenc128:
2001         movaps -0x20(TKEYP), KEY
2002         aesenc KEY, STATE
2003         movaps -0x10(TKEYP), KEY
2004         aesenc KEY, STATE
2005         movaps (TKEYP), KEY
2006         aesenc KEY, STATE
2007         movaps 0x10(TKEYP), KEY
2008         aesenc KEY, STATE
2009         movaps 0x20(TKEYP), KEY
2010         aesenc KEY, STATE
2011         movaps 0x30(TKEYP), KEY
2012         aesenc KEY, STATE
2013         movaps 0x40(TKEYP), KEY
2014         aesenc KEY, STATE
2015         movaps 0x50(TKEYP), KEY
2016         aesenc KEY, STATE
2017         movaps 0x60(TKEYP), KEY
2018         aesenc KEY, STATE
2019         movaps 0x70(TKEYP), KEY
2020         aesenclast KEY, STATE
2021         ret
2022 SYM_FUNC_END(_aesni_enc1)
2023
2024 /*
2025  * _aesni_enc4: internal ABI
2026  * input:
2027  *      KEYP:           key struct pointer
2028  *      KLEN:           round count
2029  *      STATE1:         initial state (input)
2030  *      STATE2
2031  *      STATE3
2032  *      STATE4
2033  * output:
2034  *      STATE1:         finial state (output)
2035  *      STATE2
2036  *      STATE3
2037  *      STATE4
2038  * changed:
2039  *      KEY
2040  *      TKEYP (T1)
2041  */
2042 SYM_FUNC_START_LOCAL(_aesni_enc4)
2043         movaps (KEYP), KEY              # key
2044         mov KEYP, TKEYP
2045         pxor KEY, STATE1                # round 0
2046         pxor KEY, STATE2
2047         pxor KEY, STATE3
2048         pxor KEY, STATE4
2049         add $0x30, TKEYP
2050         cmp $24, KLEN
2051         jb .L4enc128
2052         lea 0x20(TKEYP), TKEYP
2053         je .L4enc192
2054         add $0x20, TKEYP
2055         movaps -0x60(TKEYP), KEY
2056         aesenc KEY, STATE1
2057         aesenc KEY, STATE2
2058         aesenc KEY, STATE3
2059         aesenc KEY, STATE4
2060         movaps -0x50(TKEYP), KEY
2061         aesenc KEY, STATE1
2062         aesenc KEY, STATE2
2063         aesenc KEY, STATE3
2064         aesenc KEY, STATE4
2065 #.align 4
2066 .L4enc192:
2067         movaps -0x40(TKEYP), KEY
2068         aesenc KEY, STATE1
2069         aesenc KEY, STATE2
2070         aesenc KEY, STATE3
2071         aesenc KEY, STATE4
2072         movaps -0x30(TKEYP), KEY
2073         aesenc KEY, STATE1
2074         aesenc KEY, STATE2
2075         aesenc KEY, STATE3
2076         aesenc KEY, STATE4
2077 #.align 4
2078 .L4enc128:
2079         movaps -0x20(TKEYP), KEY
2080         aesenc KEY, STATE1
2081         aesenc KEY, STATE2
2082         aesenc KEY, STATE3
2083         aesenc KEY, STATE4
2084         movaps -0x10(TKEYP), KEY
2085         aesenc KEY, STATE1
2086         aesenc KEY, STATE2
2087         aesenc KEY, STATE3
2088         aesenc KEY, STATE4
2089         movaps (TKEYP), KEY
2090         aesenc KEY, STATE1
2091         aesenc KEY, STATE2
2092         aesenc KEY, STATE3
2093         aesenc KEY, STATE4
2094         movaps 0x10(TKEYP), KEY
2095         aesenc KEY, STATE1
2096         aesenc KEY, STATE2
2097         aesenc KEY, STATE3
2098         aesenc KEY, STATE4
2099         movaps 0x20(TKEYP), KEY
2100         aesenc KEY, STATE1
2101         aesenc KEY, STATE2
2102         aesenc KEY, STATE3
2103         aesenc KEY, STATE4
2104         movaps 0x30(TKEYP), KEY
2105         aesenc KEY, STATE1
2106         aesenc KEY, STATE2
2107         aesenc KEY, STATE3
2108         aesenc KEY, STATE4
2109         movaps 0x40(TKEYP), KEY
2110         aesenc KEY, STATE1
2111         aesenc KEY, STATE2
2112         aesenc KEY, STATE3
2113         aesenc KEY, STATE4
2114         movaps 0x50(TKEYP), KEY
2115         aesenc KEY, STATE1
2116         aesenc KEY, STATE2
2117         aesenc KEY, STATE3
2118         aesenc KEY, STATE4
2119         movaps 0x60(TKEYP), KEY
2120         aesenc KEY, STATE1
2121         aesenc KEY, STATE2
2122         aesenc KEY, STATE3
2123         aesenc KEY, STATE4
2124         movaps 0x70(TKEYP), KEY
2125         aesenclast KEY, STATE1          # last round
2126         aesenclast KEY, STATE2
2127         aesenclast KEY, STATE3
2128         aesenclast KEY, STATE4
2129         ret
2130 SYM_FUNC_END(_aesni_enc4)
2131
2132 /*
2133  * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2134  */
2135 SYM_FUNC_START(aesni_dec)
2136         FRAME_BEGIN
2137 #ifndef __x86_64__
2138         pushl KEYP
2139         pushl KLEN
2140         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2141         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2142         movl (FRAME_OFFSET+20)(%esp), INP       # src
2143 #endif
2144         mov 480(KEYP), KLEN             # key length
2145         add $240, KEYP
2146         movups (INP), STATE             # input
2147         call _aesni_dec1
2148         movups STATE, (OUTP)            #output
2149 #ifndef __x86_64__
2150         popl KLEN
2151         popl KEYP
2152 #endif
2153         FRAME_END
2154         ret
2155 SYM_FUNC_END(aesni_dec)
2156
2157 /*
2158  * _aesni_dec1:         internal ABI
2159  * input:
2160  *      KEYP:           key struct pointer
2161  *      KLEN:           key length
2162  *      STATE:          initial state (input)
2163  * output:
2164  *      STATE:          finial state (output)
2165  * changed:
2166  *      KEY
2167  *      TKEYP (T1)
2168  */
2169 SYM_FUNC_START_LOCAL(_aesni_dec1)
2170         movaps (KEYP), KEY              # key
2171         mov KEYP, TKEYP
2172         pxor KEY, STATE         # round 0
2173         add $0x30, TKEYP
2174         cmp $24, KLEN
2175         jb .Ldec128
2176         lea 0x20(TKEYP), TKEYP
2177         je .Ldec192
2178         add $0x20, TKEYP
2179         movaps -0x60(TKEYP), KEY
2180         aesdec KEY, STATE
2181         movaps -0x50(TKEYP), KEY
2182         aesdec KEY, STATE
2183 .align 4
2184 .Ldec192:
2185         movaps -0x40(TKEYP), KEY
2186         aesdec KEY, STATE
2187         movaps -0x30(TKEYP), KEY
2188         aesdec KEY, STATE
2189 .align 4
2190 .Ldec128:
2191         movaps -0x20(TKEYP), KEY
2192         aesdec KEY, STATE
2193         movaps -0x10(TKEYP), KEY
2194         aesdec KEY, STATE
2195         movaps (TKEYP), KEY
2196         aesdec KEY, STATE
2197         movaps 0x10(TKEYP), KEY
2198         aesdec KEY, STATE
2199         movaps 0x20(TKEYP), KEY
2200         aesdec KEY, STATE
2201         movaps 0x30(TKEYP), KEY
2202         aesdec KEY, STATE
2203         movaps 0x40(TKEYP), KEY
2204         aesdec KEY, STATE
2205         movaps 0x50(TKEYP), KEY
2206         aesdec KEY, STATE
2207         movaps 0x60(TKEYP), KEY
2208         aesdec KEY, STATE
2209         movaps 0x70(TKEYP), KEY
2210         aesdeclast KEY, STATE
2211         ret
2212 SYM_FUNC_END(_aesni_dec1)
2213
2214 /*
2215  * _aesni_dec4: internal ABI
2216  * input:
2217  *      KEYP:           key struct pointer
2218  *      KLEN:           key length
2219  *      STATE1:         initial state (input)
2220  *      STATE2
2221  *      STATE3
2222  *      STATE4
2223  * output:
2224  *      STATE1:         finial state (output)
2225  *      STATE2
2226  *      STATE3
2227  *      STATE4
2228  * changed:
2229  *      KEY
2230  *      TKEYP (T1)
2231  */
2232 SYM_FUNC_START_LOCAL(_aesni_dec4)
2233         movaps (KEYP), KEY              # key
2234         mov KEYP, TKEYP
2235         pxor KEY, STATE1                # round 0
2236         pxor KEY, STATE2
2237         pxor KEY, STATE3
2238         pxor KEY, STATE4
2239         add $0x30, TKEYP
2240         cmp $24, KLEN
2241         jb .L4dec128
2242         lea 0x20(TKEYP), TKEYP
2243         je .L4dec192
2244         add $0x20, TKEYP
2245         movaps -0x60(TKEYP), KEY
2246         aesdec KEY, STATE1
2247         aesdec KEY, STATE2
2248         aesdec KEY, STATE3
2249         aesdec KEY, STATE4
2250         movaps -0x50(TKEYP), KEY
2251         aesdec KEY, STATE1
2252         aesdec KEY, STATE2
2253         aesdec KEY, STATE3
2254         aesdec KEY, STATE4
2255 .align 4
2256 .L4dec192:
2257         movaps -0x40(TKEYP), KEY
2258         aesdec KEY, STATE1
2259         aesdec KEY, STATE2
2260         aesdec KEY, STATE3
2261         aesdec KEY, STATE4
2262         movaps -0x30(TKEYP), KEY
2263         aesdec KEY, STATE1
2264         aesdec KEY, STATE2
2265         aesdec KEY, STATE3
2266         aesdec KEY, STATE4
2267 .align 4
2268 .L4dec128:
2269         movaps -0x20(TKEYP), KEY
2270         aesdec KEY, STATE1
2271         aesdec KEY, STATE2
2272         aesdec KEY, STATE3
2273         aesdec KEY, STATE4
2274         movaps -0x10(TKEYP), KEY
2275         aesdec KEY, STATE1
2276         aesdec KEY, STATE2
2277         aesdec KEY, STATE3
2278         aesdec KEY, STATE4
2279         movaps (TKEYP), KEY
2280         aesdec KEY, STATE1
2281         aesdec KEY, STATE2
2282         aesdec KEY, STATE3
2283         aesdec KEY, STATE4
2284         movaps 0x10(TKEYP), KEY
2285         aesdec KEY, STATE1
2286         aesdec KEY, STATE2
2287         aesdec KEY, STATE3
2288         aesdec KEY, STATE4
2289         movaps 0x20(TKEYP), KEY
2290         aesdec KEY, STATE1
2291         aesdec KEY, STATE2
2292         aesdec KEY, STATE3
2293         aesdec KEY, STATE4
2294         movaps 0x30(TKEYP), KEY
2295         aesdec KEY, STATE1
2296         aesdec KEY, STATE2
2297         aesdec KEY, STATE3
2298         aesdec KEY, STATE4
2299         movaps 0x40(TKEYP), KEY
2300         aesdec KEY, STATE1
2301         aesdec KEY, STATE2
2302         aesdec KEY, STATE3
2303         aesdec KEY, STATE4
2304         movaps 0x50(TKEYP), KEY
2305         aesdec KEY, STATE1
2306         aesdec KEY, STATE2
2307         aesdec KEY, STATE3
2308         aesdec KEY, STATE4
2309         movaps 0x60(TKEYP), KEY
2310         aesdec KEY, STATE1
2311         aesdec KEY, STATE2
2312         aesdec KEY, STATE3
2313         aesdec KEY, STATE4
2314         movaps 0x70(TKEYP), KEY
2315         aesdeclast KEY, STATE1          # last round
2316         aesdeclast KEY, STATE2
2317         aesdeclast KEY, STATE3
2318         aesdeclast KEY, STATE4
2319         ret
2320 SYM_FUNC_END(_aesni_dec4)
2321
2322 /*
2323  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2324  *                    size_t len)
2325  */
2326 SYM_FUNC_START(aesni_ecb_enc)
2327         FRAME_BEGIN
2328 #ifndef __x86_64__
2329         pushl LEN
2330         pushl KEYP
2331         pushl KLEN
2332         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2333         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2334         movl (FRAME_OFFSET+24)(%esp), INP       # src
2335         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2336 #endif
2337         test LEN, LEN           # check length
2338         jz .Lecb_enc_ret
2339         mov 480(KEYP), KLEN
2340         cmp $16, LEN
2341         jb .Lecb_enc_ret
2342         cmp $64, LEN
2343         jb .Lecb_enc_loop1
2344 .align 4
2345 .Lecb_enc_loop4:
2346         movups (INP), STATE1
2347         movups 0x10(INP), STATE2
2348         movups 0x20(INP), STATE3
2349         movups 0x30(INP), STATE4
2350         call _aesni_enc4
2351         movups STATE1, (OUTP)
2352         movups STATE2, 0x10(OUTP)
2353         movups STATE3, 0x20(OUTP)
2354         movups STATE4, 0x30(OUTP)
2355         sub $64, LEN
2356         add $64, INP
2357         add $64, OUTP
2358         cmp $64, LEN
2359         jge .Lecb_enc_loop4
2360         cmp $16, LEN
2361         jb .Lecb_enc_ret
2362 .align 4
2363 .Lecb_enc_loop1:
2364         movups (INP), STATE1
2365         call _aesni_enc1
2366         movups STATE1, (OUTP)
2367         sub $16, LEN
2368         add $16, INP
2369         add $16, OUTP
2370         cmp $16, LEN
2371         jge .Lecb_enc_loop1
2372 .Lecb_enc_ret:
2373 #ifndef __x86_64__
2374         popl KLEN
2375         popl KEYP
2376         popl LEN
2377 #endif
2378         FRAME_END
2379         ret
2380 SYM_FUNC_END(aesni_ecb_enc)
2381
2382 /*
2383  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2384  *                    size_t len);
2385  */
2386 SYM_FUNC_START(aesni_ecb_dec)
2387         FRAME_BEGIN
2388 #ifndef __x86_64__
2389         pushl LEN
2390         pushl KEYP
2391         pushl KLEN
2392         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2393         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2394         movl (FRAME_OFFSET+24)(%esp), INP       # src
2395         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2396 #endif
2397         test LEN, LEN
2398         jz .Lecb_dec_ret
2399         mov 480(KEYP), KLEN
2400         add $240, KEYP
2401         cmp $16, LEN
2402         jb .Lecb_dec_ret
2403         cmp $64, LEN
2404         jb .Lecb_dec_loop1
2405 .align 4
2406 .Lecb_dec_loop4:
2407         movups (INP), STATE1
2408         movups 0x10(INP), STATE2
2409         movups 0x20(INP), STATE3
2410         movups 0x30(INP), STATE4
2411         call _aesni_dec4
2412         movups STATE1, (OUTP)
2413         movups STATE2, 0x10(OUTP)
2414         movups STATE3, 0x20(OUTP)
2415         movups STATE4, 0x30(OUTP)
2416         sub $64, LEN
2417         add $64, INP
2418         add $64, OUTP
2419         cmp $64, LEN
2420         jge .Lecb_dec_loop4
2421         cmp $16, LEN
2422         jb .Lecb_dec_ret
2423 .align 4
2424 .Lecb_dec_loop1:
2425         movups (INP), STATE1
2426         call _aesni_dec1
2427         movups STATE1, (OUTP)
2428         sub $16, LEN
2429         add $16, INP
2430         add $16, OUTP
2431         cmp $16, LEN
2432         jge .Lecb_dec_loop1
2433 .Lecb_dec_ret:
2434 #ifndef __x86_64__
2435         popl KLEN
2436         popl KEYP
2437         popl LEN
2438 #endif
2439         FRAME_END
2440         ret
2441 SYM_FUNC_END(aesni_ecb_dec)
2442
2443 /*
2444  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445  *                    size_t len, u8 *iv)
2446  */
2447 SYM_FUNC_START(aesni_cbc_enc)
2448         FRAME_BEGIN
2449 #ifndef __x86_64__
2450         pushl IVP
2451         pushl LEN
2452         pushl KEYP
2453         pushl KLEN
2454         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2455         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2456         movl (FRAME_OFFSET+28)(%esp), INP       # src
2457         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2458         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2459 #endif
2460         cmp $16, LEN
2461         jb .Lcbc_enc_ret
2462         mov 480(KEYP), KLEN
2463         movups (IVP), STATE     # load iv as initial state
2464 .align 4
2465 .Lcbc_enc_loop:
2466         movups (INP), IN        # load input
2467         pxor IN, STATE
2468         call _aesni_enc1
2469         movups STATE, (OUTP)    # store output
2470         sub $16, LEN
2471         add $16, INP
2472         add $16, OUTP
2473         cmp $16, LEN
2474         jge .Lcbc_enc_loop
2475         movups STATE, (IVP)
2476 .Lcbc_enc_ret:
2477 #ifndef __x86_64__
2478         popl KLEN
2479         popl KEYP
2480         popl LEN
2481         popl IVP
2482 #endif
2483         FRAME_END
2484         ret
2485 SYM_FUNC_END(aesni_cbc_enc)
2486
2487 /*
2488  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489  *                    size_t len, u8 *iv)
2490  */
2491 SYM_FUNC_START(aesni_cbc_dec)
2492         FRAME_BEGIN
2493 #ifndef __x86_64__
2494         pushl IVP
2495         pushl LEN
2496         pushl KEYP
2497         pushl KLEN
2498         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2499         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2500         movl (FRAME_OFFSET+28)(%esp), INP       # src
2501         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2502         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2503 #endif
2504         cmp $16, LEN
2505         jb .Lcbc_dec_just_ret
2506         mov 480(KEYP), KLEN
2507         add $240, KEYP
2508         movups (IVP), IV
2509         cmp $64, LEN
2510         jb .Lcbc_dec_loop1
2511 .align 4
2512 .Lcbc_dec_loop4:
2513         movups (INP), IN1
2514         movaps IN1, STATE1
2515         movups 0x10(INP), IN2
2516         movaps IN2, STATE2
2517 #ifdef __x86_64__
2518         movups 0x20(INP), IN3
2519         movaps IN3, STATE3
2520         movups 0x30(INP), IN4
2521         movaps IN4, STATE4
2522 #else
2523         movups 0x20(INP), IN1
2524         movaps IN1, STATE3
2525         movups 0x30(INP), IN2
2526         movaps IN2, STATE4
2527 #endif
2528         call _aesni_dec4
2529         pxor IV, STATE1
2530 #ifdef __x86_64__
2531         pxor IN1, STATE2
2532         pxor IN2, STATE3
2533         pxor IN3, STATE4
2534         movaps IN4, IV
2535 #else
2536         pxor IN1, STATE4
2537         movaps IN2, IV
2538         movups (INP), IN1
2539         pxor IN1, STATE2
2540         movups 0x10(INP), IN2
2541         pxor IN2, STATE3
2542 #endif
2543         movups STATE1, (OUTP)
2544         movups STATE2, 0x10(OUTP)
2545         movups STATE3, 0x20(OUTP)
2546         movups STATE4, 0x30(OUTP)
2547         sub $64, LEN
2548         add $64, INP
2549         add $64, OUTP
2550         cmp $64, LEN
2551         jge .Lcbc_dec_loop4
2552         cmp $16, LEN
2553         jb .Lcbc_dec_ret
2554 .align 4
2555 .Lcbc_dec_loop1:
2556         movups (INP), IN
2557         movaps IN, STATE
2558         call _aesni_dec1
2559         pxor IV, STATE
2560         movups STATE, (OUTP)
2561         movaps IN, IV
2562         sub $16, LEN
2563         add $16, INP
2564         add $16, OUTP
2565         cmp $16, LEN
2566         jge .Lcbc_dec_loop1
2567 .Lcbc_dec_ret:
2568         movups IV, (IVP)
2569 .Lcbc_dec_just_ret:
2570 #ifndef __x86_64__
2571         popl KLEN
2572         popl KEYP
2573         popl LEN
2574         popl IVP
2575 #endif
2576         FRAME_END
2577         ret
2578 SYM_FUNC_END(aesni_cbc_dec)
2579
2580 #ifdef __x86_64__
2581 .pushsection .rodata
2582 .align 16
2583 .Lbswap_mask:
2584         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2585 .popsection
2586
2587 /*
2588  * _aesni_inc_init:     internal ABI
2589  *      setup registers used by _aesni_inc
2590  * input:
2591  *      IV
2592  * output:
2593  *      CTR:    == IV, in little endian
2594  *      TCTR_LOW: == lower qword of CTR
2595  *      INC:    == 1, in little endian
2596  *      BSWAP_MASK == endian swapping mask
2597  */
2598 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2599         movaps .Lbswap_mask, BSWAP_MASK
2600         movaps IV, CTR
2601         pshufb BSWAP_MASK, CTR
2602         mov $1, TCTR_LOW
2603         movq TCTR_LOW, INC
2604         movq CTR, TCTR_LOW
2605         ret
2606 SYM_FUNC_END(_aesni_inc_init)
2607
2608 /*
2609  * _aesni_inc:          internal ABI
2610  *      Increase IV by 1, IV is in big endian
2611  * input:
2612  *      IV
2613  *      CTR:    == IV, in little endian
2614  *      TCTR_LOW: == lower qword of CTR
2615  *      INC:    == 1, in little endian
2616  *      BSWAP_MASK == endian swapping mask
2617  * output:
2618  *      IV:     Increase by 1
2619  * changed:
2620  *      CTR:    == output IV, in little endian
2621  *      TCTR_LOW: == lower qword of CTR
2622  */
2623 SYM_FUNC_START_LOCAL(_aesni_inc)
2624         paddq INC, CTR
2625         add $1, TCTR_LOW
2626         jnc .Linc_low
2627         pslldq $8, INC
2628         paddq INC, CTR
2629         psrldq $8, INC
2630 .Linc_low:
2631         movaps CTR, IV
2632         pshufb BSWAP_MASK, IV
2633         ret
2634 SYM_FUNC_END(_aesni_inc)
2635
2636 /*
2637  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638  *                    size_t len, u8 *iv)
2639  */
2640 SYM_FUNC_START(aesni_ctr_enc)
2641         FRAME_BEGIN
2642         cmp $16, LEN
2643         jb .Lctr_enc_just_ret
2644         mov 480(KEYP), KLEN
2645         movups (IVP), IV
2646         call _aesni_inc_init
2647         cmp $64, LEN
2648         jb .Lctr_enc_loop1
2649 .align 4
2650 .Lctr_enc_loop4:
2651         movaps IV, STATE1
2652         call _aesni_inc
2653         movups (INP), IN1
2654         movaps IV, STATE2
2655         call _aesni_inc
2656         movups 0x10(INP), IN2
2657         movaps IV, STATE3
2658         call _aesni_inc
2659         movups 0x20(INP), IN3
2660         movaps IV, STATE4
2661         call _aesni_inc
2662         movups 0x30(INP), IN4
2663         call _aesni_enc4
2664         pxor IN1, STATE1
2665         movups STATE1, (OUTP)
2666         pxor IN2, STATE2
2667         movups STATE2, 0x10(OUTP)
2668         pxor IN3, STATE3
2669         movups STATE3, 0x20(OUTP)
2670         pxor IN4, STATE4
2671         movups STATE4, 0x30(OUTP)
2672         sub $64, LEN
2673         add $64, INP
2674         add $64, OUTP
2675         cmp $64, LEN
2676         jge .Lctr_enc_loop4
2677         cmp $16, LEN
2678         jb .Lctr_enc_ret
2679 .align 4
2680 .Lctr_enc_loop1:
2681         movaps IV, STATE
2682         call _aesni_inc
2683         movups (INP), IN
2684         call _aesni_enc1
2685         pxor IN, STATE
2686         movups STATE, (OUTP)
2687         sub $16, LEN
2688         add $16, INP
2689         add $16, OUTP
2690         cmp $16, LEN
2691         jge .Lctr_enc_loop1
2692 .Lctr_enc_ret:
2693         movups IV, (IVP)
2694 .Lctr_enc_just_ret:
2695         FRAME_END
2696         ret
2697 SYM_FUNC_END(aesni_ctr_enc)
2698
2699 /*
2700  * _aesni_gf128mul_x_ble:               internal ABI
2701  *      Multiply in GF(2^128) for XTS IVs
2702  * input:
2703  *      IV:     current IV
2704  *      GF128MUL_MASK == mask with 0x87 and 0x01
2705  * output:
2706  *      IV:     next IV
2707  * changed:
2708  *      CTR:    == temporary value
2709  */
2710 #define _aesni_gf128mul_x_ble() \
2711         pshufd $0x13, IV, CTR; \
2712         paddq IV, IV; \
2713         psrad $31, CTR; \
2714         pand GF128MUL_MASK, CTR; \
2715         pxor CTR, IV;
2716
2717 /*
2718  * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2719  *                       const u8 *src, bool enc, le128 *iv)
2720  */
2721 SYM_FUNC_START(aesni_xts_crypt8)
2722         FRAME_BEGIN
2723         cmpb $0, %cl
2724         movl $0, %ecx
2725         movl $240, %r10d
2726         leaq _aesni_enc4, %r11
2727         leaq _aesni_dec4, %rax
2728         cmovel %r10d, %ecx
2729         cmoveq %rax, %r11
2730
2731         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2732         movups (IVP), IV
2733
2734         mov 480(KEYP), KLEN
2735         addq %rcx, KEYP
2736
2737         movdqa IV, STATE1
2738         movdqu 0x00(INP), INC
2739         pxor INC, STATE1
2740         movdqu IV, 0x00(OUTP)
2741
2742         _aesni_gf128mul_x_ble()
2743         movdqa IV, STATE2
2744         movdqu 0x10(INP), INC
2745         pxor INC, STATE2
2746         movdqu IV, 0x10(OUTP)
2747
2748         _aesni_gf128mul_x_ble()
2749         movdqa IV, STATE3
2750         movdqu 0x20(INP), INC
2751         pxor INC, STATE3
2752         movdqu IV, 0x20(OUTP)
2753
2754         _aesni_gf128mul_x_ble()
2755         movdqa IV, STATE4
2756         movdqu 0x30(INP), INC
2757         pxor INC, STATE4
2758         movdqu IV, 0x30(OUTP)
2759
2760         CALL_NOSPEC r11
2761
2762         movdqu 0x00(OUTP), INC
2763         pxor INC, STATE1
2764         movdqu STATE1, 0x00(OUTP)
2765
2766         _aesni_gf128mul_x_ble()
2767         movdqa IV, STATE1
2768         movdqu 0x40(INP), INC
2769         pxor INC, STATE1
2770         movdqu IV, 0x40(OUTP)
2771
2772         movdqu 0x10(OUTP), INC
2773         pxor INC, STATE2
2774         movdqu STATE2, 0x10(OUTP)
2775
2776         _aesni_gf128mul_x_ble()
2777         movdqa IV, STATE2
2778         movdqu 0x50(INP), INC
2779         pxor INC, STATE2
2780         movdqu IV, 0x50(OUTP)
2781
2782         movdqu 0x20(OUTP), INC
2783         pxor INC, STATE3
2784         movdqu STATE3, 0x20(OUTP)
2785
2786         _aesni_gf128mul_x_ble()
2787         movdqa IV, STATE3
2788         movdqu 0x60(INP), INC
2789         pxor INC, STATE3
2790         movdqu IV, 0x60(OUTP)
2791
2792         movdqu 0x30(OUTP), INC
2793         pxor INC, STATE4
2794         movdqu STATE4, 0x30(OUTP)
2795
2796         _aesni_gf128mul_x_ble()
2797         movdqa IV, STATE4
2798         movdqu 0x70(INP), INC
2799         pxor INC, STATE4
2800         movdqu IV, 0x70(OUTP)
2801
2802         _aesni_gf128mul_x_ble()
2803         movups IV, (IVP)
2804
2805         CALL_NOSPEC r11
2806
2807         movdqu 0x40(OUTP), INC
2808         pxor INC, STATE1
2809         movdqu STATE1, 0x40(OUTP)
2810
2811         movdqu 0x50(OUTP), INC
2812         pxor INC, STATE2
2813         movdqu STATE2, 0x50(OUTP)
2814
2815         movdqu 0x60(OUTP), INC
2816         pxor INC, STATE3
2817         movdqu STATE3, 0x60(OUTP)
2818
2819         movdqu 0x70(OUTP), INC
2820         pxor INC, STATE4
2821         movdqu STATE4, 0x70(OUTP)
2822
2823         FRAME_END
2824         ret
2825 SYM_FUNC_END(aesni_xts_crypt8)
2826
2827 #endif