arch/x86/crypto/aesni-intel_asm.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Implement AES algorithm in Intel AES-NI instructions.
   4  *
   5  * The white paper of AES-NI instructions can be downloaded from:
   6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7  *
   8  * Copyright (C) 2008, Intel Corp.
   9  *    Author: Huang Ying <ying.huang@intel.com>
  10  *            Vinodh Gopal <vinodh.gopal@intel.com>
  11  *            Kahraman Akdemir
  12  *
  13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14  * interface for 64-bit kernels.
  15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17  *             Adrian Hoban <adrian.hoban@intel.com>
  18  *             James Guilford (james.guilford@intel.com)
  19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20  *             Tadeusz Struk (tadeusz.struk@intel.com)
  21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22  *    Copyright (c) 2010, Intel Corporation.
  23  *
  24  * Ported x86_64 version to x86:
  25  *    Author: Mathias Krause <minipli@googlemail.com>
  26  */
  27
  28 #include <linux/linkage.h>
  29 #include <asm/frame.h>
  30 #include <asm/nospec-branch.h>
  31
  32 /*
  33  * The following macros are used to move an (un)aligned 16 byte value to/from
  34  * an XMM register.  This can done for either FP or integer values, for FP use
  35  * movaps (move aligned packed single) or integer use movdqa (move double quad
  36  * aligned).  It doesn't make a performance difference which instruction is used
  37  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38  * shorter, so that is the one we'll use for now. (same for unaligned).
  39  */
  40 #define MOVADQ  movaps
  41 #define MOVUDQ  movups
  42
  43 #ifdef __x86_64__
  44
  45 # constants in mergeable sections, linker can reorder and merge
  46 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  47 .align 16
  48 POLY:   .octa 0xC2000000000000000000000000000001
  49 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  50 .align 16
  51 TWOONE: .octa 0x00000001000000000000000000000001
  52
  53 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  54 .align 16
  55 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  56 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  57 .align 16
  58 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  59 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  60 .align 16
  61 MASK2:      .octa 0xffffffffffffffff0000000000000000
  62 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  63 .align 16
  64 ONE:        .octa 0x00000000000000000000000000000001
  65 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  66 .align 16
  67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  68 .section        .rodata.cst16.dec, "aM", @progbits, 16
  69 .align 16
  70 dec:        .octa 0x1
  71 .section        .rodata.cst16.enc, "aM", @progbits, 16
  72 .align 16
  73 enc:        .octa 0x2
  74
  75 # order of these constants should not change.
  76 # more specifically, ALL_F should follow SHIFT_MASK,
  77 # and zero should follow ALL_F
  78 .section        .rodata, "a", @progbits
  79 .align 16
  80 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  81 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  82             .octa 0x00000000000000000000000000000000
  83
  84 .text
  85
  86
  87 #define STACK_OFFSET    8*3
  88
  89 #define AadHash 16*0
  90 #define AadLen 16*1
  91 #define InLen (16*1)+8
  92 #define PBlockEncKey 16*2
  93 #define OrigIV 16*3
  94 #define CurCount 16*4
  95 #define PBlockLen 16*5
  96 #define HashKey         16*6    // store HashKey <<1 mod poly here
  97 #define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
  98 #define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
  99 #define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 100 #define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 101                                 // bits of  HashKey <<1 mod poly here
 102                                 //(for Karatsuba purposes)
 103 #define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 104                                 // bits of  HashKey^2 <<1 mod poly here
 105                                 // (for Karatsuba purposes)
 106 #define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 107                                 // bits of  HashKey^3 <<1 mod poly here
 108                                 // (for Karatsuba purposes)
 109 #define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 110                                 // bits of  HashKey^4 <<1 mod poly here
 111                                 // (for Karatsuba purposes)
 112
 113 #define arg1 rdi
 114 #define arg2 rsi
 115 #define arg3 rdx
 116 #define arg4 rcx
 117 #define arg5 r8
 118 #define arg6 r9
 119 #define arg7 STACK_OFFSET+8(%rsp)
 120 #define arg8 STACK_OFFSET+16(%rsp)
 121 #define arg9 STACK_OFFSET+24(%rsp)
 122 #define arg10 STACK_OFFSET+32(%rsp)
 123 #define arg11 STACK_OFFSET+40(%rsp)
 124 #define keysize 2*15*16(%arg1)
 125 #endif
 126
 127
 128 #define STATE1  %xmm0
 129 #define STATE2  %xmm4
 130 #define STATE3  %xmm5
 131 #define STATE4  %xmm6
 132 #define STATE   STATE1
 133 #define IN1     %xmm1
 134 #define IN2     %xmm7
 135 #define IN3     %xmm8
 136 #define IN4     %xmm9
 137 #define IN      IN1
 138 #define KEY     %xmm2
 139 #define IV      %xmm3
 140
 141 #define BSWAP_MASK %xmm10
 142 #define CTR     %xmm11
 143 #define INC     %xmm12
 144
 145 #define GF128MUL_MASK %xmm7
 146
 147 #ifdef __x86_64__
 148 #define AREG    %rax
 149 #define KEYP    %rdi
 150 #define OUTP    %rsi
 151 #define UKEYP   OUTP
 152 #define INP     %rdx
 153 #define LEN     %rcx
 154 #define IVP     %r8
 155 #define KLEN    %r9d
 156 #define T1      %r10
 157 #define TKEYP   T1
 158 #define T2      %r11
 159 #define TCTR_LOW T2
 160 #else
 161 #define AREG    %eax
 162 #define KEYP    %edi
 163 #define OUTP    AREG
 164 #define UKEYP   OUTP
 165 #define INP     %edx
 166 #define LEN     %esi
 167 #define IVP     %ebp
 168 #define KLEN    %ebx
 169 #define T1      %ecx
 170 #define TKEYP   T1
 171 #endif
 172
 173 .macro FUNC_SAVE
 174         push    %r12
 175         push    %r13
 176         push    %r14
 177 #
 178 # states of %xmm registers %xmm6:%xmm15 not saved
 179 # all %xmm registers are clobbered
 180 #
 181 .endm
 182
 183
 184 .macro FUNC_RESTORE
 185         pop     %r14
 186         pop     %r13
 187         pop     %r12
 188 .endm
 189
 190 # Precompute hashkeys.
 191 # Input: Hash subkey.
 192 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 193 # once per key.
 194 # clobbers r12, and tmp xmm registers.
 195 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 196         mov     \SUBKEY, %r12
 197         movdqu  (%r12), \TMP3
 198         movdqa  SHUF_MASK(%rip), \TMP2
 199         pshufb  \TMP2, \TMP3
 200
 201         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 202
 203         movdqa  \TMP3, \TMP2
 204         psllq   $1, \TMP3
 205         psrlq   $63, \TMP2
 206         movdqa  \TMP2, \TMP1
 207         pslldq  $8, \TMP2
 208         psrldq  $8, \TMP1
 209         por     \TMP2, \TMP3
 210
 211         # reduce HashKey<<1
 212
 213         pshufd  $0x24, \TMP1, \TMP2
 214         pcmpeqd TWOONE(%rip), \TMP2
 215         pand    POLY(%rip), \TMP2
 216         pxor    \TMP2, \TMP3
 217         movdqu  \TMP3, HashKey(%arg2)
 218
 219         movdqa     \TMP3, \TMP5
 220         pshufd     $78, \TMP3, \TMP1
 221         pxor       \TMP3, \TMP1
 222         movdqu     \TMP1, HashKey_k(%arg2)
 223
 224         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 225 # TMP5 = HashKey^2<<1 (mod poly)
 226         movdqu     \TMP5, HashKey_2(%arg2)
 227 # HashKey_2 = HashKey^2<<1 (mod poly)
 228         pshufd     $78, \TMP5, \TMP1
 229         pxor       \TMP5, \TMP1
 230         movdqu     \TMP1, HashKey_2_k(%arg2)
 231
 232         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 233 # TMP5 = HashKey^3<<1 (mod poly)
 234         movdqu     \TMP5, HashKey_3(%arg2)
 235         pshufd     $78, \TMP5, \TMP1
 236         pxor       \TMP5, \TMP1
 237         movdqu     \TMP1, HashKey_3_k(%arg2)
 238
 239         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 240 # TMP5 = HashKey^3<<1 (mod poly)
 241         movdqu     \TMP5, HashKey_4(%arg2)
 242         pshufd     $78, \TMP5, \TMP1
 243         pxor       \TMP5, \TMP1
 244         movdqu     \TMP1, HashKey_4_k(%arg2)
 245 .endm
 246
 247 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 248 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 249 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 250         mov \AADLEN, %r11
 251         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 252         xor %r11d, %r11d
 253         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 254         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 255         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 256         mov \Iv, %rax
 257         movdqu (%rax), %xmm0
 258         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 259
 260         movdqa  SHUF_MASK(%rip), %xmm2
 261         pshufb %xmm2, %xmm0
 262         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 263
 264         PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 265         movdqu HashKey(%arg2), %xmm13
 266
 267         CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 268         %xmm4, %xmm5, %xmm6
 269 .endm
 270
 271 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 272 # struct has been initialized by GCM_INIT.
 273 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 274 # Clobbers rax, r10-r13, and xmm0-xmm15
 275 .macro GCM_ENC_DEC operation
 276         movdqu AadHash(%arg2), %xmm8
 277         movdqu HashKey(%arg2), %xmm13
 278         add %arg5, InLen(%arg2)
 279
 280         xor %r11d, %r11d # initialise the data pointer offset as zero
 281         PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 282
 283         sub %r11, %arg5         # sub partial block data used
 284         mov %arg5, %r13         # save the number of bytes
 285
 286         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 287         mov %r13, %r12
 288         # Encrypt/Decrypt first few blocks
 289
 290         and     $(3<<4), %r12
 291         jz      _initial_num_blocks_is_0_\@
 292         cmp     $(2<<4), %r12
 293         jb      _initial_num_blocks_is_1_\@
 294         je      _initial_num_blocks_is_2_\@
 295 _initial_num_blocks_is_3_\@:
 296         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 297 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 298         sub     $48, %r13
 299         jmp     _initial_blocks_\@
 300 _initial_num_blocks_is_2_\@:
 301         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 303         sub     $32, %r13
 304         jmp     _initial_blocks_\@
 305 _initial_num_blocks_is_1_\@:
 306         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 308         sub     $16, %r13
 309         jmp     _initial_blocks_\@
 310 _initial_num_blocks_is_0_\@:
 311         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 313 _initial_blocks_\@:
 314
 315         # Main loop - Encrypt/Decrypt remaining blocks
 316
 317         test    %r13, %r13
 318         je      _zero_cipher_left_\@
 319         sub     $64, %r13
 320         je      _four_cipher_left_\@
 321 _crypt_by_4_\@:
 322         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 323         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 324         %xmm7, %xmm8, enc
 325         add     $64, %r11
 326         sub     $64, %r13
 327         jne     _crypt_by_4_\@
 328 _four_cipher_left_\@:
 329         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 330 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 331 _zero_cipher_left_\@:
 332         movdqu %xmm8, AadHash(%arg2)
 333         movdqu %xmm0, CurCount(%arg2)
 334
 335         mov     %arg5, %r13
 336         and     $15, %r13                       # %r13 = arg5 (mod 16)
 337         je      _multiple_of_16_bytes_\@
 338
 339         mov %r13, PBlockLen(%arg2)
 340
 341         # Handle the last <16 Byte block separately
 342         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 343         movdqu %xmm0, CurCount(%arg2)
 344         movdqa SHUF_MASK(%rip), %xmm10
 345         pshufb %xmm10, %xmm0
 346
 347         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 348         movdqu %xmm0, PBlockEncKey(%arg2)
 349
 350         cmp     $16, %arg5
 351         jge _large_enough_update_\@
 352
 353         lea (%arg4,%r11,1), %r10
 354         mov %r13, %r12
 355         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 356         jmp _data_read_\@
 357
 358 _large_enough_update_\@:
 359         sub     $16, %r11
 360         add     %r13, %r11
 361
 362         # receive the last <16 Byte block
 363         movdqu  (%arg4, %r11, 1), %xmm1
 364
 365         sub     %r13, %r11
 366         add     $16, %r11
 367
 368         lea     SHIFT_MASK+16(%rip), %r12
 369         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 370         # (r13 is the number of bytes in plaintext mod 16)
 371         sub     %r13, %r12
 372         # get the appropriate shuffle mask
 373         movdqu  (%r12), %xmm2
 374         # shift right 16-r13 bytes
 375         pshufb  %xmm2, %xmm1
 376
 377 _data_read_\@:
 378         lea ALL_F+16(%rip), %r12
 379         sub %r13, %r12
 380
 381 .ifc \operation, dec
 382         movdqa  %xmm1, %xmm2
 383 .endif
 384         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 385         movdqu  (%r12), %xmm1
 386         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 387         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 388 .ifc \operation, dec
 389         pand    %xmm1, %xmm2
 390         movdqa SHUF_MASK(%rip), %xmm10
 391         pshufb %xmm10 ,%xmm2
 392
 393         pxor %xmm2, %xmm8
 394 .else
 395         movdqa SHUF_MASK(%rip), %xmm10
 396         pshufb %xmm10,%xmm0
 397
 398         pxor    %xmm0, %xmm8
 399 .endif
 400
 401         movdqu %xmm8, AadHash(%arg2)
 402 .ifc \operation, enc
 403         # GHASH computation for the last <16 byte block
 404         movdqa SHUF_MASK(%rip), %xmm10
 405         # shuffle xmm0 back to output as ciphertext
 406         pshufb %xmm10, %xmm0
 407 .endif
 408
 409         # Output %r13 bytes
 410         movq %xmm0, %rax
 411         cmp $8, %r13
 412         jle _less_than_8_bytes_left_\@
 413         mov %rax, (%arg3 , %r11, 1)
 414         add $8, %r11
 415         psrldq $8, %xmm0
 416         movq %xmm0, %rax
 417         sub $8, %r13
 418 _less_than_8_bytes_left_\@:
 419         mov %al,  (%arg3, %r11, 1)
 420         add $1, %r11
 421         shr $8, %rax
 422         sub $1, %r13
 423         jne _less_than_8_bytes_left_\@
 424 _multiple_of_16_bytes_\@:
 425 .endm
 426
 427 # GCM_COMPLETE Finishes update of tag of last partial block
 428 # Output: Authorization Tag (AUTH_TAG)
 429 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 430 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 431         movdqu AadHash(%arg2), %xmm8
 432         movdqu HashKey(%arg2), %xmm13
 433
 434         mov PBlockLen(%arg2), %r12
 435
 436         test %r12, %r12
 437         je _partial_done\@
 438
 439         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 440
 441 _partial_done\@:
 442         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 443         shl     $3, %r12                  # convert into number of bits
 444         movd    %r12d, %xmm15             # len(A) in %xmm15
 445         mov InLen(%arg2), %r12
 446         shl     $3, %r12                  # len(C) in bits (*128)
 447         movq    %r12, %xmm1
 448
 449         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 450         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 451         pxor    %xmm15, %xmm8
 452         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 453         # final GHASH computation
 454         movdqa SHUF_MASK(%rip), %xmm10
 455         pshufb %xmm10, %xmm8
 456
 457         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 458         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 459         pxor    %xmm8, %xmm0
 460 _return_T_\@:
 461         mov     \AUTHTAG, %r10                     # %r10 = authTag
 462         mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 463         cmp     $16, %r11
 464         je      _T_16_\@
 465         cmp     $8, %r11
 466         jl      _T_4_\@
 467 _T_8_\@:
 468         movq    %xmm0, %rax
 469         mov     %rax, (%r10)
 470         add     $8, %r10
 471         sub     $8, %r11
 472         psrldq  $8, %xmm0
 473         test    %r11, %r11
 474         je      _return_T_done_\@
 475 _T_4_\@:
 476         movd    %xmm0, %eax
 477         mov     %eax, (%r10)
 478         add     $4, %r10
 479         sub     $4, %r11
 480         psrldq  $4, %xmm0
 481         test    %r11, %r11
 482         je      _return_T_done_\@
 483 _T_123_\@:
 484         movd    %xmm0, %eax
 485         cmp     $2, %r11
 486         jl      _T_1_\@
 487         mov     %ax, (%r10)
 488         cmp     $2, %r11
 489         je      _return_T_done_\@
 490         add     $2, %r10
 491         sar     $16, %eax
 492 _T_1_\@:
 493         mov     %al, (%r10)
 494         jmp     _return_T_done_\@
 495 _T_16_\@:
 496         movdqu  %xmm0, (%r10)
 497 _return_T_done_\@:
 498 .endm
 499
 500 #ifdef __x86_64__
 501 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 502 *
 503 *
 504 * Input: A and B (128-bits each, bit-reflected)
 505 * Output: C = A*B*x mod poly, (i.e. >>1 )
 506 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 507 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 508 *
 509 */
 510 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 511         movdqa    \GH, \TMP1
 512         pshufd    $78, \GH, \TMP2
 513         pshufd    $78, \HK, \TMP3
 514         pxor      \GH, \TMP2            # TMP2 = a1+a0
 515         pxor      \HK, \TMP3            # TMP3 = b1+b0
 516         pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 517         pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 518         pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 519         pxor      \GH, \TMP2
 520         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 521         movdqa    \TMP2, \TMP3
 522         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 523         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 524         pxor      \TMP3, \GH
 525         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 526
 527         # first phase of the reduction
 528
 529         movdqa    \GH, \TMP2
 530         movdqa    \GH, \TMP3
 531         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 532                                         # in in order to perform
 533                                         # independent shifts
 534         pslld     $31, \TMP2            # packed right shift <<31
 535         pslld     $30, \TMP3            # packed right shift <<30
 536         pslld     $25, \TMP4            # packed right shift <<25
 537         pxor      \TMP3, \TMP2          # xor the shifted versions
 538         pxor      \TMP4, \TMP2
 539         movdqa    \TMP2, \TMP5
 540         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 541         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 542         pxor      \TMP2, \GH
 543
 544         # second phase of the reduction
 545
 546         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 547                                         # in in order to perform
 548                                         # independent shifts
 549         movdqa    \GH,\TMP3
 550         movdqa    \GH,\TMP4
 551         psrld     $1,\TMP2              # packed left shift >>1
 552         psrld     $2,\TMP3              # packed left shift >>2
 553         psrld     $7,\TMP4              # packed left shift >>7
 554         pxor      \TMP3,\TMP2           # xor the shifted versions
 555         pxor      \TMP4,\TMP2
 556         pxor      \TMP5, \TMP2
 557         pxor      \TMP2, \GH
 558         pxor      \TMP1, \GH            # result is in TMP1
 559 .endm
 560
 561 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 562 # where 0 < DLEN < 16
 563 # Clobbers %rax, DLEN and XMM1
 564 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 565         cmp $8, \DLEN
 566         jl _read_lt8_\@
 567         mov (\DPTR), %rax
 568         movq %rax, \XMMDst
 569         sub $8, \DLEN
 570         jz _done_read_partial_block_\@
 571         xor %eax, %eax
 572 _read_next_byte_\@:
 573         shl $8, %rax
 574         mov 7(\DPTR, \DLEN, 1), %al
 575         dec \DLEN
 576         jnz _read_next_byte_\@
 577         movq %rax, \XMM1
 578         pslldq $8, \XMM1
 579         por \XMM1, \XMMDst
 580         jmp _done_read_partial_block_\@
 581 _read_lt8_\@:
 582         xor %eax, %eax
 583 _read_next_byte_lt8_\@:
 584         shl $8, %rax
 585         mov -1(\DPTR, \DLEN, 1), %al
 586         dec \DLEN
 587         jnz _read_next_byte_lt8_\@
 588         movq %rax, \XMMDst
 589 _done_read_partial_block_\@:
 590 .endm
 591
 592 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 593 # clobbers r10-11, xmm14
 594 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 595         TMP6 TMP7
 596         MOVADQ     SHUF_MASK(%rip), %xmm14
 597         mov        \AAD, %r10           # %r10 = AAD
 598         mov        \AADLEN, %r11                # %r11 = aadLen
 599         pxor       \TMP7, \TMP7
 600         pxor       \TMP6, \TMP6
 601
 602         cmp        $16, %r11
 603         jl         _get_AAD_rest\@
 604 _get_AAD_blocks\@:
 605         movdqu     (%r10), \TMP7
 606         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 607         pxor       \TMP7, \TMP6
 608         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 609         add        $16, %r10
 610         sub        $16, %r11
 611         cmp        $16, %r11
 612         jge        _get_AAD_blocks\@
 613
 614         movdqu     \TMP6, \TMP7
 615
 616         /* read the last <16B of AAD */
 617 _get_AAD_rest\@:
 618         test       %r11, %r11
 619         je         _get_AAD_done\@
 620
 621         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 622         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 623         pxor       \TMP6, \TMP7
 624         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 625         movdqu \TMP7, \TMP6
 626
 627 _get_AAD_done\@:
 628         movdqu \TMP6, AadHash(%arg2)
 629 .endm
 630
 631 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 632 # between update calls.
 633 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 634 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 635 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 636 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 637         AAD_HASH operation
 638         mov     PBlockLen(%arg2), %r13
 639         test    %r13, %r13
 640         je      _partial_block_done_\@  # Leave Macro if no partial blocks
 641         # Read in input data without over reading
 642         cmp     $16, \PLAIN_CYPH_LEN
 643         jl      _fewer_than_16_bytes_\@
 644         movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 645         jmp     _data_read_\@
 646
 647 _fewer_than_16_bytes_\@:
 648         lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 649         mov     \PLAIN_CYPH_LEN, %r12
 650         READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 651
 652         mov PBlockLen(%arg2), %r13
 653
 654 _data_read_\@:                          # Finished reading in data
 655
 656         movdqu  PBlockEncKey(%arg2), %xmm9
 657         movdqu  HashKey(%arg2), %xmm13
 658
 659         lea     SHIFT_MASK(%rip), %r12
 660
 661         # adjust the shuffle mask pointer to be able to shift r13 bytes
 662         # r16-r13 is the number of bytes in plaintext mod 16)
 663         add     %r13, %r12
 664         movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 665         pshufb  %xmm2, %xmm9            # shift right r13 bytes
 666
 667 .ifc \operation, dec
 668         movdqa  %xmm1, %xmm3
 669         pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 670
 671         mov     \PLAIN_CYPH_LEN, %r10
 672         add     %r13, %r10
 673         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 674         sub     $16, %r10
 675         # Determine if if partial block is not being filled and
 676         # shift mask accordingly
 677         jge     _no_extra_mask_1_\@
 678         sub     %r10, %r12
 679 _no_extra_mask_1_\@:
 680
 681         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 682         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 683         pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 684
 685         pand    %xmm1, %xmm3
 686         movdqa  SHUF_MASK(%rip), %xmm10
 687         pshufb  %xmm10, %xmm3
 688         pshufb  %xmm2, %xmm3
 689         pxor    %xmm3, \AAD_HASH
 690
 691         test    %r10, %r10
 692         jl      _partial_incomplete_1_\@
 693
 694         # GHASH computation for the last <16 Byte block
 695         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 696         xor     %eax, %eax
 697
 698         mov     %rax, PBlockLen(%arg2)
 699         jmp     _dec_done_\@
 700 _partial_incomplete_1_\@:
 701         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 702 _dec_done_\@:
 703         movdqu  \AAD_HASH, AadHash(%arg2)
 704 .else
 705         pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 706
 707         mov     \PLAIN_CYPH_LEN, %r10
 708         add     %r13, %r10
 709         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 710         sub     $16, %r10
 711         # Determine if if partial block is not being filled and
 712         # shift mask accordingly
 713         jge     _no_extra_mask_2_\@
 714         sub     %r10, %r12
 715 _no_extra_mask_2_\@:
 716
 717         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 718         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 719         pand    %xmm1, %xmm9
 720
 721         movdqa  SHUF_MASK(%rip), %xmm1
 722         pshufb  %xmm1, %xmm9
 723         pshufb  %xmm2, %xmm9
 724         pxor    %xmm9, \AAD_HASH
 725
 726         test    %r10, %r10
 727         jl      _partial_incomplete_2_\@
 728
 729         # GHASH computation for the last <16 Byte block
 730         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 731         xor     %eax, %eax
 732
 733         mov     %rax, PBlockLen(%arg2)
 734         jmp     _encode_done_\@
 735 _partial_incomplete_2_\@:
 736         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 737 _encode_done_\@:
 738         movdqu  \AAD_HASH, AadHash(%arg2)
 739
 740         movdqa  SHUF_MASK(%rip), %xmm10
 741         # shuffle xmm9 back to output as ciphertext
 742         pshufb  %xmm10, %xmm9
 743         pshufb  %xmm2, %xmm9
 744 .endif
 745         # output encrypted Bytes
 746         test    %r10, %r10
 747         jl      _partial_fill_\@
 748         mov     %r13, %r12
 749         mov     $16, %r13
 750         # Set r13 to be the number of bytes to write out
 751         sub     %r12, %r13
 752         jmp     _count_set_\@
 753 _partial_fill_\@:
 754         mov     \PLAIN_CYPH_LEN, %r13
 755 _count_set_\@:
 756         movdqa  %xmm9, %xmm0
 757         movq    %xmm0, %rax
 758         cmp     $8, %r13
 759         jle     _less_than_8_bytes_left_\@
 760
 761         mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 762         add     $8, \DATA_OFFSET
 763         psrldq  $8, %xmm0
 764         movq    %xmm0, %rax
 765         sub     $8, %r13
 766 _less_than_8_bytes_left_\@:
 767         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 768         add     $1, \DATA_OFFSET
 769         shr     $8, %rax
 770         sub     $1, %r13
 771         jne     _less_than_8_bytes_left_\@
 772 _partial_block_done_\@:
 773 .endm # PARTIAL_BLOCK
 774
 775 /*
 776 * if a = number of total plaintext bytes
 777 * b = floor(a/16)
 778 * num_initial_blocks = b mod 4
 779 * encrypt the initial num_initial_blocks blocks and apply ghash on
 780 * the ciphertext
 781 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 782 * are clobbered
 783 * arg1, %arg2, %arg3 are used as a pointer only, not modified
 784 */
 785
 786
 787 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 788         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 789         MOVADQ          SHUF_MASK(%rip), %xmm14
 790
 791         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 792
 793         # start AES for num_initial_blocks blocks
 794
 795         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 796
 797 .if (\i == 5) || (\i == 6) || (\i == 7)
 798
 799         MOVADQ          ONE(%RIP),\TMP1
 800         MOVADQ          0(%arg1),\TMP2
 801 .irpc index, \i_seq
 802         paddd           \TMP1, \XMM0                 # INCR Y0
 803 .ifc \operation, dec
 804         movdqa     \XMM0, %xmm\index
 805 .else
 806         MOVADQ          \XMM0, %xmm\index
 807 .endif
 808         pshufb  %xmm14, %xmm\index      # perform a 16 byte swap
 809         pxor            \TMP2, %xmm\index
 810 .endr
 811         lea     0x10(%arg1),%r10
 812         mov     keysize,%eax
 813         shr     $2,%eax                         # 128->4, 192->6, 256->8
 814         add     $5,%eax                       # 128->9, 192->11, 256->13
 815
 816 aes_loop_initial_\@:
 817         MOVADQ  (%r10),\TMP1
 818 .irpc   index, \i_seq
 819         aesenc  \TMP1, %xmm\index
 820 .endr
 821         add     $16,%r10
 822         sub     $1,%eax
 823         jnz     aes_loop_initial_\@
 824
 825         MOVADQ  (%r10), \TMP1
 826 .irpc index, \i_seq
 827         aesenclast \TMP1, %xmm\index         # Last Round
 828 .endr
 829 .irpc index, \i_seq
 830         movdqu     (%arg4 , %r11, 1), \TMP1
 831         pxor       \TMP1, %xmm\index
 832         movdqu     %xmm\index, (%arg3 , %r11, 1)
 833         # write back plaintext/ciphertext for num_initial_blocks
 834         add        $16, %r11
 835
 836 .ifc \operation, dec
 837         movdqa     \TMP1, %xmm\index
 838 .endif
 839         pshufb     %xmm14, %xmm\index
 840
 841                 # prepare plaintext/ciphertext for GHASH computation
 842 .endr
 843 .endif
 844
 845         # apply GHASH on num_initial_blocks blocks
 846
 847 .if \i == 5
 848         pxor       %xmm5, %xmm6
 849         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 850         pxor       %xmm6, %xmm7
 851         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 852         pxor       %xmm7, %xmm8
 853         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854 .elseif \i == 6
 855         pxor       %xmm6, %xmm7
 856         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 857         pxor       %xmm7, %xmm8
 858         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859 .elseif \i == 7
 860         pxor       %xmm7, %xmm8
 861         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 862 .endif
 863         cmp        $64, %r13
 864         jl      _initial_blocks_done\@
 865         # no need for precomputed values
 866 /*
 867 *
 868 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 869 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 870 */
 871         MOVADQ     ONE(%RIP),\TMP1
 872         paddd      \TMP1, \XMM0              # INCR Y0
 873         MOVADQ     \XMM0, \XMM1
 874         pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 875
 876         paddd      \TMP1, \XMM0              # INCR Y0
 877         MOVADQ     \XMM0, \XMM2
 878         pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 879
 880         paddd      \TMP1, \XMM0              # INCR Y0
 881         MOVADQ     \XMM0, \XMM3
 882         pshufb %xmm14, \XMM3        # perform a 16 byte swap
 883
 884         paddd      \TMP1, \XMM0              # INCR Y0
 885         MOVADQ     \XMM0, \XMM4
 886         pshufb %xmm14, \XMM4        # perform a 16 byte swap
 887
 888         MOVADQ     0(%arg1),\TMP1
 889         pxor       \TMP1, \XMM1
 890         pxor       \TMP1, \XMM2
 891         pxor       \TMP1, \XMM3
 892         pxor       \TMP1, \XMM4
 893 .irpc index, 1234 # do 4 rounds
 894         movaps 0x10*\index(%arg1), \TMP1
 895         aesenc     \TMP1, \XMM1
 896         aesenc     \TMP1, \XMM2
 897         aesenc     \TMP1, \XMM3
 898         aesenc     \TMP1, \XMM4
 899 .endr
 900 .irpc index, 56789 # do next 5 rounds
 901         movaps 0x10*\index(%arg1), \TMP1
 902         aesenc     \TMP1, \XMM1
 903         aesenc     \TMP1, \XMM2
 904         aesenc     \TMP1, \XMM3
 905         aesenc     \TMP1, \XMM4
 906 .endr
 907         lea        0xa0(%arg1),%r10
 908         mov        keysize,%eax
 909         shr        $2,%eax                      # 128->4, 192->6, 256->8
 910         sub        $4,%eax                      # 128->0, 192->2, 256->4
 911         jz         aes_loop_pre_done\@
 912
 913 aes_loop_pre_\@:
 914         MOVADQ     (%r10),\TMP2
 915 .irpc   index, 1234
 916         aesenc     \TMP2, %xmm\index
 917 .endr
 918         add        $16,%r10
 919         sub        $1,%eax
 920         jnz        aes_loop_pre_\@
 921
 922 aes_loop_pre_done\@:
 923         MOVADQ     (%r10), \TMP2
 924         aesenclast \TMP2, \XMM1
 925         aesenclast \TMP2, \XMM2
 926         aesenclast \TMP2, \XMM3
 927         aesenclast \TMP2, \XMM4
 928         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 929         pxor       \TMP1, \XMM1
 930 .ifc \operation, dec
 931         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 932         movdqa     \TMP1, \XMM1
 933 .endif
 934         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 935         pxor       \TMP1, \XMM2
 936 .ifc \operation, dec
 937         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 938         movdqa     \TMP1, \XMM2
 939 .endif
 940         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 941         pxor       \TMP1, \XMM3
 942 .ifc \operation, dec
 943         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 944         movdqa     \TMP1, \XMM3
 945 .endif
 946         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 947         pxor       \TMP1, \XMM4
 948 .ifc \operation, dec
 949         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 950         movdqa     \TMP1, \XMM4
 951 .else
 952         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 953         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 954         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 955         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 956 .endif
 957
 958         add        $64, %r11
 959         pshufb %xmm14, \XMM1 # perform a 16 byte swap
 960         pxor       \XMMDst, \XMM1
 961 # combine GHASHed value with the corresponding ciphertext
 962         pshufb %xmm14, \XMM2 # perform a 16 byte swap
 963         pshufb %xmm14, \XMM3 # perform a 16 byte swap
 964         pshufb %xmm14, \XMM4 # perform a 16 byte swap
 965
 966 _initial_blocks_done\@:
 967
 968 .endm
 969
 970 /*
 971 * encrypt 4 blocks at a time
 972 * ghash the 4 previously encrypted ciphertext blocks
 973 * arg1, %arg3, %arg4 are used as pointers only, not modified
 974 * %r11 is the data offset value
 975 */
 976 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 977 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 978
 979         movdqa    \XMM1, \XMM5
 980         movdqa    \XMM2, \XMM6
 981         movdqa    \XMM3, \XMM7
 982         movdqa    \XMM4, \XMM8
 983
 984         movdqa    SHUF_MASK(%rip), %xmm15
 985         # multiply TMP5 * HashKey using karatsuba
 986
 987         movdqa    \XMM5, \TMP4
 988         pshufd    $78, \XMM5, \TMP6
 989         pxor      \XMM5, \TMP6
 990         paddd     ONE(%rip), \XMM0              # INCR CNT
 991         movdqu    HashKey_4(%arg2), \TMP5
 992         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 993         movdqa    \XMM0, \XMM1
 994         paddd     ONE(%rip), \XMM0              # INCR CNT
 995         movdqa    \XMM0, \XMM2
 996         paddd     ONE(%rip), \XMM0              # INCR CNT
 997         movdqa    \XMM0, \XMM3
 998         paddd     ONE(%rip), \XMM0              # INCR CNT
 999         movdqa    \XMM0, \XMM4
1000         pshufb %xmm15, \XMM1    # perform a 16 byte swap
1001         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1003         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1004         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1005
1006         pxor      (%arg1), \XMM1
1007         pxor      (%arg1), \XMM2
1008         pxor      (%arg1), \XMM3
1009         pxor      (%arg1), \XMM4
1010         movdqu    HashKey_4_k(%arg2), \TMP5
1011         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012         movaps 0x10(%arg1), \TMP1
1013         aesenc    \TMP1, \XMM1              # Round 1
1014         aesenc    \TMP1, \XMM2
1015         aesenc    \TMP1, \XMM3
1016         aesenc    \TMP1, \XMM4
1017         movaps 0x20(%arg1), \TMP1
1018         aesenc    \TMP1, \XMM1              # Round 2
1019         aesenc    \TMP1, \XMM2
1020         aesenc    \TMP1, \XMM3
1021         aesenc    \TMP1, \XMM4
1022         movdqa    \XMM6, \TMP1
1023         pshufd    $78, \XMM6, \TMP2
1024         pxor      \XMM6, \TMP2
1025         movdqu    HashKey_3(%arg2), \TMP5
1026         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027         movaps 0x30(%arg1), \TMP3
1028         aesenc    \TMP3, \XMM1              # Round 3
1029         aesenc    \TMP3, \XMM2
1030         aesenc    \TMP3, \XMM3
1031         aesenc    \TMP3, \XMM4
1032         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033         movaps 0x40(%arg1), \TMP3
1034         aesenc    \TMP3, \XMM1              # Round 4
1035         aesenc    \TMP3, \XMM2
1036         aesenc    \TMP3, \XMM3
1037         aesenc    \TMP3, \XMM4
1038         movdqu    HashKey_3_k(%arg2), \TMP5
1039         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040         movaps 0x50(%arg1), \TMP3
1041         aesenc    \TMP3, \XMM1              # Round 5
1042         aesenc    \TMP3, \XMM2
1043         aesenc    \TMP3, \XMM3
1044         aesenc    \TMP3, \XMM4
1045         pxor      \TMP1, \TMP4
1046 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047         pxor      \XMM6, \XMM5
1048         pxor      \TMP2, \TMP6
1049         movdqa    \XMM7, \TMP1
1050         pshufd    $78, \XMM7, \TMP2
1051         pxor      \XMM7, \TMP2
1052         movdqu    HashKey_2(%arg2), \TMP5
1053
1054         # Multiply TMP5 * HashKey using karatsuba
1055
1056         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057         movaps 0x60(%arg1), \TMP3
1058         aesenc    \TMP3, \XMM1              # Round 6
1059         aesenc    \TMP3, \XMM2
1060         aesenc    \TMP3, \XMM3
1061         aesenc    \TMP3, \XMM4
1062         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063         movaps 0x70(%arg1), \TMP3
1064         aesenc    \TMP3, \XMM1              # Round 7
1065         aesenc    \TMP3, \XMM2
1066         aesenc    \TMP3, \XMM3
1067         aesenc    \TMP3, \XMM4
1068         movdqu    HashKey_2_k(%arg2), \TMP5
1069         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070         movaps 0x80(%arg1), \TMP3
1071         aesenc    \TMP3, \XMM1              # Round 8
1072         aesenc    \TMP3, \XMM2
1073         aesenc    \TMP3, \XMM3
1074         aesenc    \TMP3, \XMM4
1075         pxor      \TMP1, \TMP4
1076 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077         pxor      \XMM7, \XMM5
1078         pxor      \TMP2, \TMP6
1079
1080         # Multiply XMM8 * HashKey
1081         # XMM8 and TMP5 hold the values for the two operands
1082
1083         movdqa    \XMM8, \TMP1
1084         pshufd    $78, \XMM8, \TMP2
1085         pxor      \XMM8, \TMP2
1086         movdqu    HashKey(%arg2), \TMP5
1087         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088         movaps 0x90(%arg1), \TMP3
1089         aesenc    \TMP3, \XMM1             # Round 9
1090         aesenc    \TMP3, \XMM2
1091         aesenc    \TMP3, \XMM3
1092         aesenc    \TMP3, \XMM4
1093         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094         lea       0xa0(%arg1),%r10
1095         mov       keysize,%eax
1096         shr       $2,%eax                       # 128->4, 192->6, 256->8
1097         sub       $4,%eax                       # 128->0, 192->2, 256->4
1098         jz        aes_loop_par_enc_done\@
1099
1100 aes_loop_par_enc\@:
1101         MOVADQ    (%r10),\TMP3
1102 .irpc   index, 1234
1103         aesenc    \TMP3, %xmm\index
1104 .endr
1105         add       $16,%r10
1106         sub       $1,%eax
1107         jnz       aes_loop_par_enc\@
1108
1109 aes_loop_par_enc_done\@:
1110         MOVADQ    (%r10), \TMP3
1111         aesenclast \TMP3, \XMM1           # Round 10
1112         aesenclast \TMP3, \XMM2
1113         aesenclast \TMP3, \XMM3
1114         aesenclast \TMP3, \XMM4
1115         movdqu    HashKey_k(%arg2), \TMP5
1116         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117         movdqu    (%arg4,%r11,1), \TMP3
1118         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119         movdqu    16(%arg4,%r11,1), \TMP3
1120         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121         movdqu    32(%arg4,%r11,1), \TMP3
1122         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123         movdqu    48(%arg4,%r11,1), \TMP3
1124         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1131         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1132         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1133
1134         pxor      \TMP4, \TMP1
1135         pxor      \XMM8, \XMM5
1136         pxor      \TMP6, \TMP2
1137         pxor      \TMP1, \TMP2
1138         pxor      \XMM5, \TMP2
1139         movdqa    \TMP2, \TMP3
1140         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1141         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1142         pxor      \TMP3, \XMM5
1143         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1144
1145         # first phase of reduction
1146
1147         movdqa    \XMM5, \TMP2
1148         movdqa    \XMM5, \TMP3
1149         movdqa    \XMM5, \TMP4
1150 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151         pslld     $31, \TMP2                   # packed right shift << 31
1152         pslld     $30, \TMP3                   # packed right shift << 30
1153         pslld     $25, \TMP4                   # packed right shift << 25
1154         pxor      \TMP3, \TMP2                 # xor the shifted versions
1155         pxor      \TMP4, \TMP2
1156         movdqa    \TMP2, \TMP5
1157         psrldq    $4, \TMP5                    # right shift T5 1 DW
1158         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159         pxor      \TMP2, \XMM5
1160
1161         # second phase of reduction
1162
1163         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164         movdqa    \XMM5,\TMP3
1165         movdqa    \XMM5,\TMP4
1166         psrld     $1, \TMP2                    # packed left shift >>1
1167         psrld     $2, \TMP3                    # packed left shift >>2
1168         psrld     $7, \TMP4                    # packed left shift >>7
1169         pxor      \TMP3,\TMP2                  # xor the shifted versions
1170         pxor      \TMP4,\TMP2
1171         pxor      \TMP5, \TMP2
1172         pxor      \TMP2, \XMM5
1173         pxor      \TMP1, \XMM5                 # result is in TMP1
1174
1175         pxor      \XMM5, \XMM1
1176 .endm
1177
1178 /*
1179 * decrypt 4 blocks at a time
1180 * ghash the 4 previously decrypted ciphertext blocks
1181 * arg1, %arg3, %arg4 are used as pointers only, not modified
1182 * %r11 is the data offset value
1183 */
1184 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187         movdqa    \XMM1, \XMM5
1188         movdqa    \XMM2, \XMM6
1189         movdqa    \XMM3, \XMM7
1190         movdqa    \XMM4, \XMM8
1191
1192         movdqa    SHUF_MASK(%rip), %xmm15
1193         # multiply TMP5 * HashKey using karatsuba
1194
1195         movdqa    \XMM5, \TMP4
1196         pshufd    $78, \XMM5, \TMP6
1197         pxor      \XMM5, \TMP6
1198         paddd     ONE(%rip), \XMM0              # INCR CNT
1199         movdqu    HashKey_4(%arg2), \TMP5
1200         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201         movdqa    \XMM0, \XMM1
1202         paddd     ONE(%rip), \XMM0              # INCR CNT
1203         movdqa    \XMM0, \XMM2
1204         paddd     ONE(%rip), \XMM0              # INCR CNT
1205         movdqa    \XMM0, \XMM3
1206         paddd     ONE(%rip), \XMM0              # INCR CNT
1207         movdqa    \XMM0, \XMM4
1208         pshufb %xmm15, \XMM1    # perform a 16 byte swap
1209         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1211         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1212         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1213
1214         pxor      (%arg1), \XMM1
1215         pxor      (%arg1), \XMM2
1216         pxor      (%arg1), \XMM3
1217         pxor      (%arg1), \XMM4
1218         movdqu    HashKey_4_k(%arg2), \TMP5
1219         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220         movaps 0x10(%arg1), \TMP1
1221         aesenc    \TMP1, \XMM1              # Round 1
1222         aesenc    \TMP1, \XMM2
1223         aesenc    \TMP1, \XMM3
1224         aesenc    \TMP1, \XMM4
1225         movaps 0x20(%arg1), \TMP1
1226         aesenc    \TMP1, \XMM1              # Round 2
1227         aesenc    \TMP1, \XMM2
1228         aesenc    \TMP1, \XMM3
1229         aesenc    \TMP1, \XMM4
1230         movdqa    \XMM6, \TMP1
1231         pshufd    $78, \XMM6, \TMP2
1232         pxor      \XMM6, \TMP2
1233         movdqu    HashKey_3(%arg2), \TMP5
1234         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235         movaps 0x30(%arg1), \TMP3
1236         aesenc    \TMP3, \XMM1              # Round 3
1237         aesenc    \TMP3, \XMM2
1238         aesenc    \TMP3, \XMM3
1239         aesenc    \TMP3, \XMM4
1240         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241         movaps 0x40(%arg1), \TMP3
1242         aesenc    \TMP3, \XMM1              # Round 4
1243         aesenc    \TMP3, \XMM2
1244         aesenc    \TMP3, \XMM3
1245         aesenc    \TMP3, \XMM4
1246         movdqu    HashKey_3_k(%arg2), \TMP5
1247         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248         movaps 0x50(%arg1), \TMP3
1249         aesenc    \TMP3, \XMM1              # Round 5
1250         aesenc    \TMP3, \XMM2
1251         aesenc    \TMP3, \XMM3
1252         aesenc    \TMP3, \XMM4
1253         pxor      \TMP1, \TMP4
1254 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255         pxor      \XMM6, \XMM5
1256         pxor      \TMP2, \TMP6
1257         movdqa    \XMM7, \TMP1
1258         pshufd    $78, \XMM7, \TMP2
1259         pxor      \XMM7, \TMP2
1260         movdqu    HashKey_2(%arg2), \TMP5
1261
1262         # Multiply TMP5 * HashKey using karatsuba
1263
1264         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265         movaps 0x60(%arg1), \TMP3
1266         aesenc    \TMP3, \XMM1              # Round 6
1267         aesenc    \TMP3, \XMM2
1268         aesenc    \TMP3, \XMM3
1269         aesenc    \TMP3, \XMM4
1270         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271         movaps 0x70(%arg1), \TMP3
1272         aesenc    \TMP3, \XMM1              # Round 7
1273         aesenc    \TMP3, \XMM2
1274         aesenc    \TMP3, \XMM3
1275         aesenc    \TMP3, \XMM4
1276         movdqu    HashKey_2_k(%arg2), \TMP5
1277         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278         movaps 0x80(%arg1), \TMP3
1279         aesenc    \TMP3, \XMM1              # Round 8
1280         aesenc    \TMP3, \XMM2
1281         aesenc    \TMP3, \XMM3
1282         aesenc    \TMP3, \XMM4
1283         pxor      \TMP1, \TMP4
1284 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285         pxor      \XMM7, \XMM5
1286         pxor      \TMP2, \TMP6
1287
1288         # Multiply XMM8 * HashKey
1289         # XMM8 and TMP5 hold the values for the two operands
1290
1291         movdqa    \XMM8, \TMP1
1292         pshufd    $78, \XMM8, \TMP2
1293         pxor      \XMM8, \TMP2
1294         movdqu    HashKey(%arg2), \TMP5
1295         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296         movaps 0x90(%arg1), \TMP3
1297         aesenc    \TMP3, \XMM1             # Round 9
1298         aesenc    \TMP3, \XMM2
1299         aesenc    \TMP3, \XMM3
1300         aesenc    \TMP3, \XMM4
1301         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302         lea       0xa0(%arg1),%r10
1303         mov       keysize,%eax
1304         shr       $2,%eax                       # 128->4, 192->6, 256->8
1305         sub       $4,%eax                       # 128->0, 192->2, 256->4
1306         jz        aes_loop_par_dec_done\@
1307
1308 aes_loop_par_dec\@:
1309         MOVADQ    (%r10),\TMP3
1310 .irpc   index, 1234
1311         aesenc    \TMP3, %xmm\index
1312 .endr
1313         add       $16,%r10
1314         sub       $1,%eax
1315         jnz       aes_loop_par_dec\@
1316
1317 aes_loop_par_dec_done\@:
1318         MOVADQ    (%r10), \TMP3
1319         aesenclast \TMP3, \XMM1           # last round
1320         aesenclast \TMP3, \XMM2
1321         aesenclast \TMP3, \XMM3
1322         aesenclast \TMP3, \XMM4
1323         movdqu    HashKey_k(%arg2), \TMP5
1324         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325         movdqu    (%arg4,%r11,1), \TMP3
1326         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328         movdqa    \TMP3, \XMM1
1329         movdqu    16(%arg4,%r11,1), \TMP3
1330         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332         movdqa    \TMP3, \XMM2
1333         movdqu    32(%arg4,%r11,1), \TMP3
1334         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336         movdqa    \TMP3, \XMM3
1337         movdqu    48(%arg4,%r11,1), \TMP3
1338         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340         movdqa    \TMP3, \XMM4
1341         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1343         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1344         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1345
1346         pxor      \TMP4, \TMP1
1347         pxor      \XMM8, \XMM5
1348         pxor      \TMP6, \TMP2
1349         pxor      \TMP1, \TMP2
1350         pxor      \XMM5, \TMP2
1351         movdqa    \TMP2, \TMP3
1352         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1353         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1354         pxor      \TMP3, \XMM5
1355         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1356
1357         # first phase of reduction
1358
1359         movdqa    \XMM5, \TMP2
1360         movdqa    \XMM5, \TMP3
1361         movdqa    \XMM5, \TMP4
1362 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363         pslld     $31, \TMP2                   # packed right shift << 31
1364         pslld     $30, \TMP3                   # packed right shift << 30
1365         pslld     $25, \TMP4                   # packed right shift << 25
1366         pxor      \TMP3, \TMP2                 # xor the shifted versions
1367         pxor      \TMP4, \TMP2
1368         movdqa    \TMP2, \TMP5
1369         psrldq    $4, \TMP5                    # right shift T5 1 DW
1370         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371         pxor      \TMP2, \XMM5
1372
1373         # second phase of reduction
1374
1375         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376         movdqa    \XMM5,\TMP3
1377         movdqa    \XMM5,\TMP4
1378         psrld     $1, \TMP2                    # packed left shift >>1
1379         psrld     $2, \TMP3                    # packed left shift >>2
1380         psrld     $7, \TMP4                    # packed left shift >>7
1381         pxor      \TMP3,\TMP2                  # xor the shifted versions
1382         pxor      \TMP4,\TMP2
1383         pxor      \TMP5, \TMP2
1384         pxor      \TMP2, \XMM5
1385         pxor      \TMP1, \XMM5                 # result is in TMP1
1386
1387         pxor      \XMM5, \XMM1
1388 .endm
1389
1390 /* GHASH the last 4 ciphertext blocks. */
1391 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394         # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396         movdqa    \XMM1, \TMP6
1397         pshufd    $78, \XMM1, \TMP2
1398         pxor      \XMM1, \TMP2
1399         movdqu    HashKey_4(%arg2), \TMP5
1400         pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401         pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402         movdqu    HashKey_4_k(%arg2), \TMP4
1403         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404         movdqa    \XMM1, \XMMDst
1405         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406
1407         # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409         movdqa    \XMM2, \TMP1
1410         pshufd    $78, \XMM2, \TMP2
1411         pxor      \XMM2, \TMP2
1412         movdqu    HashKey_3(%arg2), \TMP5
1413         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414         pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415         movdqu    HashKey_3_k(%arg2), \TMP4
1416         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417         pxor      \TMP1, \TMP6
1418         pxor      \XMM2, \XMMDst
1419         pxor      \TMP2, \XMM1
1420 # results accumulated in TMP6, XMMDst, XMM1
1421
1422         # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424         movdqa    \XMM3, \TMP1
1425         pshufd    $78, \XMM3, \TMP2
1426         pxor      \XMM3, \TMP2
1427         movdqu    HashKey_2(%arg2), \TMP5
1428         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429         pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430         movdqu    HashKey_2_k(%arg2), \TMP4
1431         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432         pxor      \TMP1, \TMP6
1433         pxor      \XMM3, \XMMDst
1434         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435
1436         # Multiply TMP1 * HashKey (using Karatsuba)
1437         movdqa    \XMM4, \TMP1
1438         pshufd    $78, \XMM4, \TMP2
1439         pxor      \XMM4, \TMP2
1440         movdqu    HashKey(%arg2), \TMP5
1441         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1442         pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443         movdqu    HashKey_k(%arg2), \TMP4
1444         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445         pxor      \TMP1, \TMP6
1446         pxor      \XMM4, \XMMDst
1447         pxor      \XMM1, \TMP2
1448         pxor      \TMP6, \TMP2
1449         pxor      \XMMDst, \TMP2
1450         # middle section of the temp results combined as in karatsuba algorithm
1451         movdqa    \TMP2, \TMP4
1452         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1453         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1454         pxor      \TMP4, \XMMDst
1455         pxor      \TMP2, \TMP6
1456 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457         # first phase of the reduction
1458         movdqa    \XMMDst, \TMP2
1459         movdqa    \XMMDst, \TMP3
1460         movdqa    \XMMDst, \TMP4
1461 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462         pslld     $31, \TMP2                # packed right shifting << 31
1463         pslld     $30, \TMP3                # packed right shifting << 30
1464         pslld     $25, \TMP4                # packed right shifting << 25
1465         pxor      \TMP3, \TMP2              # xor the shifted versions
1466         pxor      \TMP4, \TMP2
1467         movdqa    \TMP2, \TMP7
1468         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470         pxor      \TMP2, \XMMDst
1471
1472         # second phase of the reduction
1473         movdqa    \XMMDst, \TMP2
1474         # make 3 copies of XMMDst for doing 3 shift operations
1475         movdqa    \XMMDst, \TMP3
1476         movdqa    \XMMDst, \TMP4
1477         psrld     $1, \TMP2                 # packed left shift >> 1
1478         psrld     $2, \TMP3                 # packed left shift >> 2
1479         psrld     $7, \TMP4                 # packed left shift >> 7
1480         pxor      \TMP3, \TMP2              # xor the shifted versions
1481         pxor      \TMP4, \TMP2
1482         pxor      \TMP7, \TMP2
1483         pxor      \TMP2, \XMMDst
1484         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485 .endm
1486
1487
1488 /* Encryption of a single block
1489 * uses eax & r10
1490 */
1491
1492 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494         pxor            (%arg1), \XMM0
1495         mov             keysize,%eax
1496         shr             $2,%eax                 # 128->4, 192->6, 256->8
1497         add             $5,%eax                 # 128->9, 192->11, 256->13
1498         lea             16(%arg1), %r10   # get first expanded key address
1499
1500 _esb_loop_\@:
1501         MOVADQ          (%r10),\TMP1
1502         aesenc          \TMP1,\XMM0
1503         add             $16,%r10
1504         sub             $1,%eax
1505         jnz             _esb_loop_\@
1506
1507         MOVADQ          (%r10),\TMP1
1508         aesenclast      \TMP1,\XMM0
1509 .endm
1510 /*****************************************************************************
1511 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512 *                   struct gcm_context_data *data
1513 *                                      // Context data
1514 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515 *                   const u8 *in,      // Ciphertext input
1516 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1517 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1522 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524 *                                      // given authentication tag and only return the plaintext if they match.
1525 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526 *                                      // (most likely), 12 or 8.
1527 *
1528 * Assumptions:
1529 *
1530 * keys:
1531 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532 *       set of 11 keys in the data structure void *aes_ctx
1533 *
1534 * iv:
1535 *       0                   1                   2                   3
1536 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538 *       |                             Salt  (From the SA)               |
1539 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540 *       |                     Initialization Vector                     |
1541 *       |         (This is the sequence number from IPSec header)       |
1542 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 *       |                              0x1                              |
1544 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 *
1546 *
1547 *
1548 * AAD:
1549 *       AAD padded to 128 bits with 0
1550 *       for example, assume AAD is a u32 vector
1551 *
1552 *       if AAD is 8 bytes:
1553 *       AAD[3] = {A0, A1};
1554 *       padded AAD in xmm register = {A1 A0 0 0}
1555 *
1556 *       0                   1                   2                   3
1557 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559 *       |                               SPI (A1)                        |
1560 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561 *       |                     32-bit Sequence Number (A0)               |
1562 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563 *       |                              0x0                              |
1564 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 *
1566 *                                       AAD Format with 32-bit Sequence Number
1567 *
1568 *       if AAD is 12 bytes:
1569 *       AAD[3] = {A0, A1, A2};
1570 *       padded AAD in xmm register = {A2 A1 A0 0}
1571 *
1572 *       0                   1                   2                   3
1573 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577 *       |                               SPI (A2)                        |
1578 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1580 *       |                                                               |
1581 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582 *       |                              0x0                              |
1583 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 *
1585 *                        AAD Format with 64-bit Extended Sequence Number
1586 *
1587 * poly = x^128 + x^127 + x^126 + x^121 + 1
1588 *
1589 *****************************************************************************/
1590 SYM_FUNC_START(aesni_gcm_dec)
1591         FUNC_SAVE
1592
1593         GCM_INIT %arg6, arg7, arg8, arg9
1594         GCM_ENC_DEC dec
1595         GCM_COMPLETE arg10, arg11
1596         FUNC_RESTORE
1597         ret
1598 SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601 /*****************************************************************************
1602 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603 *                    struct gcm_context_data *data
1604 *                                        // Context data
1605 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606 *                    const u8 *in,       // Plaintext input
1607 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1613 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614 *                    u8 *auth_tag,       // Authenticated Tag output.
1615 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616 *                                        // 12 or 8.
1617 *
1618 * Assumptions:
1619 *
1620 * keys:
1621 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1622 *       first set of 11 keys in the data structure void *aes_ctx
1623 *
1624 *
1625 * iv:
1626 *       0                   1                   2                   3
1627 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629 *       |                             Salt  (From the SA)               |
1630 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631 *       |                     Initialization Vector                     |
1632 *       |         (This is the sequence number from IPSec header)       |
1633 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 *       |                              0x1                              |
1635 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 *
1637 *
1638 *
1639 * AAD:
1640 *       AAD padded to 128 bits with 0
1641 *       for example, assume AAD is a u32 vector
1642 *
1643 *       if AAD is 8 bytes:
1644 *       AAD[3] = {A0, A1};
1645 *       padded AAD in xmm register = {A1 A0 0 0}
1646 *
1647 *       0                   1                   2                   3
1648 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650 *       |                               SPI (A1)                        |
1651 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652 *       |                     32-bit Sequence Number (A0)               |
1653 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654 *       |                              0x0                              |
1655 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 *
1657 *                                 AAD Format with 32-bit Sequence Number
1658 *
1659 *       if AAD is 12 bytes:
1660 *       AAD[3] = {A0, A1, A2};
1661 *       padded AAD in xmm register = {A2 A1 A0 0}
1662 *
1663 *       0                   1                   2                   3
1664 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666 *       |                               SPI (A2)                        |
1667 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1669 *       |                                                               |
1670 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671 *       |                              0x0                              |
1672 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 *
1674 *                         AAD Format with 64-bit Extended Sequence Number
1675 *
1676 * poly = x^128 + x^127 + x^126 + x^121 + 1
1677 ***************************************************************************/
1678 SYM_FUNC_START(aesni_gcm_enc)
1679         FUNC_SAVE
1680
1681         GCM_INIT %arg6, arg7, arg8, arg9
1682         GCM_ENC_DEC enc
1683
1684         GCM_COMPLETE arg10, arg11
1685         FUNC_RESTORE
1686         ret
1687 SYM_FUNC_END(aesni_gcm_enc)
1688
1689 /*****************************************************************************
1690 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691 *                     struct gcm_context_data *data,
1692 *                                         // context data
1693 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1698 *                     u64 aad_len)        // Length of AAD in bytes.
1699 */
1700 SYM_FUNC_START(aesni_gcm_init)
1701         FUNC_SAVE
1702         GCM_INIT %arg3, %arg4,%arg5, %arg6
1703         FUNC_RESTORE
1704         ret
1705 SYM_FUNC_END(aesni_gcm_init)
1706
1707 /*****************************************************************************
1708 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709 *                    struct gcm_context_data *data,
1710 *                                        // context data
1711 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712 *                    const u8 *in,       // Plaintext input
1713 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714 */
1715 SYM_FUNC_START(aesni_gcm_enc_update)
1716         FUNC_SAVE
1717         GCM_ENC_DEC enc
1718         FUNC_RESTORE
1719         ret
1720 SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722 /*****************************************************************************
1723 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724 *                    struct gcm_context_data *data,
1725 *                                        // context data
1726 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727 *                    const u8 *in,       // Plaintext input
1728 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729 */
1730 SYM_FUNC_START(aesni_gcm_dec_update)
1731         FUNC_SAVE
1732         GCM_ENC_DEC dec
1733         FUNC_RESTORE
1734         ret
1735 SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737 /*****************************************************************************
1738 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739 *                    struct gcm_context_data *data,
1740 *                                        // context data
1741 *                    u8 *auth_tag,       // Authenticated Tag output.
1742 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743 *                                        // 12 or 8.
1744 */
1745 SYM_FUNC_START(aesni_gcm_finalize)
1746         FUNC_SAVE
1747         GCM_COMPLETE %arg3 %arg4
1748         FUNC_RESTORE
1749         ret
1750 SYM_FUNC_END(aesni_gcm_finalize)
1751
1752 #endif
1753
1754
1755 SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1756 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1757         pshufd $0b11111111, %xmm1, %xmm1
1758         shufps $0b00010000, %xmm0, %xmm4
1759         pxor %xmm4, %xmm0
1760         shufps $0b10001100, %xmm0, %xmm4
1761         pxor %xmm4, %xmm0
1762         pxor %xmm1, %xmm0
1763         movaps %xmm0, (TKEYP)
1764         add $0x10, TKEYP
1765         ret
1766 SYM_FUNC_END(_key_expansion_256a)
1767 SYM_FUNC_END_ALIAS(_key_expansion_128)
1768
1769 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1770         pshufd $0b01010101, %xmm1, %xmm1
1771         shufps $0b00010000, %xmm0, %xmm4
1772         pxor %xmm4, %xmm0
1773         shufps $0b10001100, %xmm0, %xmm4
1774         pxor %xmm4, %xmm0
1775         pxor %xmm1, %xmm0
1776
1777         movaps %xmm2, %xmm5
1778         movaps %xmm2, %xmm6
1779         pslldq $4, %xmm5
1780         pshufd $0b11111111, %xmm0, %xmm3
1781         pxor %xmm3, %xmm2
1782         pxor %xmm5, %xmm2
1783
1784         movaps %xmm0, %xmm1
1785         shufps $0b01000100, %xmm0, %xmm6
1786         movaps %xmm6, (TKEYP)
1787         shufps $0b01001110, %xmm2, %xmm1
1788         movaps %xmm1, 0x10(TKEYP)
1789         add $0x20, TKEYP
1790         ret
1791 SYM_FUNC_END(_key_expansion_192a)
1792
1793 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1794         pshufd $0b01010101, %xmm1, %xmm1
1795         shufps $0b00010000, %xmm0, %xmm4
1796         pxor %xmm4, %xmm0
1797         shufps $0b10001100, %xmm0, %xmm4
1798         pxor %xmm4, %xmm0
1799         pxor %xmm1, %xmm0
1800
1801         movaps %xmm2, %xmm5
1802         pslldq $4, %xmm5
1803         pshufd $0b11111111, %xmm0, %xmm3
1804         pxor %xmm3, %xmm2
1805         pxor %xmm5, %xmm2
1806
1807         movaps %xmm0, (TKEYP)
1808         add $0x10, TKEYP
1809         ret
1810 SYM_FUNC_END(_key_expansion_192b)
1811
1812 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1813         pshufd $0b10101010, %xmm1, %xmm1
1814         shufps $0b00010000, %xmm2, %xmm4
1815         pxor %xmm4, %xmm2
1816         shufps $0b10001100, %xmm2, %xmm4
1817         pxor %xmm4, %xmm2
1818         pxor %xmm1, %xmm2
1819         movaps %xmm2, (TKEYP)
1820         add $0x10, TKEYP
1821         ret
1822 SYM_FUNC_END(_key_expansion_256b)
1823
1824 /*
1825  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1826  *                   unsigned int key_len)
1827  */
1828 SYM_FUNC_START(aesni_set_key)
1829         FRAME_BEGIN
1830 #ifndef __x86_64__
1831         pushl KEYP
1832         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1833         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1834         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1835 #endif
1836         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1837         movaps %xmm0, (KEYP)
1838         lea 0x10(KEYP), TKEYP           # key addr
1839         movl %edx, 480(KEYP)
1840         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1841         cmp $24, %dl
1842         jb .Lenc_key128
1843         je .Lenc_key192
1844         movups 0x10(UKEYP), %xmm2       # other user key
1845         movaps %xmm2, (TKEYP)
1846         add $0x10, TKEYP
1847         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1848         call _key_expansion_256a
1849         aeskeygenassist $0x1, %xmm0, %xmm1
1850         call _key_expansion_256b
1851         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1852         call _key_expansion_256a
1853         aeskeygenassist $0x2, %xmm0, %xmm1
1854         call _key_expansion_256b
1855         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1856         call _key_expansion_256a
1857         aeskeygenassist $0x4, %xmm0, %xmm1
1858         call _key_expansion_256b
1859         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1860         call _key_expansion_256a
1861         aeskeygenassist $0x8, %xmm0, %xmm1
1862         call _key_expansion_256b
1863         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1864         call _key_expansion_256a
1865         aeskeygenassist $0x10, %xmm0, %xmm1
1866         call _key_expansion_256b
1867         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1868         call _key_expansion_256a
1869         aeskeygenassist $0x20, %xmm0, %xmm1
1870         call _key_expansion_256b
1871         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1872         call _key_expansion_256a
1873         jmp .Ldec_key
1874 .Lenc_key192:
1875         movq 0x10(UKEYP), %xmm2         # other user key
1876         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1877         call _key_expansion_192a
1878         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1879         call _key_expansion_192b
1880         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1881         call _key_expansion_192a
1882         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1883         call _key_expansion_192b
1884         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1885         call _key_expansion_192a
1886         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1887         call _key_expansion_192b
1888         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1889         call _key_expansion_192a
1890         aeskeygenassist $0x80, %xmm2, %xmm1     # round 8
1891         call _key_expansion_192b
1892         jmp .Ldec_key
1893 .Lenc_key128:
1894         aeskeygenassist $0x1, %xmm0, %xmm1      # round 1
1895         call _key_expansion_128
1896         aeskeygenassist $0x2, %xmm0, %xmm1      # round 2
1897         call _key_expansion_128
1898         aeskeygenassist $0x4, %xmm0, %xmm1      # round 3
1899         call _key_expansion_128
1900         aeskeygenassist $0x8, %xmm0, %xmm1      # round 4
1901         call _key_expansion_128
1902         aeskeygenassist $0x10, %xmm0, %xmm1     # round 5
1903         call _key_expansion_128
1904         aeskeygenassist $0x20, %xmm0, %xmm1     # round 6
1905         call _key_expansion_128
1906         aeskeygenassist $0x40, %xmm0, %xmm1     # round 7
1907         call _key_expansion_128
1908         aeskeygenassist $0x80, %xmm0, %xmm1     # round 8
1909         call _key_expansion_128
1910         aeskeygenassist $0x1b, %xmm0, %xmm1     # round 9
1911         call _key_expansion_128
1912         aeskeygenassist $0x36, %xmm0, %xmm1     # round 10
1913         call _key_expansion_128
1914 .Ldec_key:
1915         sub $0x10, TKEYP
1916         movaps (KEYP), %xmm0
1917         movaps (TKEYP), %xmm1
1918         movaps %xmm0, 240(TKEYP)
1919         movaps %xmm1, 240(KEYP)
1920         add $0x10, KEYP
1921         lea 240-16(TKEYP), UKEYP
1922 .align 4
1923 .Ldec_key_loop:
1924         movaps (KEYP), %xmm0
1925         aesimc %xmm0, %xmm1
1926         movaps %xmm1, (UKEYP)
1927         add $0x10, KEYP
1928         sub $0x10, UKEYP
1929         cmp TKEYP, KEYP
1930         jb .Ldec_key_loop
1931         xor AREG, AREG
1932 #ifndef __x86_64__
1933         popl KEYP
1934 #endif
1935         FRAME_END
1936         ret
1937 SYM_FUNC_END(aesni_set_key)
1938
1939 /*
1940  * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1941  */
1942 SYM_FUNC_START(aesni_enc)
1943         FRAME_BEGIN
1944 #ifndef __x86_64__
1945         pushl KEYP
1946         pushl KLEN
1947         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1948         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1949         movl (FRAME_OFFSET+20)(%esp), INP       # src
1950 #endif
1951         movl 480(KEYP), KLEN            # key length
1952         movups (INP), STATE             # input
1953         call _aesni_enc1
1954         movups STATE, (OUTP)            # output
1955 #ifndef __x86_64__
1956         popl KLEN
1957         popl KEYP
1958 #endif
1959         FRAME_END
1960         ret
1961 SYM_FUNC_END(aesni_enc)
1962
1963 /*
1964  * _aesni_enc1:         internal ABI
1965  * input:
1966  *      KEYP:           key struct pointer
1967  *      KLEN:           round count
1968  *      STATE:          initial state (input)
1969  * output:
1970  *      STATE:          finial state (output)
1971  * changed:
1972  *      KEY
1973  *      TKEYP (T1)
1974  */
1975 SYM_FUNC_START_LOCAL(_aesni_enc1)
1976         movaps (KEYP), KEY              # key
1977         mov KEYP, TKEYP
1978         pxor KEY, STATE         # round 0
1979         add $0x30, TKEYP
1980         cmp $24, KLEN
1981         jb .Lenc128
1982         lea 0x20(TKEYP), TKEYP
1983         je .Lenc192
1984         add $0x20, TKEYP
1985         movaps -0x60(TKEYP), KEY
1986         aesenc KEY, STATE
1987         movaps -0x50(TKEYP), KEY
1988         aesenc KEY, STATE
1989 .align 4
1990 .Lenc192:
1991         movaps -0x40(TKEYP), KEY
1992         aesenc KEY, STATE
1993         movaps -0x30(TKEYP), KEY
1994         aesenc KEY, STATE
1995 .align 4
1996 .Lenc128:
1997         movaps -0x20(TKEYP), KEY
1998         aesenc KEY, STATE
1999         movaps -0x10(TKEYP), KEY
2000         aesenc KEY, STATE
2001         movaps (TKEYP), KEY
2002         aesenc KEY, STATE
2003         movaps 0x10(TKEYP), KEY
2004         aesenc KEY, STATE
2005         movaps 0x20(TKEYP), KEY
2006         aesenc KEY, STATE
2007         movaps 0x30(TKEYP), KEY
2008         aesenc KEY, STATE
2009         movaps 0x40(TKEYP), KEY
2010         aesenc KEY, STATE
2011         movaps 0x50(TKEYP), KEY
2012         aesenc KEY, STATE
2013         movaps 0x60(TKEYP), KEY
2014         aesenc KEY, STATE
2015         movaps 0x70(TKEYP), KEY
2016         aesenclast KEY, STATE
2017         ret
2018 SYM_FUNC_END(_aesni_enc1)
2019
2020 /*
2021  * _aesni_enc4: internal ABI
2022  * input:
2023  *      KEYP:           key struct pointer
2024  *      KLEN:           round count
2025  *      STATE1:         initial state (input)
2026  *      STATE2
2027  *      STATE3
2028  *      STATE4
2029  * output:
2030  *      STATE1:         finial state (output)
2031  *      STATE2
2032  *      STATE3
2033  *      STATE4
2034  * changed:
2035  *      KEY
2036  *      TKEYP (T1)
2037  */
2038 SYM_FUNC_START_LOCAL(_aesni_enc4)
2039         movaps (KEYP), KEY              # key
2040         mov KEYP, TKEYP
2041         pxor KEY, STATE1                # round 0
2042         pxor KEY, STATE2
2043         pxor KEY, STATE3
2044         pxor KEY, STATE4
2045         add $0x30, TKEYP
2046         cmp $24, KLEN
2047         jb .L4enc128
2048         lea 0x20(TKEYP), TKEYP
2049         je .L4enc192
2050         add $0x20, TKEYP
2051         movaps -0x60(TKEYP), KEY
2052         aesenc KEY, STATE1
2053         aesenc KEY, STATE2
2054         aesenc KEY, STATE3
2055         aesenc KEY, STATE4
2056         movaps -0x50(TKEYP), KEY
2057         aesenc KEY, STATE1
2058         aesenc KEY, STATE2
2059         aesenc KEY, STATE3
2060         aesenc KEY, STATE4
2061 #.align 4
2062 .L4enc192:
2063         movaps -0x40(TKEYP), KEY
2064         aesenc KEY, STATE1
2065         aesenc KEY, STATE2
2066         aesenc KEY, STATE3
2067         aesenc KEY, STATE4
2068         movaps -0x30(TKEYP), KEY
2069         aesenc KEY, STATE1
2070         aesenc KEY, STATE2
2071         aesenc KEY, STATE3
2072         aesenc KEY, STATE4
2073 #.align 4
2074 .L4enc128:
2075         movaps -0x20(TKEYP), KEY
2076         aesenc KEY, STATE1
2077         aesenc KEY, STATE2
2078         aesenc KEY, STATE3
2079         aesenc KEY, STATE4
2080         movaps -0x10(TKEYP), KEY
2081         aesenc KEY, STATE1
2082         aesenc KEY, STATE2
2083         aesenc KEY, STATE3
2084         aesenc KEY, STATE4
2085         movaps (TKEYP), KEY
2086         aesenc KEY, STATE1
2087         aesenc KEY, STATE2
2088         aesenc KEY, STATE3
2089         aesenc KEY, STATE4
2090         movaps 0x10(TKEYP), KEY
2091         aesenc KEY, STATE1
2092         aesenc KEY, STATE2
2093         aesenc KEY, STATE3
2094         aesenc KEY, STATE4
2095         movaps 0x20(TKEYP), KEY
2096         aesenc KEY, STATE1
2097         aesenc KEY, STATE2
2098         aesenc KEY, STATE3
2099         aesenc KEY, STATE4
2100         movaps 0x30(TKEYP), KEY
2101         aesenc KEY, STATE1
2102         aesenc KEY, STATE2
2103         aesenc KEY, STATE3
2104         aesenc KEY, STATE4
2105         movaps 0x40(TKEYP), KEY
2106         aesenc KEY, STATE1
2107         aesenc KEY, STATE2
2108         aesenc KEY, STATE3
2109         aesenc KEY, STATE4
2110         movaps 0x50(TKEYP), KEY
2111         aesenc KEY, STATE1
2112         aesenc KEY, STATE2
2113         aesenc KEY, STATE3
2114         aesenc KEY, STATE4
2115         movaps 0x60(TKEYP), KEY
2116         aesenc KEY, STATE1
2117         aesenc KEY, STATE2
2118         aesenc KEY, STATE3
2119         aesenc KEY, STATE4
2120         movaps 0x70(TKEYP), KEY
2121         aesenclast KEY, STATE1          # last round
2122         aesenclast KEY, STATE2
2123         aesenclast KEY, STATE3
2124         aesenclast KEY, STATE4
2125         ret
2126 SYM_FUNC_END(_aesni_enc4)
2127
2128 /*
2129  * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2130  */
2131 SYM_FUNC_START(aesni_dec)
2132         FRAME_BEGIN
2133 #ifndef __x86_64__
2134         pushl KEYP
2135         pushl KLEN
2136         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2137         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2138         movl (FRAME_OFFSET+20)(%esp), INP       # src
2139 #endif
2140         mov 480(KEYP), KLEN             # key length
2141         add $240, KEYP
2142         movups (INP), STATE             # input
2143         call _aesni_dec1
2144         movups STATE, (OUTP)            #output
2145 #ifndef __x86_64__
2146         popl KLEN
2147         popl KEYP
2148 #endif
2149         FRAME_END
2150         ret
2151 SYM_FUNC_END(aesni_dec)
2152
2153 /*
2154  * _aesni_dec1:         internal ABI
2155  * input:
2156  *      KEYP:           key struct pointer
2157  *      KLEN:           key length
2158  *      STATE:          initial state (input)
2159  * output:
2160  *      STATE:          finial state (output)
2161  * changed:
2162  *      KEY
2163  *      TKEYP (T1)
2164  */
2165 SYM_FUNC_START_LOCAL(_aesni_dec1)
2166         movaps (KEYP), KEY              # key
2167         mov KEYP, TKEYP
2168         pxor KEY, STATE         # round 0
2169         add $0x30, TKEYP
2170         cmp $24, KLEN
2171         jb .Ldec128
2172         lea 0x20(TKEYP), TKEYP
2173         je .Ldec192
2174         add $0x20, TKEYP
2175         movaps -0x60(TKEYP), KEY
2176         aesdec KEY, STATE
2177         movaps -0x50(TKEYP), KEY
2178         aesdec KEY, STATE
2179 .align 4
2180 .Ldec192:
2181         movaps -0x40(TKEYP), KEY
2182         aesdec KEY, STATE
2183         movaps -0x30(TKEYP), KEY
2184         aesdec KEY, STATE
2185 .align 4
2186 .Ldec128:
2187         movaps -0x20(TKEYP), KEY
2188         aesdec KEY, STATE
2189         movaps -0x10(TKEYP), KEY
2190         aesdec KEY, STATE
2191         movaps (TKEYP), KEY
2192         aesdec KEY, STATE
2193         movaps 0x10(TKEYP), KEY
2194         aesdec KEY, STATE
2195         movaps 0x20(TKEYP), KEY
2196         aesdec KEY, STATE
2197         movaps 0x30(TKEYP), KEY
2198         aesdec KEY, STATE
2199         movaps 0x40(TKEYP), KEY
2200         aesdec KEY, STATE
2201         movaps 0x50(TKEYP), KEY
2202         aesdec KEY, STATE
2203         movaps 0x60(TKEYP), KEY
2204         aesdec KEY, STATE
2205         movaps 0x70(TKEYP), KEY
2206         aesdeclast KEY, STATE
2207         ret
2208 SYM_FUNC_END(_aesni_dec1)
2209
2210 /*
2211  * _aesni_dec4: internal ABI
2212  * input:
2213  *      KEYP:           key struct pointer
2214  *      KLEN:           key length
2215  *      STATE1:         initial state (input)
2216  *      STATE2
2217  *      STATE3
2218  *      STATE4
2219  * output:
2220  *      STATE1:         finial state (output)
2221  *      STATE2
2222  *      STATE3
2223  *      STATE4
2224  * changed:
2225  *      KEY
2226  *      TKEYP (T1)
2227  */
2228 SYM_FUNC_START_LOCAL(_aesni_dec4)
2229         movaps (KEYP), KEY              # key
2230         mov KEYP, TKEYP
2231         pxor KEY, STATE1                # round 0
2232         pxor KEY, STATE2
2233         pxor KEY, STATE3
2234         pxor KEY, STATE4
2235         add $0x30, TKEYP
2236         cmp $24, KLEN
2237         jb .L4dec128
2238         lea 0x20(TKEYP), TKEYP
2239         je .L4dec192
2240         add $0x20, TKEYP
2241         movaps -0x60(TKEYP), KEY
2242         aesdec KEY, STATE1
2243         aesdec KEY, STATE2
2244         aesdec KEY, STATE3
2245         aesdec KEY, STATE4
2246         movaps -0x50(TKEYP), KEY
2247         aesdec KEY, STATE1
2248         aesdec KEY, STATE2
2249         aesdec KEY, STATE3
2250         aesdec KEY, STATE4
2251 .align 4
2252 .L4dec192:
2253         movaps -0x40(TKEYP), KEY
2254         aesdec KEY, STATE1
2255         aesdec KEY, STATE2
2256         aesdec KEY, STATE3
2257         aesdec KEY, STATE4
2258         movaps -0x30(TKEYP), KEY
2259         aesdec KEY, STATE1
2260         aesdec KEY, STATE2
2261         aesdec KEY, STATE3
2262         aesdec KEY, STATE4
2263 .align 4
2264 .L4dec128:
2265         movaps -0x20(TKEYP), KEY
2266         aesdec KEY, STATE1
2267         aesdec KEY, STATE2
2268         aesdec KEY, STATE3
2269         aesdec KEY, STATE4
2270         movaps -0x10(TKEYP), KEY
2271         aesdec KEY, STATE1
2272         aesdec KEY, STATE2
2273         aesdec KEY, STATE3
2274         aesdec KEY, STATE4
2275         movaps (TKEYP), KEY
2276         aesdec KEY, STATE1
2277         aesdec KEY, STATE2
2278         aesdec KEY, STATE3
2279         aesdec KEY, STATE4
2280         movaps 0x10(TKEYP), KEY
2281         aesdec KEY, STATE1
2282         aesdec KEY, STATE2
2283         aesdec KEY, STATE3
2284         aesdec KEY, STATE4
2285         movaps 0x20(TKEYP), KEY
2286         aesdec KEY, STATE1
2287         aesdec KEY, STATE2
2288         aesdec KEY, STATE3
2289         aesdec KEY, STATE4
2290         movaps 0x30(TKEYP), KEY
2291         aesdec KEY, STATE1
2292         aesdec KEY, STATE2
2293         aesdec KEY, STATE3
2294         aesdec KEY, STATE4
2295         movaps 0x40(TKEYP), KEY
2296         aesdec KEY, STATE1
2297         aesdec KEY, STATE2
2298         aesdec KEY, STATE3
2299         aesdec KEY, STATE4
2300         movaps 0x50(TKEYP), KEY
2301         aesdec KEY, STATE1
2302         aesdec KEY, STATE2
2303         aesdec KEY, STATE3
2304         aesdec KEY, STATE4
2305         movaps 0x60(TKEYP), KEY
2306         aesdec KEY, STATE1
2307         aesdec KEY, STATE2
2308         aesdec KEY, STATE3
2309         aesdec KEY, STATE4
2310         movaps 0x70(TKEYP), KEY
2311         aesdeclast KEY, STATE1          # last round
2312         aesdeclast KEY, STATE2
2313         aesdeclast KEY, STATE3
2314         aesdeclast KEY, STATE4
2315         ret
2316 SYM_FUNC_END(_aesni_dec4)
2317
2318 /*
2319  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2320  *                    size_t len)
2321  */
2322 SYM_FUNC_START(aesni_ecb_enc)
2323         FRAME_BEGIN
2324 #ifndef __x86_64__
2325         pushl LEN
2326         pushl KEYP
2327         pushl KLEN
2328         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2329         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2330         movl (FRAME_OFFSET+24)(%esp), INP       # src
2331         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2332 #endif
2333         test LEN, LEN           # check length
2334         jz .Lecb_enc_ret
2335         mov 480(KEYP), KLEN
2336         cmp $16, LEN
2337         jb .Lecb_enc_ret
2338         cmp $64, LEN
2339         jb .Lecb_enc_loop1
2340 .align 4
2341 .Lecb_enc_loop4:
2342         movups (INP), STATE1
2343         movups 0x10(INP), STATE2
2344         movups 0x20(INP), STATE3
2345         movups 0x30(INP), STATE4
2346         call _aesni_enc4
2347         movups STATE1, (OUTP)
2348         movups STATE2, 0x10(OUTP)
2349         movups STATE3, 0x20(OUTP)
2350         movups STATE4, 0x30(OUTP)
2351         sub $64, LEN
2352         add $64, INP
2353         add $64, OUTP
2354         cmp $64, LEN
2355         jge .Lecb_enc_loop4
2356         cmp $16, LEN
2357         jb .Lecb_enc_ret
2358 .align 4
2359 .Lecb_enc_loop1:
2360         movups (INP), STATE1
2361         call _aesni_enc1
2362         movups STATE1, (OUTP)
2363         sub $16, LEN
2364         add $16, INP
2365         add $16, OUTP
2366         cmp $16, LEN
2367         jge .Lecb_enc_loop1
2368 .Lecb_enc_ret:
2369 #ifndef __x86_64__
2370         popl KLEN
2371         popl KEYP
2372         popl LEN
2373 #endif
2374         FRAME_END
2375         ret
2376 SYM_FUNC_END(aesni_ecb_enc)
2377
2378 /*
2379  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2380  *                    size_t len);
2381  */
2382 SYM_FUNC_START(aesni_ecb_dec)
2383         FRAME_BEGIN
2384 #ifndef __x86_64__
2385         pushl LEN
2386         pushl KEYP
2387         pushl KLEN
2388         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2389         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2390         movl (FRAME_OFFSET+24)(%esp), INP       # src
2391         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2392 #endif
2393         test LEN, LEN
2394         jz .Lecb_dec_ret
2395         mov 480(KEYP), KLEN
2396         add $240, KEYP
2397         cmp $16, LEN
2398         jb .Lecb_dec_ret
2399         cmp $64, LEN
2400         jb .Lecb_dec_loop1
2401 .align 4
2402 .Lecb_dec_loop4:
2403         movups (INP), STATE1
2404         movups 0x10(INP), STATE2
2405         movups 0x20(INP), STATE3
2406         movups 0x30(INP), STATE4
2407         call _aesni_dec4
2408         movups STATE1, (OUTP)
2409         movups STATE2, 0x10(OUTP)
2410         movups STATE3, 0x20(OUTP)
2411         movups STATE4, 0x30(OUTP)
2412         sub $64, LEN
2413         add $64, INP
2414         add $64, OUTP
2415         cmp $64, LEN
2416         jge .Lecb_dec_loop4
2417         cmp $16, LEN
2418         jb .Lecb_dec_ret
2419 .align 4
2420 .Lecb_dec_loop1:
2421         movups (INP), STATE1
2422         call _aesni_dec1
2423         movups STATE1, (OUTP)
2424         sub $16, LEN
2425         add $16, INP
2426         add $16, OUTP
2427         cmp $16, LEN
2428         jge .Lecb_dec_loop1
2429 .Lecb_dec_ret:
2430 #ifndef __x86_64__
2431         popl KLEN
2432         popl KEYP
2433         popl LEN
2434 #endif
2435         FRAME_END
2436         ret
2437 SYM_FUNC_END(aesni_ecb_dec)
2438
2439 /*
2440  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441  *                    size_t len, u8 *iv)
2442  */
2443 SYM_FUNC_START(aesni_cbc_enc)
2444         FRAME_BEGIN
2445 #ifndef __x86_64__
2446         pushl IVP
2447         pushl LEN
2448         pushl KEYP
2449         pushl KLEN
2450         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2451         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2452         movl (FRAME_OFFSET+28)(%esp), INP       # src
2453         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2454         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2455 #endif
2456         cmp $16, LEN
2457         jb .Lcbc_enc_ret
2458         mov 480(KEYP), KLEN
2459         movups (IVP), STATE     # load iv as initial state
2460 .align 4
2461 .Lcbc_enc_loop:
2462         movups (INP), IN        # load input
2463         pxor IN, STATE
2464         call _aesni_enc1
2465         movups STATE, (OUTP)    # store output
2466         sub $16, LEN
2467         add $16, INP
2468         add $16, OUTP
2469         cmp $16, LEN
2470         jge .Lcbc_enc_loop
2471         movups STATE, (IVP)
2472 .Lcbc_enc_ret:
2473 #ifndef __x86_64__
2474         popl KLEN
2475         popl KEYP
2476         popl LEN
2477         popl IVP
2478 #endif
2479         FRAME_END
2480         ret
2481 SYM_FUNC_END(aesni_cbc_enc)
2482
2483 /*
2484  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485  *                    size_t len, u8 *iv)
2486  */
2487 SYM_FUNC_START(aesni_cbc_dec)
2488         FRAME_BEGIN
2489 #ifndef __x86_64__
2490         pushl IVP
2491         pushl LEN
2492         pushl KEYP
2493         pushl KLEN
2494         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2495         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2496         movl (FRAME_OFFSET+28)(%esp), INP       # src
2497         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2498         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2499 #endif
2500         cmp $16, LEN
2501         jb .Lcbc_dec_just_ret
2502         mov 480(KEYP), KLEN
2503         add $240, KEYP
2504         movups (IVP), IV
2505         cmp $64, LEN
2506         jb .Lcbc_dec_loop1
2507 .align 4
2508 .Lcbc_dec_loop4:
2509         movups (INP), IN1
2510         movaps IN1, STATE1
2511         movups 0x10(INP), IN2
2512         movaps IN2, STATE2
2513 #ifdef __x86_64__
2514         movups 0x20(INP), IN3
2515         movaps IN3, STATE3
2516         movups 0x30(INP), IN4
2517         movaps IN4, STATE4
2518 #else
2519         movups 0x20(INP), IN1
2520         movaps IN1, STATE3
2521         movups 0x30(INP), IN2
2522         movaps IN2, STATE4
2523 #endif
2524         call _aesni_dec4
2525         pxor IV, STATE1
2526 #ifdef __x86_64__
2527         pxor IN1, STATE2
2528         pxor IN2, STATE3
2529         pxor IN3, STATE4
2530         movaps IN4, IV
2531 #else
2532         pxor IN1, STATE4
2533         movaps IN2, IV
2534         movups (INP), IN1
2535         pxor IN1, STATE2
2536         movups 0x10(INP), IN2
2537         pxor IN2, STATE3
2538 #endif
2539         movups STATE1, (OUTP)
2540         movups STATE2, 0x10(OUTP)
2541         movups STATE3, 0x20(OUTP)
2542         movups STATE4, 0x30(OUTP)
2543         sub $64, LEN
2544         add $64, INP
2545         add $64, OUTP
2546         cmp $64, LEN
2547         jge .Lcbc_dec_loop4
2548         cmp $16, LEN
2549         jb .Lcbc_dec_ret
2550 .align 4
2551 .Lcbc_dec_loop1:
2552         movups (INP), IN
2553         movaps IN, STATE
2554         call _aesni_dec1
2555         pxor IV, STATE
2556         movups STATE, (OUTP)
2557         movaps IN, IV
2558         sub $16, LEN
2559         add $16, INP
2560         add $16, OUTP
2561         cmp $16, LEN
2562         jge .Lcbc_dec_loop1
2563 .Lcbc_dec_ret:
2564         movups IV, (IVP)
2565 .Lcbc_dec_just_ret:
2566 #ifndef __x86_64__
2567         popl KLEN
2568         popl KEYP
2569         popl LEN
2570         popl IVP
2571 #endif
2572         FRAME_END
2573         ret
2574 SYM_FUNC_END(aesni_cbc_dec)
2575
2576 /*
2577  * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2578  *                        size_t len, u8 *iv)
2579  */
2580 SYM_FUNC_START(aesni_cts_cbc_enc)
2581         FRAME_BEGIN
2582 #ifndef __x86_64__
2583         pushl IVP
2584         pushl LEN
2585         pushl KEYP
2586         pushl KLEN
2587         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2588         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2589         movl (FRAME_OFFSET+28)(%esp), INP       # src
2590         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2591         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2592         lea .Lcts_permute_table, T1
2593 #else
2594         lea .Lcts_permute_table(%rip), T1
2595 #endif
2596         mov 480(KEYP), KLEN
2597         movups (IVP), STATE
2598         sub $16, LEN
2599         mov T1, IVP
2600         add $32, IVP
2601         add LEN, T1
2602         sub LEN, IVP
2603         movups (T1), %xmm4
2604         movups (IVP), %xmm5
2605
2606         movups (INP), IN1
2607         add LEN, INP
2608         movups (INP), IN2
2609
2610         pxor IN1, STATE
2611         call _aesni_enc1
2612
2613         pshufb %xmm5, IN2
2614         pxor STATE, IN2
2615         pshufb %xmm4, STATE
2616         add OUTP, LEN
2617         movups STATE, (LEN)
2618
2619         movaps IN2, STATE
2620         call _aesni_enc1
2621         movups STATE, (OUTP)
2622
2623 #ifndef __x86_64__
2624         popl KLEN
2625         popl KEYP
2626         popl LEN
2627         popl IVP
2628 #endif
2629         FRAME_END
2630         ret
2631 SYM_FUNC_END(aesni_cts_cbc_enc)
2632
2633 /*
2634  * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2635  *                        size_t len, u8 *iv)
2636  */
2637 SYM_FUNC_START(aesni_cts_cbc_dec)
2638         FRAME_BEGIN
2639 #ifndef __x86_64__
2640         pushl IVP
2641         pushl LEN
2642         pushl KEYP
2643         pushl KLEN
2644         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2645         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2646         movl (FRAME_OFFSET+28)(%esp), INP       # src
2647         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2648         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2649         lea .Lcts_permute_table, T1
2650 #else
2651         lea .Lcts_permute_table(%rip), T1
2652 #endif
2653         mov 480(KEYP), KLEN
2654         add $240, KEYP
2655         movups (IVP), IV
2656         sub $16, LEN
2657         mov T1, IVP
2658         add $32, IVP
2659         add LEN, T1
2660         sub LEN, IVP
2661         movups (T1), %xmm4
2662
2663         movups (INP), STATE
2664         add LEN, INP
2665         movups (INP), IN1
2666
2667         call _aesni_dec1
2668         movaps STATE, IN2
2669         pshufb %xmm4, STATE
2670         pxor IN1, STATE
2671
2672         add OUTP, LEN
2673         movups STATE, (LEN)
2674
2675         movups (IVP), %xmm0
2676         pshufb %xmm0, IN1
2677         pblendvb IN2, IN1
2678         movaps IN1, STATE
2679         call _aesni_dec1
2680
2681         pxor IV, STATE
2682         movups STATE, (OUTP)
2683
2684 #ifndef __x86_64__
2685         popl KLEN
2686         popl KEYP
2687         popl LEN
2688         popl IVP
2689 #endif
2690         FRAME_END
2691         ret
2692 SYM_FUNC_END(aesni_cts_cbc_dec)
2693
2694 .pushsection .rodata
2695 .align 16
2696 .Lcts_permute_table:
2697         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2698         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2699         .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2700         .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2701         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2702         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2703 #ifdef __x86_64__
2704 .Lbswap_mask:
2705         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2706 #endif
2707 .popsection
2708
2709 #ifdef __x86_64__
2710 /*
2711  * _aesni_inc_init:     internal ABI
2712  *      setup registers used by _aesni_inc
2713  * input:
2714  *      IV
2715  * output:
2716  *      CTR:    == IV, in little endian
2717  *      TCTR_LOW: == lower qword of CTR
2718  *      INC:    == 1, in little endian
2719  *      BSWAP_MASK == endian swapping mask
2720  */
2721 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2722         movaps .Lbswap_mask, BSWAP_MASK
2723         movaps IV, CTR
2724         pshufb BSWAP_MASK, CTR
2725         mov $1, TCTR_LOW
2726         movq TCTR_LOW, INC
2727         movq CTR, TCTR_LOW
2728         ret
2729 SYM_FUNC_END(_aesni_inc_init)
2730
2731 /*
2732  * _aesni_inc:          internal ABI
2733  *      Increase IV by 1, IV is in big endian
2734  * input:
2735  *      IV
2736  *      CTR:    == IV, in little endian
2737  *      TCTR_LOW: == lower qword of CTR
2738  *      INC:    == 1, in little endian
2739  *      BSWAP_MASK == endian swapping mask
2740  * output:
2741  *      IV:     Increase by 1
2742  * changed:
2743  *      CTR:    == output IV, in little endian
2744  *      TCTR_LOW: == lower qword of CTR
2745  */
2746 SYM_FUNC_START_LOCAL(_aesni_inc)
2747         paddq INC, CTR
2748         add $1, TCTR_LOW
2749         jnc .Linc_low
2750         pslldq $8, INC
2751         paddq INC, CTR
2752         psrldq $8, INC
2753 .Linc_low:
2754         movaps CTR, IV
2755         pshufb BSWAP_MASK, IV
2756         ret
2757 SYM_FUNC_END(_aesni_inc)
2758
2759 /*
2760  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2761  *                    size_t len, u8 *iv)
2762  */
2763 SYM_FUNC_START(aesni_ctr_enc)
2764         FRAME_BEGIN
2765         cmp $16, LEN
2766         jb .Lctr_enc_just_ret
2767         mov 480(KEYP), KLEN
2768         movups (IVP), IV
2769         call _aesni_inc_init
2770         cmp $64, LEN
2771         jb .Lctr_enc_loop1
2772 .align 4
2773 .Lctr_enc_loop4:
2774         movaps IV, STATE1
2775         call _aesni_inc
2776         movups (INP), IN1
2777         movaps IV, STATE2
2778         call _aesni_inc
2779         movups 0x10(INP), IN2
2780         movaps IV, STATE3
2781         call _aesni_inc
2782         movups 0x20(INP), IN3
2783         movaps IV, STATE4
2784         call _aesni_inc
2785         movups 0x30(INP), IN4
2786         call _aesni_enc4
2787         pxor IN1, STATE1
2788         movups STATE1, (OUTP)
2789         pxor IN2, STATE2
2790         movups STATE2, 0x10(OUTP)
2791         pxor IN3, STATE3
2792         movups STATE3, 0x20(OUTP)
2793         pxor IN4, STATE4
2794         movups STATE4, 0x30(OUTP)
2795         sub $64, LEN
2796         add $64, INP
2797         add $64, OUTP
2798         cmp $64, LEN
2799         jge .Lctr_enc_loop4
2800         cmp $16, LEN
2801         jb .Lctr_enc_ret
2802 .align 4
2803 .Lctr_enc_loop1:
2804         movaps IV, STATE
2805         call _aesni_inc
2806         movups (INP), IN
2807         call _aesni_enc1
2808         pxor IN, STATE
2809         movups STATE, (OUTP)
2810         sub $16, LEN
2811         add $16, INP
2812         add $16, OUTP
2813         cmp $16, LEN
2814         jge .Lctr_enc_loop1
2815 .Lctr_enc_ret:
2816         movups IV, (IVP)
2817 .Lctr_enc_just_ret:
2818         FRAME_END
2819         ret
2820 SYM_FUNC_END(aesni_ctr_enc)
2821
2822 #endif
2823
2824 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2825 .align 16
2826 .Lgf128mul_x_ble_mask:
2827         .octa 0x00000000000000010000000000000087
2828 .previous
2829
2830 /*
2831  * _aesni_gf128mul_x_ble:               internal ABI
2832  *      Multiply in GF(2^128) for XTS IVs
2833  * input:
2834  *      IV:     current IV
2835  *      GF128MUL_MASK == mask with 0x87 and 0x01
2836  * output:
2837  *      IV:     next IV
2838  * changed:
2839  *      CTR:    == temporary value
2840  */
2841 #define _aesni_gf128mul_x_ble() \
2842         pshufd $0x13, IV, KEY; \
2843         paddq IV, IV; \
2844         psrad $31, KEY; \
2845         pand GF128MUL_MASK, KEY; \
2846         pxor KEY, IV;
2847
2848 /*
2849  * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2850  *                        const u8 *src, unsigned int len, le128 *iv)
2851  */
2852 SYM_FUNC_START(aesni_xts_encrypt)
2853         FRAME_BEGIN
2854 #ifndef __x86_64__
2855         pushl IVP
2856         pushl LEN
2857         pushl KEYP
2858         pushl KLEN
2859         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2860         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2861         movl (FRAME_OFFSET+28)(%esp), INP       # src
2862         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2863         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2864         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2865 #else
2866         movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2867 #endif
2868         movups (IVP), IV
2869
2870         mov 480(KEYP), KLEN
2871
2872 .Lxts_enc_loop4:
2873         sub $64, LEN
2874         jl .Lxts_enc_1x
2875
2876         movdqa IV, STATE1
2877         movdqu 0x00(INP), IN
2878         pxor IN, STATE1
2879         movdqu IV, 0x00(OUTP)
2880
2881         _aesni_gf128mul_x_ble()
2882         movdqa IV, STATE2
2883         movdqu 0x10(INP), IN
2884         pxor IN, STATE2
2885         movdqu IV, 0x10(OUTP)
2886
2887         _aesni_gf128mul_x_ble()
2888         movdqa IV, STATE3
2889         movdqu 0x20(INP), IN
2890         pxor IN, STATE3
2891         movdqu IV, 0x20(OUTP)
2892
2893         _aesni_gf128mul_x_ble()
2894         movdqa IV, STATE4
2895         movdqu 0x30(INP), IN
2896         pxor IN, STATE4
2897         movdqu IV, 0x30(OUTP)
2898
2899         call _aesni_enc4
2900
2901         movdqu 0x00(OUTP), IN
2902         pxor IN, STATE1
2903         movdqu STATE1, 0x00(OUTP)
2904
2905         movdqu 0x10(OUTP), IN
2906         pxor IN, STATE2
2907         movdqu STATE2, 0x10(OUTP)
2908
2909         movdqu 0x20(OUTP), IN
2910         pxor IN, STATE3
2911         movdqu STATE3, 0x20(OUTP)
2912
2913         movdqu 0x30(OUTP), IN
2914         pxor IN, STATE4
2915         movdqu STATE4, 0x30(OUTP)
2916
2917         _aesni_gf128mul_x_ble()
2918
2919         add $64, INP
2920         add $64, OUTP
2921         test LEN, LEN
2922         jnz .Lxts_enc_loop4
2923
2924 .Lxts_enc_ret_iv:
2925         movups IV, (IVP)
2926
2927 .Lxts_enc_ret:
2928 #ifndef __x86_64__
2929         popl KLEN
2930         popl KEYP
2931         popl LEN
2932         popl IVP
2933 #endif
2934         FRAME_END
2935         ret
2936
2937 .Lxts_enc_1x:
2938         add $64, LEN
2939         jz .Lxts_enc_ret_iv
2940         sub $16, LEN
2941         jl .Lxts_enc_cts4
2942
2943 .Lxts_enc_loop1:
2944         movdqu (INP), STATE
2945         pxor IV, STATE
2946         call _aesni_enc1
2947         pxor IV, STATE
2948         _aesni_gf128mul_x_ble()
2949
2950         test LEN, LEN
2951         jz .Lxts_enc_out
2952
2953         add $16, INP
2954         sub $16, LEN
2955         jl .Lxts_enc_cts1
2956
2957         movdqu STATE, (OUTP)
2958         add $16, OUTP
2959         jmp .Lxts_enc_loop1
2960
2961 .Lxts_enc_out:
2962         movdqu STATE, (OUTP)
2963         jmp .Lxts_enc_ret_iv
2964
2965 .Lxts_enc_cts4:
2966         movdqa STATE4, STATE
2967         sub $16, OUTP
2968
2969 .Lxts_enc_cts1:
2970 #ifndef __x86_64__
2971         lea .Lcts_permute_table, T1
2972 #else
2973         lea .Lcts_permute_table(%rip), T1
2974 #endif
2975         add LEN, INP            /* rewind input pointer */
2976         add $16, LEN            /* # bytes in final block */
2977         movups (INP), IN1
2978
2979         mov T1, IVP
2980         add $32, IVP
2981         add LEN, T1
2982         sub LEN, IVP
2983         add OUTP, LEN
2984
2985         movups (T1), %xmm4
2986         movaps STATE, IN2
2987         pshufb %xmm4, STATE
2988         movups STATE, (LEN)
2989
2990         movups (IVP), %xmm0
2991         pshufb %xmm0, IN1
2992         pblendvb IN2, IN1
2993         movaps IN1, STATE
2994
2995         pxor IV, STATE
2996         call _aesni_enc1
2997         pxor IV, STATE
2998
2999         movups STATE, (OUTP)
3000         jmp .Lxts_enc_ret
3001 SYM_FUNC_END(aesni_xts_encrypt)
3002
3003 /*
3004  * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3005  *                        const u8 *src, unsigned int len, le128 *iv)
3006  */
3007 SYM_FUNC_START(aesni_xts_decrypt)
3008         FRAME_BEGIN
3009 #ifndef __x86_64__
3010         pushl IVP
3011         pushl LEN
3012         pushl KEYP
3013         pushl KLEN
3014         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
3015         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
3016         movl (FRAME_OFFSET+28)(%esp), INP       # src
3017         movl (FRAME_OFFSET+32)(%esp), LEN       # len
3018         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
3019         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3020 #else
3021         movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3022 #endif
3023         movups (IVP), IV
3024
3025         mov 480(KEYP), KLEN
3026         add $240, KEYP
3027
3028         test $15, LEN
3029         jz .Lxts_dec_loop4
3030         sub $16, LEN
3031
3032 .Lxts_dec_loop4:
3033         sub $64, LEN
3034         jl .Lxts_dec_1x
3035
3036         movdqa IV, STATE1
3037         movdqu 0x00(INP), IN
3038         pxor IN, STATE1
3039         movdqu IV, 0x00(OUTP)
3040
3041         _aesni_gf128mul_x_ble()
3042         movdqa IV, STATE2
3043         movdqu 0x10(INP), IN
3044         pxor IN, STATE2
3045         movdqu IV, 0x10(OUTP)
3046
3047         _aesni_gf128mul_x_ble()
3048         movdqa IV, STATE3
3049         movdqu 0x20(INP), IN
3050         pxor IN, STATE3
3051         movdqu IV, 0x20(OUTP)
3052
3053         _aesni_gf128mul_x_ble()
3054         movdqa IV, STATE4
3055         movdqu 0x30(INP), IN
3056         pxor IN, STATE4
3057         movdqu IV, 0x30(OUTP)
3058
3059         call _aesni_dec4
3060
3061         movdqu 0x00(OUTP), IN
3062         pxor IN, STATE1
3063         movdqu STATE1, 0x00(OUTP)
3064
3065         movdqu 0x10(OUTP), IN
3066         pxor IN, STATE2
3067         movdqu STATE2, 0x10(OUTP)
3068
3069         movdqu 0x20(OUTP), IN
3070         pxor IN, STATE3
3071         movdqu STATE3, 0x20(OUTP)
3072
3073         movdqu 0x30(OUTP), IN
3074         pxor IN, STATE4
3075         movdqu STATE4, 0x30(OUTP)
3076
3077         _aesni_gf128mul_x_ble()
3078
3079         add $64, INP
3080         add $64, OUTP
3081         test LEN, LEN
3082         jnz .Lxts_dec_loop4
3083
3084 .Lxts_dec_ret_iv:
3085         movups IV, (IVP)
3086
3087 .Lxts_dec_ret:
3088 #ifndef __x86_64__
3089         popl KLEN
3090         popl KEYP
3091         popl LEN
3092         popl IVP
3093 #endif
3094         FRAME_END
3095         ret
3096
3097 .Lxts_dec_1x:
3098         add $64, LEN
3099         jz .Lxts_dec_ret_iv
3100
3101 .Lxts_dec_loop1:
3102         movdqu (INP), STATE
3103
3104         add $16, INP
3105         sub $16, LEN
3106         jl .Lxts_dec_cts1
3107
3108         pxor IV, STATE
3109         call _aesni_dec1
3110         pxor IV, STATE
3111         _aesni_gf128mul_x_ble()
3112
3113         test LEN, LEN
3114         jz .Lxts_dec_out
3115
3116         movdqu STATE, (OUTP)
3117         add $16, OUTP
3118         jmp .Lxts_dec_loop1
3119
3120 .Lxts_dec_out:
3121         movdqu STATE, (OUTP)
3122         jmp .Lxts_dec_ret_iv
3123
3124 .Lxts_dec_cts1:
3125         movdqa IV, STATE4
3126         _aesni_gf128mul_x_ble()
3127
3128         pxor IV, STATE
3129         call _aesni_dec1
3130         pxor IV, STATE
3131
3132 #ifndef __x86_64__
3133         lea .Lcts_permute_table, T1
3134 #else
3135         lea .Lcts_permute_table(%rip), T1
3136 #endif
3137         add LEN, INP            /* rewind input pointer */
3138         add $16, LEN            /* # bytes in final block */
3139         movups (INP), IN1
3140
3141         mov T1, IVP
3142         add $32, IVP
3143         add LEN, T1
3144         sub LEN, IVP
3145         add OUTP, LEN
3146
3147         movups (T1), %xmm4
3148         movaps STATE, IN2
3149         pshufb %xmm4, STATE
3150         movups STATE, (LEN)
3151
3152         movups (IVP), %xmm0
3153         pshufb %xmm0, IN1
3154         pblendvb IN2, IN1
3155         movaps IN1, STATE
3156
3157         pxor STATE4, STATE
3158         call _aesni_dec1
3159         pxor STATE4, STATE
3160
3161         movups STATE, (OUTP)
3162         jmp .Lxts_dec_ret
3163 SYM_FUNC_END(aesni_xts_decrypt)