arch/x86/crypto/aesni-intel_asm.S

   1 /*
   2  * Implement AES algorithm in Intel AES-NI instructions.
   3  *
   4  * The white paper of AES-NI instructions can be downloaded from:
   5  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   6  *
   7  * Copyright (C) 2008, Intel Corp.
   8  *    Author: Huang Ying <ying.huang@intel.com>
   9  *            Vinodh Gopal <vinodh.gopal@intel.com>
  10  *            Kahraman Akdemir
  11  *
  12  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13  * interface for 64-bit kernels.
  14  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  16  *             Adrian Hoban <adrian.hoban@intel.com>
  17  *             James Guilford (james.guilford@intel.com)
  18  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  19  *             Tadeusz Struk (tadeusz.struk@intel.com)
  20  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  21  *    Copyright (c) 2010, Intel Corporation.
  22  *
  23  * Ported x86_64 version to x86:
  24  *    Author: Mathias Krause <minipli@googlemail.com>
  25  *
  26  * This program is free software; you can redistribute it and/or modify
  27  * it under the terms of the GNU General Public License as published by
  28  * the Free Software Foundation; either version 2 of the License, or
  29  * (at your option) any later version.
  30  */
  31
  32 #include <linux/linkage.h>
  33 #include <asm/inst.h>
  34 #include <asm/frame.h>
  35 #include <asm/nospec-branch.h>
  36
  37 /*
  38  * The following macros are used to move an (un)aligned 16 byte value to/from
  39  * an XMM register.  This can done for either FP or integer values, for FP use
  40  * movaps (move aligned packed single) or integer use movdqa (move double quad
  41  * aligned).  It doesn't make a performance difference which instruction is used
  42  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  43  * shorter, so that is the one we'll use for now. (same for unaligned).
  44  */
  45 #define MOVADQ  movaps
  46 #define MOVUDQ  movups
  47
  48 #ifdef __x86_64__
  49
  50 # constants in mergeable sections, linker can reorder and merge
  51 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  52 .align 16
  53 .Lgf128mul_x_ble_mask:
  54         .octa 0x00000000000000010000000000000087
  55 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  56 .align 16
  57 POLY:   .octa 0xC2000000000000000000000000000001
  58 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  59 .align 16
  60 TWOONE: .octa 0x00000001000000000000000000000001
  61
  62 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  63 .align 16
  64 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  65 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  66 .align 16
  67 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  68 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  69 .align 16
  70 MASK2:      .octa 0xffffffffffffffff0000000000000000
  71 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  72 .align 16
  73 ONE:        .octa 0x00000000000000000000000000000001
  74 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  75 .align 16
  76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  77 .section        .rodata.cst16.dec, "aM", @progbits, 16
  78 .align 16
  79 dec:        .octa 0x1
  80 .section        .rodata.cst16.enc, "aM", @progbits, 16
  81 .align 16
  82 enc:        .octa 0x2
  83
  84 # order of these constants should not change.
  85 # more specifically, ALL_F should follow SHIFT_MASK,
  86 # and zero should follow ALL_F
  87 .section        .rodata, "a", @progbits
  88 .align 16
  89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  90 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  91             .octa 0x00000000000000000000000000000000
  92
  93 .text
  94
  95
  96 #define STACK_OFFSET    8*3
  97 #define HashKey         16*0    // store HashKey <<1 mod poly here
  98 #define HashKey_2       16*1    // store HashKey^2 <<1 mod poly here
  99 #define HashKey_3       16*2    // store HashKey^3 <<1 mod poly here
 100 #define HashKey_4       16*3    // store HashKey^4 <<1 mod poly here
 101 #define HashKey_k       16*4    // store XOR of High 64 bits and Low 64
 102                                 // bits of  HashKey <<1 mod poly here
 103                                 //(for Karatsuba purposes)
 104 #define HashKey_2_k     16*5    // store XOR of High 64 bits and Low 64
 105                                 // bits of  HashKey^2 <<1 mod poly here
 106                                 // (for Karatsuba purposes)
 107 #define HashKey_3_k     16*6    // store XOR of High 64 bits and Low 64
 108                                 // bits of  HashKey^3 <<1 mod poly here
 109                                 // (for Karatsuba purposes)
 110 #define HashKey_4_k     16*7    // store XOR of High 64 bits and Low 64
 111                                 // bits of  HashKey^4 <<1 mod poly here
 112                                 // (for Karatsuba purposes)
 113 #define VARIABLE_OFFSET 16*8
 114
 115 #define AadHash 16*0
 116 #define AadLen 16*1
 117 #define InLen (16*1)+8
 118 #define PBlockEncKey 16*2
 119 #define OrigIV 16*3
 120 #define CurCount 16*4
 121 #define PBlockLen 16*5
 122
 123 #define arg1 rdi
 124 #define arg2 rsi
 125 #define arg3 rdx
 126 #define arg4 rcx
 127 #define arg5 r8
 128 #define arg6 r9
 129 #define arg7 STACK_OFFSET+8(%r14)
 130 #define arg8 STACK_OFFSET+16(%r14)
 131 #define arg9 STACK_OFFSET+24(%r14)
 132 #define arg10 STACK_OFFSET+32(%r14)
 133 #define arg11 STACK_OFFSET+40(%r14)
 134 #define keysize 2*15*16(%arg1)
 135 #endif
 136
 137
 138 #define STATE1  %xmm0
 139 #define STATE2  %xmm4
 140 #define STATE3  %xmm5
 141 #define STATE4  %xmm6
 142 #define STATE   STATE1
 143 #define IN1     %xmm1
 144 #define IN2     %xmm7
 145 #define IN3     %xmm8
 146 #define IN4     %xmm9
 147 #define IN      IN1
 148 #define KEY     %xmm2
 149 #define IV      %xmm3
 150
 151 #define BSWAP_MASK %xmm10
 152 #define CTR     %xmm11
 153 #define INC     %xmm12
 154
 155 #define GF128MUL_MASK %xmm10
 156
 157 #ifdef __x86_64__
 158 #define AREG    %rax
 159 #define KEYP    %rdi
 160 #define OUTP    %rsi
 161 #define UKEYP   OUTP
 162 #define INP     %rdx
 163 #define LEN     %rcx
 164 #define IVP     %r8
 165 #define KLEN    %r9d
 166 #define T1      %r10
 167 #define TKEYP   T1
 168 #define T2      %r11
 169 #define TCTR_LOW T2
 170 #else
 171 #define AREG    %eax
 172 #define KEYP    %edi
 173 #define OUTP    AREG
 174 #define UKEYP   OUTP
 175 #define INP     %edx
 176 #define LEN     %esi
 177 #define IVP     %ebp
 178 #define KLEN    %ebx
 179 #define T1      %ecx
 180 #define TKEYP   T1
 181 #endif
 182
 183 .macro FUNC_SAVE
 184         push    %r12
 185         push    %r13
 186         push    %r14
 187         mov     %rsp, %r14
 188 #
 189 # states of %xmm registers %xmm6:%xmm15 not saved
 190 # all %xmm registers are clobbered
 191 #
 192         sub     $VARIABLE_OFFSET, %rsp
 193         and     $~63, %rsp
 194 .endm
 195
 196
 197 .macro FUNC_RESTORE
 198         mov     %r14, %rsp
 199         pop     %r14
 200         pop     %r13
 201         pop     %r12
 202 .endm
 203
 204
 205 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 206 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 207 .macro GCM_INIT
 208
 209         mov arg9, %r11
 210         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 211         xor %r11, %r11
 212         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 213         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 214         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 215         mov %arg6, %rax
 216         movdqu (%rax), %xmm0
 217         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 218
 219         movdqa  SHUF_MASK(%rip), %xmm2
 220         PSHUFB_XMM %xmm2, %xmm0
 221         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 222
 223         mov     arg7, %r12
 224         movdqu  (%r12), %xmm13
 225         movdqa  SHUF_MASK(%rip), %xmm2
 226         PSHUFB_XMM %xmm2, %xmm13
 227
 228         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 229
 230         movdqa  %xmm13, %xmm2
 231         psllq   $1, %xmm13
 232         psrlq   $63, %xmm2
 233         movdqa  %xmm2, %xmm1
 234         pslldq  $8, %xmm2
 235         psrldq  $8, %xmm1
 236         por     %xmm2, %xmm13
 237
 238         # reduce HashKey<<1
 239
 240         pshufd  $0x24, %xmm1, %xmm2
 241         pcmpeqd TWOONE(%rip), %xmm2
 242         pand    POLY(%rip), %xmm2
 243         pxor    %xmm2, %xmm13
 244         movdqa  %xmm13, HashKey(%rsp)
 245
 246         CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
 247         %xmm5 %xmm6
 248 .endm
 249
 250 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 251 # struct has been initialized by GCM_INIT.
 252 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 253 # Clobbers rax, r10-r13, and xmm0-xmm15
 254 .macro GCM_ENC_DEC operation
 255         movdqu AadHash(%arg2), %xmm8
 256         movdqu HashKey(%rsp), %xmm13
 257         add %arg5, InLen(%arg2)
 258         mov %arg5, %r13         # save the number of bytes
 259         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 260         mov %r13, %r12
 261         # Encrypt/Decrypt first few blocks
 262
 263         and     $(3<<4), %r12
 264         jz      _initial_num_blocks_is_0_\@
 265         cmp     $(2<<4), %r12
 266         jb      _initial_num_blocks_is_1_\@
 267         je      _initial_num_blocks_is_2_\@
 268 _initial_num_blocks_is_3_\@:
 269         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 270 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 271         sub     $48, %r13
 272         jmp     _initial_blocks_\@
 273 _initial_num_blocks_is_2_\@:
 274         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 275 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 276         sub     $32, %r13
 277         jmp     _initial_blocks_\@
 278 _initial_num_blocks_is_1_\@:
 279         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 280 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 281         sub     $16, %r13
 282         jmp     _initial_blocks_\@
 283 _initial_num_blocks_is_0_\@:
 284         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 285 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 286 _initial_blocks_\@:
 287
 288         # Main loop - Encrypt/Decrypt remaining blocks
 289
 290         cmp     $0, %r13
 291         je      _zero_cipher_left_\@
 292         sub     $64, %r13
 293         je      _four_cipher_left_\@
 294 _crypt_by_4_\@:
 295         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 296         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 297         %xmm7, %xmm8, enc
 298         add     $64, %r11
 299         sub     $64, %r13
 300         jne     _crypt_by_4_\@
 301 _four_cipher_left_\@:
 302         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 303 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 304 _zero_cipher_left_\@:
 305         movdqu %xmm8, AadHash(%arg2)
 306         movdqu %xmm0, CurCount(%arg2)
 307
 308         mov     %arg5, %r13
 309         and     $15, %r13                       # %r13 = arg5 (mod 16)
 310         je      _multiple_of_16_bytes_\@
 311
 312         mov %r13, PBlockLen(%arg2)
 313
 314         # Handle the last <16 Byte block separately
 315         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 316         movdqu %xmm0, CurCount(%arg2)
 317         movdqa SHUF_MASK(%rip), %xmm10
 318         PSHUFB_XMM %xmm10, %xmm0
 319
 320         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 321         movdqu %xmm0, PBlockEncKey(%arg2)
 322
 323         lea (%arg4,%r11,1), %r10
 324         mov %r13, %r12
 325         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 326
 327         lea ALL_F+16(%rip), %r12
 328         sub %r13, %r12
 329 .ifc \operation, dec
 330         movdqa  %xmm1, %xmm2
 331 .endif
 332         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 333         movdqu  (%r12), %xmm1
 334         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 335         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 336 .ifc \operation, dec
 337         pand    %xmm1, %xmm2
 338         movdqa SHUF_MASK(%rip), %xmm10
 339         PSHUFB_XMM %xmm10 ,%xmm2
 340
 341         pxor %xmm2, %xmm8
 342 .else
 343         movdqa SHUF_MASK(%rip), %xmm10
 344         PSHUFB_XMM %xmm10,%xmm0
 345
 346         pxor    %xmm0, %xmm8
 347 .endif
 348
 349         movdqu %xmm8, AadHash(%arg2)
 350 .ifc \operation, enc
 351         # GHASH computation for the last <16 byte block
 352         movdqa SHUF_MASK(%rip), %xmm10
 353         # shuffle xmm0 back to output as ciphertext
 354         PSHUFB_XMM %xmm10, %xmm0
 355 .endif
 356
 357         # Output %r13 bytes
 358         MOVQ_R64_XMM %xmm0, %rax
 359         cmp $8, %r13
 360         jle _less_than_8_bytes_left_\@
 361         mov %rax, (%arg3 , %r11, 1)
 362         add $8, %r11
 363         psrldq $8, %xmm0
 364         MOVQ_R64_XMM %xmm0, %rax
 365         sub $8, %r13
 366 _less_than_8_bytes_left_\@:
 367         mov %al,  (%arg3, %r11, 1)
 368         add $1, %r11
 369         shr $8, %rax
 370         sub $1, %r13
 371         jne _less_than_8_bytes_left_\@
 372 _multiple_of_16_bytes_\@:
 373 .endm
 374
 375 # GCM_COMPLETE Finishes update of tag of last partial block
 376 # Output: Authorization Tag (AUTH_TAG)
 377 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 378 .macro GCM_COMPLETE
 379         movdqu AadHash(%arg2), %xmm8
 380         movdqu HashKey(%rsp), %xmm13
 381
 382         mov PBlockLen(%arg2), %r12
 383
 384         cmp $0, %r12
 385         je _partial_done\@
 386
 387         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 388
 389 _partial_done\@:
 390         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 391         shl     $3, %r12                  # convert into number of bits
 392         movd    %r12d, %xmm15             # len(A) in %xmm15
 393         mov InLen(%arg2), %r12
 394         shl     $3, %r12                  # len(C) in bits (*128)
 395         MOVQ_R64_XMM    %r12, %xmm1
 396
 397         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 398         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 399         pxor    %xmm15, %xmm8
 400         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 401         # final GHASH computation
 402         movdqa SHUF_MASK(%rip), %xmm10
 403         PSHUFB_XMM %xmm10, %xmm8
 404
 405         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 406         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 407         pxor    %xmm8, %xmm0
 408 _return_T_\@:
 409         mov     arg10, %r10                     # %r10 = authTag
 410         mov     arg11, %r11                    # %r11 = auth_tag_len
 411         cmp     $16, %r11
 412         je      _T_16_\@
 413         cmp     $8, %r11
 414         jl      _T_4_\@
 415 _T_8_\@:
 416         MOVQ_R64_XMM    %xmm0, %rax
 417         mov     %rax, (%r10)
 418         add     $8, %r10
 419         sub     $8, %r11
 420         psrldq  $8, %xmm0
 421         cmp     $0, %r11
 422         je      _return_T_done_\@
 423 _T_4_\@:
 424         movd    %xmm0, %eax
 425         mov     %eax, (%r10)
 426         add     $4, %r10
 427         sub     $4, %r11
 428         psrldq  $4, %xmm0
 429         cmp     $0, %r11
 430         je      _return_T_done_\@
 431 _T_123_\@:
 432         movd    %xmm0, %eax
 433         cmp     $2, %r11
 434         jl      _T_1_\@
 435         mov     %ax, (%r10)
 436         cmp     $2, %r11
 437         je      _return_T_done_\@
 438         add     $2, %r10
 439         sar     $16, %eax
 440 _T_1_\@:
 441         mov     %al, (%r10)
 442         jmp     _return_T_done_\@
 443 _T_16_\@:
 444         movdqu  %xmm0, (%r10)
 445 _return_T_done_\@:
 446 .endm
 447
 448 #ifdef __x86_64__
 449 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 450 *
 451 *
 452 * Input: A and B (128-bits each, bit-reflected)
 453 * Output: C = A*B*x mod poly, (i.e. >>1 )
 454 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 455 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 456 *
 457 */
 458 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 459         movdqa    \GH, \TMP1
 460         pshufd    $78, \GH, \TMP2
 461         pshufd    $78, \HK, \TMP3
 462         pxor      \GH, \TMP2            # TMP2 = a1+a0
 463         pxor      \HK, \TMP3            # TMP3 = b1+b0
 464         PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 465         PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 466         PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 467         pxor      \GH, \TMP2
 468         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 469         movdqa    \TMP2, \TMP3
 470         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 471         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 472         pxor      \TMP3, \GH
 473         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 474
 475         # first phase of the reduction
 476
 477         movdqa    \GH, \TMP2
 478         movdqa    \GH, \TMP3
 479         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 480                                         # in in order to perform
 481                                         # independent shifts
 482         pslld     $31, \TMP2            # packed right shift <<31
 483         pslld     $30, \TMP3            # packed right shift <<30
 484         pslld     $25, \TMP4            # packed right shift <<25
 485         pxor      \TMP3, \TMP2          # xor the shifted versions
 486         pxor      \TMP4, \TMP2
 487         movdqa    \TMP2, \TMP5
 488         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 489         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 490         pxor      \TMP2, \GH
 491
 492         # second phase of the reduction
 493
 494         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 495                                         # in in order to perform
 496                                         # independent shifts
 497         movdqa    \GH,\TMP3
 498         movdqa    \GH,\TMP4
 499         psrld     $1,\TMP2              # packed left shift >>1
 500         psrld     $2,\TMP3              # packed left shift >>2
 501         psrld     $7,\TMP4              # packed left shift >>7
 502         pxor      \TMP3,\TMP2           # xor the shifted versions
 503         pxor      \TMP4,\TMP2
 504         pxor      \TMP5, \TMP2
 505         pxor      \TMP2, \GH
 506         pxor      \TMP1, \GH            # result is in TMP1
 507 .endm
 508
 509 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 510 # where 0 < DLEN < 16
 511 # Clobbers %rax, DLEN and XMM1
 512 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 513         cmp $8, \DLEN
 514         jl _read_lt8_\@
 515         mov (\DPTR), %rax
 516         MOVQ_R64_XMM %rax, \XMMDst
 517         sub $8, \DLEN
 518         jz _done_read_partial_block_\@
 519         xor %eax, %eax
 520 _read_next_byte_\@:
 521         shl $8, %rax
 522         mov 7(\DPTR, \DLEN, 1), %al
 523         dec \DLEN
 524         jnz _read_next_byte_\@
 525         MOVQ_R64_XMM %rax, \XMM1
 526         pslldq $8, \XMM1
 527         por \XMM1, \XMMDst
 528         jmp _done_read_partial_block_\@
 529 _read_lt8_\@:
 530         xor %eax, %eax
 531 _read_next_byte_lt8_\@:
 532         shl $8, %rax
 533         mov -1(\DPTR, \DLEN, 1), %al
 534         dec \DLEN
 535         jnz _read_next_byte_lt8_\@
 536         MOVQ_R64_XMM %rax, \XMMDst
 537 _done_read_partial_block_\@:
 538 .endm
 539
 540 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 541 # clobbers r10-11, xmm14
 542 .macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
 543         TMP6 TMP7
 544         MOVADQ     SHUF_MASK(%rip), %xmm14
 545         mov        arg8, %r10           # %r10 = AAD
 546         mov        arg9, %r11           # %r11 = aadLen
 547         pxor       \TMP7, \TMP7
 548         pxor       \TMP6, \TMP6
 549
 550         cmp        $16, %r11
 551         jl         _get_AAD_rest\@
 552 _get_AAD_blocks\@:
 553         movdqu     (%r10), \TMP7
 554         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 555         pxor       \TMP7, \TMP6
 556         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 557         add        $16, %r10
 558         sub        $16, %r11
 559         cmp        $16, %r11
 560         jge        _get_AAD_blocks\@
 561
 562         movdqu     \TMP6, \TMP7
 563
 564         /* read the last <16B of AAD */
 565 _get_AAD_rest\@:
 566         cmp        $0, %r11
 567         je         _get_AAD_done\@
 568
 569         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 570         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 571         pxor       \TMP6, \TMP7
 572         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 573         movdqu \TMP7, \TMP6
 574
 575 _get_AAD_done\@:
 576         movdqu \TMP6, AadHash(%arg2)
 577 .endm
 578
 579 /*
 580 * if a = number of total plaintext bytes
 581 * b = floor(a/16)
 582 * num_initial_blocks = b mod 4
 583 * encrypt the initial num_initial_blocks blocks and apply ghash on
 584 * the ciphertext
 585 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 586 * are clobbered
 587 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
 588 */
 589
 590
 591 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 592         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 593         MOVADQ          SHUF_MASK(%rip), %xmm14
 594
 595         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 596
 597         xor        %r11, %r11 # initialise the data pointer offset as zero
 598         # start AES for num_initial_blocks blocks
 599
 600         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 601
 602 .if (\i == 5) || (\i == 6) || (\i == 7)
 603
 604         MOVADQ          ONE(%RIP),\TMP1
 605         MOVADQ          0(%arg1),\TMP2
 606 .irpc index, \i_seq
 607         paddd           \TMP1, \XMM0                 # INCR Y0
 608 .ifc \operation, dec
 609         movdqa     \XMM0, %xmm\index
 610 .else
 611         MOVADQ          \XMM0, %xmm\index
 612 .endif
 613         PSHUFB_XMM      %xmm14, %xmm\index      # perform a 16 byte swap
 614         pxor            \TMP2, %xmm\index
 615 .endr
 616         lea     0x10(%arg1),%r10
 617         mov     keysize,%eax
 618         shr     $2,%eax                         # 128->4, 192->6, 256->8
 619         add     $5,%eax                       # 128->9, 192->11, 256->13
 620
 621 aes_loop_initial_\@:
 622         MOVADQ  (%r10),\TMP1
 623 .irpc   index, \i_seq
 624         AESENC  \TMP1, %xmm\index
 625 .endr
 626         add     $16,%r10
 627         sub     $1,%eax
 628         jnz     aes_loop_initial_\@
 629
 630         MOVADQ  (%r10), \TMP1
 631 .irpc index, \i_seq
 632         AESENCLAST \TMP1, %xmm\index         # Last Round
 633 .endr
 634 .irpc index, \i_seq
 635         movdqu     (%arg4 , %r11, 1), \TMP1
 636         pxor       \TMP1, %xmm\index
 637         movdqu     %xmm\index, (%arg3 , %r11, 1)
 638         # write back plaintext/ciphertext for num_initial_blocks
 639         add        $16, %r11
 640
 641 .ifc \operation, dec
 642         movdqa     \TMP1, %xmm\index
 643 .endif
 644         PSHUFB_XMM         %xmm14, %xmm\index
 645
 646                 # prepare plaintext/ciphertext for GHASH computation
 647 .endr
 648 .endif
 649
 650         # apply GHASH on num_initial_blocks blocks
 651
 652 .if \i == 5
 653         pxor       %xmm5, %xmm6
 654         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 655         pxor       %xmm6, %xmm7
 656         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 657         pxor       %xmm7, %xmm8
 658         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 659 .elseif \i == 6
 660         pxor       %xmm6, %xmm7
 661         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 662         pxor       %xmm7, %xmm8
 663         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 664 .elseif \i == 7
 665         pxor       %xmm7, %xmm8
 666         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 667 .endif
 668         cmp        $64, %r13
 669         jl      _initial_blocks_done\@
 670         # no need for precomputed values
 671 /*
 672 *
 673 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 674 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 675 */
 676         MOVADQ     ONE(%RIP),\TMP1
 677         paddd      \TMP1, \XMM0              # INCR Y0
 678         MOVADQ     \XMM0, \XMM1
 679         PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 680
 681         paddd      \TMP1, \XMM0              # INCR Y0
 682         MOVADQ     \XMM0, \XMM2
 683         PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 684
 685         paddd      \TMP1, \XMM0              # INCR Y0
 686         MOVADQ     \XMM0, \XMM3
 687         PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 688
 689         paddd      \TMP1, \XMM0              # INCR Y0
 690         MOVADQ     \XMM0, \XMM4
 691         PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 692
 693         MOVADQ     0(%arg1),\TMP1
 694         pxor       \TMP1, \XMM1
 695         pxor       \TMP1, \XMM2
 696         pxor       \TMP1, \XMM3
 697         pxor       \TMP1, \XMM4
 698         movdqa     \TMP3, \TMP5
 699         pshufd     $78, \TMP3, \TMP1
 700         pxor       \TMP3, \TMP1
 701         movdqa     \TMP1, HashKey_k(%rsp)
 702         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 703 # TMP5 = HashKey^2<<1 (mod poly)
 704         movdqa     \TMP5, HashKey_2(%rsp)
 705 # HashKey_2 = HashKey^2<<1 (mod poly)
 706         pshufd     $78, \TMP5, \TMP1
 707         pxor       \TMP5, \TMP1
 708         movdqa     \TMP1, HashKey_2_k(%rsp)
 709 .irpc index, 1234 # do 4 rounds
 710         movaps 0x10*\index(%arg1), \TMP1
 711         AESENC     \TMP1, \XMM1
 712         AESENC     \TMP1, \XMM2
 713         AESENC     \TMP1, \XMM3
 714         AESENC     \TMP1, \XMM4
 715 .endr
 716         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 717 # TMP5 = HashKey^3<<1 (mod poly)
 718         movdqa     \TMP5, HashKey_3(%rsp)
 719         pshufd     $78, \TMP5, \TMP1
 720         pxor       \TMP5, \TMP1
 721         movdqa     \TMP1, HashKey_3_k(%rsp)
 722 .irpc index, 56789 # do next 5 rounds
 723         movaps 0x10*\index(%arg1), \TMP1
 724         AESENC     \TMP1, \XMM1
 725         AESENC     \TMP1, \XMM2
 726         AESENC     \TMP1, \XMM3
 727         AESENC     \TMP1, \XMM4
 728 .endr
 729         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 730 # TMP5 = HashKey^3<<1 (mod poly)
 731         movdqa     \TMP5, HashKey_4(%rsp)
 732         pshufd     $78, \TMP5, \TMP1
 733         pxor       \TMP5, \TMP1
 734         movdqa     \TMP1, HashKey_4_k(%rsp)
 735         lea        0xa0(%arg1),%r10
 736         mov        keysize,%eax
 737         shr        $2,%eax                      # 128->4, 192->6, 256->8
 738         sub        $4,%eax                      # 128->0, 192->2, 256->4
 739         jz         aes_loop_pre_done\@
 740
 741 aes_loop_pre_\@:
 742         MOVADQ     (%r10),\TMP2
 743 .irpc   index, 1234
 744         AESENC     \TMP2, %xmm\index
 745 .endr
 746         add        $16,%r10
 747         sub        $1,%eax
 748         jnz        aes_loop_pre_\@
 749
 750 aes_loop_pre_done\@:
 751         MOVADQ     (%r10), \TMP2
 752         AESENCLAST \TMP2, \XMM1
 753         AESENCLAST \TMP2, \XMM2
 754         AESENCLAST \TMP2, \XMM3
 755         AESENCLAST \TMP2, \XMM4
 756         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 757         pxor       \TMP1, \XMM1
 758 .ifc \operation, dec
 759         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 760         movdqa     \TMP1, \XMM1
 761 .endif
 762         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 763         pxor       \TMP1, \XMM2
 764 .ifc \operation, dec
 765         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 766         movdqa     \TMP1, \XMM2
 767 .endif
 768         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 769         pxor       \TMP1, \XMM3
 770 .ifc \operation, dec
 771         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 772         movdqa     \TMP1, \XMM3
 773 .endif
 774         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 775         pxor       \TMP1, \XMM4
 776 .ifc \operation, dec
 777         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 778         movdqa     \TMP1, \XMM4
 779 .else
 780         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 781         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 782         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 783         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 784 .endif
 785
 786         add        $64, %r11
 787         PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 788         pxor       \XMMDst, \XMM1
 789 # combine GHASHed value with the corresponding ciphertext
 790         PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 791         PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 792         PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 793
 794 _initial_blocks_done\@:
 795
 796 .endm
 797
 798 /*
 799 * encrypt 4 blocks at a time
 800 * ghash the 4 previously encrypted ciphertext blocks
 801 * arg1, %arg3, %arg4 are used as pointers only, not modified
 802 * %r11 is the data offset value
 803 */
 804 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 805 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 806
 807         movdqa    \XMM1, \XMM5
 808         movdqa    \XMM2, \XMM6
 809         movdqa    \XMM3, \XMM7
 810         movdqa    \XMM4, \XMM8
 811
 812         movdqa    SHUF_MASK(%rip), %xmm15
 813         # multiply TMP5 * HashKey using karatsuba
 814
 815         movdqa    \XMM5, \TMP4
 816         pshufd    $78, \XMM5, \TMP6
 817         pxor      \XMM5, \TMP6
 818         paddd     ONE(%rip), \XMM0              # INCR CNT
 819         movdqa    HashKey_4(%rsp), \TMP5
 820         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 821         movdqa    \XMM0, \XMM1
 822         paddd     ONE(%rip), \XMM0              # INCR CNT
 823         movdqa    \XMM0, \XMM2
 824         paddd     ONE(%rip), \XMM0              # INCR CNT
 825         movdqa    \XMM0, \XMM3
 826         paddd     ONE(%rip), \XMM0              # INCR CNT
 827         movdqa    \XMM0, \XMM4
 828         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
 829         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 830         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
 831         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
 832         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
 833
 834         pxor      (%arg1), \XMM1
 835         pxor      (%arg1), \XMM2
 836         pxor      (%arg1), \XMM3
 837         pxor      (%arg1), \XMM4
 838         movdqa    HashKey_4_k(%rsp), \TMP5
 839         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
 840         movaps 0x10(%arg1), \TMP1
 841         AESENC    \TMP1, \XMM1              # Round 1
 842         AESENC    \TMP1, \XMM2
 843         AESENC    \TMP1, \XMM3
 844         AESENC    \TMP1, \XMM4
 845         movaps 0x20(%arg1), \TMP1
 846         AESENC    \TMP1, \XMM1              # Round 2
 847         AESENC    \TMP1, \XMM2
 848         AESENC    \TMP1, \XMM3
 849         AESENC    \TMP1, \XMM4
 850         movdqa    \XMM6, \TMP1
 851         pshufd    $78, \XMM6, \TMP2
 852         pxor      \XMM6, \TMP2
 853         movdqa    HashKey_3(%rsp), \TMP5
 854         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
 855         movaps 0x30(%arg1), \TMP3
 856         AESENC    \TMP3, \XMM1              # Round 3
 857         AESENC    \TMP3, \XMM2
 858         AESENC    \TMP3, \XMM3
 859         AESENC    \TMP3, \XMM4
 860         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
 861         movaps 0x40(%arg1), \TMP3
 862         AESENC    \TMP3, \XMM1              # Round 4
 863         AESENC    \TMP3, \XMM2
 864         AESENC    \TMP3, \XMM3
 865         AESENC    \TMP3, \XMM4
 866         movdqa    HashKey_3_k(%rsp), \TMP5
 867         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 868         movaps 0x50(%arg1), \TMP3
 869         AESENC    \TMP3, \XMM1              # Round 5
 870         AESENC    \TMP3, \XMM2
 871         AESENC    \TMP3, \XMM3
 872         AESENC    \TMP3, \XMM4
 873         pxor      \TMP1, \TMP4
 874 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 875         pxor      \XMM6, \XMM5
 876         pxor      \TMP2, \TMP6
 877         movdqa    \XMM7, \TMP1
 878         pshufd    $78, \XMM7, \TMP2
 879         pxor      \XMM7, \TMP2
 880         movdqa    HashKey_2(%rsp ), \TMP5
 881
 882         # Multiply TMP5 * HashKey using karatsuba
 883
 884         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
 885         movaps 0x60(%arg1), \TMP3
 886         AESENC    \TMP3, \XMM1              # Round 6
 887         AESENC    \TMP3, \XMM2
 888         AESENC    \TMP3, \XMM3
 889         AESENC    \TMP3, \XMM4
 890         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
 891         movaps 0x70(%arg1), \TMP3
 892         AESENC    \TMP3, \XMM1             # Round 7
 893         AESENC    \TMP3, \XMM2
 894         AESENC    \TMP3, \XMM3
 895         AESENC    \TMP3, \XMM4
 896         movdqa    HashKey_2_k(%rsp), \TMP5
 897         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
 898         movaps 0x80(%arg1), \TMP3
 899         AESENC    \TMP3, \XMM1             # Round 8
 900         AESENC    \TMP3, \XMM2
 901         AESENC    \TMP3, \XMM3
 902         AESENC    \TMP3, \XMM4
 903         pxor      \TMP1, \TMP4
 904 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
 905         pxor      \XMM7, \XMM5
 906         pxor      \TMP2, \TMP6
 907
 908         # Multiply XMM8 * HashKey
 909         # XMM8 and TMP5 hold the values for the two operands
 910
 911         movdqa    \XMM8, \TMP1
 912         pshufd    $78, \XMM8, \TMP2
 913         pxor      \XMM8, \TMP2
 914         movdqa    HashKey(%rsp), \TMP5
 915         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
 916         movaps 0x90(%arg1), \TMP3
 917         AESENC    \TMP3, \XMM1            # Round 9
 918         AESENC    \TMP3, \XMM2
 919         AESENC    \TMP3, \XMM3
 920         AESENC    \TMP3, \XMM4
 921         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
 922         lea       0xa0(%arg1),%r10
 923         mov       keysize,%eax
 924         shr       $2,%eax                       # 128->4, 192->6, 256->8
 925         sub       $4,%eax                       # 128->0, 192->2, 256->4
 926         jz        aes_loop_par_enc_done
 927
 928 aes_loop_par_enc:
 929         MOVADQ    (%r10),\TMP3
 930 .irpc   index, 1234
 931         AESENC    \TMP3, %xmm\index
 932 .endr
 933         add       $16,%r10
 934         sub       $1,%eax
 935         jnz       aes_loop_par_enc
 936
 937 aes_loop_par_enc_done:
 938         MOVADQ    (%r10), \TMP3
 939         AESENCLAST \TMP3, \XMM1           # Round 10
 940         AESENCLAST \TMP3, \XMM2
 941         AESENCLAST \TMP3, \XMM3
 942         AESENCLAST \TMP3, \XMM4
 943         movdqa    HashKey_k(%rsp), \TMP5
 944         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
 945         movdqu    (%arg4,%r11,1), \TMP3
 946         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
 947         movdqu    16(%arg4,%r11,1), \TMP3
 948         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
 949         movdqu    32(%arg4,%r11,1), \TMP3
 950         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
 951         movdqu    48(%arg4,%r11,1), \TMP3
 952         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
 953         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
 954         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
 955         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
 956         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
 957         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
 958         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
 959         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
 960         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
 961
 962         pxor      \TMP4, \TMP1
 963         pxor      \XMM8, \XMM5
 964         pxor      \TMP6, \TMP2
 965         pxor      \TMP1, \TMP2
 966         pxor      \XMM5, \TMP2
 967         movdqa    \TMP2, \TMP3
 968         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
 969         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
 970         pxor      \TMP3, \XMM5
 971         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
 972
 973         # first phase of reduction
 974
 975         movdqa    \XMM5, \TMP2
 976         movdqa    \XMM5, \TMP3
 977         movdqa    \XMM5, \TMP4
 978 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
 979         pslld     $31, \TMP2                   # packed right shift << 31
 980         pslld     $30, \TMP3                   # packed right shift << 30
 981         pslld     $25, \TMP4                   # packed right shift << 25
 982         pxor      \TMP3, \TMP2                 # xor the shifted versions
 983         pxor      \TMP4, \TMP2
 984         movdqa    \TMP2, \TMP5
 985         psrldq    $4, \TMP5                    # right shift T5 1 DW
 986         pslldq    $12, \TMP2                   # left shift T2 3 DWs
 987         pxor      \TMP2, \XMM5
 988
 989         # second phase of reduction
 990
 991         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
 992         movdqa    \XMM5,\TMP3
 993         movdqa    \XMM5,\TMP4
 994         psrld     $1, \TMP2                    # packed left shift >>1
 995         psrld     $2, \TMP3                    # packed left shift >>2
 996         psrld     $7, \TMP4                    # packed left shift >>7
 997         pxor      \TMP3,\TMP2                  # xor the shifted versions
 998         pxor      \TMP4,\TMP2
 999         pxor      \TMP5, \TMP2
1000         pxor      \TMP2, \XMM5
1001         pxor      \TMP1, \XMM5                 # result is in TMP1
1002
1003         pxor      \XMM5, \XMM1
1004 .endm
1005
1006 /*
1007 * decrypt 4 blocks at a time
1008 * ghash the 4 previously decrypted ciphertext blocks
1009 * arg1, %arg3, %arg4 are used as pointers only, not modified
1010 * %r11 is the data offset value
1011 */
1012 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1013 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1014
1015         movdqa    \XMM1, \XMM5
1016         movdqa    \XMM2, \XMM6
1017         movdqa    \XMM3, \XMM7
1018         movdqa    \XMM4, \XMM8
1019
1020         movdqa    SHUF_MASK(%rip), %xmm15
1021         # multiply TMP5 * HashKey using karatsuba
1022
1023         movdqa    \XMM5, \TMP4
1024         pshufd    $78, \XMM5, \TMP6
1025         pxor      \XMM5, \TMP6
1026         paddd     ONE(%rip), \XMM0              # INCR CNT
1027         movdqa    HashKey_4(%rsp), \TMP5
1028         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1029         movdqa    \XMM0, \XMM1
1030         paddd     ONE(%rip), \XMM0              # INCR CNT
1031         movdqa    \XMM0, \XMM2
1032         paddd     ONE(%rip), \XMM0              # INCR CNT
1033         movdqa    \XMM0, \XMM3
1034         paddd     ONE(%rip), \XMM0              # INCR CNT
1035         movdqa    \XMM0, \XMM4
1036         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1037         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1038         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1039         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1040         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1041
1042         pxor      (%arg1), \XMM1
1043         pxor      (%arg1), \XMM2
1044         pxor      (%arg1), \XMM3
1045         pxor      (%arg1), \XMM4
1046         movdqa    HashKey_4_k(%rsp), \TMP5
1047         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1048         movaps 0x10(%arg1), \TMP1
1049         AESENC    \TMP1, \XMM1              # Round 1
1050         AESENC    \TMP1, \XMM2
1051         AESENC    \TMP1, \XMM3
1052         AESENC    \TMP1, \XMM4
1053         movaps 0x20(%arg1), \TMP1
1054         AESENC    \TMP1, \XMM1              # Round 2
1055         AESENC    \TMP1, \XMM2
1056         AESENC    \TMP1, \XMM3
1057         AESENC    \TMP1, \XMM4
1058         movdqa    \XMM6, \TMP1
1059         pshufd    $78, \XMM6, \TMP2
1060         pxor      \XMM6, \TMP2
1061         movdqa    HashKey_3(%rsp), \TMP5
1062         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1063         movaps 0x30(%arg1), \TMP3
1064         AESENC    \TMP3, \XMM1              # Round 3
1065         AESENC    \TMP3, \XMM2
1066         AESENC    \TMP3, \XMM3
1067         AESENC    \TMP3, \XMM4
1068         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1069         movaps 0x40(%arg1), \TMP3
1070         AESENC    \TMP3, \XMM1              # Round 4
1071         AESENC    \TMP3, \XMM2
1072         AESENC    \TMP3, \XMM3
1073         AESENC    \TMP3, \XMM4
1074         movdqa    HashKey_3_k(%rsp), \TMP5
1075         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1076         movaps 0x50(%arg1), \TMP3
1077         AESENC    \TMP3, \XMM1              # Round 5
1078         AESENC    \TMP3, \XMM2
1079         AESENC    \TMP3, \XMM3
1080         AESENC    \TMP3, \XMM4
1081         pxor      \TMP1, \TMP4
1082 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1083         pxor      \XMM6, \XMM5
1084         pxor      \TMP2, \TMP6
1085         movdqa    \XMM7, \TMP1
1086         pshufd    $78, \XMM7, \TMP2
1087         pxor      \XMM7, \TMP2
1088         movdqa    HashKey_2(%rsp ), \TMP5
1089
1090         # Multiply TMP5 * HashKey using karatsuba
1091
1092         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1093         movaps 0x60(%arg1), \TMP3
1094         AESENC    \TMP3, \XMM1              # Round 6
1095         AESENC    \TMP3, \XMM2
1096         AESENC    \TMP3, \XMM3
1097         AESENC    \TMP3, \XMM4
1098         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1099         movaps 0x70(%arg1), \TMP3
1100         AESENC    \TMP3, \XMM1             # Round 7
1101         AESENC    \TMP3, \XMM2
1102         AESENC    \TMP3, \XMM3
1103         AESENC    \TMP3, \XMM4
1104         movdqa    HashKey_2_k(%rsp), \TMP5
1105         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1106         movaps 0x80(%arg1), \TMP3
1107         AESENC    \TMP3, \XMM1             # Round 8
1108         AESENC    \TMP3, \XMM2
1109         AESENC    \TMP3, \XMM3
1110         AESENC    \TMP3, \XMM4
1111         pxor      \TMP1, \TMP4
1112 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1113         pxor      \XMM7, \XMM5
1114         pxor      \TMP2, \TMP6
1115
1116         # Multiply XMM8 * HashKey
1117         # XMM8 and TMP5 hold the values for the two operands
1118
1119         movdqa    \XMM8, \TMP1
1120         pshufd    $78, \XMM8, \TMP2
1121         pxor      \XMM8, \TMP2
1122         movdqa    HashKey(%rsp), \TMP5
1123         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1124         movaps 0x90(%arg1), \TMP3
1125         AESENC    \TMP3, \XMM1            # Round 9
1126         AESENC    \TMP3, \XMM2
1127         AESENC    \TMP3, \XMM3
1128         AESENC    \TMP3, \XMM4
1129         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1130         lea       0xa0(%arg1),%r10
1131         mov       keysize,%eax
1132         shr       $2,%eax                       # 128->4, 192->6, 256->8
1133         sub       $4,%eax                       # 128->0, 192->2, 256->4
1134         jz        aes_loop_par_dec_done
1135
1136 aes_loop_par_dec:
1137         MOVADQ    (%r10),\TMP3
1138 .irpc   index, 1234
1139         AESENC    \TMP3, %xmm\index
1140 .endr
1141         add       $16,%r10
1142         sub       $1,%eax
1143         jnz       aes_loop_par_dec
1144
1145 aes_loop_par_dec_done:
1146         MOVADQ    (%r10), \TMP3
1147         AESENCLAST \TMP3, \XMM1           # last round
1148         AESENCLAST \TMP3, \XMM2
1149         AESENCLAST \TMP3, \XMM3
1150         AESENCLAST \TMP3, \XMM4
1151         movdqa    HashKey_k(%rsp), \TMP5
1152         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1153         movdqu    (%arg4,%r11,1), \TMP3
1154         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1155         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1156         movdqa    \TMP3, \XMM1
1157         movdqu    16(%arg4,%r11,1), \TMP3
1158         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1159         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1160         movdqa    \TMP3, \XMM2
1161         movdqu    32(%arg4,%r11,1), \TMP3
1162         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1163         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1164         movdqa    \TMP3, \XMM3
1165         movdqu    48(%arg4,%r11,1), \TMP3
1166         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1167         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1168         movdqa    \TMP3, \XMM4
1169         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1170         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1171         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1172         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1173
1174         pxor      \TMP4, \TMP1
1175         pxor      \XMM8, \XMM5
1176         pxor      \TMP6, \TMP2
1177         pxor      \TMP1, \TMP2
1178         pxor      \XMM5, \TMP2
1179         movdqa    \TMP2, \TMP3
1180         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1181         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1182         pxor      \TMP3, \XMM5
1183         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1184
1185         # first phase of reduction
1186
1187         movdqa    \XMM5, \TMP2
1188         movdqa    \XMM5, \TMP3
1189         movdqa    \XMM5, \TMP4
1190 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1191         pslld     $31, \TMP2                   # packed right shift << 31
1192         pslld     $30, \TMP3                   # packed right shift << 30
1193         pslld     $25, \TMP4                   # packed right shift << 25
1194         pxor      \TMP3, \TMP2                 # xor the shifted versions
1195         pxor      \TMP4, \TMP2
1196         movdqa    \TMP2, \TMP5
1197         psrldq    $4, \TMP5                    # right shift T5 1 DW
1198         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1199         pxor      \TMP2, \XMM5
1200
1201         # second phase of reduction
1202
1203         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1204         movdqa    \XMM5,\TMP3
1205         movdqa    \XMM5,\TMP4
1206         psrld     $1, \TMP2                    # packed left shift >>1
1207         psrld     $2, \TMP3                    # packed left shift >>2
1208         psrld     $7, \TMP4                    # packed left shift >>7
1209         pxor      \TMP3,\TMP2                  # xor the shifted versions
1210         pxor      \TMP4,\TMP2
1211         pxor      \TMP5, \TMP2
1212         pxor      \TMP2, \XMM5
1213         pxor      \TMP1, \XMM5                 # result is in TMP1
1214
1215         pxor      \XMM5, \XMM1
1216 .endm
1217
1218 /* GHASH the last 4 ciphertext blocks. */
1219 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1220 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1221
1222         # Multiply TMP6 * HashKey (using Karatsuba)
1223
1224         movdqa    \XMM1, \TMP6
1225         pshufd    $78, \XMM1, \TMP2
1226         pxor      \XMM1, \TMP2
1227         movdqa    HashKey_4(%rsp), \TMP5
1228         PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1229         PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1230         movdqa    HashKey_4_k(%rsp), \TMP4
1231         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1232         movdqa    \XMM1, \XMMDst
1233         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1234
1235         # Multiply TMP1 * HashKey (using Karatsuba)
1236
1237         movdqa    \XMM2, \TMP1
1238         pshufd    $78, \XMM2, \TMP2
1239         pxor      \XMM2, \TMP2
1240         movdqa    HashKey_3(%rsp), \TMP5
1241         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1242         PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1243         movdqa    HashKey_3_k(%rsp), \TMP4
1244         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1245         pxor      \TMP1, \TMP6
1246         pxor      \XMM2, \XMMDst
1247         pxor      \TMP2, \XMM1
1248 # results accumulated in TMP6, XMMDst, XMM1
1249
1250         # Multiply TMP1 * HashKey (using Karatsuba)
1251
1252         movdqa    \XMM3, \TMP1
1253         pshufd    $78, \XMM3, \TMP2
1254         pxor      \XMM3, \TMP2
1255         movdqa    HashKey_2(%rsp), \TMP5
1256         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1257         PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1258         movdqa    HashKey_2_k(%rsp), \TMP4
1259         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1260         pxor      \TMP1, \TMP6
1261         pxor      \XMM3, \XMMDst
1262         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1263
1264         # Multiply TMP1 * HashKey (using Karatsuba)
1265         movdqa    \XMM4, \TMP1
1266         pshufd    $78, \XMM4, \TMP2
1267         pxor      \XMM4, \TMP2
1268         movdqa    HashKey(%rsp), \TMP5
1269         PCLMULQDQ 0x11, \TMP5, \TMP1        # TMP1 = a1*b1
1270         PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1271         movdqa    HashKey_k(%rsp), \TMP4
1272         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1273         pxor      \TMP1, \TMP6
1274         pxor      \XMM4, \XMMDst
1275         pxor      \XMM1, \TMP2
1276         pxor      \TMP6, \TMP2
1277         pxor      \XMMDst, \TMP2
1278         # middle section of the temp results combined as in karatsuba algorithm
1279         movdqa    \TMP2, \TMP4
1280         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1281         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1282         pxor      \TMP4, \XMMDst
1283         pxor      \TMP2, \TMP6
1284 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1285         # first phase of the reduction
1286         movdqa    \XMMDst, \TMP2
1287         movdqa    \XMMDst, \TMP3
1288         movdqa    \XMMDst, \TMP4
1289 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1290         pslld     $31, \TMP2                # packed right shifting << 31
1291         pslld     $30, \TMP3                # packed right shifting << 30
1292         pslld     $25, \TMP4                # packed right shifting << 25
1293         pxor      \TMP3, \TMP2              # xor the shifted versions
1294         pxor      \TMP4, \TMP2
1295         movdqa    \TMP2, \TMP7
1296         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1297         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1298         pxor      \TMP2, \XMMDst
1299
1300         # second phase of the reduction
1301         movdqa    \XMMDst, \TMP2
1302         # make 3 copies of XMMDst for doing 3 shift operations
1303         movdqa    \XMMDst, \TMP3
1304         movdqa    \XMMDst, \TMP4
1305         psrld     $1, \TMP2                 # packed left shift >> 1
1306         psrld     $2, \TMP3                 # packed left shift >> 2
1307         psrld     $7, \TMP4                 # packed left shift >> 7
1308         pxor      \TMP3, \TMP2              # xor the shifted versions
1309         pxor      \TMP4, \TMP2
1310         pxor      \TMP7, \TMP2
1311         pxor      \TMP2, \XMMDst
1312         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1313 .endm
1314
1315
1316 /* Encryption of a single block
1317 * uses eax & r10
1318 */
1319
1320 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1321
1322         pxor            (%arg1), \XMM0
1323         mov             keysize,%eax
1324         shr             $2,%eax                 # 128->4, 192->6, 256->8
1325         add             $5,%eax                 # 128->9, 192->11, 256->13
1326         lea             16(%arg1), %r10   # get first expanded key address
1327
1328 _esb_loop_\@:
1329         MOVADQ          (%r10),\TMP1
1330         AESENC          \TMP1,\XMM0
1331         add             $16,%r10
1332         sub             $1,%eax
1333         jnz             _esb_loop_\@
1334
1335         MOVADQ          (%r10),\TMP1
1336         AESENCLAST      \TMP1,\XMM0
1337 .endm
1338 /*****************************************************************************
1339 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1340 *                   struct gcm_context_data *data
1341 *                                      // Context data
1342 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1343 *                   const u8 *in,      // Ciphertext input
1344 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1345 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1346 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1347 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1348 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1349 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1350 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1351 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1352 *                                      // given authentication tag and only return the plaintext if they match.
1353 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1354 *                                      // (most likely), 12 or 8.
1355 *
1356 * Assumptions:
1357 *
1358 * keys:
1359 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1360 *       set of 11 keys in the data structure void *aes_ctx
1361 *
1362 * iv:
1363 *       0                   1                   2                   3
1364 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1365 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366 *       |                             Salt  (From the SA)               |
1367 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1368 *       |                     Initialization Vector                     |
1369 *       |         (This is the sequence number from IPSec header)       |
1370 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1371 *       |                              0x1                              |
1372 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1373 *
1374 *
1375 *
1376 * AAD:
1377 *       AAD padded to 128 bits with 0
1378 *       for example, assume AAD is a u32 vector
1379 *
1380 *       if AAD is 8 bytes:
1381 *       AAD[3] = {A0, A1};
1382 *       padded AAD in xmm register = {A1 A0 0 0}
1383 *
1384 *       0                   1                   2                   3
1385 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1386 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1387 *       |                               SPI (A1)                        |
1388 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1389 *       |                     32-bit Sequence Number (A0)               |
1390 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1391 *       |                              0x0                              |
1392 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1393 *
1394 *                                       AAD Format with 32-bit Sequence Number
1395 *
1396 *       if AAD is 12 bytes:
1397 *       AAD[3] = {A0, A1, A2};
1398 *       padded AAD in xmm register = {A2 A1 A0 0}
1399 *
1400 *       0                   1                   2                   3
1401 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1402 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1403 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1404 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1405 *       |                               SPI (A2)                        |
1406 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1407 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1408 *       |                                                               |
1409 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1410 *       |                              0x0                              |
1411 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1412 *
1413 *                        AAD Format with 64-bit Extended Sequence Number
1414 *
1415 * poly = x^128 + x^127 + x^126 + x^121 + 1
1416 *
1417 *****************************************************************************/
1418 ENTRY(aesni_gcm_dec)
1419         FUNC_SAVE
1420
1421         GCM_INIT
1422         GCM_ENC_DEC dec
1423         GCM_COMPLETE
1424         FUNC_RESTORE
1425         ret
1426 ENDPROC(aesni_gcm_dec)
1427
1428
1429 /*****************************************************************************
1430 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1431 *                    struct gcm_context_data *data
1432 *                                        // Context data
1433 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1434 *                    const u8 *in,       // Plaintext input
1435 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1436 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1437 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1438 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1439 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1440 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1441 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1442 *                    u8 *auth_tag,       // Authenticated Tag output.
1443 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1444 *                                        // 12 or 8.
1445 *
1446 * Assumptions:
1447 *
1448 * keys:
1449 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1450 *       first set of 11 keys in the data structure void *aes_ctx
1451 *
1452 *
1453 * iv:
1454 *       0                   1                   2                   3
1455 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1456 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1457 *       |                             Salt  (From the SA)               |
1458 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1459 *       |                     Initialization Vector                     |
1460 *       |         (This is the sequence number from IPSec header)       |
1461 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1462 *       |                              0x1                              |
1463 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1464 *
1465 *
1466 *
1467 * AAD:
1468 *       AAD padded to 128 bits with 0
1469 *       for example, assume AAD is a u32 vector
1470 *
1471 *       if AAD is 8 bytes:
1472 *       AAD[3] = {A0, A1};
1473 *       padded AAD in xmm register = {A1 A0 0 0}
1474 *
1475 *       0                   1                   2                   3
1476 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1477 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1478 *       |                               SPI (A1)                        |
1479 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1480 *       |                     32-bit Sequence Number (A0)               |
1481 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1482 *       |                              0x0                              |
1483 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1484 *
1485 *                                 AAD Format with 32-bit Sequence Number
1486 *
1487 *       if AAD is 12 bytes:
1488 *       AAD[3] = {A0, A1, A2};
1489 *       padded AAD in xmm register = {A2 A1 A0 0}
1490 *
1491 *       0                   1                   2                   3
1492 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1493 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1494 *       |                               SPI (A2)                        |
1495 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1497 *       |                                                               |
1498 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1499 *       |                              0x0                              |
1500 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1501 *
1502 *                         AAD Format with 64-bit Extended Sequence Number
1503 *
1504 * poly = x^128 + x^127 + x^126 + x^121 + 1
1505 ***************************************************************************/
1506 ENTRY(aesni_gcm_enc)
1507         FUNC_SAVE
1508
1509         GCM_INIT
1510         GCM_ENC_DEC enc
1511         GCM_COMPLETE
1512         FUNC_RESTORE
1513         ret
1514 ENDPROC(aesni_gcm_enc)
1515
1516 #endif
1517
1518
1519 .align 4
1520 _key_expansion_128:
1521 _key_expansion_256a:
1522         pshufd $0b11111111, %xmm1, %xmm1
1523         shufps $0b00010000, %xmm0, %xmm4
1524         pxor %xmm4, %xmm0
1525         shufps $0b10001100, %xmm0, %xmm4
1526         pxor %xmm4, %xmm0
1527         pxor %xmm1, %xmm0
1528         movaps %xmm0, (TKEYP)
1529         add $0x10, TKEYP
1530         ret
1531 ENDPROC(_key_expansion_128)
1532 ENDPROC(_key_expansion_256a)
1533
1534 .align 4
1535 _key_expansion_192a:
1536         pshufd $0b01010101, %xmm1, %xmm1
1537         shufps $0b00010000, %xmm0, %xmm4
1538         pxor %xmm4, %xmm0
1539         shufps $0b10001100, %xmm0, %xmm4
1540         pxor %xmm4, %xmm0
1541         pxor %xmm1, %xmm0
1542
1543         movaps %xmm2, %xmm5
1544         movaps %xmm2, %xmm6
1545         pslldq $4, %xmm5
1546         pshufd $0b11111111, %xmm0, %xmm3
1547         pxor %xmm3, %xmm2
1548         pxor %xmm5, %xmm2
1549
1550         movaps %xmm0, %xmm1
1551         shufps $0b01000100, %xmm0, %xmm6
1552         movaps %xmm6, (TKEYP)
1553         shufps $0b01001110, %xmm2, %xmm1
1554         movaps %xmm1, 0x10(TKEYP)
1555         add $0x20, TKEYP
1556         ret
1557 ENDPROC(_key_expansion_192a)
1558
1559 .align 4
1560 _key_expansion_192b:
1561         pshufd $0b01010101, %xmm1, %xmm1
1562         shufps $0b00010000, %xmm0, %xmm4
1563         pxor %xmm4, %xmm0
1564         shufps $0b10001100, %xmm0, %xmm4
1565         pxor %xmm4, %xmm0
1566         pxor %xmm1, %xmm0
1567
1568         movaps %xmm2, %xmm5
1569         pslldq $4, %xmm5
1570         pshufd $0b11111111, %xmm0, %xmm3
1571         pxor %xmm3, %xmm2
1572         pxor %xmm5, %xmm2
1573
1574         movaps %xmm0, (TKEYP)
1575         add $0x10, TKEYP
1576         ret
1577 ENDPROC(_key_expansion_192b)
1578
1579 .align 4
1580 _key_expansion_256b:
1581         pshufd $0b10101010, %xmm1, %xmm1
1582         shufps $0b00010000, %xmm2, %xmm4
1583         pxor %xmm4, %xmm2
1584         shufps $0b10001100, %xmm2, %xmm4
1585         pxor %xmm4, %xmm2
1586         pxor %xmm1, %xmm2
1587         movaps %xmm2, (TKEYP)
1588         add $0x10, TKEYP
1589         ret
1590 ENDPROC(_key_expansion_256b)
1591
1592 /*
1593  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1594  *                   unsigned int key_len)
1595  */
1596 ENTRY(aesni_set_key)
1597         FRAME_BEGIN
1598 #ifndef __x86_64__
1599         pushl KEYP
1600         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1601         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1602         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1603 #endif
1604         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1605         movaps %xmm0, (KEYP)
1606         lea 0x10(KEYP), TKEYP           # key addr
1607         movl %edx, 480(KEYP)
1608         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1609         cmp $24, %dl
1610         jb .Lenc_key128
1611         je .Lenc_key192
1612         movups 0x10(UKEYP), %xmm2       # other user key
1613         movaps %xmm2, (TKEYP)
1614         add $0x10, TKEYP
1615         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1616         call _key_expansion_256a
1617         AESKEYGENASSIST 0x1 %xmm0 %xmm1
1618         call _key_expansion_256b
1619         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1620         call _key_expansion_256a
1621         AESKEYGENASSIST 0x2 %xmm0 %xmm1
1622         call _key_expansion_256b
1623         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1624         call _key_expansion_256a
1625         AESKEYGENASSIST 0x4 %xmm0 %xmm1
1626         call _key_expansion_256b
1627         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1628         call _key_expansion_256a
1629         AESKEYGENASSIST 0x8 %xmm0 %xmm1
1630         call _key_expansion_256b
1631         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1632         call _key_expansion_256a
1633         AESKEYGENASSIST 0x10 %xmm0 %xmm1
1634         call _key_expansion_256b
1635         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1636         call _key_expansion_256a
1637         AESKEYGENASSIST 0x20 %xmm0 %xmm1
1638         call _key_expansion_256b
1639         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1640         call _key_expansion_256a
1641         jmp .Ldec_key
1642 .Lenc_key192:
1643         movq 0x10(UKEYP), %xmm2         # other user key
1644         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1645         call _key_expansion_192a
1646         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1647         call _key_expansion_192b
1648         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1649         call _key_expansion_192a
1650         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1651         call _key_expansion_192b
1652         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1653         call _key_expansion_192a
1654         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1655         call _key_expansion_192b
1656         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1657         call _key_expansion_192a
1658         AESKEYGENASSIST 0x80 %xmm2 %xmm1        # round 8
1659         call _key_expansion_192b
1660         jmp .Ldec_key
1661 .Lenc_key128:
1662         AESKEYGENASSIST 0x1 %xmm0 %xmm1         # round 1
1663         call _key_expansion_128
1664         AESKEYGENASSIST 0x2 %xmm0 %xmm1         # round 2
1665         call _key_expansion_128
1666         AESKEYGENASSIST 0x4 %xmm0 %xmm1         # round 3
1667         call _key_expansion_128
1668         AESKEYGENASSIST 0x8 %xmm0 %xmm1         # round 4
1669         call _key_expansion_128
1670         AESKEYGENASSIST 0x10 %xmm0 %xmm1        # round 5
1671         call _key_expansion_128
1672         AESKEYGENASSIST 0x20 %xmm0 %xmm1        # round 6
1673         call _key_expansion_128
1674         AESKEYGENASSIST 0x40 %xmm0 %xmm1        # round 7
1675         call _key_expansion_128
1676         AESKEYGENASSIST 0x80 %xmm0 %xmm1        # round 8
1677         call _key_expansion_128
1678         AESKEYGENASSIST 0x1b %xmm0 %xmm1        # round 9
1679         call _key_expansion_128
1680         AESKEYGENASSIST 0x36 %xmm0 %xmm1        # round 10
1681         call _key_expansion_128
1682 .Ldec_key:
1683         sub $0x10, TKEYP
1684         movaps (KEYP), %xmm0
1685         movaps (TKEYP), %xmm1
1686         movaps %xmm0, 240(TKEYP)
1687         movaps %xmm1, 240(KEYP)
1688         add $0x10, KEYP
1689         lea 240-16(TKEYP), UKEYP
1690 .align 4
1691 .Ldec_key_loop:
1692         movaps (KEYP), %xmm0
1693         AESIMC %xmm0 %xmm1
1694         movaps %xmm1, (UKEYP)
1695         add $0x10, KEYP
1696         sub $0x10, UKEYP
1697         cmp TKEYP, KEYP
1698         jb .Ldec_key_loop
1699         xor AREG, AREG
1700 #ifndef __x86_64__
1701         popl KEYP
1702 #endif
1703         FRAME_END
1704         ret
1705 ENDPROC(aesni_set_key)
1706
1707 /*
1708  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1709  */
1710 ENTRY(aesni_enc)
1711         FRAME_BEGIN
1712 #ifndef __x86_64__
1713         pushl KEYP
1714         pushl KLEN
1715         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1716         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1717         movl (FRAME_OFFSET+20)(%esp), INP       # src
1718 #endif
1719         movl 480(KEYP), KLEN            # key length
1720         movups (INP), STATE             # input
1721         call _aesni_enc1
1722         movups STATE, (OUTP)            # output
1723 #ifndef __x86_64__
1724         popl KLEN
1725         popl KEYP
1726 #endif
1727         FRAME_END
1728         ret
1729 ENDPROC(aesni_enc)
1730
1731 /*
1732  * _aesni_enc1:         internal ABI
1733  * input:
1734  *      KEYP:           key struct pointer
1735  *      KLEN:           round count
1736  *      STATE:          initial state (input)
1737  * output:
1738  *      STATE:          finial state (output)
1739  * changed:
1740  *      KEY
1741  *      TKEYP (T1)
1742  */
1743 .align 4
1744 _aesni_enc1:
1745         movaps (KEYP), KEY              # key
1746         mov KEYP, TKEYP
1747         pxor KEY, STATE         # round 0
1748         add $0x30, TKEYP
1749         cmp $24, KLEN
1750         jb .Lenc128
1751         lea 0x20(TKEYP), TKEYP
1752         je .Lenc192
1753         add $0x20, TKEYP
1754         movaps -0x60(TKEYP), KEY
1755         AESENC KEY STATE
1756         movaps -0x50(TKEYP), KEY
1757         AESENC KEY STATE
1758 .align 4
1759 .Lenc192:
1760         movaps -0x40(TKEYP), KEY
1761         AESENC KEY STATE
1762         movaps -0x30(TKEYP), KEY
1763         AESENC KEY STATE
1764 .align 4
1765 .Lenc128:
1766         movaps -0x20(TKEYP), KEY
1767         AESENC KEY STATE
1768         movaps -0x10(TKEYP), KEY
1769         AESENC KEY STATE
1770         movaps (TKEYP), KEY
1771         AESENC KEY STATE
1772         movaps 0x10(TKEYP), KEY
1773         AESENC KEY STATE
1774         movaps 0x20(TKEYP), KEY
1775         AESENC KEY STATE
1776         movaps 0x30(TKEYP), KEY
1777         AESENC KEY STATE
1778         movaps 0x40(TKEYP), KEY
1779         AESENC KEY STATE
1780         movaps 0x50(TKEYP), KEY
1781         AESENC KEY STATE
1782         movaps 0x60(TKEYP), KEY
1783         AESENC KEY STATE
1784         movaps 0x70(TKEYP), KEY
1785         AESENCLAST KEY STATE
1786         ret
1787 ENDPROC(_aesni_enc1)
1788
1789 /*
1790  * _aesni_enc4: internal ABI
1791  * input:
1792  *      KEYP:           key struct pointer
1793  *      KLEN:           round count
1794  *      STATE1:         initial state (input)
1795  *      STATE2
1796  *      STATE3
1797  *      STATE4
1798  * output:
1799  *      STATE1:         finial state (output)
1800  *      STATE2
1801  *      STATE3
1802  *      STATE4
1803  * changed:
1804  *      KEY
1805  *      TKEYP (T1)
1806  */
1807 .align 4
1808 _aesni_enc4:
1809         movaps (KEYP), KEY              # key
1810         mov KEYP, TKEYP
1811         pxor KEY, STATE1                # round 0
1812         pxor KEY, STATE2
1813         pxor KEY, STATE3
1814         pxor KEY, STATE4
1815         add $0x30, TKEYP
1816         cmp $24, KLEN
1817         jb .L4enc128
1818         lea 0x20(TKEYP), TKEYP
1819         je .L4enc192
1820         add $0x20, TKEYP
1821         movaps -0x60(TKEYP), KEY
1822         AESENC KEY STATE1
1823         AESENC KEY STATE2
1824         AESENC KEY STATE3
1825         AESENC KEY STATE4
1826         movaps -0x50(TKEYP), KEY
1827         AESENC KEY STATE1
1828         AESENC KEY STATE2
1829         AESENC KEY STATE3
1830         AESENC KEY STATE4
1831 #.align 4
1832 .L4enc192:
1833         movaps -0x40(TKEYP), KEY
1834         AESENC KEY STATE1
1835         AESENC KEY STATE2
1836         AESENC KEY STATE3
1837         AESENC KEY STATE4
1838         movaps -0x30(TKEYP), KEY
1839         AESENC KEY STATE1
1840         AESENC KEY STATE2
1841         AESENC KEY STATE3
1842         AESENC KEY STATE4
1843 #.align 4
1844 .L4enc128:
1845         movaps -0x20(TKEYP), KEY
1846         AESENC KEY STATE1
1847         AESENC KEY STATE2
1848         AESENC KEY STATE3
1849         AESENC KEY STATE4
1850         movaps -0x10(TKEYP), KEY
1851         AESENC KEY STATE1
1852         AESENC KEY STATE2
1853         AESENC KEY STATE3
1854         AESENC KEY STATE4
1855         movaps (TKEYP), KEY
1856         AESENC KEY STATE1
1857         AESENC KEY STATE2
1858         AESENC KEY STATE3
1859         AESENC KEY STATE4
1860         movaps 0x10(TKEYP), KEY
1861         AESENC KEY STATE1
1862         AESENC KEY STATE2
1863         AESENC KEY STATE3
1864         AESENC KEY STATE4
1865         movaps 0x20(TKEYP), KEY
1866         AESENC KEY STATE1
1867         AESENC KEY STATE2
1868         AESENC KEY STATE3
1869         AESENC KEY STATE4
1870         movaps 0x30(TKEYP), KEY
1871         AESENC KEY STATE1
1872         AESENC KEY STATE2
1873         AESENC KEY STATE3
1874         AESENC KEY STATE4
1875         movaps 0x40(TKEYP), KEY
1876         AESENC KEY STATE1
1877         AESENC KEY STATE2
1878         AESENC KEY STATE3
1879         AESENC KEY STATE4
1880         movaps 0x50(TKEYP), KEY
1881         AESENC KEY STATE1
1882         AESENC KEY STATE2
1883         AESENC KEY STATE3
1884         AESENC KEY STATE4
1885         movaps 0x60(TKEYP), KEY
1886         AESENC KEY STATE1
1887         AESENC KEY STATE2
1888         AESENC KEY STATE3
1889         AESENC KEY STATE4
1890         movaps 0x70(TKEYP), KEY
1891         AESENCLAST KEY STATE1           # last round
1892         AESENCLAST KEY STATE2
1893         AESENCLAST KEY STATE3
1894         AESENCLAST KEY STATE4
1895         ret
1896 ENDPROC(_aesni_enc4)
1897
1898 /*
1899  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1900  */
1901 ENTRY(aesni_dec)
1902         FRAME_BEGIN
1903 #ifndef __x86_64__
1904         pushl KEYP
1905         pushl KLEN
1906         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1907         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1908         movl (FRAME_OFFSET+20)(%esp), INP       # src
1909 #endif
1910         mov 480(KEYP), KLEN             # key length
1911         add $240, KEYP
1912         movups (INP), STATE             # input
1913         call _aesni_dec1
1914         movups STATE, (OUTP)            #output
1915 #ifndef __x86_64__
1916         popl KLEN
1917         popl KEYP
1918 #endif
1919         FRAME_END
1920         ret
1921 ENDPROC(aesni_dec)
1922
1923 /*
1924  * _aesni_dec1:         internal ABI
1925  * input:
1926  *      KEYP:           key struct pointer
1927  *      KLEN:           key length
1928  *      STATE:          initial state (input)
1929  * output:
1930  *      STATE:          finial state (output)
1931  * changed:
1932  *      KEY
1933  *      TKEYP (T1)
1934  */
1935 .align 4
1936 _aesni_dec1:
1937         movaps (KEYP), KEY              # key
1938         mov KEYP, TKEYP
1939         pxor KEY, STATE         # round 0
1940         add $0x30, TKEYP
1941         cmp $24, KLEN
1942         jb .Ldec128
1943         lea 0x20(TKEYP), TKEYP
1944         je .Ldec192
1945         add $0x20, TKEYP
1946         movaps -0x60(TKEYP), KEY
1947         AESDEC KEY STATE
1948         movaps -0x50(TKEYP), KEY
1949         AESDEC KEY STATE
1950 .align 4
1951 .Ldec192:
1952         movaps -0x40(TKEYP), KEY
1953         AESDEC KEY STATE
1954         movaps -0x30(TKEYP), KEY
1955         AESDEC KEY STATE
1956 .align 4
1957 .Ldec128:
1958         movaps -0x20(TKEYP), KEY
1959         AESDEC KEY STATE
1960         movaps -0x10(TKEYP), KEY
1961         AESDEC KEY STATE
1962         movaps (TKEYP), KEY
1963         AESDEC KEY STATE
1964         movaps 0x10(TKEYP), KEY
1965         AESDEC KEY STATE
1966         movaps 0x20(TKEYP), KEY
1967         AESDEC KEY STATE
1968         movaps 0x30(TKEYP), KEY
1969         AESDEC KEY STATE
1970         movaps 0x40(TKEYP), KEY
1971         AESDEC KEY STATE
1972         movaps 0x50(TKEYP), KEY
1973         AESDEC KEY STATE
1974         movaps 0x60(TKEYP), KEY
1975         AESDEC KEY STATE
1976         movaps 0x70(TKEYP), KEY
1977         AESDECLAST KEY STATE
1978         ret
1979 ENDPROC(_aesni_dec1)
1980
1981 /*
1982  * _aesni_dec4: internal ABI
1983  * input:
1984  *      KEYP:           key struct pointer
1985  *      KLEN:           key length
1986  *      STATE1:         initial state (input)
1987  *      STATE2
1988  *      STATE3
1989  *      STATE4
1990  * output:
1991  *      STATE1:         finial state (output)
1992  *      STATE2
1993  *      STATE3
1994  *      STATE4
1995  * changed:
1996  *      KEY
1997  *      TKEYP (T1)
1998  */
1999 .align 4
2000 _aesni_dec4:
2001         movaps (KEYP), KEY              # key
2002         mov KEYP, TKEYP
2003         pxor KEY, STATE1                # round 0
2004         pxor KEY, STATE2
2005         pxor KEY, STATE3
2006         pxor KEY, STATE4
2007         add $0x30, TKEYP
2008         cmp $24, KLEN
2009         jb .L4dec128
2010         lea 0x20(TKEYP), TKEYP
2011         je .L4dec192
2012         add $0x20, TKEYP
2013         movaps -0x60(TKEYP), KEY
2014         AESDEC KEY STATE1
2015         AESDEC KEY STATE2
2016         AESDEC KEY STATE3
2017         AESDEC KEY STATE4
2018         movaps -0x50(TKEYP), KEY
2019         AESDEC KEY STATE1
2020         AESDEC KEY STATE2
2021         AESDEC KEY STATE3
2022         AESDEC KEY STATE4
2023 .align 4
2024 .L4dec192:
2025         movaps -0x40(TKEYP), KEY
2026         AESDEC KEY STATE1
2027         AESDEC KEY STATE2
2028         AESDEC KEY STATE3
2029         AESDEC KEY STATE4
2030         movaps -0x30(TKEYP), KEY
2031         AESDEC KEY STATE1
2032         AESDEC KEY STATE2
2033         AESDEC KEY STATE3
2034         AESDEC KEY STATE4
2035 .align 4
2036 .L4dec128:
2037         movaps -0x20(TKEYP), KEY
2038         AESDEC KEY STATE1
2039         AESDEC KEY STATE2
2040         AESDEC KEY STATE3
2041         AESDEC KEY STATE4
2042         movaps -0x10(TKEYP), KEY
2043         AESDEC KEY STATE1
2044         AESDEC KEY STATE2
2045         AESDEC KEY STATE3
2046         AESDEC KEY STATE4
2047         movaps (TKEYP), KEY
2048         AESDEC KEY STATE1
2049         AESDEC KEY STATE2
2050         AESDEC KEY STATE3
2051         AESDEC KEY STATE4
2052         movaps 0x10(TKEYP), KEY
2053         AESDEC KEY STATE1
2054         AESDEC KEY STATE2
2055         AESDEC KEY STATE3
2056         AESDEC KEY STATE4
2057         movaps 0x20(TKEYP), KEY
2058         AESDEC KEY STATE1
2059         AESDEC KEY STATE2
2060         AESDEC KEY STATE3
2061         AESDEC KEY STATE4
2062         movaps 0x30(TKEYP), KEY
2063         AESDEC KEY STATE1
2064         AESDEC KEY STATE2
2065         AESDEC KEY STATE3
2066         AESDEC KEY STATE4
2067         movaps 0x40(TKEYP), KEY
2068         AESDEC KEY STATE1
2069         AESDEC KEY STATE2
2070         AESDEC KEY STATE3
2071         AESDEC KEY STATE4
2072         movaps 0x50(TKEYP), KEY
2073         AESDEC KEY STATE1
2074         AESDEC KEY STATE2
2075         AESDEC KEY STATE3
2076         AESDEC KEY STATE4
2077         movaps 0x60(TKEYP), KEY
2078         AESDEC KEY STATE1
2079         AESDEC KEY STATE2
2080         AESDEC KEY STATE3
2081         AESDEC KEY STATE4
2082         movaps 0x70(TKEYP), KEY
2083         AESDECLAST KEY STATE1           # last round
2084         AESDECLAST KEY STATE2
2085         AESDECLAST KEY STATE3
2086         AESDECLAST KEY STATE4
2087         ret
2088 ENDPROC(_aesni_dec4)
2089
2090 /*
2091  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2092  *                    size_t len)
2093  */
2094 ENTRY(aesni_ecb_enc)
2095         FRAME_BEGIN
2096 #ifndef __x86_64__
2097         pushl LEN
2098         pushl KEYP
2099         pushl KLEN
2100         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2101         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2102         movl (FRAME_OFFSET+24)(%esp), INP       # src
2103         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2104 #endif
2105         test LEN, LEN           # check length
2106         jz .Lecb_enc_ret
2107         mov 480(KEYP), KLEN
2108         cmp $16, LEN
2109         jb .Lecb_enc_ret
2110         cmp $64, LEN
2111         jb .Lecb_enc_loop1
2112 .align 4
2113 .Lecb_enc_loop4:
2114         movups (INP), STATE1
2115         movups 0x10(INP), STATE2
2116         movups 0x20(INP), STATE3
2117         movups 0x30(INP), STATE4
2118         call _aesni_enc4
2119         movups STATE1, (OUTP)
2120         movups STATE2, 0x10(OUTP)
2121         movups STATE3, 0x20(OUTP)
2122         movups STATE4, 0x30(OUTP)
2123         sub $64, LEN
2124         add $64, INP
2125         add $64, OUTP
2126         cmp $64, LEN
2127         jge .Lecb_enc_loop4
2128         cmp $16, LEN
2129         jb .Lecb_enc_ret
2130 .align 4
2131 .Lecb_enc_loop1:
2132         movups (INP), STATE1
2133         call _aesni_enc1
2134         movups STATE1, (OUTP)
2135         sub $16, LEN
2136         add $16, INP
2137         add $16, OUTP
2138         cmp $16, LEN
2139         jge .Lecb_enc_loop1
2140 .Lecb_enc_ret:
2141 #ifndef __x86_64__
2142         popl KLEN
2143         popl KEYP
2144         popl LEN
2145 #endif
2146         FRAME_END
2147         ret
2148 ENDPROC(aesni_ecb_enc)
2149
2150 /*
2151  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2152  *                    size_t len);
2153  */
2154 ENTRY(aesni_ecb_dec)
2155         FRAME_BEGIN
2156 #ifndef __x86_64__
2157         pushl LEN
2158         pushl KEYP
2159         pushl KLEN
2160         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2161         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2162         movl (FRAME_OFFSET+24)(%esp), INP       # src
2163         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2164 #endif
2165         test LEN, LEN
2166         jz .Lecb_dec_ret
2167         mov 480(KEYP), KLEN
2168         add $240, KEYP
2169         cmp $16, LEN
2170         jb .Lecb_dec_ret
2171         cmp $64, LEN
2172         jb .Lecb_dec_loop1
2173 .align 4
2174 .Lecb_dec_loop4:
2175         movups (INP), STATE1
2176         movups 0x10(INP), STATE2
2177         movups 0x20(INP), STATE3
2178         movups 0x30(INP), STATE4
2179         call _aesni_dec4
2180         movups STATE1, (OUTP)
2181         movups STATE2, 0x10(OUTP)
2182         movups STATE3, 0x20(OUTP)
2183         movups STATE4, 0x30(OUTP)
2184         sub $64, LEN
2185         add $64, INP
2186         add $64, OUTP
2187         cmp $64, LEN
2188         jge .Lecb_dec_loop4
2189         cmp $16, LEN
2190         jb .Lecb_dec_ret
2191 .align 4
2192 .Lecb_dec_loop1:
2193         movups (INP), STATE1
2194         call _aesni_dec1
2195         movups STATE1, (OUTP)
2196         sub $16, LEN
2197         add $16, INP
2198         add $16, OUTP
2199         cmp $16, LEN
2200         jge .Lecb_dec_loop1
2201 .Lecb_dec_ret:
2202 #ifndef __x86_64__
2203         popl KLEN
2204         popl KEYP
2205         popl LEN
2206 #endif
2207         FRAME_END
2208         ret
2209 ENDPROC(aesni_ecb_dec)
2210
2211 /*
2212  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2213  *                    size_t len, u8 *iv)
2214  */
2215 ENTRY(aesni_cbc_enc)
2216         FRAME_BEGIN
2217 #ifndef __x86_64__
2218         pushl IVP
2219         pushl LEN
2220         pushl KEYP
2221         pushl KLEN
2222         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2223         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2224         movl (FRAME_OFFSET+28)(%esp), INP       # src
2225         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2226         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2227 #endif
2228         cmp $16, LEN
2229         jb .Lcbc_enc_ret
2230         mov 480(KEYP), KLEN
2231         movups (IVP), STATE     # load iv as initial state
2232 .align 4
2233 .Lcbc_enc_loop:
2234         movups (INP), IN        # load input
2235         pxor IN, STATE
2236         call _aesni_enc1
2237         movups STATE, (OUTP)    # store output
2238         sub $16, LEN
2239         add $16, INP
2240         add $16, OUTP
2241         cmp $16, LEN
2242         jge .Lcbc_enc_loop
2243         movups STATE, (IVP)
2244 .Lcbc_enc_ret:
2245 #ifndef __x86_64__
2246         popl KLEN
2247         popl KEYP
2248         popl LEN
2249         popl IVP
2250 #endif
2251         FRAME_END
2252         ret
2253 ENDPROC(aesni_cbc_enc)
2254
2255 /*
2256  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2257  *                    size_t len, u8 *iv)
2258  */
2259 ENTRY(aesni_cbc_dec)
2260         FRAME_BEGIN
2261 #ifndef __x86_64__
2262         pushl IVP
2263         pushl LEN
2264         pushl KEYP
2265         pushl KLEN
2266         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2267         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2268         movl (FRAME_OFFSET+28)(%esp), INP       # src
2269         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2270         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2271 #endif
2272         cmp $16, LEN
2273         jb .Lcbc_dec_just_ret
2274         mov 480(KEYP), KLEN
2275         add $240, KEYP
2276         movups (IVP), IV
2277         cmp $64, LEN
2278         jb .Lcbc_dec_loop1
2279 .align 4
2280 .Lcbc_dec_loop4:
2281         movups (INP), IN1
2282         movaps IN1, STATE1
2283         movups 0x10(INP), IN2
2284         movaps IN2, STATE2
2285 #ifdef __x86_64__
2286         movups 0x20(INP), IN3
2287         movaps IN3, STATE3
2288         movups 0x30(INP), IN4
2289         movaps IN4, STATE4
2290 #else
2291         movups 0x20(INP), IN1
2292         movaps IN1, STATE3
2293         movups 0x30(INP), IN2
2294         movaps IN2, STATE4
2295 #endif
2296         call _aesni_dec4
2297         pxor IV, STATE1
2298 #ifdef __x86_64__
2299         pxor IN1, STATE2
2300         pxor IN2, STATE3
2301         pxor IN3, STATE4
2302         movaps IN4, IV
2303 #else
2304         pxor IN1, STATE4
2305         movaps IN2, IV
2306         movups (INP), IN1
2307         pxor IN1, STATE2
2308         movups 0x10(INP), IN2
2309         pxor IN2, STATE3
2310 #endif
2311         movups STATE1, (OUTP)
2312         movups STATE2, 0x10(OUTP)
2313         movups STATE3, 0x20(OUTP)
2314         movups STATE4, 0x30(OUTP)
2315         sub $64, LEN
2316         add $64, INP
2317         add $64, OUTP
2318         cmp $64, LEN
2319         jge .Lcbc_dec_loop4
2320         cmp $16, LEN
2321         jb .Lcbc_dec_ret
2322 .align 4
2323 .Lcbc_dec_loop1:
2324         movups (INP), IN
2325         movaps IN, STATE
2326         call _aesni_dec1
2327         pxor IV, STATE
2328         movups STATE, (OUTP)
2329         movaps IN, IV
2330         sub $16, LEN
2331         add $16, INP
2332         add $16, OUTP
2333         cmp $16, LEN
2334         jge .Lcbc_dec_loop1
2335 .Lcbc_dec_ret:
2336         movups IV, (IVP)
2337 .Lcbc_dec_just_ret:
2338 #ifndef __x86_64__
2339         popl KLEN
2340         popl KEYP
2341         popl LEN
2342         popl IVP
2343 #endif
2344         FRAME_END
2345         ret
2346 ENDPROC(aesni_cbc_dec)
2347
2348 #ifdef __x86_64__
2349 .pushsection .rodata
2350 .align 16
2351 .Lbswap_mask:
2352         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2353 .popsection
2354
2355 /*
2356  * _aesni_inc_init:     internal ABI
2357  *      setup registers used by _aesni_inc
2358  * input:
2359  *      IV
2360  * output:
2361  *      CTR:    == IV, in little endian
2362  *      TCTR_LOW: == lower qword of CTR
2363  *      INC:    == 1, in little endian
2364  *      BSWAP_MASK == endian swapping mask
2365  */
2366 .align 4
2367 _aesni_inc_init:
2368         movaps .Lbswap_mask, BSWAP_MASK
2369         movaps IV, CTR
2370         PSHUFB_XMM BSWAP_MASK CTR
2371         mov $1, TCTR_LOW
2372         MOVQ_R64_XMM TCTR_LOW INC
2373         MOVQ_R64_XMM CTR TCTR_LOW
2374         ret
2375 ENDPROC(_aesni_inc_init)
2376
2377 /*
2378  * _aesni_inc:          internal ABI
2379  *      Increase IV by 1, IV is in big endian
2380  * input:
2381  *      IV
2382  *      CTR:    == IV, in little endian
2383  *      TCTR_LOW: == lower qword of CTR
2384  *      INC:    == 1, in little endian
2385  *      BSWAP_MASK == endian swapping mask
2386  * output:
2387  *      IV:     Increase by 1
2388  * changed:
2389  *      CTR:    == output IV, in little endian
2390  *      TCTR_LOW: == lower qword of CTR
2391  */
2392 .align 4
2393 _aesni_inc:
2394         paddq INC, CTR
2395         add $1, TCTR_LOW
2396         jnc .Linc_low
2397         pslldq $8, INC
2398         paddq INC, CTR
2399         psrldq $8, INC
2400 .Linc_low:
2401         movaps CTR, IV
2402         PSHUFB_XMM BSWAP_MASK IV
2403         ret
2404 ENDPROC(_aesni_inc)
2405
2406 /*
2407  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2408  *                    size_t len, u8 *iv)
2409  */
2410 ENTRY(aesni_ctr_enc)
2411         FRAME_BEGIN
2412         cmp $16, LEN
2413         jb .Lctr_enc_just_ret
2414         mov 480(KEYP), KLEN
2415         movups (IVP), IV
2416         call _aesni_inc_init
2417         cmp $64, LEN
2418         jb .Lctr_enc_loop1
2419 .align 4
2420 .Lctr_enc_loop4:
2421         movaps IV, STATE1
2422         call _aesni_inc
2423         movups (INP), IN1
2424         movaps IV, STATE2
2425         call _aesni_inc
2426         movups 0x10(INP), IN2
2427         movaps IV, STATE3
2428         call _aesni_inc
2429         movups 0x20(INP), IN3
2430         movaps IV, STATE4
2431         call _aesni_inc
2432         movups 0x30(INP), IN4
2433         call _aesni_enc4
2434         pxor IN1, STATE1
2435         movups STATE1, (OUTP)
2436         pxor IN2, STATE2
2437         movups STATE2, 0x10(OUTP)
2438         pxor IN3, STATE3
2439         movups STATE3, 0x20(OUTP)
2440         pxor IN4, STATE4
2441         movups STATE4, 0x30(OUTP)
2442         sub $64, LEN
2443         add $64, INP
2444         add $64, OUTP
2445         cmp $64, LEN
2446         jge .Lctr_enc_loop4
2447         cmp $16, LEN
2448         jb .Lctr_enc_ret
2449 .align 4
2450 .Lctr_enc_loop1:
2451         movaps IV, STATE
2452         call _aesni_inc
2453         movups (INP), IN
2454         call _aesni_enc1
2455         pxor IN, STATE
2456         movups STATE, (OUTP)
2457         sub $16, LEN
2458         add $16, INP
2459         add $16, OUTP
2460         cmp $16, LEN
2461         jge .Lctr_enc_loop1
2462 .Lctr_enc_ret:
2463         movups IV, (IVP)
2464 .Lctr_enc_just_ret:
2465         FRAME_END
2466         ret
2467 ENDPROC(aesni_ctr_enc)
2468
2469 /*
2470  * _aesni_gf128mul_x_ble:               internal ABI
2471  *      Multiply in GF(2^128) for XTS IVs
2472  * input:
2473  *      IV:     current IV
2474  *      GF128MUL_MASK == mask with 0x87 and 0x01
2475  * output:
2476  *      IV:     next IV
2477  * changed:
2478  *      CTR:    == temporary value
2479  */
2480 #define _aesni_gf128mul_x_ble() \
2481         pshufd $0x13, IV, CTR; \
2482         paddq IV, IV; \
2483         psrad $31, CTR; \
2484         pand GF128MUL_MASK, CTR; \
2485         pxor CTR, IV;
2486
2487 /*
2488  * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489  *                       bool enc, u8 *iv)
2490  */
2491 ENTRY(aesni_xts_crypt8)
2492         FRAME_BEGIN
2493         cmpb $0, %cl
2494         movl $0, %ecx
2495         movl $240, %r10d
2496         leaq _aesni_enc4, %r11
2497         leaq _aesni_dec4, %rax
2498         cmovel %r10d, %ecx
2499         cmoveq %rax, %r11
2500
2501         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2502         movups (IVP), IV
2503
2504         mov 480(KEYP), KLEN
2505         addq %rcx, KEYP
2506
2507         movdqa IV, STATE1
2508         movdqu 0x00(INP), INC
2509         pxor INC, STATE1
2510         movdqu IV, 0x00(OUTP)
2511
2512         _aesni_gf128mul_x_ble()
2513         movdqa IV, STATE2
2514         movdqu 0x10(INP), INC
2515         pxor INC, STATE2
2516         movdqu IV, 0x10(OUTP)
2517
2518         _aesni_gf128mul_x_ble()
2519         movdqa IV, STATE3
2520         movdqu 0x20(INP), INC
2521         pxor INC, STATE3
2522         movdqu IV, 0x20(OUTP)
2523
2524         _aesni_gf128mul_x_ble()
2525         movdqa IV, STATE4
2526         movdqu 0x30(INP), INC
2527         pxor INC, STATE4
2528         movdqu IV, 0x30(OUTP)
2529
2530         CALL_NOSPEC %r11
2531
2532         movdqu 0x00(OUTP), INC
2533         pxor INC, STATE1
2534         movdqu STATE1, 0x00(OUTP)
2535
2536         _aesni_gf128mul_x_ble()
2537         movdqa IV, STATE1
2538         movdqu 0x40(INP), INC
2539         pxor INC, STATE1
2540         movdqu IV, 0x40(OUTP)
2541
2542         movdqu 0x10(OUTP), INC
2543         pxor INC, STATE2
2544         movdqu STATE2, 0x10(OUTP)
2545
2546         _aesni_gf128mul_x_ble()
2547         movdqa IV, STATE2
2548         movdqu 0x50(INP), INC
2549         pxor INC, STATE2
2550         movdqu IV, 0x50(OUTP)
2551
2552         movdqu 0x20(OUTP), INC
2553         pxor INC, STATE3
2554         movdqu STATE3, 0x20(OUTP)
2555
2556         _aesni_gf128mul_x_ble()
2557         movdqa IV, STATE3
2558         movdqu 0x60(INP), INC
2559         pxor INC, STATE3
2560         movdqu IV, 0x60(OUTP)
2561
2562         movdqu 0x30(OUTP), INC
2563         pxor INC, STATE4
2564         movdqu STATE4, 0x30(OUTP)
2565
2566         _aesni_gf128mul_x_ble()
2567         movdqa IV, STATE4
2568         movdqu 0x70(INP), INC
2569         pxor INC, STATE4
2570         movdqu IV, 0x70(OUTP)
2571
2572         _aesni_gf128mul_x_ble()
2573         movups IV, (IVP)
2574
2575         CALL_NOSPEC %r11
2576
2577         movdqu 0x40(OUTP), INC
2578         pxor INC, STATE1
2579         movdqu STATE1, 0x40(OUTP)
2580
2581         movdqu 0x50(OUTP), INC
2582         pxor INC, STATE2
2583         movdqu STATE2, 0x50(OUTP)
2584
2585         movdqu 0x60(OUTP), INC
2586         pxor INC, STATE3
2587         movdqu STATE3, 0x60(OUTP)
2588
2589         movdqu 0x70(OUTP), INC
2590         pxor INC, STATE4
2591         movdqu STATE4, 0x70(OUTP)
2592
2593         FRAME_END
2594         ret
2595 ENDPROC(aesni_xts_crypt8)
2596
2597 #endif