2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
35 #include <asm/nospec-branch.h>
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
50 # constants in mergeable sections, linker can reorder and merge
51 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
53 .Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55 .section .rodata.cst16.POLY, "aM", @progbits, 16
57 POLY: .octa 0xC2000000000000000000000000000001
58 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
60 TWOONE: .octa 0x00000001000000000000000000000001
62 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
64 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65 .section .rodata.cst16.MASK1, "aM", @progbits, 16
67 MASK1: .octa 0x0000000000000000ffffffffffffffff
68 .section .rodata.cst16.MASK2, "aM", @progbits, 16
70 MASK2: .octa 0xffffffffffffffff0000000000000000
71 .section .rodata.cst16.ONE, "aM", @progbits, 16
73 ONE: .octa 0x00000000000000000000000000000001
74 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 .section .rodata.cst16.dec, "aM", @progbits, 16
80 .section .rodata.cst16.enc, "aM", @progbits, 16
84 # order of these constants should not change.
85 # more specifically, ALL_F should follow SHIFT_MASK,
86 # and zero should follow ALL_F
87 .section .rodata, "a", @progbits
89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
96 #define STACK_OFFSET 8*3
97 #define HashKey 16*0 // store HashKey <<1 mod poly here
98 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113 #define VARIABLE_OFFSET 16*8
117 #define InLen (16*1)+8
118 #define PBlockEncKey 16*2
120 #define CurCount 16*4
121 #define PBlockLen 16*5
129 #define arg7 STACK_OFFSET+8(%r14)
130 #define arg8 STACK_OFFSET+16(%r14)
131 #define arg9 STACK_OFFSET+24(%r14)
132 #define arg10 STACK_OFFSET+32(%r14)
133 #define arg11 STACK_OFFSET+40(%r14)
134 #define keysize 2*15*16(%arg1)
151 #define BSWAP_MASK %xmm10
155 #define GF128MUL_MASK %xmm10
189 # states of %xmm registers %xmm6:%xmm15 not saved
190 # all %xmm registers are clobbered
192 sub $VARIABLE_OFFSET, %rsp
205 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
206 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
210 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
212 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
213 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
214 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
217 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
219 movdqa SHUF_MASK(%rip), %xmm2
220 PSHUFB_XMM %xmm2, %xmm0
221 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
224 movdqu (%r12), %xmm13
225 movdqa SHUF_MASK(%rip), %xmm2
226 PSHUFB_XMM %xmm2, %xmm13
228 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
240 pshufd $0x24, %xmm1, %xmm2
241 pcmpeqd TWOONE(%rip), %xmm2
242 pand POLY(%rip), %xmm2
244 movdqa %xmm13, HashKey(%rsp)
246 CALC_AAD_HASH %xmm13 %xmm0 %xmm1 %xmm2 %xmm3 %xmm4 \
250 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
251 # struct has been initialized by GCM_INIT.
252 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
253 # Clobbers rax, r10-r13, and xmm0-xmm15
254 .macro GCM_ENC_DEC operation
255 movdqu AadHash(%arg2), %xmm8
256 movdqu HashKey(%rsp), %xmm13
257 add %arg5, InLen(%arg2)
258 mov %arg5, %r13 # save the number of bytes
259 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
261 # Encrypt/Decrypt first few blocks
264 jz _initial_num_blocks_is_0_\@
266 jb _initial_num_blocks_is_1_\@
267 je _initial_num_blocks_is_2_\@
268 _initial_num_blocks_is_3_\@:
269 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
270 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
272 jmp _initial_blocks_\@
273 _initial_num_blocks_is_2_\@:
274 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
275 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
277 jmp _initial_blocks_\@
278 _initial_num_blocks_is_1_\@:
279 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
280 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
282 jmp _initial_blocks_\@
283 _initial_num_blocks_is_0_\@:
284 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
285 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
288 # Main loop - Encrypt/Decrypt remaining blocks
291 je _zero_cipher_left_\@
293 je _four_cipher_left_\@
295 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
296 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
301 _four_cipher_left_\@:
302 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
303 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
304 _zero_cipher_left_\@:
305 movdqu %xmm8, AadHash(%arg2)
306 movdqu %xmm0, CurCount(%arg2)
309 and $15, %r13 # %r13 = arg5 (mod 16)
310 je _multiple_of_16_bytes_\@
312 mov %r13, PBlockLen(%arg2)
314 # Handle the last <16 Byte block separately
315 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
316 movdqu %xmm0, CurCount(%arg2)
317 movdqa SHUF_MASK(%rip), %xmm10
318 PSHUFB_XMM %xmm10, %xmm0
320 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
321 movdqu %xmm0, PBlockEncKey(%arg2)
323 lea (%arg4,%r11,1), %r10
325 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
327 lea ALL_F+16(%rip), %r12
332 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
334 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
335 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
338 movdqa SHUF_MASK(%rip), %xmm10
339 PSHUFB_XMM %xmm10 ,%xmm2
343 movdqa SHUF_MASK(%rip), %xmm10
344 PSHUFB_XMM %xmm10,%xmm0
349 movdqu %xmm8, AadHash(%arg2)
351 # GHASH computation for the last <16 byte block
352 movdqa SHUF_MASK(%rip), %xmm10
353 # shuffle xmm0 back to output as ciphertext
354 PSHUFB_XMM %xmm10, %xmm0
358 MOVQ_R64_XMM %xmm0, %rax
360 jle _less_than_8_bytes_left_\@
361 mov %rax, (%arg3 , %r11, 1)
364 MOVQ_R64_XMM %xmm0, %rax
366 _less_than_8_bytes_left_\@:
367 mov %al, (%arg3, %r11, 1)
371 jne _less_than_8_bytes_left_\@
372 _multiple_of_16_bytes_\@:
375 # GCM_COMPLETE Finishes update of tag of last partial block
376 # Output: Authorization Tag (AUTH_TAG)
377 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
379 movdqu AadHash(%arg2), %xmm8
380 movdqu HashKey(%rsp), %xmm13
382 mov PBlockLen(%arg2), %r12
387 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
390 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
391 shl $3, %r12 # convert into number of bits
392 movd %r12d, %xmm15 # len(A) in %xmm15
393 mov InLen(%arg2), %r12
394 shl $3, %r12 # len(C) in bits (*128)
395 MOVQ_R64_XMM %r12, %xmm1
397 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
398 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
400 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
401 # final GHASH computation
402 movdqa SHUF_MASK(%rip), %xmm10
403 PSHUFB_XMM %xmm10, %xmm8
405 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
406 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
409 mov arg10, %r10 # %r10 = authTag
410 mov arg11, %r11 # %r11 = auth_tag_len
416 MOVQ_R64_XMM %xmm0, %rax
442 jmp _return_T_done_\@
449 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
452 * Input: A and B (128-bits each, bit-reflected)
453 * Output: C = A*B*x mod poly, (i.e. >>1 )
454 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
455 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
458 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
460 pshufd $78, \GH, \TMP2
461 pshufd $78, \HK, \TMP3
462 pxor \GH, \TMP2 # TMP2 = a1+a0
463 pxor \HK, \TMP3 # TMP3 = b1+b0
464 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
465 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
466 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
468 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
470 pslldq $8, \TMP3 # left shift TMP3 2 DWs
471 psrldq $8, \TMP2 # right shift TMP2 2 DWs
473 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
475 # first phase of the reduction
479 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
480 # in in order to perform
482 pslld $31, \TMP2 # packed right shift <<31
483 pslld $30, \TMP3 # packed right shift <<30
484 pslld $25, \TMP4 # packed right shift <<25
485 pxor \TMP3, \TMP2 # xor the shifted versions
488 psrldq $4, \TMP5 # right shift TMP5 1 DW
489 pslldq $12, \TMP2 # left shift TMP2 3 DWs
492 # second phase of the reduction
494 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
495 # in in order to perform
499 psrld $1,\TMP2 # packed left shift >>1
500 psrld $2,\TMP3 # packed left shift >>2
501 psrld $7,\TMP4 # packed left shift >>7
502 pxor \TMP3,\TMP2 # xor the shifted versions
506 pxor \TMP1, \GH # result is in TMP1
509 # Reads DLEN bytes starting at DPTR and stores in XMMDst
510 # where 0 < DLEN < 16
511 # Clobbers %rax, DLEN and XMM1
512 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
516 MOVQ_R64_XMM %rax, \XMMDst
518 jz _done_read_partial_block_\@
522 mov 7(\DPTR, \DLEN, 1), %al
524 jnz _read_next_byte_\@
525 MOVQ_R64_XMM %rax, \XMM1
528 jmp _done_read_partial_block_\@
531 _read_next_byte_lt8_\@:
533 mov -1(\DPTR, \DLEN, 1), %al
535 jnz _read_next_byte_lt8_\@
536 MOVQ_R64_XMM %rax, \XMMDst
537 _done_read_partial_block_\@:
540 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
541 # clobbers r10-11, xmm14
542 .macro CALC_AAD_HASH HASHKEY TMP1 TMP2 TMP3 TMP4 TMP5 \
544 MOVADQ SHUF_MASK(%rip), %xmm14
545 mov arg8, %r10 # %r10 = AAD
546 mov arg9, %r11 # %r11 = aadLen
554 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
556 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
560 jge _get_AAD_blocks\@
564 /* read the last <16B of AAD */
569 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
570 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
572 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
576 movdqu \TMP6, AadHash(%arg2)
580 * if a = number of total plaintext bytes
582 * num_initial_blocks = b mod 4
583 * encrypt the initial num_initial_blocks blocks and apply ghash on
585 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
587 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
591 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
592 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
593 MOVADQ SHUF_MASK(%rip), %xmm14
595 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
597 xor %r11, %r11 # initialise the data pointer offset as zero
598 # start AES for num_initial_blocks blocks
600 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
602 .if (\i == 5) || (\i == 6) || (\i == 7)
604 MOVADQ ONE(%RIP),\TMP1
605 MOVADQ 0(%arg1),\TMP2
607 paddd \TMP1, \XMM0 # INCR Y0
609 movdqa \XMM0, %xmm\index
611 MOVADQ \XMM0, %xmm\index
613 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
614 pxor \TMP2, %xmm\index
618 shr $2,%eax # 128->4, 192->6, 256->8
619 add $5,%eax # 128->9, 192->11, 256->13
624 AESENC \TMP1, %xmm\index
628 jnz aes_loop_initial_\@
632 AESENCLAST \TMP1, %xmm\index # Last Round
635 movdqu (%arg4 , %r11, 1), \TMP1
636 pxor \TMP1, %xmm\index
637 movdqu %xmm\index, (%arg3 , %r11, 1)
638 # write back plaintext/ciphertext for num_initial_blocks
642 movdqa \TMP1, %xmm\index
644 PSHUFB_XMM %xmm14, %xmm\index
646 # prepare plaintext/ciphertext for GHASH computation
650 # apply GHASH on num_initial_blocks blocks
654 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
656 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
658 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
661 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
663 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
666 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
669 jl _initial_blocks_done\@
670 # no need for precomputed values
673 * Precomputations for HashKey parallel with encryption of first 4 blocks.
674 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
676 MOVADQ ONE(%RIP),\TMP1
677 paddd \TMP1, \XMM0 # INCR Y0
679 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
681 paddd \TMP1, \XMM0 # INCR Y0
683 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
685 paddd \TMP1, \XMM0 # INCR Y0
687 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
689 paddd \TMP1, \XMM0 # INCR Y0
691 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
693 MOVADQ 0(%arg1),\TMP1
699 pshufd $78, \TMP3, \TMP1
701 movdqa \TMP1, HashKey_k(%rsp)
702 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
703 # TMP5 = HashKey^2<<1 (mod poly)
704 movdqa \TMP5, HashKey_2(%rsp)
705 # HashKey_2 = HashKey^2<<1 (mod poly)
706 pshufd $78, \TMP5, \TMP1
708 movdqa \TMP1, HashKey_2_k(%rsp)
709 .irpc index, 1234 # do 4 rounds
710 movaps 0x10*\index(%arg1), \TMP1
716 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
717 # TMP5 = HashKey^3<<1 (mod poly)
718 movdqa \TMP5, HashKey_3(%rsp)
719 pshufd $78, \TMP5, \TMP1
721 movdqa \TMP1, HashKey_3_k(%rsp)
722 .irpc index, 56789 # do next 5 rounds
723 movaps 0x10*\index(%arg1), \TMP1
729 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
730 # TMP5 = HashKey^3<<1 (mod poly)
731 movdqa \TMP5, HashKey_4(%rsp)
732 pshufd $78, \TMP5, \TMP1
734 movdqa \TMP1, HashKey_4_k(%rsp)
737 shr $2,%eax # 128->4, 192->6, 256->8
738 sub $4,%eax # 128->0, 192->2, 256->4
739 jz aes_loop_pre_done\@
744 AESENC \TMP2, %xmm\index
752 AESENCLAST \TMP2, \XMM1
753 AESENCLAST \TMP2, \XMM2
754 AESENCLAST \TMP2, \XMM3
755 AESENCLAST \TMP2, \XMM4
756 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
759 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
762 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
765 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
768 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
771 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
774 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
777 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
780 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
781 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
782 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
783 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
787 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
789 # combine GHASHed value with the corresponding ciphertext
790 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
791 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
792 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
794 _initial_blocks_done\@:
799 * encrypt 4 blocks at a time
800 * ghash the 4 previously encrypted ciphertext blocks
801 * arg1, %arg3, %arg4 are used as pointers only, not modified
802 * %r11 is the data offset value
804 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
805 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
812 movdqa SHUF_MASK(%rip), %xmm15
813 # multiply TMP5 * HashKey using karatsuba
816 pshufd $78, \XMM5, \TMP6
818 paddd ONE(%rip), \XMM0 # INCR CNT
819 movdqa HashKey_4(%rsp), \TMP5
820 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
822 paddd ONE(%rip), \XMM0 # INCR CNT
824 paddd ONE(%rip), \XMM0 # INCR CNT
826 paddd ONE(%rip), \XMM0 # INCR CNT
828 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
829 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
830 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
831 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
832 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
838 movdqa HashKey_4_k(%rsp), \TMP5
839 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
840 movaps 0x10(%arg1), \TMP1
841 AESENC \TMP1, \XMM1 # Round 1
845 movaps 0x20(%arg1), \TMP1
846 AESENC \TMP1, \XMM1 # Round 2
851 pshufd $78, \XMM6, \TMP2
853 movdqa HashKey_3(%rsp), \TMP5
854 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
855 movaps 0x30(%arg1), \TMP3
856 AESENC \TMP3, \XMM1 # Round 3
860 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
861 movaps 0x40(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 4
866 movdqa HashKey_3_k(%rsp), \TMP5
867 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
868 movaps 0x50(%arg1), \TMP3
869 AESENC \TMP3, \XMM1 # Round 5
874 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
878 pshufd $78, \XMM7, \TMP2
880 movdqa HashKey_2(%rsp ), \TMP5
882 # Multiply TMP5 * HashKey using karatsuba
884 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
885 movaps 0x60(%arg1), \TMP3
886 AESENC \TMP3, \XMM1 # Round 6
890 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
891 movaps 0x70(%arg1), \TMP3
892 AESENC \TMP3, \XMM1 # Round 7
896 movdqa HashKey_2_k(%rsp), \TMP5
897 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
898 movaps 0x80(%arg1), \TMP3
899 AESENC \TMP3, \XMM1 # Round 8
904 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
908 # Multiply XMM8 * HashKey
909 # XMM8 and TMP5 hold the values for the two operands
912 pshufd $78, \XMM8, \TMP2
914 movdqa HashKey(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
916 movaps 0x90(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 9
921 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
924 shr $2,%eax # 128->4, 192->6, 256->8
925 sub $4,%eax # 128->0, 192->2, 256->4
926 jz aes_loop_par_enc_done
931 AESENC \TMP3, %xmm\index
937 aes_loop_par_enc_done:
939 AESENCLAST \TMP3, \XMM1 # Round 10
940 AESENCLAST \TMP3, \XMM2
941 AESENCLAST \TMP3, \XMM3
942 AESENCLAST \TMP3, \XMM4
943 movdqa HashKey_k(%rsp), \TMP5
944 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
945 movdqu (%arg4,%r11,1), \TMP3
946 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
947 movdqu 16(%arg4,%r11,1), \TMP3
948 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
949 movdqu 32(%arg4,%r11,1), \TMP3
950 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
951 movdqu 48(%arg4,%r11,1), \TMP3
952 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
953 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
954 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
955 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
956 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
957 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
958 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
959 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
960 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
968 pslldq $8, \TMP3 # left shift TMP3 2 DWs
969 psrldq $8, \TMP2 # right shift TMP2 2 DWs
971 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
973 # first phase of reduction
978 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
979 pslld $31, \TMP2 # packed right shift << 31
980 pslld $30, \TMP3 # packed right shift << 30
981 pslld $25, \TMP4 # packed right shift << 25
982 pxor \TMP3, \TMP2 # xor the shifted versions
985 psrldq $4, \TMP5 # right shift T5 1 DW
986 pslldq $12, \TMP2 # left shift T2 3 DWs
989 # second phase of reduction
991 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
994 psrld $1, \TMP2 # packed left shift >>1
995 psrld $2, \TMP3 # packed left shift >>2
996 psrld $7, \TMP4 # packed left shift >>7
997 pxor \TMP3,\TMP2 # xor the shifted versions
1001 pxor \TMP1, \XMM5 # result is in TMP1
1007 * decrypt 4 blocks at a time
1008 * ghash the 4 previously decrypted ciphertext blocks
1009 * arg1, %arg3, %arg4 are used as pointers only, not modified
1010 * %r11 is the data offset value
1012 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1013 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1020 movdqa SHUF_MASK(%rip), %xmm15
1021 # multiply TMP5 * HashKey using karatsuba
1024 pshufd $78, \XMM5, \TMP6
1026 paddd ONE(%rip), \XMM0 # INCR CNT
1027 movdqa HashKey_4(%rsp), \TMP5
1028 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1030 paddd ONE(%rip), \XMM0 # INCR CNT
1032 paddd ONE(%rip), \XMM0 # INCR CNT
1034 paddd ONE(%rip), \XMM0 # INCR CNT
1036 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1037 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1038 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1039 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1040 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1046 movdqa HashKey_4_k(%rsp), \TMP5
1047 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1048 movaps 0x10(%arg1), \TMP1
1049 AESENC \TMP1, \XMM1 # Round 1
1053 movaps 0x20(%arg1), \TMP1
1054 AESENC \TMP1, \XMM1 # Round 2
1059 pshufd $78, \XMM6, \TMP2
1061 movdqa HashKey_3(%rsp), \TMP5
1062 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1063 movaps 0x30(%arg1), \TMP3
1064 AESENC \TMP3, \XMM1 # Round 3
1068 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1069 movaps 0x40(%arg1), \TMP3
1070 AESENC \TMP3, \XMM1 # Round 4
1074 movdqa HashKey_3_k(%rsp), \TMP5
1075 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1076 movaps 0x50(%arg1), \TMP3
1077 AESENC \TMP3, \XMM1 # Round 5
1082 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1086 pshufd $78, \XMM7, \TMP2
1088 movdqa HashKey_2(%rsp ), \TMP5
1090 # Multiply TMP5 * HashKey using karatsuba
1092 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1093 movaps 0x60(%arg1), \TMP3
1094 AESENC \TMP3, \XMM1 # Round 6
1098 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1099 movaps 0x70(%arg1), \TMP3
1100 AESENC \TMP3, \XMM1 # Round 7
1104 movdqa HashKey_2_k(%rsp), \TMP5
1105 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1106 movaps 0x80(%arg1), \TMP3
1107 AESENC \TMP3, \XMM1 # Round 8
1112 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1116 # Multiply XMM8 * HashKey
1117 # XMM8 and TMP5 hold the values for the two operands
1120 pshufd $78, \XMM8, \TMP2
1122 movdqa HashKey(%rsp), \TMP5
1123 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1124 movaps 0x90(%arg1), \TMP3
1125 AESENC \TMP3, \XMM1 # Round 9
1129 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1130 lea 0xa0(%arg1),%r10
1132 shr $2,%eax # 128->4, 192->6, 256->8
1133 sub $4,%eax # 128->0, 192->2, 256->4
1134 jz aes_loop_par_dec_done
1139 AESENC \TMP3, %xmm\index
1143 jnz aes_loop_par_dec
1145 aes_loop_par_dec_done:
1146 MOVADQ (%r10), \TMP3
1147 AESENCLAST \TMP3, \XMM1 # last round
1148 AESENCLAST \TMP3, \XMM2
1149 AESENCLAST \TMP3, \XMM3
1150 AESENCLAST \TMP3, \XMM4
1151 movdqa HashKey_k(%rsp), \TMP5
1152 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1153 movdqu (%arg4,%r11,1), \TMP3
1154 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1155 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1157 movdqu 16(%arg4,%r11,1), \TMP3
1158 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1159 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1161 movdqu 32(%arg4,%r11,1), \TMP3
1162 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1163 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1165 movdqu 48(%arg4,%r11,1), \TMP3
1166 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1167 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1169 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1170 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1171 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1172 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1180 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1181 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1183 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1185 # first phase of reduction
1190 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1191 pslld $31, \TMP2 # packed right shift << 31
1192 pslld $30, \TMP3 # packed right shift << 30
1193 pslld $25, \TMP4 # packed right shift << 25
1194 pxor \TMP3, \TMP2 # xor the shifted versions
1197 psrldq $4, \TMP5 # right shift T5 1 DW
1198 pslldq $12, \TMP2 # left shift T2 3 DWs
1201 # second phase of reduction
1203 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1206 psrld $1, \TMP2 # packed left shift >>1
1207 psrld $2, \TMP3 # packed left shift >>2
1208 psrld $7, \TMP4 # packed left shift >>7
1209 pxor \TMP3,\TMP2 # xor the shifted versions
1213 pxor \TMP1, \XMM5 # result is in TMP1
1218 /* GHASH the last 4 ciphertext blocks. */
1219 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1220 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1222 # Multiply TMP6 * HashKey (using Karatsuba)
1225 pshufd $78, \XMM1, \TMP2
1227 movdqa HashKey_4(%rsp), \TMP5
1228 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1229 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1230 movdqa HashKey_4_k(%rsp), \TMP4
1231 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1232 movdqa \XMM1, \XMMDst
1233 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1235 # Multiply TMP1 * HashKey (using Karatsuba)
1238 pshufd $78, \XMM2, \TMP2
1240 movdqa HashKey_3(%rsp), \TMP5
1241 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1242 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1243 movdqa HashKey_3_k(%rsp), \TMP4
1244 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1248 # results accumulated in TMP6, XMMDst, XMM1
1250 # Multiply TMP1 * HashKey (using Karatsuba)
1253 pshufd $78, \XMM3, \TMP2
1255 movdqa HashKey_2(%rsp), \TMP5
1256 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1257 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1258 movdqa HashKey_2_k(%rsp), \TMP4
1259 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1262 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1264 # Multiply TMP1 * HashKey (using Karatsuba)
1266 pshufd $78, \XMM4, \TMP2
1268 movdqa HashKey(%rsp), \TMP5
1269 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1270 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1271 movdqa HashKey_k(%rsp), \TMP4
1272 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1278 # middle section of the temp results combined as in karatsuba algorithm
1280 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1281 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1284 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1285 # first phase of the reduction
1286 movdqa \XMMDst, \TMP2
1287 movdqa \XMMDst, \TMP3
1288 movdqa \XMMDst, \TMP4
1289 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1290 pslld $31, \TMP2 # packed right shifting << 31
1291 pslld $30, \TMP3 # packed right shifting << 30
1292 pslld $25, \TMP4 # packed right shifting << 25
1293 pxor \TMP3, \TMP2 # xor the shifted versions
1296 psrldq $4, \TMP7 # right shift TMP7 1 DW
1297 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1300 # second phase of the reduction
1301 movdqa \XMMDst, \TMP2
1302 # make 3 copies of XMMDst for doing 3 shift operations
1303 movdqa \XMMDst, \TMP3
1304 movdqa \XMMDst, \TMP4
1305 psrld $1, \TMP2 # packed left shift >> 1
1306 psrld $2, \TMP3 # packed left shift >> 2
1307 psrld $7, \TMP4 # packed left shift >> 7
1308 pxor \TMP3, \TMP2 # xor the shifted versions
1312 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1316 /* Encryption of a single block
1320 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1324 shr $2,%eax # 128->4, 192->6, 256->8
1325 add $5,%eax # 128->9, 192->11, 256->13
1326 lea 16(%arg1), %r10 # get first expanded key address
1336 AESENCLAST \TMP1,\XMM0
1338 /*****************************************************************************
1339 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1340 * struct gcm_context_data *data
1342 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1343 * const u8 *in, // Ciphertext input
1344 * u64 plaintext_len, // Length of data in bytes for decryption.
1345 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1346 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1347 * // concatenated with 0x00000001. 16-byte aligned pointer.
1348 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1349 * const u8 *aad, // Additional Authentication Data (AAD)
1350 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1351 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1352 * // given authentication tag and only return the plaintext if they match.
1353 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1354 * // (most likely), 12 or 8.
1359 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1360 * set of 11 keys in the data structure void *aes_ctx
1364 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1365 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366 * | Salt (From the SA) |
1367 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1368 * | Initialization Vector |
1369 * | (This is the sequence number from IPSec header) |
1370 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1372 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1377 * AAD padded to 128 bits with 0
1378 * for example, assume AAD is a u32 vector
1380 * if AAD is 8 bytes:
1381 * AAD[3] = {A0, A1};
1382 * padded AAD in xmm register = {A1 A0 0 0}
1385 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1386 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1388 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1389 * | 32-bit Sequence Number (A0) |
1390 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1392 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1394 * AAD Format with 32-bit Sequence Number
1396 * if AAD is 12 bytes:
1397 * AAD[3] = {A0, A1, A2};
1398 * padded AAD in xmm register = {A2 A1 A0 0}
1401 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1402 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1403 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1404 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1406 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1407 * | 64-bit Extended Sequence Number {A1,A0} |
1409 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1411 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1413 * AAD Format with 64-bit Extended Sequence Number
1415 * poly = x^128 + x^127 + x^126 + x^121 + 1
1417 *****************************************************************************/
1418 ENTRY(aesni_gcm_dec)
1426 ENDPROC(aesni_gcm_dec)
1429 /*****************************************************************************
1430 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1431 * struct gcm_context_data *data
1433 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1434 * const u8 *in, // Plaintext input
1435 * u64 plaintext_len, // Length of data in bytes for encryption.
1436 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1437 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1438 * // concatenated with 0x00000001. 16-byte aligned pointer.
1439 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1440 * const u8 *aad, // Additional Authentication Data (AAD)
1441 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1442 * u8 *auth_tag, // Authenticated Tag output.
1443 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1449 * keys are pre-expanded and aligned to 16 bytes. we are using the
1450 * first set of 11 keys in the data structure void *aes_ctx
1455 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1456 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1457 * | Salt (From the SA) |
1458 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1459 * | Initialization Vector |
1460 * | (This is the sequence number from IPSec header) |
1461 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1463 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468 * AAD padded to 128 bits with 0
1469 * for example, assume AAD is a u32 vector
1471 * if AAD is 8 bytes:
1472 * AAD[3] = {A0, A1};
1473 * padded AAD in xmm register = {A1 A0 0 0}
1476 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1477 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1479 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1480 * | 32-bit Sequence Number (A0) |
1481 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1483 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1485 * AAD Format with 32-bit Sequence Number
1487 * if AAD is 12 bytes:
1488 * AAD[3] = {A0, A1, A2};
1489 * padded AAD in xmm register = {A2 A1 A0 0}
1492 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1493 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496 * | 64-bit Extended Sequence Number {A1,A0} |
1498 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1500 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1502 * AAD Format with 64-bit Extended Sequence Number
1504 * poly = x^128 + x^127 + x^126 + x^121 + 1
1505 ***************************************************************************/
1506 ENTRY(aesni_gcm_enc)
1514 ENDPROC(aesni_gcm_enc)
1521 _key_expansion_256a:
1522 pshufd $0b11111111, %xmm1, %xmm1
1523 shufps $0b00010000, %xmm0, %xmm4
1525 shufps $0b10001100, %xmm0, %xmm4
1528 movaps %xmm0, (TKEYP)
1531 ENDPROC(_key_expansion_128)
1532 ENDPROC(_key_expansion_256a)
1535 _key_expansion_192a:
1536 pshufd $0b01010101, %xmm1, %xmm1
1537 shufps $0b00010000, %xmm0, %xmm4
1539 shufps $0b10001100, %xmm0, %xmm4
1546 pshufd $0b11111111, %xmm0, %xmm3
1551 shufps $0b01000100, %xmm0, %xmm6
1552 movaps %xmm6, (TKEYP)
1553 shufps $0b01001110, %xmm2, %xmm1
1554 movaps %xmm1, 0x10(TKEYP)
1557 ENDPROC(_key_expansion_192a)
1560 _key_expansion_192b:
1561 pshufd $0b01010101, %xmm1, %xmm1
1562 shufps $0b00010000, %xmm0, %xmm4
1564 shufps $0b10001100, %xmm0, %xmm4
1570 pshufd $0b11111111, %xmm0, %xmm3
1574 movaps %xmm0, (TKEYP)
1577 ENDPROC(_key_expansion_192b)
1580 _key_expansion_256b:
1581 pshufd $0b10101010, %xmm1, %xmm1
1582 shufps $0b00010000, %xmm2, %xmm4
1584 shufps $0b10001100, %xmm2, %xmm4
1587 movaps %xmm2, (TKEYP)
1590 ENDPROC(_key_expansion_256b)
1593 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1594 * unsigned int key_len)
1596 ENTRY(aesni_set_key)
1600 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1601 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1602 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1604 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1605 movaps %xmm0, (KEYP)
1606 lea 0x10(KEYP), TKEYP # key addr
1607 movl %edx, 480(KEYP)
1608 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1612 movups 0x10(UKEYP), %xmm2 # other user key
1613 movaps %xmm2, (TKEYP)
1615 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1616 call _key_expansion_256a
1617 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1618 call _key_expansion_256b
1619 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1620 call _key_expansion_256a
1621 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1622 call _key_expansion_256b
1623 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1624 call _key_expansion_256a
1625 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1626 call _key_expansion_256b
1627 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1628 call _key_expansion_256a
1629 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1630 call _key_expansion_256b
1631 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1632 call _key_expansion_256a
1633 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1634 call _key_expansion_256b
1635 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1636 call _key_expansion_256a
1637 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1638 call _key_expansion_256b
1639 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1640 call _key_expansion_256a
1643 movq 0x10(UKEYP), %xmm2 # other user key
1644 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1645 call _key_expansion_192a
1646 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1647 call _key_expansion_192b
1648 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1649 call _key_expansion_192a
1650 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1651 call _key_expansion_192b
1652 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1653 call _key_expansion_192a
1654 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1655 call _key_expansion_192b
1656 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1657 call _key_expansion_192a
1658 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1659 call _key_expansion_192b
1662 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1663 call _key_expansion_128
1664 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1665 call _key_expansion_128
1666 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1667 call _key_expansion_128
1668 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1669 call _key_expansion_128
1670 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1671 call _key_expansion_128
1672 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1673 call _key_expansion_128
1674 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1675 call _key_expansion_128
1676 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1677 call _key_expansion_128
1678 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1679 call _key_expansion_128
1680 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1681 call _key_expansion_128
1684 movaps (KEYP), %xmm0
1685 movaps (TKEYP), %xmm1
1686 movaps %xmm0, 240(TKEYP)
1687 movaps %xmm1, 240(KEYP)
1689 lea 240-16(TKEYP), UKEYP
1692 movaps (KEYP), %xmm0
1694 movaps %xmm1, (UKEYP)
1705 ENDPROC(aesni_set_key)
1708 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1715 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1716 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1717 movl (FRAME_OFFSET+20)(%esp), INP # src
1719 movl 480(KEYP), KLEN # key length
1720 movups (INP), STATE # input
1722 movups STATE, (OUTP) # output
1732 * _aesni_enc1: internal ABI
1734 * KEYP: key struct pointer
1736 * STATE: initial state (input)
1738 * STATE: finial state (output)
1745 movaps (KEYP), KEY # key
1747 pxor KEY, STATE # round 0
1751 lea 0x20(TKEYP), TKEYP
1754 movaps -0x60(TKEYP), KEY
1756 movaps -0x50(TKEYP), KEY
1760 movaps -0x40(TKEYP), KEY
1762 movaps -0x30(TKEYP), KEY
1766 movaps -0x20(TKEYP), KEY
1768 movaps -0x10(TKEYP), KEY
1772 movaps 0x10(TKEYP), KEY
1774 movaps 0x20(TKEYP), KEY
1776 movaps 0x30(TKEYP), KEY
1778 movaps 0x40(TKEYP), KEY
1780 movaps 0x50(TKEYP), KEY
1782 movaps 0x60(TKEYP), KEY
1784 movaps 0x70(TKEYP), KEY
1785 AESENCLAST KEY STATE
1787 ENDPROC(_aesni_enc1)
1790 * _aesni_enc4: internal ABI
1792 * KEYP: key struct pointer
1794 * STATE1: initial state (input)
1799 * STATE1: finial state (output)
1809 movaps (KEYP), KEY # key
1811 pxor KEY, STATE1 # round 0
1818 lea 0x20(TKEYP), TKEYP
1821 movaps -0x60(TKEYP), KEY
1826 movaps -0x50(TKEYP), KEY
1833 movaps -0x40(TKEYP), KEY
1838 movaps -0x30(TKEYP), KEY
1845 movaps -0x20(TKEYP), KEY
1850 movaps -0x10(TKEYP), KEY
1860 movaps 0x10(TKEYP), KEY
1865 movaps 0x20(TKEYP), KEY
1870 movaps 0x30(TKEYP), KEY
1875 movaps 0x40(TKEYP), KEY
1880 movaps 0x50(TKEYP), KEY
1885 movaps 0x60(TKEYP), KEY
1890 movaps 0x70(TKEYP), KEY
1891 AESENCLAST KEY STATE1 # last round
1892 AESENCLAST KEY STATE2
1893 AESENCLAST KEY STATE3
1894 AESENCLAST KEY STATE4
1896 ENDPROC(_aesni_enc4)
1899 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1906 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1907 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1908 movl (FRAME_OFFSET+20)(%esp), INP # src
1910 mov 480(KEYP), KLEN # key length
1912 movups (INP), STATE # input
1914 movups STATE, (OUTP) #output
1924 * _aesni_dec1: internal ABI
1926 * KEYP: key struct pointer
1928 * STATE: initial state (input)
1930 * STATE: finial state (output)
1937 movaps (KEYP), KEY # key
1939 pxor KEY, STATE # round 0
1943 lea 0x20(TKEYP), TKEYP
1946 movaps -0x60(TKEYP), KEY
1948 movaps -0x50(TKEYP), KEY
1952 movaps -0x40(TKEYP), KEY
1954 movaps -0x30(TKEYP), KEY
1958 movaps -0x20(TKEYP), KEY
1960 movaps -0x10(TKEYP), KEY
1964 movaps 0x10(TKEYP), KEY
1966 movaps 0x20(TKEYP), KEY
1968 movaps 0x30(TKEYP), KEY
1970 movaps 0x40(TKEYP), KEY
1972 movaps 0x50(TKEYP), KEY
1974 movaps 0x60(TKEYP), KEY
1976 movaps 0x70(TKEYP), KEY
1977 AESDECLAST KEY STATE
1979 ENDPROC(_aesni_dec1)
1982 * _aesni_dec4: internal ABI
1984 * KEYP: key struct pointer
1986 * STATE1: initial state (input)
1991 * STATE1: finial state (output)
2001 movaps (KEYP), KEY # key
2003 pxor KEY, STATE1 # round 0
2010 lea 0x20(TKEYP), TKEYP
2013 movaps -0x60(TKEYP), KEY
2018 movaps -0x50(TKEYP), KEY
2025 movaps -0x40(TKEYP), KEY
2030 movaps -0x30(TKEYP), KEY
2037 movaps -0x20(TKEYP), KEY
2042 movaps -0x10(TKEYP), KEY
2052 movaps 0x10(TKEYP), KEY
2057 movaps 0x20(TKEYP), KEY
2062 movaps 0x30(TKEYP), KEY
2067 movaps 0x40(TKEYP), KEY
2072 movaps 0x50(TKEYP), KEY
2077 movaps 0x60(TKEYP), KEY
2082 movaps 0x70(TKEYP), KEY
2083 AESDECLAST KEY STATE1 # last round
2084 AESDECLAST KEY STATE2
2085 AESDECLAST KEY STATE3
2086 AESDECLAST KEY STATE4
2088 ENDPROC(_aesni_dec4)
2091 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2094 ENTRY(aesni_ecb_enc)
2100 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2101 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2102 movl (FRAME_OFFSET+24)(%esp), INP # src
2103 movl (FRAME_OFFSET+28)(%esp), LEN # len
2105 test LEN, LEN # check length
2114 movups (INP), STATE1
2115 movups 0x10(INP), STATE2
2116 movups 0x20(INP), STATE3
2117 movups 0x30(INP), STATE4
2119 movups STATE1, (OUTP)
2120 movups STATE2, 0x10(OUTP)
2121 movups STATE3, 0x20(OUTP)
2122 movups STATE4, 0x30(OUTP)
2132 movups (INP), STATE1
2134 movups STATE1, (OUTP)
2148 ENDPROC(aesni_ecb_enc)
2151 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2154 ENTRY(aesni_ecb_dec)
2160 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2161 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2162 movl (FRAME_OFFSET+24)(%esp), INP # src
2163 movl (FRAME_OFFSET+28)(%esp), LEN # len
2175 movups (INP), STATE1
2176 movups 0x10(INP), STATE2
2177 movups 0x20(INP), STATE3
2178 movups 0x30(INP), STATE4
2180 movups STATE1, (OUTP)
2181 movups STATE2, 0x10(OUTP)
2182 movups STATE3, 0x20(OUTP)
2183 movups STATE4, 0x30(OUTP)
2193 movups (INP), STATE1
2195 movups STATE1, (OUTP)
2209 ENDPROC(aesni_ecb_dec)
2212 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2213 * size_t len, u8 *iv)
2215 ENTRY(aesni_cbc_enc)
2222 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2223 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2224 movl (FRAME_OFFSET+28)(%esp), INP # src
2225 movl (FRAME_OFFSET+32)(%esp), LEN # len
2226 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2231 movups (IVP), STATE # load iv as initial state
2234 movups (INP), IN # load input
2237 movups STATE, (OUTP) # store output
2253 ENDPROC(aesni_cbc_enc)
2256 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2257 * size_t len, u8 *iv)
2259 ENTRY(aesni_cbc_dec)
2266 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2267 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2268 movl (FRAME_OFFSET+28)(%esp), INP # src
2269 movl (FRAME_OFFSET+32)(%esp), LEN # len
2270 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2273 jb .Lcbc_dec_just_ret
2283 movups 0x10(INP), IN2
2286 movups 0x20(INP), IN3
2288 movups 0x30(INP), IN4
2291 movups 0x20(INP), IN1
2293 movups 0x30(INP), IN2
2308 movups 0x10(INP), IN2
2311 movups STATE1, (OUTP)
2312 movups STATE2, 0x10(OUTP)
2313 movups STATE3, 0x20(OUTP)
2314 movups STATE4, 0x30(OUTP)
2328 movups STATE, (OUTP)
2346 ENDPROC(aesni_cbc_dec)
2349 .pushsection .rodata
2352 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2356 * _aesni_inc_init: internal ABI
2357 * setup registers used by _aesni_inc
2361 * CTR: == IV, in little endian
2362 * TCTR_LOW: == lower qword of CTR
2363 * INC: == 1, in little endian
2364 * BSWAP_MASK == endian swapping mask
2368 movaps .Lbswap_mask, BSWAP_MASK
2370 PSHUFB_XMM BSWAP_MASK CTR
2372 MOVQ_R64_XMM TCTR_LOW INC
2373 MOVQ_R64_XMM CTR TCTR_LOW
2375 ENDPROC(_aesni_inc_init)
2378 * _aesni_inc: internal ABI
2379 * Increase IV by 1, IV is in big endian
2382 * CTR: == IV, in little endian
2383 * TCTR_LOW: == lower qword of CTR
2384 * INC: == 1, in little endian
2385 * BSWAP_MASK == endian swapping mask
2389 * CTR: == output IV, in little endian
2390 * TCTR_LOW: == lower qword of CTR
2402 PSHUFB_XMM BSWAP_MASK IV
2407 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2408 * size_t len, u8 *iv)
2410 ENTRY(aesni_ctr_enc)
2413 jb .Lctr_enc_just_ret
2416 call _aesni_inc_init
2426 movups 0x10(INP), IN2
2429 movups 0x20(INP), IN3
2432 movups 0x30(INP), IN4
2435 movups STATE1, (OUTP)
2437 movups STATE2, 0x10(OUTP)
2439 movups STATE3, 0x20(OUTP)
2441 movups STATE4, 0x30(OUTP)
2456 movups STATE, (OUTP)
2467 ENDPROC(aesni_ctr_enc)
2470 * _aesni_gf128mul_x_ble: internal ABI
2471 * Multiply in GF(2^128) for XTS IVs
2474 * GF128MUL_MASK == mask with 0x87 and 0x01
2478 * CTR: == temporary value
2480 #define _aesni_gf128mul_x_ble() \
2481 pshufd $0x13, IV, CTR; \
2484 pand GF128MUL_MASK, CTR; \
2488 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2491 ENTRY(aesni_xts_crypt8)
2496 leaq _aesni_enc4, %r11
2497 leaq _aesni_dec4, %rax
2501 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2508 movdqu 0x00(INP), INC
2510 movdqu IV, 0x00(OUTP)
2512 _aesni_gf128mul_x_ble()
2514 movdqu 0x10(INP), INC
2516 movdqu IV, 0x10(OUTP)
2518 _aesni_gf128mul_x_ble()
2520 movdqu 0x20(INP), INC
2522 movdqu IV, 0x20(OUTP)
2524 _aesni_gf128mul_x_ble()
2526 movdqu 0x30(INP), INC
2528 movdqu IV, 0x30(OUTP)
2532 movdqu 0x00(OUTP), INC
2534 movdqu STATE1, 0x00(OUTP)
2536 _aesni_gf128mul_x_ble()
2538 movdqu 0x40(INP), INC
2540 movdqu IV, 0x40(OUTP)
2542 movdqu 0x10(OUTP), INC
2544 movdqu STATE2, 0x10(OUTP)
2546 _aesni_gf128mul_x_ble()
2548 movdqu 0x50(INP), INC
2550 movdqu IV, 0x50(OUTP)
2552 movdqu 0x20(OUTP), INC
2554 movdqu STATE3, 0x20(OUTP)
2556 _aesni_gf128mul_x_ble()
2558 movdqu 0x60(INP), INC
2560 movdqu IV, 0x60(OUTP)
2562 movdqu 0x30(OUTP), INC
2564 movdqu STATE4, 0x30(OUTP)
2566 _aesni_gf128mul_x_ble()
2568 movdqu 0x70(INP), INC
2570 movdqu IV, 0x70(OUTP)
2572 _aesni_gf128mul_x_ble()
2577 movdqu 0x40(OUTP), INC
2579 movdqu STATE1, 0x40(OUTP)
2581 movdqu 0x50(OUTP), INC
2583 movdqu STATE2, 0x50(OUTP)
2585 movdqu 0x60(OUTP), INC
2587 movdqu STATE3, 0x60(OUTP)
2589 movdqu 0x70(OUTP), INC
2591 movdqu STATE4, 0x70(OUTP)
2595 ENDPROC(aesni_xts_crypt8)