1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Implement AES algorithm in Intel AES-NI instructions.
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
28 #include <linux/linkage.h>
29 #include <asm/frame.h>
30 #include <asm/nospec-branch.h>
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
45 # constants in mergeable sections, linker can reorder and merge
46 .section .rodata.cst16.POLY, "aM", @progbits, 16
48 POLY: .octa 0xC2000000000000000000000000000001
49 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
51 TWOONE: .octa 0x00000001000000000000000000000001
53 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
55 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
56 .section .rodata.cst16.MASK1, "aM", @progbits, 16
58 MASK1: .octa 0x0000000000000000ffffffffffffffff
59 .section .rodata.cst16.MASK2, "aM", @progbits, 16
61 MASK2: .octa 0xffffffffffffffff0000000000000000
62 .section .rodata.cst16.ONE, "aM", @progbits, 16
64 ONE: .octa 0x00000000000000000000000000000001
65 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68 .section .rodata.cst16.dec, "aM", @progbits, 16
71 .section .rodata.cst16.enc, "aM", @progbits, 16
75 # order of these constants should not change.
76 # more specifically, ALL_F should follow SHIFT_MASK,
77 # and zero should follow ALL_F
78 .section .rodata, "a", @progbits
80 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
82 .octa 0x00000000000000000000000000000000
87 #define STACK_OFFSET 8*3
91 #define InLen (16*1)+8
92 #define PBlockEncKey 16*2
95 #define PBlockLen 16*5
96 #define HashKey 16*6 // store HashKey <<1 mod poly here
97 #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
98 #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
99 #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
100 #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
101 // bits of HashKey <<1 mod poly here
102 //(for Karatsuba purposes)
103 #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
104 // bits of HashKey^2 <<1 mod poly here
105 // (for Karatsuba purposes)
106 #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
107 // bits of HashKey^3 <<1 mod poly here
108 // (for Karatsuba purposes)
109 #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
110 // bits of HashKey^4 <<1 mod poly here
111 // (for Karatsuba purposes)
119 #define arg7 STACK_OFFSET+8(%rsp)
120 #define arg8 STACK_OFFSET+16(%rsp)
121 #define arg9 STACK_OFFSET+24(%rsp)
122 #define arg10 STACK_OFFSET+32(%rsp)
123 #define arg11 STACK_OFFSET+40(%rsp)
124 #define keysize 2*15*16(%arg1)
141 #define BSWAP_MASK %xmm10
145 #define GF128MUL_MASK %xmm7
178 # states of %xmm registers %xmm6:%xmm15 not saved
179 # all %xmm registers are clobbered
190 # Precompute hashkeys.
191 # Input: Hash subkey.
192 # Output: HashKeys stored in gcm_context_data. Only needs to be called
194 # clobbers r12, and tmp xmm registers.
195 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
198 movdqa SHUF_MASK(%rip), \TMP2
201 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
213 pshufd $0x24, \TMP1, \TMP2
214 pcmpeqd TWOONE(%rip), \TMP2
215 pand POLY(%rip), \TMP2
217 movdqu \TMP3, HashKey(%arg2)
220 pshufd $78, \TMP3, \TMP1
222 movdqu \TMP1, HashKey_k(%arg2)
224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225 # TMP5 = HashKey^2<<1 (mod poly)
226 movdqu \TMP5, HashKey_2(%arg2)
227 # HashKey_2 = HashKey^2<<1 (mod poly)
228 pshufd $78, \TMP5, \TMP1
230 movdqu \TMP1, HashKey_2_k(%arg2)
232 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
233 # TMP5 = HashKey^3<<1 (mod poly)
234 movdqu \TMP5, HashKey_3(%arg2)
235 pshufd $78, \TMP5, \TMP1
237 movdqu \TMP1, HashKey_3_k(%arg2)
239 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
240 # TMP5 = HashKey^3<<1 (mod poly)
241 movdqu \TMP5, HashKey_4(%arg2)
242 pshufd $78, \TMP5, \TMP1
244 movdqu \TMP1, HashKey_4_k(%arg2)
247 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
248 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
249 .macro GCM_INIT Iv SUBKEY AAD AADLEN
251 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
253 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
254 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
255 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
258 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
260 movdqa SHUF_MASK(%rip), %xmm2
262 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
264 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
265 movdqu HashKey(%arg2), %xmm13
267 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
271 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
272 # struct has been initialized by GCM_INIT.
273 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
274 # Clobbers rax, r10-r13, and xmm0-xmm15
275 .macro GCM_ENC_DEC operation
276 movdqu AadHash(%arg2), %xmm8
277 movdqu HashKey(%arg2), %xmm13
278 add %arg5, InLen(%arg2)
280 xor %r11d, %r11d # initialise the data pointer offset as zero
281 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
283 sub %r11, %arg5 # sub partial block data used
284 mov %arg5, %r13 # save the number of bytes
286 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
288 # Encrypt/Decrypt first few blocks
291 jz _initial_num_blocks_is_0_\@
293 jb _initial_num_blocks_is_1_\@
294 je _initial_num_blocks_is_2_\@
295 _initial_num_blocks_is_3_\@:
296 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
297 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
299 jmp _initial_blocks_\@
300 _initial_num_blocks_is_2_\@:
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
304 jmp _initial_blocks_\@
305 _initial_num_blocks_is_1_\@:
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
309 jmp _initial_blocks_\@
310 _initial_num_blocks_is_0_\@:
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
315 # Main loop - Encrypt/Decrypt remaining blocks
318 je _zero_cipher_left_\@
320 je _four_cipher_left_\@
322 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
323 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
328 _four_cipher_left_\@:
329 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
330 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
331 _zero_cipher_left_\@:
332 movdqu %xmm8, AadHash(%arg2)
333 movdqu %xmm0, CurCount(%arg2)
336 and $15, %r13 # %r13 = arg5 (mod 16)
337 je _multiple_of_16_bytes_\@
339 mov %r13, PBlockLen(%arg2)
341 # Handle the last <16 Byte block separately
342 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
343 movdqu %xmm0, CurCount(%arg2)
344 movdqa SHUF_MASK(%rip), %xmm10
347 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
348 movdqu %xmm0, PBlockEncKey(%arg2)
351 jge _large_enough_update_\@
353 lea (%arg4,%r11,1), %r10
355 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
358 _large_enough_update_\@:
362 # receive the last <16 Byte block
363 movdqu (%arg4, %r11, 1), %xmm1
368 lea SHIFT_MASK+16(%rip), %r12
369 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
370 # (r13 is the number of bytes in plaintext mod 16)
372 # get the appropriate shuffle mask
374 # shift right 16-r13 bytes
378 lea ALL_F+16(%rip), %r12
384 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
386 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
387 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
390 movdqa SHUF_MASK(%rip), %xmm10
395 movdqa SHUF_MASK(%rip), %xmm10
401 movdqu %xmm8, AadHash(%arg2)
403 # GHASH computation for the last <16 byte block
404 movdqa SHUF_MASK(%rip), %xmm10
405 # shuffle xmm0 back to output as ciphertext
412 jle _less_than_8_bytes_left_\@
413 mov %rax, (%arg3 , %r11, 1)
418 _less_than_8_bytes_left_\@:
419 mov %al, (%arg3, %r11, 1)
423 jne _less_than_8_bytes_left_\@
424 _multiple_of_16_bytes_\@:
427 # GCM_COMPLETE Finishes update of tag of last partial block
428 # Output: Authorization Tag (AUTH_TAG)
429 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
430 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
431 movdqu AadHash(%arg2), %xmm8
432 movdqu HashKey(%arg2), %xmm13
434 mov PBlockLen(%arg2), %r12
439 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
442 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
443 shl $3, %r12 # convert into number of bits
444 movd %r12d, %xmm15 # len(A) in %xmm15
445 mov InLen(%arg2), %r12
446 shl $3, %r12 # len(C) in bits (*128)
449 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
450 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
452 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
453 # final GHASH computation
454 movdqa SHUF_MASK(%rip), %xmm10
457 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
458 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
461 mov \AUTHTAG, %r10 # %r10 = authTag
462 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
494 jmp _return_T_done_\@
501 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
504 * Input: A and B (128-bits each, bit-reflected)
505 * Output: C = A*B*x mod poly, (i.e. >>1 )
506 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
507 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
510 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
512 pshufd $78, \GH, \TMP2
513 pshufd $78, \HK, \TMP3
514 pxor \GH, \TMP2 # TMP2 = a1+a0
515 pxor \HK, \TMP3 # TMP3 = b1+b0
516 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
517 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
518 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
520 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
522 pslldq $8, \TMP3 # left shift TMP3 2 DWs
523 psrldq $8, \TMP2 # right shift TMP2 2 DWs
525 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
527 # first phase of the reduction
531 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
532 # in in order to perform
534 pslld $31, \TMP2 # packed right shift <<31
535 pslld $30, \TMP3 # packed right shift <<30
536 pslld $25, \TMP4 # packed right shift <<25
537 pxor \TMP3, \TMP2 # xor the shifted versions
540 psrldq $4, \TMP5 # right shift TMP5 1 DW
541 pslldq $12, \TMP2 # left shift TMP2 3 DWs
544 # second phase of the reduction
546 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
547 # in in order to perform
551 psrld $1,\TMP2 # packed left shift >>1
552 psrld $2,\TMP3 # packed left shift >>2
553 psrld $7,\TMP4 # packed left shift >>7
554 pxor \TMP3,\TMP2 # xor the shifted versions
558 pxor \TMP1, \GH # result is in TMP1
561 # Reads DLEN bytes starting at DPTR and stores in XMMDst
562 # where 0 < DLEN < 16
563 # Clobbers %rax, DLEN and XMM1
564 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
570 jz _done_read_partial_block_\@
574 mov 7(\DPTR, \DLEN, 1), %al
576 jnz _read_next_byte_\@
580 jmp _done_read_partial_block_\@
583 _read_next_byte_lt8_\@:
585 mov -1(\DPTR, \DLEN, 1), %al
587 jnz _read_next_byte_lt8_\@
589 _done_read_partial_block_\@:
592 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
593 # clobbers r10-11, xmm14
594 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
596 MOVADQ SHUF_MASK(%rip), %xmm14
597 mov \AAD, %r10 # %r10 = AAD
598 mov \AADLEN, %r11 # %r11 = aadLen
606 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
608 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
612 jge _get_AAD_blocks\@
616 /* read the last <16B of AAD */
621 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
622 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
624 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
628 movdqu \TMP6, AadHash(%arg2)
631 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
632 # between update calls.
633 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
634 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
635 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
636 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
638 mov PBlockLen(%arg2), %r13
640 je _partial_block_done_\@ # Leave Macro if no partial blocks
641 # Read in input data without over reading
642 cmp $16, \PLAIN_CYPH_LEN
643 jl _fewer_than_16_bytes_\@
644 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
647 _fewer_than_16_bytes_\@:
648 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
649 mov \PLAIN_CYPH_LEN, %r12
650 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
652 mov PBlockLen(%arg2), %r13
654 _data_read_\@: # Finished reading in data
656 movdqu PBlockEncKey(%arg2), %xmm9
657 movdqu HashKey(%arg2), %xmm13
659 lea SHIFT_MASK(%rip), %r12
661 # adjust the shuffle mask pointer to be able to shift r13 bytes
662 # r16-r13 is the number of bytes in plaintext mod 16)
664 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
665 pshufb %xmm2, %xmm9 # shift right r13 bytes
669 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
671 mov \PLAIN_CYPH_LEN, %r10
673 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
675 # Determine if if partial block is not being filled and
676 # shift mask accordingly
677 jge _no_extra_mask_1_\@
681 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
682 # get the appropriate mask to mask out bottom r13 bytes of xmm9
683 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
686 movdqa SHUF_MASK(%rip), %xmm10
689 pxor %xmm3, \AAD_HASH
692 jl _partial_incomplete_1_\@
694 # GHASH computation for the last <16 Byte block
695 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
698 mov %rax, PBlockLen(%arg2)
700 _partial_incomplete_1_\@:
701 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
703 movdqu \AAD_HASH, AadHash(%arg2)
705 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
707 mov \PLAIN_CYPH_LEN, %r10
709 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
711 # Determine if if partial block is not being filled and
712 # shift mask accordingly
713 jge _no_extra_mask_2_\@
717 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
718 # get the appropriate mask to mask out bottom r13 bytes of xmm9
721 movdqa SHUF_MASK(%rip), %xmm1
724 pxor %xmm9, \AAD_HASH
727 jl _partial_incomplete_2_\@
729 # GHASH computation for the last <16 Byte block
730 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
733 mov %rax, PBlockLen(%arg2)
735 _partial_incomplete_2_\@:
736 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
738 movdqu \AAD_HASH, AadHash(%arg2)
740 movdqa SHUF_MASK(%rip), %xmm10
741 # shuffle xmm9 back to output as ciphertext
745 # output encrypted Bytes
750 # Set r13 to be the number of bytes to write out
754 mov \PLAIN_CYPH_LEN, %r13
759 jle _less_than_8_bytes_left_\@
761 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
766 _less_than_8_bytes_left_\@:
767 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
771 jne _less_than_8_bytes_left_\@
772 _partial_block_done_\@:
773 .endm # PARTIAL_BLOCK
776 * if a = number of total plaintext bytes
778 * num_initial_blocks = b mod 4
779 * encrypt the initial num_initial_blocks blocks and apply ghash on
781 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
783 * arg1, %arg2, %arg3 are used as a pointer only, not modified
787 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
788 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
789 MOVADQ SHUF_MASK(%rip), %xmm14
791 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
793 # start AES for num_initial_blocks blocks
795 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
797 .if (\i == 5) || (\i == 6) || (\i == 7)
799 MOVADQ ONE(%RIP),\TMP1
800 MOVADQ 0(%arg1),\TMP2
802 paddd \TMP1, \XMM0 # INCR Y0
804 movdqa \XMM0, %xmm\index
806 MOVADQ \XMM0, %xmm\index
808 pshufb %xmm14, %xmm\index # perform a 16 byte swap
809 pxor \TMP2, %xmm\index
813 shr $2,%eax # 128->4, 192->6, 256->8
814 add $5,%eax # 128->9, 192->11, 256->13
819 aesenc \TMP1, %xmm\index
823 jnz aes_loop_initial_\@
827 aesenclast \TMP1, %xmm\index # Last Round
830 movdqu (%arg4 , %r11, 1), \TMP1
831 pxor \TMP1, %xmm\index
832 movdqu %xmm\index, (%arg3 , %r11, 1)
833 # write back plaintext/ciphertext for num_initial_blocks
837 movdqa \TMP1, %xmm\index
839 pshufb %xmm14, %xmm\index
841 # prepare plaintext/ciphertext for GHASH computation
845 # apply GHASH on num_initial_blocks blocks
849 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
851 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
864 jl _initial_blocks_done\@
865 # no need for precomputed values
868 * Precomputations for HashKey parallel with encryption of first 4 blocks.
869 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
871 MOVADQ ONE(%RIP),\TMP1
872 paddd \TMP1, \XMM0 # INCR Y0
874 pshufb %xmm14, \XMM1 # perform a 16 byte swap
876 paddd \TMP1, \XMM0 # INCR Y0
878 pshufb %xmm14, \XMM2 # perform a 16 byte swap
880 paddd \TMP1, \XMM0 # INCR Y0
882 pshufb %xmm14, \XMM3 # perform a 16 byte swap
884 paddd \TMP1, \XMM0 # INCR Y0
886 pshufb %xmm14, \XMM4 # perform a 16 byte swap
888 MOVADQ 0(%arg1),\TMP1
893 .irpc index, 1234 # do 4 rounds
894 movaps 0x10*\index(%arg1), \TMP1
900 .irpc index, 56789 # do next 5 rounds
901 movaps 0x10*\index(%arg1), \TMP1
909 shr $2,%eax # 128->4, 192->6, 256->8
910 sub $4,%eax # 128->0, 192->2, 256->4
911 jz aes_loop_pre_done\@
916 aesenc \TMP2, %xmm\index
924 aesenclast \TMP2, \XMM1
925 aesenclast \TMP2, \XMM2
926 aesenclast \TMP2, \XMM3
927 aesenclast \TMP2, \XMM4
928 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
931 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
934 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
937 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
940 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
943 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
946 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
949 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
952 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
953 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
954 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
955 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
959 pshufb %xmm14, \XMM1 # perform a 16 byte swap
961 # combine GHASHed value with the corresponding ciphertext
962 pshufb %xmm14, \XMM2 # perform a 16 byte swap
963 pshufb %xmm14, \XMM3 # perform a 16 byte swap
964 pshufb %xmm14, \XMM4 # perform a 16 byte swap
966 _initial_blocks_done\@:
971 * encrypt 4 blocks at a time
972 * ghash the 4 previously encrypted ciphertext blocks
973 * arg1, %arg3, %arg4 are used as pointers only, not modified
974 * %r11 is the data offset value
976 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
977 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
984 movdqa SHUF_MASK(%rip), %xmm15
985 # multiply TMP5 * HashKey using karatsuba
988 pshufd $78, \XMM5, \TMP6
990 paddd ONE(%rip), \XMM0 # INCR CNT
991 movdqu HashKey_4(%arg2), \TMP5
992 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
994 paddd ONE(%rip), \XMM0 # INCR CNT
996 paddd ONE(%rip), \XMM0 # INCR CNT
998 paddd ONE(%rip), \XMM0 # INCR CNT
1000 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1001 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1002 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1003 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1004 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1010 movdqu HashKey_4_k(%arg2), \TMP5
1011 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1012 movaps 0x10(%arg1), \TMP1
1013 aesenc \TMP1, \XMM1 # Round 1
1017 movaps 0x20(%arg1), \TMP1
1018 aesenc \TMP1, \XMM1 # Round 2
1023 pshufd $78, \XMM6, \TMP2
1025 movdqu HashKey_3(%arg2), \TMP5
1026 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1027 movaps 0x30(%arg1), \TMP3
1028 aesenc \TMP3, \XMM1 # Round 3
1032 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1033 movaps 0x40(%arg1), \TMP3
1034 aesenc \TMP3, \XMM1 # Round 4
1038 movdqu HashKey_3_k(%arg2), \TMP5
1039 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1040 movaps 0x50(%arg1), \TMP3
1041 aesenc \TMP3, \XMM1 # Round 5
1046 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1050 pshufd $78, \XMM7, \TMP2
1052 movdqu HashKey_2(%arg2), \TMP5
1054 # Multiply TMP5 * HashKey using karatsuba
1056 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1057 movaps 0x60(%arg1), \TMP3
1058 aesenc \TMP3, \XMM1 # Round 6
1062 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1063 movaps 0x70(%arg1), \TMP3
1064 aesenc \TMP3, \XMM1 # Round 7
1068 movdqu HashKey_2_k(%arg2), \TMP5
1069 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1070 movaps 0x80(%arg1), \TMP3
1071 aesenc \TMP3, \XMM1 # Round 8
1076 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1080 # Multiply XMM8 * HashKey
1081 # XMM8 and TMP5 hold the values for the two operands
1084 pshufd $78, \XMM8, \TMP2
1086 movdqu HashKey(%arg2), \TMP5
1087 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1088 movaps 0x90(%arg1), \TMP3
1089 aesenc \TMP3, \XMM1 # Round 9
1093 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1094 lea 0xa0(%arg1),%r10
1096 shr $2,%eax # 128->4, 192->6, 256->8
1097 sub $4,%eax # 128->0, 192->2, 256->4
1098 jz aes_loop_par_enc_done\@
1103 aesenc \TMP3, %xmm\index
1107 jnz aes_loop_par_enc\@
1109 aes_loop_par_enc_done\@:
1110 MOVADQ (%r10), \TMP3
1111 aesenclast \TMP3, \XMM1 # Round 10
1112 aesenclast \TMP3, \XMM2
1113 aesenclast \TMP3, \XMM3
1114 aesenclast \TMP3, \XMM4
1115 movdqu HashKey_k(%arg2), \TMP5
1116 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1117 movdqu (%arg4,%r11,1), \TMP3
1118 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1119 movdqu 16(%arg4,%r11,1), \TMP3
1120 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1121 movdqu 32(%arg4,%r11,1), \TMP3
1122 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1123 movdqu 48(%arg4,%r11,1), \TMP3
1124 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1125 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1126 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1127 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1128 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1129 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1130 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1131 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1132 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1140 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1143 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1145 # first phase of reduction
1150 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151 pslld $31, \TMP2 # packed right shift << 31
1152 pslld $30, \TMP3 # packed right shift << 30
1153 pslld $25, \TMP4 # packed right shift << 25
1154 pxor \TMP3, \TMP2 # xor the shifted versions
1157 psrldq $4, \TMP5 # right shift T5 1 DW
1158 pslldq $12, \TMP2 # left shift T2 3 DWs
1161 # second phase of reduction
1163 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1166 psrld $1, \TMP2 # packed left shift >>1
1167 psrld $2, \TMP3 # packed left shift >>2
1168 psrld $7, \TMP4 # packed left shift >>7
1169 pxor \TMP3,\TMP2 # xor the shifted versions
1173 pxor \TMP1, \XMM5 # result is in TMP1
1179 * decrypt 4 blocks at a time
1180 * ghash the 4 previously decrypted ciphertext blocks
1181 * arg1, %arg3, %arg4 are used as pointers only, not modified
1182 * %r11 is the data offset value
1184 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1192 movdqa SHUF_MASK(%rip), %xmm15
1193 # multiply TMP5 * HashKey using karatsuba
1196 pshufd $78, \XMM5, \TMP6
1198 paddd ONE(%rip), \XMM0 # INCR CNT
1199 movdqu HashKey_4(%arg2), \TMP5
1200 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1202 paddd ONE(%rip), \XMM0 # INCR CNT
1204 paddd ONE(%rip), \XMM0 # INCR CNT
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1208 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1209 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1210 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1211 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1212 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1218 movdqu HashKey_4_k(%arg2), \TMP5
1219 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1220 movaps 0x10(%arg1), \TMP1
1221 aesenc \TMP1, \XMM1 # Round 1
1225 movaps 0x20(%arg1), \TMP1
1226 aesenc \TMP1, \XMM1 # Round 2
1231 pshufd $78, \XMM6, \TMP2
1233 movdqu HashKey_3(%arg2), \TMP5
1234 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1235 movaps 0x30(%arg1), \TMP3
1236 aesenc \TMP3, \XMM1 # Round 3
1240 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1241 movaps 0x40(%arg1), \TMP3
1242 aesenc \TMP3, \XMM1 # Round 4
1246 movdqu HashKey_3_k(%arg2), \TMP5
1247 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1248 movaps 0x50(%arg1), \TMP3
1249 aesenc \TMP3, \XMM1 # Round 5
1254 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1258 pshufd $78, \XMM7, \TMP2
1260 movdqu HashKey_2(%arg2), \TMP5
1262 # Multiply TMP5 * HashKey using karatsuba
1264 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1265 movaps 0x60(%arg1), \TMP3
1266 aesenc \TMP3, \XMM1 # Round 6
1270 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1271 movaps 0x70(%arg1), \TMP3
1272 aesenc \TMP3, \XMM1 # Round 7
1276 movdqu HashKey_2_k(%arg2), \TMP5
1277 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1278 movaps 0x80(%arg1), \TMP3
1279 aesenc \TMP3, \XMM1 # Round 8
1284 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1288 # Multiply XMM8 * HashKey
1289 # XMM8 and TMP5 hold the values for the two operands
1292 pshufd $78, \XMM8, \TMP2
1294 movdqu HashKey(%arg2), \TMP5
1295 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1296 movaps 0x90(%arg1), \TMP3
1297 aesenc \TMP3, \XMM1 # Round 9
1301 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1302 lea 0xa0(%arg1),%r10
1304 shr $2,%eax # 128->4, 192->6, 256->8
1305 sub $4,%eax # 128->0, 192->2, 256->4
1306 jz aes_loop_par_dec_done\@
1311 aesenc \TMP3, %xmm\index
1315 jnz aes_loop_par_dec\@
1317 aes_loop_par_dec_done\@:
1318 MOVADQ (%r10), \TMP3
1319 aesenclast \TMP3, \XMM1 # last round
1320 aesenclast \TMP3, \XMM2
1321 aesenclast \TMP3, \XMM3
1322 aesenclast \TMP3, \XMM4
1323 movdqu HashKey_k(%arg2), \TMP5
1324 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1325 movdqu (%arg4,%r11,1), \TMP3
1326 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1327 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1329 movdqu 16(%arg4,%r11,1), \TMP3
1330 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1331 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1333 movdqu 32(%arg4,%r11,1), \TMP3
1334 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1335 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1337 movdqu 48(%arg4,%r11,1), \TMP3
1338 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1339 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1341 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1342 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1343 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1344 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1352 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1353 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1355 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1357 # first phase of reduction
1362 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363 pslld $31, \TMP2 # packed right shift << 31
1364 pslld $30, \TMP3 # packed right shift << 30
1365 pslld $25, \TMP4 # packed right shift << 25
1366 pxor \TMP3, \TMP2 # xor the shifted versions
1369 psrldq $4, \TMP5 # right shift T5 1 DW
1370 pslldq $12, \TMP2 # left shift T2 3 DWs
1373 # second phase of reduction
1375 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1378 psrld $1, \TMP2 # packed left shift >>1
1379 psrld $2, \TMP3 # packed left shift >>2
1380 psrld $7, \TMP4 # packed left shift >>7
1381 pxor \TMP3,\TMP2 # xor the shifted versions
1385 pxor \TMP1, \XMM5 # result is in TMP1
1390 /* GHASH the last 4 ciphertext blocks. */
1391 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1394 # Multiply TMP6 * HashKey (using Karatsuba)
1397 pshufd $78, \XMM1, \TMP2
1399 movdqu HashKey_4(%arg2), \TMP5
1400 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1401 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1402 movdqu HashKey_4_k(%arg2), \TMP4
1403 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1404 movdqa \XMM1, \XMMDst
1405 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1407 # Multiply TMP1 * HashKey (using Karatsuba)
1410 pshufd $78, \XMM2, \TMP2
1412 movdqu HashKey_3(%arg2), \TMP5
1413 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1414 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1415 movdqu HashKey_3_k(%arg2), \TMP4
1416 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1420 # results accumulated in TMP6, XMMDst, XMM1
1422 # Multiply TMP1 * HashKey (using Karatsuba)
1425 pshufd $78, \XMM3, \TMP2
1427 movdqu HashKey_2(%arg2), \TMP5
1428 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1429 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1430 movdqu HashKey_2_k(%arg2), \TMP4
1431 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1434 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1436 # Multiply TMP1 * HashKey (using Karatsuba)
1438 pshufd $78, \XMM4, \TMP2
1440 movdqu HashKey(%arg2), \TMP5
1441 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1442 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1443 movdqu HashKey_k(%arg2), \TMP4
1444 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1450 # middle section of the temp results combined as in karatsuba algorithm
1452 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1453 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1456 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457 # first phase of the reduction
1458 movdqa \XMMDst, \TMP2
1459 movdqa \XMMDst, \TMP3
1460 movdqa \XMMDst, \TMP4
1461 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462 pslld $31, \TMP2 # packed right shifting << 31
1463 pslld $30, \TMP3 # packed right shifting << 30
1464 pslld $25, \TMP4 # packed right shifting << 25
1465 pxor \TMP3, \TMP2 # xor the shifted versions
1468 psrldq $4, \TMP7 # right shift TMP7 1 DW
1469 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1472 # second phase of the reduction
1473 movdqa \XMMDst, \TMP2
1474 # make 3 copies of XMMDst for doing 3 shift operations
1475 movdqa \XMMDst, \TMP3
1476 movdqa \XMMDst, \TMP4
1477 psrld $1, \TMP2 # packed left shift >> 1
1478 psrld $2, \TMP3 # packed left shift >> 2
1479 psrld $7, \TMP4 # packed left shift >> 7
1480 pxor \TMP3, \TMP2 # xor the shifted versions
1484 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1488 /* Encryption of a single block
1492 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1496 shr $2,%eax # 128->4, 192->6, 256->8
1497 add $5,%eax # 128->9, 192->11, 256->13
1498 lea 16(%arg1), %r10 # get first expanded key address
1508 aesenclast \TMP1,\XMM0
1510 /*****************************************************************************
1511 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1512 * struct gcm_context_data *data
1514 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1515 * const u8 *in, // Ciphertext input
1516 * u64 plaintext_len, // Length of data in bytes for decryption.
1517 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1518 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519 * // concatenated with 0x00000001. 16-byte aligned pointer.
1520 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521 * const u8 *aad, // Additional Authentication Data (AAD)
1522 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1524 * // given authentication tag and only return the plaintext if they match.
1525 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526 * // (most likely), 12 or 8.
1531 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1532 * set of 11 keys in the data structure void *aes_ctx
1536 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538 * | Salt (From the SA) |
1539 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540 * | Initialization Vector |
1541 * | (This is the sequence number from IPSec header) |
1542 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 * AAD padded to 128 bits with 0
1550 * for example, assume AAD is a u32 vector
1552 * if AAD is 8 bytes:
1553 * AAD[3] = {A0, A1};
1554 * padded AAD in xmm register = {A1 A0 0 0}
1557 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1560 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561 * | 32-bit Sequence Number (A0) |
1562 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566 * AAD Format with 32-bit Sequence Number
1568 * if AAD is 12 bytes:
1569 * AAD[3] = {A0, A1, A2};
1570 * padded AAD in xmm register = {A2 A1 A0 0}
1573 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1578 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 * | 64-bit Extended Sequence Number {A1,A0} |
1581 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1585 * AAD Format with 64-bit Extended Sequence Number
1587 * poly = x^128 + x^127 + x^126 + x^121 + 1
1589 *****************************************************************************/
1590 SYM_FUNC_START(aesni_gcm_dec)
1593 GCM_INIT %arg6, arg7, arg8, arg9
1595 GCM_COMPLETE arg10, arg11
1598 SYM_FUNC_END(aesni_gcm_dec)
1601 /*****************************************************************************
1602 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1603 * struct gcm_context_data *data
1605 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1606 * const u8 *in, // Plaintext input
1607 * u64 plaintext_len, // Length of data in bytes for encryption.
1608 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1609 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610 * // concatenated with 0x00000001. 16-byte aligned pointer.
1611 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612 * const u8 *aad, // Additional Authentication Data (AAD)
1613 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614 * u8 *auth_tag, // Authenticated Tag output.
1615 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1621 * keys are pre-expanded and aligned to 16 bytes. we are using the
1622 * first set of 11 keys in the data structure void *aes_ctx
1627 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629 * | Salt (From the SA) |
1630 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631 * | Initialization Vector |
1632 * | (This is the sequence number from IPSec header) |
1633 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 * AAD padded to 128 bits with 0
1641 * for example, assume AAD is a u32 vector
1643 * if AAD is 8 bytes:
1644 * AAD[3] = {A0, A1};
1645 * padded AAD in xmm register = {A1 A0 0 0}
1648 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1651 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652 * | 32-bit Sequence Number (A0) |
1653 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657 * AAD Format with 32-bit Sequence Number
1659 * if AAD is 12 bytes:
1660 * AAD[3] = {A0, A1, A2};
1661 * padded AAD in xmm register = {A2 A1 A0 0}
1664 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1667 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668 * | 64-bit Extended Sequence Number {A1,A0} |
1670 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1674 * AAD Format with 64-bit Extended Sequence Number
1676 * poly = x^128 + x^127 + x^126 + x^121 + 1
1677 ***************************************************************************/
1678 SYM_FUNC_START(aesni_gcm_enc)
1681 GCM_INIT %arg6, arg7, arg8, arg9
1684 GCM_COMPLETE arg10, arg11
1687 SYM_FUNC_END(aesni_gcm_enc)
1689 /*****************************************************************************
1690 * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1691 * struct gcm_context_data *data,
1693 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1694 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695 * // concatenated with 0x00000001. 16-byte aligned pointer.
1696 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697 * const u8 *aad, // Additional Authentication Data (AAD)
1698 * u64 aad_len) // Length of AAD in bytes.
1700 SYM_FUNC_START(aesni_gcm_init)
1702 GCM_INIT %arg3, %arg4,%arg5, %arg6
1705 SYM_FUNC_END(aesni_gcm_init)
1707 /*****************************************************************************
1708 * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1709 * struct gcm_context_data *data,
1711 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1712 * const u8 *in, // Plaintext input
1713 * u64 plaintext_len, // Length of data in bytes for encryption.
1715 SYM_FUNC_START(aesni_gcm_enc_update)
1720 SYM_FUNC_END(aesni_gcm_enc_update)
1722 /*****************************************************************************
1723 * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1724 * struct gcm_context_data *data,
1726 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1727 * const u8 *in, // Plaintext input
1728 * u64 plaintext_len, // Length of data in bytes for encryption.
1730 SYM_FUNC_START(aesni_gcm_dec_update)
1735 SYM_FUNC_END(aesni_gcm_dec_update)
1737 /*****************************************************************************
1738 * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1739 * struct gcm_context_data *data,
1741 * u8 *auth_tag, // Authenticated Tag output.
1742 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1745 SYM_FUNC_START(aesni_gcm_finalize)
1747 GCM_COMPLETE %arg3 %arg4
1750 SYM_FUNC_END(aesni_gcm_finalize)
1755 SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1756 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1757 pshufd $0b11111111, %xmm1, %xmm1
1758 shufps $0b00010000, %xmm0, %xmm4
1760 shufps $0b10001100, %xmm0, %xmm4
1763 movaps %xmm0, (TKEYP)
1766 SYM_FUNC_END(_key_expansion_256a)
1767 SYM_FUNC_END_ALIAS(_key_expansion_128)
1769 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1770 pshufd $0b01010101, %xmm1, %xmm1
1771 shufps $0b00010000, %xmm0, %xmm4
1773 shufps $0b10001100, %xmm0, %xmm4
1780 pshufd $0b11111111, %xmm0, %xmm3
1785 shufps $0b01000100, %xmm0, %xmm6
1786 movaps %xmm6, (TKEYP)
1787 shufps $0b01001110, %xmm2, %xmm1
1788 movaps %xmm1, 0x10(TKEYP)
1791 SYM_FUNC_END(_key_expansion_192a)
1793 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1794 pshufd $0b01010101, %xmm1, %xmm1
1795 shufps $0b00010000, %xmm0, %xmm4
1797 shufps $0b10001100, %xmm0, %xmm4
1803 pshufd $0b11111111, %xmm0, %xmm3
1807 movaps %xmm0, (TKEYP)
1810 SYM_FUNC_END(_key_expansion_192b)
1812 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1813 pshufd $0b10101010, %xmm1, %xmm1
1814 shufps $0b00010000, %xmm2, %xmm4
1816 shufps $0b10001100, %xmm2, %xmm4
1819 movaps %xmm2, (TKEYP)
1822 SYM_FUNC_END(_key_expansion_256b)
1825 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1826 * unsigned int key_len)
1828 SYM_FUNC_START(aesni_set_key)
1832 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1833 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1834 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1836 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1837 movaps %xmm0, (KEYP)
1838 lea 0x10(KEYP), TKEYP # key addr
1839 movl %edx, 480(KEYP)
1840 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1844 movups 0x10(UKEYP), %xmm2 # other user key
1845 movaps %xmm2, (TKEYP)
1847 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1848 call _key_expansion_256a
1849 aeskeygenassist $0x1, %xmm0, %xmm1
1850 call _key_expansion_256b
1851 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1852 call _key_expansion_256a
1853 aeskeygenassist $0x2, %xmm0, %xmm1
1854 call _key_expansion_256b
1855 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1856 call _key_expansion_256a
1857 aeskeygenassist $0x4, %xmm0, %xmm1
1858 call _key_expansion_256b
1859 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1860 call _key_expansion_256a
1861 aeskeygenassist $0x8, %xmm0, %xmm1
1862 call _key_expansion_256b
1863 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1864 call _key_expansion_256a
1865 aeskeygenassist $0x10, %xmm0, %xmm1
1866 call _key_expansion_256b
1867 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1868 call _key_expansion_256a
1869 aeskeygenassist $0x20, %xmm0, %xmm1
1870 call _key_expansion_256b
1871 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1872 call _key_expansion_256a
1875 movq 0x10(UKEYP), %xmm2 # other user key
1876 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1877 call _key_expansion_192a
1878 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1879 call _key_expansion_192b
1880 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1881 call _key_expansion_192a
1882 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1883 call _key_expansion_192b
1884 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1885 call _key_expansion_192a
1886 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1887 call _key_expansion_192b
1888 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1889 call _key_expansion_192a
1890 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
1891 call _key_expansion_192b
1894 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
1895 call _key_expansion_128
1896 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
1897 call _key_expansion_128
1898 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
1899 call _key_expansion_128
1900 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
1901 call _key_expansion_128
1902 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
1903 call _key_expansion_128
1904 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
1905 call _key_expansion_128
1906 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
1907 call _key_expansion_128
1908 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
1909 call _key_expansion_128
1910 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
1911 call _key_expansion_128
1912 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
1913 call _key_expansion_128
1916 movaps (KEYP), %xmm0
1917 movaps (TKEYP), %xmm1
1918 movaps %xmm0, 240(TKEYP)
1919 movaps %xmm1, 240(KEYP)
1921 lea 240-16(TKEYP), UKEYP
1924 movaps (KEYP), %xmm0
1926 movaps %xmm1, (UKEYP)
1937 SYM_FUNC_END(aesni_set_key)
1940 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1942 SYM_FUNC_START(aesni_enc)
1947 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1948 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1949 movl (FRAME_OFFSET+20)(%esp), INP # src
1951 movl 480(KEYP), KLEN # key length
1952 movups (INP), STATE # input
1954 movups STATE, (OUTP) # output
1961 SYM_FUNC_END(aesni_enc)
1964 * _aesni_enc1: internal ABI
1966 * KEYP: key struct pointer
1968 * STATE: initial state (input)
1970 * STATE: finial state (output)
1975 SYM_FUNC_START_LOCAL(_aesni_enc1)
1976 movaps (KEYP), KEY # key
1978 pxor KEY, STATE # round 0
1982 lea 0x20(TKEYP), TKEYP
1985 movaps -0x60(TKEYP), KEY
1987 movaps -0x50(TKEYP), KEY
1991 movaps -0x40(TKEYP), KEY
1993 movaps -0x30(TKEYP), KEY
1997 movaps -0x20(TKEYP), KEY
1999 movaps -0x10(TKEYP), KEY
2003 movaps 0x10(TKEYP), KEY
2005 movaps 0x20(TKEYP), KEY
2007 movaps 0x30(TKEYP), KEY
2009 movaps 0x40(TKEYP), KEY
2011 movaps 0x50(TKEYP), KEY
2013 movaps 0x60(TKEYP), KEY
2015 movaps 0x70(TKEYP), KEY
2016 aesenclast KEY, STATE
2018 SYM_FUNC_END(_aesni_enc1)
2021 * _aesni_enc4: internal ABI
2023 * KEYP: key struct pointer
2025 * STATE1: initial state (input)
2030 * STATE1: finial state (output)
2038 SYM_FUNC_START_LOCAL(_aesni_enc4)
2039 movaps (KEYP), KEY # key
2041 pxor KEY, STATE1 # round 0
2048 lea 0x20(TKEYP), TKEYP
2051 movaps -0x60(TKEYP), KEY
2056 movaps -0x50(TKEYP), KEY
2063 movaps -0x40(TKEYP), KEY
2068 movaps -0x30(TKEYP), KEY
2075 movaps -0x20(TKEYP), KEY
2080 movaps -0x10(TKEYP), KEY
2090 movaps 0x10(TKEYP), KEY
2095 movaps 0x20(TKEYP), KEY
2100 movaps 0x30(TKEYP), KEY
2105 movaps 0x40(TKEYP), KEY
2110 movaps 0x50(TKEYP), KEY
2115 movaps 0x60(TKEYP), KEY
2120 movaps 0x70(TKEYP), KEY
2121 aesenclast KEY, STATE1 # last round
2122 aesenclast KEY, STATE2
2123 aesenclast KEY, STATE3
2124 aesenclast KEY, STATE4
2126 SYM_FUNC_END(_aesni_enc4)
2129 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2131 SYM_FUNC_START(aesni_dec)
2136 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2137 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2138 movl (FRAME_OFFSET+20)(%esp), INP # src
2140 mov 480(KEYP), KLEN # key length
2142 movups (INP), STATE # input
2144 movups STATE, (OUTP) #output
2151 SYM_FUNC_END(aesni_dec)
2154 * _aesni_dec1: internal ABI
2156 * KEYP: key struct pointer
2158 * STATE: initial state (input)
2160 * STATE: finial state (output)
2165 SYM_FUNC_START_LOCAL(_aesni_dec1)
2166 movaps (KEYP), KEY # key
2168 pxor KEY, STATE # round 0
2172 lea 0x20(TKEYP), TKEYP
2175 movaps -0x60(TKEYP), KEY
2177 movaps -0x50(TKEYP), KEY
2181 movaps -0x40(TKEYP), KEY
2183 movaps -0x30(TKEYP), KEY
2187 movaps -0x20(TKEYP), KEY
2189 movaps -0x10(TKEYP), KEY
2193 movaps 0x10(TKEYP), KEY
2195 movaps 0x20(TKEYP), KEY
2197 movaps 0x30(TKEYP), KEY
2199 movaps 0x40(TKEYP), KEY
2201 movaps 0x50(TKEYP), KEY
2203 movaps 0x60(TKEYP), KEY
2205 movaps 0x70(TKEYP), KEY
2206 aesdeclast KEY, STATE
2208 SYM_FUNC_END(_aesni_dec1)
2211 * _aesni_dec4: internal ABI
2213 * KEYP: key struct pointer
2215 * STATE1: initial state (input)
2220 * STATE1: finial state (output)
2228 SYM_FUNC_START_LOCAL(_aesni_dec4)
2229 movaps (KEYP), KEY # key
2231 pxor KEY, STATE1 # round 0
2238 lea 0x20(TKEYP), TKEYP
2241 movaps -0x60(TKEYP), KEY
2246 movaps -0x50(TKEYP), KEY
2253 movaps -0x40(TKEYP), KEY
2258 movaps -0x30(TKEYP), KEY
2265 movaps -0x20(TKEYP), KEY
2270 movaps -0x10(TKEYP), KEY
2280 movaps 0x10(TKEYP), KEY
2285 movaps 0x20(TKEYP), KEY
2290 movaps 0x30(TKEYP), KEY
2295 movaps 0x40(TKEYP), KEY
2300 movaps 0x50(TKEYP), KEY
2305 movaps 0x60(TKEYP), KEY
2310 movaps 0x70(TKEYP), KEY
2311 aesdeclast KEY, STATE1 # last round
2312 aesdeclast KEY, STATE2
2313 aesdeclast KEY, STATE3
2314 aesdeclast KEY, STATE4
2316 SYM_FUNC_END(_aesni_dec4)
2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2322 SYM_FUNC_START(aesni_ecb_enc)
2328 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2329 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2330 movl (FRAME_OFFSET+24)(%esp), INP # src
2331 movl (FRAME_OFFSET+28)(%esp), LEN # len
2333 test LEN, LEN # check length
2342 movups (INP), STATE1
2343 movups 0x10(INP), STATE2
2344 movups 0x20(INP), STATE3
2345 movups 0x30(INP), STATE4
2347 movups STATE1, (OUTP)
2348 movups STATE2, 0x10(OUTP)
2349 movups STATE3, 0x20(OUTP)
2350 movups STATE4, 0x30(OUTP)
2360 movups (INP), STATE1
2362 movups STATE1, (OUTP)
2376 SYM_FUNC_END(aesni_ecb_enc)
2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2382 SYM_FUNC_START(aesni_ecb_dec)
2388 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2389 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2390 movl (FRAME_OFFSET+24)(%esp), INP # src
2391 movl (FRAME_OFFSET+28)(%esp), LEN # len
2403 movups (INP), STATE1
2404 movups 0x10(INP), STATE2
2405 movups 0x20(INP), STATE3
2406 movups 0x30(INP), STATE4
2408 movups STATE1, (OUTP)
2409 movups STATE2, 0x10(OUTP)
2410 movups STATE3, 0x20(OUTP)
2411 movups STATE4, 0x30(OUTP)
2421 movups (INP), STATE1
2423 movups STATE1, (OUTP)
2437 SYM_FUNC_END(aesni_ecb_dec)
2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441 * size_t len, u8 *iv)
2443 SYM_FUNC_START(aesni_cbc_enc)
2450 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2451 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2452 movl (FRAME_OFFSET+28)(%esp), INP # src
2453 movl (FRAME_OFFSET+32)(%esp), LEN # len
2454 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2459 movups (IVP), STATE # load iv as initial state
2462 movups (INP), IN # load input
2465 movups STATE, (OUTP) # store output
2481 SYM_FUNC_END(aesni_cbc_enc)
2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485 * size_t len, u8 *iv)
2487 SYM_FUNC_START(aesni_cbc_dec)
2494 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2495 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2496 movl (FRAME_OFFSET+28)(%esp), INP # src
2497 movl (FRAME_OFFSET+32)(%esp), LEN # len
2498 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2501 jb .Lcbc_dec_just_ret
2511 movups 0x10(INP), IN2
2514 movups 0x20(INP), IN3
2516 movups 0x30(INP), IN4
2519 movups 0x20(INP), IN1
2521 movups 0x30(INP), IN2
2536 movups 0x10(INP), IN2
2539 movups STATE1, (OUTP)
2540 movups STATE2, 0x10(OUTP)
2541 movups STATE3, 0x20(OUTP)
2542 movups STATE4, 0x30(OUTP)
2556 movups STATE, (OUTP)
2574 SYM_FUNC_END(aesni_cbc_dec)
2577 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2578 * size_t len, u8 *iv)
2580 SYM_FUNC_START(aesni_cts_cbc_enc)
2587 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2588 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2589 movl (FRAME_OFFSET+28)(%esp), INP # src
2590 movl (FRAME_OFFSET+32)(%esp), LEN # len
2591 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2592 lea .Lcts_permute_table, T1
2594 lea .Lcts_permute_table(%rip), T1
2621 movups STATE, (OUTP)
2631 SYM_FUNC_END(aesni_cts_cbc_enc)
2634 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2635 * size_t len, u8 *iv)
2637 SYM_FUNC_START(aesni_cts_cbc_dec)
2644 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2645 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2646 movl (FRAME_OFFSET+28)(%esp), INP # src
2647 movl (FRAME_OFFSET+32)(%esp), LEN # len
2648 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2649 lea .Lcts_permute_table, T1
2651 lea .Lcts_permute_table(%rip), T1
2682 movups STATE, (OUTP)
2692 SYM_FUNC_END(aesni_cts_cbc_dec)
2694 .pushsection .rodata
2696 .Lcts_permute_table:
2697 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2698 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2699 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2700 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2701 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2702 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2705 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2711 * _aesni_inc_init: internal ABI
2712 * setup registers used by _aesni_inc
2716 * CTR: == IV, in little endian
2717 * TCTR_LOW: == lower qword of CTR
2718 * INC: == 1, in little endian
2719 * BSWAP_MASK == endian swapping mask
2721 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2722 movaps .Lbswap_mask, BSWAP_MASK
2724 pshufb BSWAP_MASK, CTR
2729 SYM_FUNC_END(_aesni_inc_init)
2732 * _aesni_inc: internal ABI
2733 * Increase IV by 1, IV is in big endian
2736 * CTR: == IV, in little endian
2737 * TCTR_LOW: == lower qword of CTR
2738 * INC: == 1, in little endian
2739 * BSWAP_MASK == endian swapping mask
2743 * CTR: == output IV, in little endian
2744 * TCTR_LOW: == lower qword of CTR
2746 SYM_FUNC_START_LOCAL(_aesni_inc)
2755 pshufb BSWAP_MASK, IV
2757 SYM_FUNC_END(_aesni_inc)
2760 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2761 * size_t len, u8 *iv)
2763 SYM_FUNC_START(aesni_ctr_enc)
2766 jb .Lctr_enc_just_ret
2769 call _aesni_inc_init
2779 movups 0x10(INP), IN2
2782 movups 0x20(INP), IN3
2785 movups 0x30(INP), IN4
2788 movups STATE1, (OUTP)
2790 movups STATE2, 0x10(OUTP)
2792 movups STATE3, 0x20(OUTP)
2794 movups STATE4, 0x30(OUTP)
2809 movups STATE, (OUTP)
2820 SYM_FUNC_END(aesni_ctr_enc)
2824 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2826 .Lgf128mul_x_ble_mask:
2827 .octa 0x00000000000000010000000000000087
2831 * _aesni_gf128mul_x_ble: internal ABI
2832 * Multiply in GF(2^128) for XTS IVs
2835 * GF128MUL_MASK == mask with 0x87 and 0x01
2839 * CTR: == temporary value
2841 #define _aesni_gf128mul_x_ble() \
2842 pshufd $0x13, IV, KEY; \
2845 pand GF128MUL_MASK, KEY; \
2849 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2850 * const u8 *src, unsigned int len, le128 *iv)
2852 SYM_FUNC_START(aesni_xts_encrypt)
2859 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2860 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2861 movl (FRAME_OFFSET+28)(%esp), INP # src
2862 movl (FRAME_OFFSET+32)(%esp), LEN # len
2863 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2864 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2866 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2877 movdqu 0x00(INP), IN
2879 movdqu IV, 0x00(OUTP)
2881 _aesni_gf128mul_x_ble()
2883 movdqu 0x10(INP), IN
2885 movdqu IV, 0x10(OUTP)
2887 _aesni_gf128mul_x_ble()
2889 movdqu 0x20(INP), IN
2891 movdqu IV, 0x20(OUTP)
2893 _aesni_gf128mul_x_ble()
2895 movdqu 0x30(INP), IN
2897 movdqu IV, 0x30(OUTP)
2901 movdqu 0x00(OUTP), IN
2903 movdqu STATE1, 0x00(OUTP)
2905 movdqu 0x10(OUTP), IN
2907 movdqu STATE2, 0x10(OUTP)
2909 movdqu 0x20(OUTP), IN
2911 movdqu STATE3, 0x20(OUTP)
2913 movdqu 0x30(OUTP), IN
2915 movdqu STATE4, 0x30(OUTP)
2917 _aesni_gf128mul_x_ble()
2948 _aesni_gf128mul_x_ble()
2957 movdqu STATE, (OUTP)
2962 movdqu STATE, (OUTP)
2963 jmp .Lxts_enc_ret_iv
2966 movdqa STATE4, STATE
2971 lea .Lcts_permute_table, T1
2973 lea .Lcts_permute_table(%rip), T1
2975 add LEN, INP /* rewind input pointer */
2976 add $16, LEN /* # bytes in final block */
2999 movups STATE, (OUTP)
3001 SYM_FUNC_END(aesni_xts_encrypt)
3004 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3005 * const u8 *src, unsigned int len, le128 *iv)
3007 SYM_FUNC_START(aesni_xts_decrypt)
3014 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
3015 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
3016 movl (FRAME_OFFSET+28)(%esp), INP # src
3017 movl (FRAME_OFFSET+32)(%esp), LEN # len
3018 movl (FRAME_OFFSET+36)(%esp), IVP # iv
3019 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3021 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3037 movdqu 0x00(INP), IN
3039 movdqu IV, 0x00(OUTP)
3041 _aesni_gf128mul_x_ble()
3043 movdqu 0x10(INP), IN
3045 movdqu IV, 0x10(OUTP)
3047 _aesni_gf128mul_x_ble()
3049 movdqu 0x20(INP), IN
3051 movdqu IV, 0x20(OUTP)
3053 _aesni_gf128mul_x_ble()
3055 movdqu 0x30(INP), IN
3057 movdqu IV, 0x30(OUTP)
3061 movdqu 0x00(OUTP), IN
3063 movdqu STATE1, 0x00(OUTP)
3065 movdqu 0x10(OUTP), IN
3067 movdqu STATE2, 0x10(OUTP)
3069 movdqu 0x20(OUTP), IN
3071 movdqu STATE3, 0x20(OUTP)
3073 movdqu 0x30(OUTP), IN
3075 movdqu STATE4, 0x30(OUTP)
3077 _aesni_gf128mul_x_ble()
3111 _aesni_gf128mul_x_ble()
3116 movdqu STATE, (OUTP)
3121 movdqu STATE, (OUTP)
3122 jmp .Lxts_dec_ret_iv
3126 _aesni_gf128mul_x_ble()
3133 lea .Lcts_permute_table, T1
3135 lea .Lcts_permute_table(%rip), T1
3137 add LEN, INP /* rewind input pointer */
3138 add $16, LEN /* # bytes in final block */
3161 movups STATE, (OUTP)
3163 SYM_FUNC_END(aesni_xts_decrypt)