1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
187 #define InLen (16*1)+8
188 #define PBlockEncKey 16*2
190 #define CurCount 16*4
191 #define PBlockLen 16*5
193 HashKey = 16*6 # store HashKey <<1 mod poly here
194 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
195 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
196 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
197 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
198 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
199 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
200 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
201 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
216 #define arg7 STACK_OFFSET+8*1(%r14)
217 #define arg8 STACK_OFFSET+8*2(%r14)
218 #define arg9 STACK_OFFSET+8*3(%r14)
219 #define arg10 STACK_OFFSET+8*4(%r14)
220 #define keysize 2*15*16(arg1)
230 .macro define_reg r n
241 # need to push 4 registers into stack to maintain
244 TMP1 = 16*0 # Temporary storage for AAD
245 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246 TMP3 = 16*2 # Temporary storage for AES State 3
247 TMP4 = 16*3 # Temporary storage for AES State 4
248 TMP5 = 16*4 # Temporary storage for AES State 5
249 TMP6 = 16*5 # Temporary storage for AES State 6
250 TMP7 = 16*6 # Temporary storage for AES State 7
251 TMP8 = 16*7 # Temporary storage for AES State 8
253 VARIABLE_OFFSET = 16*8
255 ################################
257 ################################
260 #the number of pushes must equal STACK_OFFSET
270 sub $VARIABLE_OFFSET, %rsp
271 and $~63, %rsp # align rsp to 64 bytes
283 # Encryption of a single block
284 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
285 vpxor (arg1), \XMM0, \XMM0
289 vaesenc 16*i(arg1), \XMM0, \XMM0
293 vaesenclast 16*i(arg1), \XMM0, \XMM0
296 # combined for GCM encrypt and decrypt functions
297 # clobbering all xmm registers
298 # clobbering r10, r11, r12, r13, r14, r15
299 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300 vmovdqu AadHash(arg2), %xmm8
301 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
302 add arg5, InLen(arg2)
304 # initialize the data pointer offset as zero
307 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
310 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
311 and $-16, %r13 # r13 = r13 - (r13 mod 16)
316 jz _initial_num_blocks_is_0\@
319 je _initial_num_blocks_is_7\@
321 je _initial_num_blocks_is_6\@
323 je _initial_num_blocks_is_5\@
325 je _initial_num_blocks_is_4\@
327 je _initial_num_blocks_is_3\@
329 je _initial_num_blocks_is_2\@
331 jmp _initial_num_blocks_is_1\@
333 _initial_num_blocks_is_7\@:
334 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
336 jmp _initial_blocks_encrypted\@
338 _initial_num_blocks_is_6\@:
339 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
341 jmp _initial_blocks_encrypted\@
343 _initial_num_blocks_is_5\@:
344 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
346 jmp _initial_blocks_encrypted\@
348 _initial_num_blocks_is_4\@:
349 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
351 jmp _initial_blocks_encrypted\@
353 _initial_num_blocks_is_3\@:
354 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
356 jmp _initial_blocks_encrypted\@
358 _initial_num_blocks_is_2\@:
359 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
361 jmp _initial_blocks_encrypted\@
363 _initial_num_blocks_is_1\@:
364 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
366 jmp _initial_blocks_encrypted\@
368 _initial_num_blocks_is_0\@:
369 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
372 _initial_blocks_encrypted\@:
374 je _zero_cipher_left\@
377 je _eight_cipher_left\@
384 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
397 jne _encrypt_by_8_new\@
399 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400 jmp _eight_cipher_left\@
403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
405 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
409 jne _encrypt_by_8_new\@
411 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
416 _eight_cipher_left\@:
417 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
421 vmovdqu %xmm14, AadHash(arg2)
422 vmovdqu %xmm9, CurCount(arg2)
426 and $15, %r13 # r13 = (arg5 mod 16)
428 je _multiple_of_16_bytes\@
430 # handle the last <16 Byte block separately
432 mov %r13, PBlockLen(arg2)
434 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
435 vmovdqu %xmm9, CurCount(arg2)
436 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
438 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
439 vmovdqu %xmm9, PBlockEncKey(arg2)
442 jge _large_enough_update\@
444 lea (arg4,%r11,1), %r10
447 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
449 lea SHIFT_MASK+16(%rip), %r12
450 sub %r13, %r12 # adjust the shuffle mask pointer to be
451 # able to shift 16-r13 bytes (r13 is the
452 # number of bytes in plaintext mod 16)
454 jmp _final_ghash_mul\@
456 _large_enough_update\@:
460 # receive the last <16 Byte block
461 vmovdqu (arg4, %r11, 1), %xmm1
466 lea SHIFT_MASK+16(%rip), %r12
467 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
468 # (r13 is the number of bytes in plaintext mod 16)
470 # get the appropriate shuffle mask
471 vmovdqu (%r12), %xmm2
472 # shift right 16-r13 bytes
473 vpshufb %xmm2, %xmm1, %xmm1
478 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
479 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
480 # mask out top 16-r13 bytes of xmm9
481 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
482 vpand %xmm1, %xmm2, %xmm2
483 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
484 vpxor %xmm2, %xmm14, %xmm14
486 vmovdqu %xmm14, AadHash(arg2)
488 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
489 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
490 # mask out top 16-r13 bytes of xmm9
491 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
492 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
493 vpxor %xmm9, %xmm14, %xmm14
495 vmovdqu %xmm14, AadHash(arg2)
496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
500 #############################
504 jle _less_than_8_bytes_left\@
506 mov %rax, (arg3 , %r11)
508 vpsrldq $8, %xmm9, %xmm9
512 _less_than_8_bytes_left\@:
513 movb %al, (arg3 , %r11)
517 jne _less_than_8_bytes_left\@
518 #############################
520 _multiple_of_16_bytes\@:
524 # GCM_COMPLETE Finishes update of tag of last partial block
525 # Output: Authorization Tag (AUTH_TAG)
526 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
527 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
528 vmovdqu AadHash(arg2), %xmm14
529 vmovdqu HashKey(arg2), %xmm13
531 mov PBlockLen(arg2), %r12
535 #GHASH computation for the last <16 Byte block
536 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
539 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
540 shl $3, %r12 # convert into number of bits
541 vmovd %r12d, %xmm15 # len(A) in xmm15
543 mov InLen(arg2), %r12
544 shl $3, %r12 # len(C) in bits (*128)
546 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
547 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
549 vpxor %xmm15, %xmm14, %xmm14
550 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
551 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
553 vmovdqu OrigIV(arg2), %xmm9
555 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
557 vpxor %xmm14, %xmm9, %xmm9
562 mov \AUTH_TAG, %r10 # r10 = authTag
563 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
576 vpsrldq $8, %xmm9, %xmm9
584 vpsrldq $4, %xmm9, %xmm9
601 vmovdqu %xmm9, (%r10)
606 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
608 mov \AAD, %r10 # r10 = AAD
609 mov \AADLEN, %r12 # r12 = aadLen
620 vpshufb SHUF_MASK(%rip), \T7, \T7
622 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
627 jge _get_AAD_blocks\@
634 /* read the last <16B of AAD. since we have at least 4B of
635 data right after the AAD (the ICV, and maybe some CT), we can
636 read 4B/8B blocks safely, and then get rid of the extra stuff */
654 vpslldq $12, \T1, \T1
658 /* finalize: shift out the extra bytes we read, and align
659 left. since pslldq can only shift by an immediate, we use
660 vpshufb and an array of shuffle masks */
663 vmovdqu aad_shift_arr(%r11), \T1
664 vpshufb \T1, \T7, \T7
665 _get_AAD_rest_final\@:
666 vpshufb SHUF_MASK(%rip), \T7, \T7
668 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
671 vmovdqu \T7, AadHash(arg2)
674 .macro INIT GHASH_MUL PRECOMPUTE
676 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
678 mov %r11, InLen(arg2) # ctx_data.in_length = 0
680 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
681 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
684 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
686 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
687 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
689 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
691 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
692 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
694 vpsllq $1, %xmm6, %xmm6
695 vpsrlq $63, %xmm2, %xmm2
697 vpslldq $8, %xmm2, %xmm2
698 vpsrldq $8, %xmm1, %xmm1
699 vpor %xmm2, %xmm6, %xmm6
701 vpshufd $0b00100100, %xmm1, %xmm2
702 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
703 vpand POLY(%rip), %xmm2, %xmm2
704 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
705 #######################################################################
706 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
708 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
710 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
714 # Reads DLEN bytes starting at DPTR and stores in XMMDst
715 # where 0 < DLEN < 16
716 # Clobbers %rax, DLEN
717 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
718 vpxor \XMMDst, \XMMDst, \XMMDst
723 vpinsrq $0, %rax, \XMMDst, \XMMDst
725 jz _done_read_partial_block_\@
729 mov 7(\DPTR, \DLEN, 1), %al
731 jnz _read_next_byte_\@
732 vpinsrq $1, %rax, \XMMDst, \XMMDst
733 jmp _done_read_partial_block_\@
736 _read_next_byte_lt8_\@:
738 mov -1(\DPTR, \DLEN, 1), %al
740 jnz _read_next_byte_lt8_\@
741 vpinsrq $0, %rax, \XMMDst, \XMMDst
742 _done_read_partial_block_\@:
745 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
746 # between update calls.
747 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
748 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
749 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
750 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
752 mov PBlockLen(arg2), %r13
754 je _partial_block_done_\@ # Leave Macro if no partial blocks
755 # Read in input data without over reading
756 cmp $16, \PLAIN_CYPH_LEN
757 jl _fewer_than_16_bytes_\@
758 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
761 _fewer_than_16_bytes_\@:
762 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
763 mov \PLAIN_CYPH_LEN, %r12
764 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
766 mov PBlockLen(arg2), %r13
768 _data_read_\@: # Finished reading in data
770 vmovdqu PBlockEncKey(arg2), %xmm9
771 vmovdqu HashKey(arg2), %xmm13
773 lea SHIFT_MASK(%rip), %r12
775 # adjust the shuffle mask pointer to be able to shift r13 bytes
776 # r16-r13 is the number of bytes in plaintext mod 16)
778 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
779 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
783 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
785 mov \PLAIN_CYPH_LEN, %r10
787 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
789 # Determine if if partial block is not being filled and
790 # shift mask accordingly
791 jge _no_extra_mask_1_\@
795 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
796 # get the appropriate mask to mask out bottom r13 bytes of xmm9
797 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
799 vpand %xmm1, %xmm3, %xmm3
800 vmovdqa SHUF_MASK(%rip), %xmm10
801 vpshufb %xmm10, %xmm3, %xmm3
802 vpshufb %xmm2, %xmm3, %xmm3
803 vpxor %xmm3, \AAD_HASH, \AAD_HASH
806 jl _partial_incomplete_1_\@
808 # GHASH computation for the last <16 Byte block
809 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
812 mov %rax, PBlockLen(arg2)
814 _partial_incomplete_1_\@:
815 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
817 vmovdqu \AAD_HASH, AadHash(arg2)
819 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
821 mov \PLAIN_CYPH_LEN, %r10
823 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
825 # Determine if if partial block is not being filled and
826 # shift mask accordingly
827 jge _no_extra_mask_2_\@
831 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
832 # get the appropriate mask to mask out bottom r13 bytes of xmm9
833 vpand %xmm1, %xmm9, %xmm9
835 vmovdqa SHUF_MASK(%rip), %xmm1
836 vpshufb %xmm1, %xmm9, %xmm9
837 vpshufb %xmm2, %xmm9, %xmm9
838 vpxor %xmm9, \AAD_HASH, \AAD_HASH
841 jl _partial_incomplete_2_\@
843 # GHASH computation for the last <16 Byte block
844 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
847 mov %rax, PBlockLen(arg2)
849 _partial_incomplete_2_\@:
850 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
852 vmovdqu \AAD_HASH, AadHash(arg2)
854 vmovdqa SHUF_MASK(%rip), %xmm10
855 # shuffle xmm9 back to output as ciphertext
856 vpshufb %xmm10, %xmm9, %xmm9
857 vpshufb %xmm2, %xmm9, %xmm9
859 # output encrypted Bytes
864 # Set r13 to be the number of bytes to write out
868 mov \PLAIN_CYPH_LEN, %r13
873 jle _less_than_8_bytes_left_\@
875 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
880 _less_than_8_bytes_left_\@:
881 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
885 jne _less_than_8_bytes_left_\@
886 _partial_block_done_\@:
887 .endm # PARTIAL_BLOCK
889 ###############################################################################
890 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
891 # Input: A and B (128-bits each, bit-reflected)
892 # Output: C = A*B*x mod poly, (i.e. >>1 )
893 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
894 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
895 ###############################################################################
896 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
898 vpshufd $0b01001110, \GH, \T2
899 vpshufd $0b01001110, \HK, \T3
900 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
901 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
903 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
904 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
905 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
907 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
909 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
910 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
912 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
914 #first phase of the reduction
915 vpslld $31, \GH, \T2 # packed right shifting << 31
916 vpslld $30, \GH, \T3 # packed right shifting shift << 30
917 vpslld $25, \GH, \T4 # packed right shifting shift << 25
919 vpxor \T3, \T2, \T2 # xor the shifted versions
922 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
924 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
925 vpxor \T2, \GH, \GH # first phase of the reduction complete
927 #second phase of the reduction
929 vpsrld $1,\GH, \T2 # packed left shifting >> 1
930 vpsrld $2,\GH, \T3 # packed left shifting >> 2
931 vpsrld $7,\GH, \T4 # packed left shifting >> 7
932 vpxor \T3, \T2, \T2 # xor the shifted versions
937 vpxor \T1, \GH, \GH # the result is in GH
942 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
944 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
947 vpshufd $0b01001110, \T5, \T1
949 vmovdqu \T1, HashKey_k(arg2)
951 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
952 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
953 vpshufd $0b01001110, \T5, \T1
955 vmovdqu \T1, HashKey_2_k(arg2)
957 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
958 vmovdqu \T5, HashKey_3(arg2)
959 vpshufd $0b01001110, \T5, \T1
961 vmovdqu \T1, HashKey_3_k(arg2)
963 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
964 vmovdqu \T5, HashKey_4(arg2)
965 vpshufd $0b01001110, \T5, \T1
967 vmovdqu \T1, HashKey_4_k(arg2)
969 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
970 vmovdqu \T5, HashKey_5(arg2)
971 vpshufd $0b01001110, \T5, \T1
973 vmovdqu \T1, HashKey_5_k(arg2)
975 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
976 vmovdqu \T5, HashKey_6(arg2)
977 vpshufd $0b01001110, \T5, \T1
979 vmovdqu \T1, HashKey_6_k(arg2)
981 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
982 vmovdqu \T5, HashKey_7(arg2)
983 vpshufd $0b01001110, \T5, \T1
985 vmovdqu \T1, HashKey_7_k(arg2)
987 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
988 vmovdqu \T5, HashKey_8(arg2)
989 vpshufd $0b01001110, \T5, \T1
991 vmovdqu \T1, HashKey_8_k(arg2)
995 ## if a = number of total plaintext bytes
997 ## num_initial_blocks = b mod 4#
998 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
999 ## r10, r11, r12, rax are clobbered
1000 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1002 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1003 i = (8-\num_initial_blocks)
1005 vmovdqu AadHash(arg2), reg_i
1007 # start AES for num_initial_blocks blocks
1008 vmovdqu CurCount(arg2), \CTR
1010 i = (9-\num_initial_blocks)
1012 .rep \num_initial_blocks
1013 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1015 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1020 vmovdqa (arg1), \T_key
1021 i = (9-\num_initial_blocks)
1023 .rep \num_initial_blocks
1024 vpxor \T_key, reg_i, reg_i
1032 vmovdqa 16*j(arg1), \T_key
1033 i = (9-\num_initial_blocks)
1035 .rep \num_initial_blocks
1036 vaesenc \T_key, reg_i, reg_i
1045 vmovdqa 16*j(arg1), \T_key
1046 i = (9-\num_initial_blocks)
1048 .rep \num_initial_blocks
1049 vaesenclast \T_key, reg_i, reg_i
1054 i = (9-\num_initial_blocks)
1056 .rep \num_initial_blocks
1057 vmovdqu (arg4, %r11), \T1
1058 vpxor \T1, reg_i, reg_i
1059 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1064 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1070 i = (8-\num_initial_blocks)
1071 j = (9-\num_initial_blocks)
1074 .rep \num_initial_blocks
1075 vpxor reg_i, reg_j, reg_j
1076 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1081 # XMM8 has the combined result here
1083 vmovdqa \XMM8, TMP1(%rsp)
1087 jl _initial_blocks_done\@ # no need for precomputed constants
1089 ###############################################################################
1090 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1091 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1093 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1095 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1097 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1099 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1101 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1103 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1105 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1107 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1109 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1111 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1113 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1115 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1117 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1119 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1121 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1123 vmovdqa (arg1), \T_key
1124 vpxor \T_key, \XMM1, \XMM1
1125 vpxor \T_key, \XMM2, \XMM2
1126 vpxor \T_key, \XMM3, \XMM3
1127 vpxor \T_key, \XMM4, \XMM4
1128 vpxor \T_key, \XMM5, \XMM5
1129 vpxor \T_key, \XMM6, \XMM6
1130 vpxor \T_key, \XMM7, \XMM7
1131 vpxor \T_key, \XMM8, \XMM8
1135 .rep \REP # do REP rounds
1136 vmovdqa 16*i(arg1), \T_key
1137 vaesenc \T_key, \XMM1, \XMM1
1138 vaesenc \T_key, \XMM2, \XMM2
1139 vaesenc \T_key, \XMM3, \XMM3
1140 vaesenc \T_key, \XMM4, \XMM4
1141 vaesenc \T_key, \XMM5, \XMM5
1142 vaesenc \T_key, \XMM6, \XMM6
1143 vaesenc \T_key, \XMM7, \XMM7
1144 vaesenc \T_key, \XMM8, \XMM8
1149 vmovdqa 16*i(arg1), \T_key
1150 vaesenclast \T_key, \XMM1, \XMM1
1151 vaesenclast \T_key, \XMM2, \XMM2
1152 vaesenclast \T_key, \XMM3, \XMM3
1153 vaesenclast \T_key, \XMM4, \XMM4
1154 vaesenclast \T_key, \XMM5, \XMM5
1155 vaesenclast \T_key, \XMM6, \XMM6
1156 vaesenclast \T_key, \XMM7, \XMM7
1157 vaesenclast \T_key, \XMM8, \XMM8
1159 vmovdqu (arg4, %r11), \T1
1160 vpxor \T1, \XMM1, \XMM1
1161 vmovdqu \XMM1, (arg3 , %r11)
1166 vmovdqu 16*1(arg4, %r11), \T1
1167 vpxor \T1, \XMM2, \XMM2
1168 vmovdqu \XMM2, 16*1(arg3 , %r11)
1173 vmovdqu 16*2(arg4, %r11), \T1
1174 vpxor \T1, \XMM3, \XMM3
1175 vmovdqu \XMM3, 16*2(arg3 , %r11)
1180 vmovdqu 16*3(arg4, %r11), \T1
1181 vpxor \T1, \XMM4, \XMM4
1182 vmovdqu \XMM4, 16*3(arg3 , %r11)
1187 vmovdqu 16*4(arg4, %r11), \T1
1188 vpxor \T1, \XMM5, \XMM5
1189 vmovdqu \XMM5, 16*4(arg3 , %r11)
1194 vmovdqu 16*5(arg4, %r11), \T1
1195 vpxor \T1, \XMM6, \XMM6
1196 vmovdqu \XMM6, 16*5(arg3 , %r11)
1201 vmovdqu 16*6(arg4, %r11), \T1
1202 vpxor \T1, \XMM7, \XMM7
1203 vmovdqu \XMM7, 16*6(arg3 , %r11)
1208 vmovdqu 16*7(arg4, %r11), \T1
1209 vpxor \T1, \XMM8, \XMM8
1210 vmovdqu \XMM8, 16*7(arg3 , %r11)
1217 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1218 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1219 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1220 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1221 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1222 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1223 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1224 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1225 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1227 ###############################################################################
1229 _initial_blocks_done\@:
1233 # encrypt 8 blocks at a time
1234 # ghash the 8 previously encrypted ciphertext blocks
1235 # arg1, arg3, arg4 are used as pointers only, not modified
1236 # r11 is the data offset value
1237 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1240 vmovdqa \XMM2, TMP2(%rsp)
1241 vmovdqa \XMM3, TMP3(%rsp)
1242 vmovdqa \XMM4, TMP4(%rsp)
1243 vmovdqa \XMM5, TMP5(%rsp)
1244 vmovdqa \XMM6, TMP6(%rsp)
1245 vmovdqa \XMM7, TMP7(%rsp)
1246 vmovdqa \XMM8, TMP8(%rsp)
1248 .if \loop_idx == in_order
1249 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1250 vpaddd ONE(%rip), \XMM1, \XMM2
1251 vpaddd ONE(%rip), \XMM2, \XMM3
1252 vpaddd ONE(%rip), \XMM3, \XMM4
1253 vpaddd ONE(%rip), \XMM4, \XMM5
1254 vpaddd ONE(%rip), \XMM5, \XMM6
1255 vpaddd ONE(%rip), \XMM6, \XMM7
1256 vpaddd ONE(%rip), \XMM7, \XMM8
1259 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1260 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1261 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1262 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1263 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1264 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1265 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1266 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1268 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1269 vpaddd ONEf(%rip), \XMM1, \XMM2
1270 vpaddd ONEf(%rip), \XMM2, \XMM3
1271 vpaddd ONEf(%rip), \XMM3, \XMM4
1272 vpaddd ONEf(%rip), \XMM4, \XMM5
1273 vpaddd ONEf(%rip), \XMM5, \XMM6
1274 vpaddd ONEf(%rip), \XMM6, \XMM7
1275 vpaddd ONEf(%rip), \XMM7, \XMM8
1280 #######################################################################
1283 vpxor \T1, \XMM1, \XMM1
1284 vpxor \T1, \XMM2, \XMM2
1285 vpxor \T1, \XMM3, \XMM3
1286 vpxor \T1, \XMM4, \XMM4
1287 vpxor \T1, \XMM5, \XMM5
1288 vpxor \T1, \XMM6, \XMM6
1289 vpxor \T1, \XMM7, \XMM7
1290 vpxor \T1, \XMM8, \XMM8
1292 #######################################################################
1298 vmovdqu 16*1(arg1), \T1
1299 vaesenc \T1, \XMM1, \XMM1
1300 vaesenc \T1, \XMM2, \XMM2
1301 vaesenc \T1, \XMM3, \XMM3
1302 vaesenc \T1, \XMM4, \XMM4
1303 vaesenc \T1, \XMM5, \XMM5
1304 vaesenc \T1, \XMM6, \XMM6
1305 vaesenc \T1, \XMM7, \XMM7
1306 vaesenc \T1, \XMM8, \XMM8
1308 vmovdqu 16*2(arg1), \T1
1309 vaesenc \T1, \XMM1, \XMM1
1310 vaesenc \T1, \XMM2, \XMM2
1311 vaesenc \T1, \XMM3, \XMM3
1312 vaesenc \T1, \XMM4, \XMM4
1313 vaesenc \T1, \XMM5, \XMM5
1314 vaesenc \T1, \XMM6, \XMM6
1315 vaesenc \T1, \XMM7, \XMM7
1316 vaesenc \T1, \XMM8, \XMM8
1319 #######################################################################
1321 vmovdqu HashKey_8(arg2), \T5
1322 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1323 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1325 vpshufd $0b01001110, \T2, \T6
1328 vmovdqu HashKey_8_k(arg2), \T5
1329 vpclmulqdq $0x00, \T5, \T6, \T6
1331 vmovdqu 16*3(arg1), \T1
1332 vaesenc \T1, \XMM1, \XMM1
1333 vaesenc \T1, \XMM2, \XMM2
1334 vaesenc \T1, \XMM3, \XMM3
1335 vaesenc \T1, \XMM4, \XMM4
1336 vaesenc \T1, \XMM5, \XMM5
1337 vaesenc \T1, \XMM6, \XMM6
1338 vaesenc \T1, \XMM7, \XMM7
1339 vaesenc \T1, \XMM8, \XMM8
1341 vmovdqa TMP2(%rsp), \T1
1342 vmovdqu HashKey_7(arg2), \T5
1343 vpclmulqdq $0x11, \T5, \T1, \T3
1345 vpclmulqdq $0x00, \T5, \T1, \T3
1348 vpshufd $0b01001110, \T1, \T3
1350 vmovdqu HashKey_7_k(arg2), \T5
1351 vpclmulqdq $0x10, \T5, \T3, \T3
1354 vmovdqu 16*4(arg1), \T1
1355 vaesenc \T1, \XMM1, \XMM1
1356 vaesenc \T1, \XMM2, \XMM2
1357 vaesenc \T1, \XMM3, \XMM3
1358 vaesenc \T1, \XMM4, \XMM4
1359 vaesenc \T1, \XMM5, \XMM5
1360 vaesenc \T1, \XMM6, \XMM6
1361 vaesenc \T1, \XMM7, \XMM7
1362 vaesenc \T1, \XMM8, \XMM8
1364 #######################################################################
1366 vmovdqa TMP3(%rsp), \T1
1367 vmovdqu HashKey_6(arg2), \T5
1368 vpclmulqdq $0x11, \T5, \T1, \T3
1370 vpclmulqdq $0x00, \T5, \T1, \T3
1373 vpshufd $0b01001110, \T1, \T3
1375 vmovdqu HashKey_6_k(arg2), \T5
1376 vpclmulqdq $0x10, \T5, \T3, \T3
1379 vmovdqu 16*5(arg1), \T1
1380 vaesenc \T1, \XMM1, \XMM1
1381 vaesenc \T1, \XMM2, \XMM2
1382 vaesenc \T1, \XMM3, \XMM3
1383 vaesenc \T1, \XMM4, \XMM4
1384 vaesenc \T1, \XMM5, \XMM5
1385 vaesenc \T1, \XMM6, \XMM6
1386 vaesenc \T1, \XMM7, \XMM7
1387 vaesenc \T1, \XMM8, \XMM8
1389 vmovdqa TMP4(%rsp), \T1
1390 vmovdqu HashKey_5(arg2), \T5
1391 vpclmulqdq $0x11, \T5, \T1, \T3
1393 vpclmulqdq $0x00, \T5, \T1, \T3
1396 vpshufd $0b01001110, \T1, \T3
1398 vmovdqu HashKey_5_k(arg2), \T5
1399 vpclmulqdq $0x10, \T5, \T3, \T3
1402 vmovdqu 16*6(arg1), \T1
1403 vaesenc \T1, \XMM1, \XMM1
1404 vaesenc \T1, \XMM2, \XMM2
1405 vaesenc \T1, \XMM3, \XMM3
1406 vaesenc \T1, \XMM4, \XMM4
1407 vaesenc \T1, \XMM5, \XMM5
1408 vaesenc \T1, \XMM6, \XMM6
1409 vaesenc \T1, \XMM7, \XMM7
1410 vaesenc \T1, \XMM8, \XMM8
1413 vmovdqa TMP5(%rsp), \T1
1414 vmovdqu HashKey_4(arg2), \T5
1415 vpclmulqdq $0x11, \T5, \T1, \T3
1417 vpclmulqdq $0x00, \T5, \T1, \T3
1420 vpshufd $0b01001110, \T1, \T3
1422 vmovdqu HashKey_4_k(arg2), \T5
1423 vpclmulqdq $0x10, \T5, \T3, \T3
1426 vmovdqu 16*7(arg1), \T1
1427 vaesenc \T1, \XMM1, \XMM1
1428 vaesenc \T1, \XMM2, \XMM2
1429 vaesenc \T1, \XMM3, \XMM3
1430 vaesenc \T1, \XMM4, \XMM4
1431 vaesenc \T1, \XMM5, \XMM5
1432 vaesenc \T1, \XMM6, \XMM6
1433 vaesenc \T1, \XMM7, \XMM7
1434 vaesenc \T1, \XMM8, \XMM8
1436 vmovdqa TMP6(%rsp), \T1
1437 vmovdqu HashKey_3(arg2), \T5
1438 vpclmulqdq $0x11, \T5, \T1, \T3
1440 vpclmulqdq $0x00, \T5, \T1, \T3
1443 vpshufd $0b01001110, \T1, \T3
1445 vmovdqu HashKey_3_k(arg2), \T5
1446 vpclmulqdq $0x10, \T5, \T3, \T3
1450 vmovdqu 16*8(arg1), \T1
1451 vaesenc \T1, \XMM1, \XMM1
1452 vaesenc \T1, \XMM2, \XMM2
1453 vaesenc \T1, \XMM3, \XMM3
1454 vaesenc \T1, \XMM4, \XMM4
1455 vaesenc \T1, \XMM5, \XMM5
1456 vaesenc \T1, \XMM6, \XMM6
1457 vaesenc \T1, \XMM7, \XMM7
1458 vaesenc \T1, \XMM8, \XMM8
1460 vmovdqa TMP7(%rsp), \T1
1461 vmovdqu HashKey_2(arg2), \T5
1462 vpclmulqdq $0x11, \T5, \T1, \T3
1464 vpclmulqdq $0x00, \T5, \T1, \T3
1467 vpshufd $0b01001110, \T1, \T3
1469 vmovdqu HashKey_2_k(arg2), \T5
1470 vpclmulqdq $0x10, \T5, \T3, \T3
1473 #######################################################################
1475 vmovdqu 16*9(arg1), \T5
1476 vaesenc \T5, \XMM1, \XMM1
1477 vaesenc \T5, \XMM2, \XMM2
1478 vaesenc \T5, \XMM3, \XMM3
1479 vaesenc \T5, \XMM4, \XMM4
1480 vaesenc \T5, \XMM5, \XMM5
1481 vaesenc \T5, \XMM6, \XMM6
1482 vaesenc \T5, \XMM7, \XMM7
1483 vaesenc \T5, \XMM8, \XMM8
1485 vmovdqa TMP8(%rsp), \T1
1486 vmovdqu HashKey(arg2), \T5
1487 vpclmulqdq $0x11, \T5, \T1, \T3
1489 vpclmulqdq $0x00, \T5, \T1, \T3
1492 vpshufd $0b01001110, \T1, \T3
1494 vmovdqu HashKey_k(arg2), \T5
1495 vpclmulqdq $0x10, \T5, \T3, \T3
1501 vmovdqu 16*10(arg1), \T5
1507 vaesenc \T5, \XMM1, \XMM1
1508 vaesenc \T5, \XMM2, \XMM2
1509 vaesenc \T5, \XMM3, \XMM3
1510 vaesenc \T5, \XMM4, \XMM4
1511 vaesenc \T5, \XMM5, \XMM5
1512 vaesenc \T5, \XMM6, \XMM6
1513 vaesenc \T5, \XMM7, \XMM7
1514 vaesenc \T5, \XMM8, \XMM8
1516 vmovdqu 16*i(arg1), \T5
1525 vpxor 16*i(arg4, %r11), \T5, \T2
1527 vaesenclast \T2, reg_j, reg_j
1529 vaesenclast \T2, reg_j, \T3
1530 vmovdqu 16*i(arg4, %r11), reg_j
1531 vmovdqu \T3, 16*i(arg3, %r11)
1537 #######################################################################
1540 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1541 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1543 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1547 #######################################################################
1548 #first phase of the reduction
1549 #######################################################################
1550 vpslld $31, \T7, \T2 # packed right shifting << 31
1551 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1552 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1554 vpxor \T3, \T2, \T2 # xor the shifted versions
1557 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1559 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1560 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1561 #######################################################################
1563 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1564 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1565 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1566 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1567 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1568 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1569 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1570 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1573 #######################################################################
1574 #second phase of the reduction
1575 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1576 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1577 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1578 vpxor \T3, \T2, \T2 # xor the shifted versions
1583 vpxor \T7, \T6, \T6 # the result is in T6
1584 #######################################################################
1586 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1587 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1588 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1589 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1590 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1591 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1592 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1593 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1596 vpxor \T6, \XMM1, \XMM1
1603 # GHASH the last 4 ciphertext blocks.
1604 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1609 vpshufd $0b01001110, \XMM1, \T2
1610 vpxor \XMM1, \T2, \T2
1611 vmovdqu HashKey_8(arg2), \T5
1612 vpclmulqdq $0x11, \T5, \XMM1, \T6
1613 vpclmulqdq $0x00, \T5, \XMM1, \T7
1615 vmovdqu HashKey_8_k(arg2), \T3
1616 vpclmulqdq $0x00, \T3, \T2, \XMM1
1618 ######################
1620 vpshufd $0b01001110, \XMM2, \T2
1621 vpxor \XMM2, \T2, \T2
1622 vmovdqu HashKey_7(arg2), \T5
1623 vpclmulqdq $0x11, \T5, \XMM2, \T4
1626 vpclmulqdq $0x00, \T5, \XMM2, \T4
1629 vmovdqu HashKey_7_k(arg2), \T3
1630 vpclmulqdq $0x00, \T3, \T2, \T2
1631 vpxor \T2, \XMM1, \XMM1
1633 ######################
1635 vpshufd $0b01001110, \XMM3, \T2
1636 vpxor \XMM3, \T2, \T2
1637 vmovdqu HashKey_6(arg2), \T5
1638 vpclmulqdq $0x11, \T5, \XMM3, \T4
1641 vpclmulqdq $0x00, \T5, \XMM3, \T4
1644 vmovdqu HashKey_6_k(arg2), \T3
1645 vpclmulqdq $0x00, \T3, \T2, \T2
1646 vpxor \T2, \XMM1, \XMM1
1648 ######################
1650 vpshufd $0b01001110, \XMM4, \T2
1651 vpxor \XMM4, \T2, \T2
1652 vmovdqu HashKey_5(arg2), \T5
1653 vpclmulqdq $0x11, \T5, \XMM4, \T4
1656 vpclmulqdq $0x00, \T5, \XMM4, \T4
1659 vmovdqu HashKey_5_k(arg2), \T3
1660 vpclmulqdq $0x00, \T3, \T2, \T2
1661 vpxor \T2, \XMM1, \XMM1
1663 ######################
1665 vpshufd $0b01001110, \XMM5, \T2
1666 vpxor \XMM5, \T2, \T2
1667 vmovdqu HashKey_4(arg2), \T5
1668 vpclmulqdq $0x11, \T5, \XMM5, \T4
1671 vpclmulqdq $0x00, \T5, \XMM5, \T4
1674 vmovdqu HashKey_4_k(arg2), \T3
1675 vpclmulqdq $0x00, \T3, \T2, \T2
1676 vpxor \T2, \XMM1, \XMM1
1678 ######################
1680 vpshufd $0b01001110, \XMM6, \T2
1681 vpxor \XMM6, \T2, \T2
1682 vmovdqu HashKey_3(arg2), \T5
1683 vpclmulqdq $0x11, \T5, \XMM6, \T4
1686 vpclmulqdq $0x00, \T5, \XMM6, \T4
1689 vmovdqu HashKey_3_k(arg2), \T3
1690 vpclmulqdq $0x00, \T3, \T2, \T2
1691 vpxor \T2, \XMM1, \XMM1
1693 ######################
1695 vpshufd $0b01001110, \XMM7, \T2
1696 vpxor \XMM7, \T2, \T2
1697 vmovdqu HashKey_2(arg2), \T5
1698 vpclmulqdq $0x11, \T5, \XMM7, \T4
1701 vpclmulqdq $0x00, \T5, \XMM7, \T4
1704 vmovdqu HashKey_2_k(arg2), \T3
1705 vpclmulqdq $0x00, \T3, \T2, \T2
1706 vpxor \T2, \XMM1, \XMM1
1708 ######################
1710 vpshufd $0b01001110, \XMM8, \T2
1711 vpxor \XMM8, \T2, \T2
1712 vmovdqu HashKey(arg2), \T5
1713 vpclmulqdq $0x11, \T5, \XMM8, \T4
1716 vpclmulqdq $0x00, \T5, \XMM8, \T4
1719 vmovdqu HashKey_k(arg2), \T3
1720 vpclmulqdq $0x00, \T3, \T2, \T2
1722 vpxor \T2, \XMM1, \XMM1
1723 vpxor \T6, \XMM1, \XMM1
1724 vpxor \T7, \XMM1, \T2
1729 vpslldq $8, \T2, \T4
1730 vpsrldq $8, \T2, \T2
1733 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1734 # the accumulated carry-less multiplications
1736 #######################################################################
1737 #first phase of the reduction
1738 vpslld $31, \T7, \T2 # packed right shifting << 31
1739 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1740 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1742 vpxor \T3, \T2, \T2 # xor the shifted versions
1745 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1747 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1748 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1749 #######################################################################
1752 #second phase of the reduction
1753 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1754 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1755 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1756 vpxor \T3, \T2, \T2 # xor the shifted versions
1761 vpxor \T7, \T6, \T6 # the result is in T6
1765 #############################################################
1766 #void aesni_gcm_precomp_avx_gen2
1767 # (gcm_data *my_ctx_data,
1768 # gcm_context_data *data,
1769 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1770 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1771 # (from Security Association) concatenated with 8 byte
1772 # Initialisation Vector (from IPSec ESP Payload)
1773 # concatenated with 0x00000001. 16-byte aligned pointer. */
1774 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1775 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1776 #############################################################
1777 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1779 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1782 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1784 ###############################################################################
1785 #void aesni_gcm_enc_update_avx_gen2(
1786 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1787 # gcm_context_data *data,
1788 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1789 # const u8 *in, /* Plaintext input */
1790 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1791 ###############################################################################
1792 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1796 je key_256_enc_update
1798 je key_128_enc_update
1800 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1804 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1808 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1811 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1813 ###############################################################################
1814 #void aesni_gcm_dec_update_avx_gen2(
1815 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1816 # gcm_context_data *data,
1817 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1818 # const u8 *in, /* Ciphertext input */
1819 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1820 ###############################################################################
1821 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1825 je key_256_dec_update
1827 je key_128_dec_update
1829 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1833 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1837 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1840 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1842 ###############################################################################
1843 #void aesni_gcm_finalize_avx_gen2(
1844 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1845 # gcm_context_data *data,
1846 # u8 *auth_tag, /* Authenticated Tag output. */
1847 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1848 # Valid values are 16 (most likely), 12 or 8. */
1849 ###############################################################################
1850 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1858 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1862 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1866 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1869 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1871 #ifdef CONFIG_AS_AVX2
1872 ###############################################################################
1873 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1874 # Input: A and B (128-bits each, bit-reflected)
1875 # Output: C = A*B*x mod poly, (i.e. >>1 )
1876 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1877 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1878 ###############################################################################
1879 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1881 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1882 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1883 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1884 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1888 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1889 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1894 #######################################################################
1895 #first phase of the reduction
1896 vmovdqa POLY2(%rip), \T3
1898 vpclmulqdq $0x01, \GH, \T3, \T2
1899 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1901 vpxor \T2, \GH, \GH # first phase of the reduction complete
1902 #######################################################################
1903 #second phase of the reduction
1904 vpclmulqdq $0x00, \GH, \T3, \T2
1905 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1907 vpclmulqdq $0x10, \GH, \T3, \GH
1908 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1910 vpxor \T2, \GH, \GH # second phase of the reduction complete
1911 #######################################################################
1912 vpxor \T1, \GH, \GH # the result is in GH
1917 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1919 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1921 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1922 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1925 vmovdqu \T5, HashKey_3(arg2)
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1928 vmovdqu \T5, HashKey_4(arg2)
1930 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1931 vmovdqu \T5, HashKey_5(arg2)
1933 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1934 vmovdqu \T5, HashKey_6(arg2)
1936 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1937 vmovdqu \T5, HashKey_7(arg2)
1939 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1940 vmovdqu \T5, HashKey_8(arg2)
1944 ## if a = number of total plaintext bytes
1946 ## num_initial_blocks = b mod 4#
1947 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1948 ## r10, r11, r12, rax are clobbered
1949 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1951 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1952 i = (8-\num_initial_blocks)
1954 vmovdqu AadHash(arg2), reg_i
1956 # start AES for num_initial_blocks blocks
1957 vmovdqu CurCount(arg2), \CTR
1959 i = (9-\num_initial_blocks)
1961 .rep \num_initial_blocks
1962 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1964 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1969 vmovdqa (arg1), \T_key
1970 i = (9-\num_initial_blocks)
1972 .rep \num_initial_blocks
1973 vpxor \T_key, reg_i, reg_i
1981 vmovdqa 16*j(arg1), \T_key
1982 i = (9-\num_initial_blocks)
1984 .rep \num_initial_blocks
1985 vaesenc \T_key, reg_i, reg_i
1995 vmovdqa 16*j(arg1), \T_key
1996 i = (9-\num_initial_blocks)
1998 .rep \num_initial_blocks
1999 vaesenclast \T_key, reg_i, reg_i
2004 i = (9-\num_initial_blocks)
2006 .rep \num_initial_blocks
2007 vmovdqu (arg4, %r11), \T1
2008 vpxor \T1, reg_i, reg_i
2009 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
2010 # num_initial_blocks blocks
2015 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2021 i = (8-\num_initial_blocks)
2022 j = (9-\num_initial_blocks)
2025 .rep \num_initial_blocks
2026 vpxor reg_i, reg_j, reg_j
2027 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2032 # XMM8 has the combined result here
2034 vmovdqa \XMM8, TMP1(%rsp)
2038 jl _initial_blocks_done\@ # no need for precomputed constants
2040 ###############################################################################
2041 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2042 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2044 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2046 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2048 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2050 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2052 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2054 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2056 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2060 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2062 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2064 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2066 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2068 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2070 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2072 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2074 vmovdqa (arg1), \T_key
2075 vpxor \T_key, \XMM1, \XMM1
2076 vpxor \T_key, \XMM2, \XMM2
2077 vpxor \T_key, \XMM3, \XMM3
2078 vpxor \T_key, \XMM4, \XMM4
2079 vpxor \T_key, \XMM5, \XMM5
2080 vpxor \T_key, \XMM6, \XMM6
2081 vpxor \T_key, \XMM7, \XMM7
2082 vpxor \T_key, \XMM8, \XMM8
2086 .rep \REP # do REP rounds
2087 vmovdqa 16*i(arg1), \T_key
2088 vaesenc \T_key, \XMM1, \XMM1
2089 vaesenc \T_key, \XMM2, \XMM2
2090 vaesenc \T_key, \XMM3, \XMM3
2091 vaesenc \T_key, \XMM4, \XMM4
2092 vaesenc \T_key, \XMM5, \XMM5
2093 vaesenc \T_key, \XMM6, \XMM6
2094 vaesenc \T_key, \XMM7, \XMM7
2095 vaesenc \T_key, \XMM8, \XMM8
2101 vmovdqa 16*i(arg1), \T_key
2102 vaesenclast \T_key, \XMM1, \XMM1
2103 vaesenclast \T_key, \XMM2, \XMM2
2104 vaesenclast \T_key, \XMM3, \XMM3
2105 vaesenclast \T_key, \XMM4, \XMM4
2106 vaesenclast \T_key, \XMM5, \XMM5
2107 vaesenclast \T_key, \XMM6, \XMM6
2108 vaesenclast \T_key, \XMM7, \XMM7
2109 vaesenclast \T_key, \XMM8, \XMM8
2111 vmovdqu (arg4, %r11), \T1
2112 vpxor \T1, \XMM1, \XMM1
2113 vmovdqu \XMM1, (arg3 , %r11)
2118 vmovdqu 16*1(arg4, %r11), \T1
2119 vpxor \T1, \XMM2, \XMM2
2120 vmovdqu \XMM2, 16*1(arg3 , %r11)
2125 vmovdqu 16*2(arg4, %r11), \T1
2126 vpxor \T1, \XMM3, \XMM3
2127 vmovdqu \XMM3, 16*2(arg3 , %r11)
2132 vmovdqu 16*3(arg4, %r11), \T1
2133 vpxor \T1, \XMM4, \XMM4
2134 vmovdqu \XMM4, 16*3(arg3 , %r11)
2139 vmovdqu 16*4(arg4, %r11), \T1
2140 vpxor \T1, \XMM5, \XMM5
2141 vmovdqu \XMM5, 16*4(arg3 , %r11)
2146 vmovdqu 16*5(arg4, %r11), \T1
2147 vpxor \T1, \XMM6, \XMM6
2148 vmovdqu \XMM6, 16*5(arg3 , %r11)
2153 vmovdqu 16*6(arg4, %r11), \T1
2154 vpxor \T1, \XMM7, \XMM7
2155 vmovdqu \XMM7, 16*6(arg3 , %r11)
2160 vmovdqu 16*7(arg4, %r11), \T1
2161 vpxor \T1, \XMM8, \XMM8
2162 vmovdqu \XMM8, 16*7(arg3 , %r11)
2169 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2170 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2171 # the corresponding ciphertext
2172 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2173 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2174 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2175 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2176 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2177 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2178 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2180 ###############################################################################
2182 _initial_blocks_done\@:
2189 # encrypt 8 blocks at a time
2190 # ghash the 8 previously encrypted ciphertext blocks
2191 # arg1, arg3, arg4 are used as pointers only, not modified
2192 # r11 is the data offset value
2193 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2196 vmovdqa \XMM2, TMP2(%rsp)
2197 vmovdqa \XMM3, TMP3(%rsp)
2198 vmovdqa \XMM4, TMP4(%rsp)
2199 vmovdqa \XMM5, TMP5(%rsp)
2200 vmovdqa \XMM6, TMP6(%rsp)
2201 vmovdqa \XMM7, TMP7(%rsp)
2202 vmovdqa \XMM8, TMP8(%rsp)
2204 .if \loop_idx == in_order
2205 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2206 vpaddd ONE(%rip), \XMM1, \XMM2
2207 vpaddd ONE(%rip), \XMM2, \XMM3
2208 vpaddd ONE(%rip), \XMM3, \XMM4
2209 vpaddd ONE(%rip), \XMM4, \XMM5
2210 vpaddd ONE(%rip), \XMM5, \XMM6
2211 vpaddd ONE(%rip), \XMM6, \XMM7
2212 vpaddd ONE(%rip), \XMM7, \XMM8
2215 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2216 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2217 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2218 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2219 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2220 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2221 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2222 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2224 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2225 vpaddd ONEf(%rip), \XMM1, \XMM2
2226 vpaddd ONEf(%rip), \XMM2, \XMM3
2227 vpaddd ONEf(%rip), \XMM3, \XMM4
2228 vpaddd ONEf(%rip), \XMM4, \XMM5
2229 vpaddd ONEf(%rip), \XMM5, \XMM6
2230 vpaddd ONEf(%rip), \XMM6, \XMM7
2231 vpaddd ONEf(%rip), \XMM7, \XMM8
2236 #######################################################################
2239 vpxor \T1, \XMM1, \XMM1
2240 vpxor \T1, \XMM2, \XMM2
2241 vpxor \T1, \XMM3, \XMM3
2242 vpxor \T1, \XMM4, \XMM4
2243 vpxor \T1, \XMM5, \XMM5
2244 vpxor \T1, \XMM6, \XMM6
2245 vpxor \T1, \XMM7, \XMM7
2246 vpxor \T1, \XMM8, \XMM8
2248 #######################################################################
2254 vmovdqu 16*1(arg1), \T1
2255 vaesenc \T1, \XMM1, \XMM1
2256 vaesenc \T1, \XMM2, \XMM2
2257 vaesenc \T1, \XMM3, \XMM3
2258 vaesenc \T1, \XMM4, \XMM4
2259 vaesenc \T1, \XMM5, \XMM5
2260 vaesenc \T1, \XMM6, \XMM6
2261 vaesenc \T1, \XMM7, \XMM7
2262 vaesenc \T1, \XMM8, \XMM8
2264 vmovdqu 16*2(arg1), \T1
2265 vaesenc \T1, \XMM1, \XMM1
2266 vaesenc \T1, \XMM2, \XMM2
2267 vaesenc \T1, \XMM3, \XMM3
2268 vaesenc \T1, \XMM4, \XMM4
2269 vaesenc \T1, \XMM5, \XMM5
2270 vaesenc \T1, \XMM6, \XMM6
2271 vaesenc \T1, \XMM7, \XMM7
2272 vaesenc \T1, \XMM8, \XMM8
2275 #######################################################################
2277 vmovdqu HashKey_8(arg2), \T5
2278 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2279 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2280 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2281 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2284 vmovdqu 16*3(arg1), \T1
2285 vaesenc \T1, \XMM1, \XMM1
2286 vaesenc \T1, \XMM2, \XMM2
2287 vaesenc \T1, \XMM3, \XMM3
2288 vaesenc \T1, \XMM4, \XMM4
2289 vaesenc \T1, \XMM5, \XMM5
2290 vaesenc \T1, \XMM6, \XMM6
2291 vaesenc \T1, \XMM7, \XMM7
2292 vaesenc \T1, \XMM8, \XMM8
2294 vmovdqa TMP2(%rsp), \T1
2295 vmovdqu HashKey_7(arg2), \T5
2296 vpclmulqdq $0x11, \T5, \T1, \T3
2299 vpclmulqdq $0x00, \T5, \T1, \T3
2302 vpclmulqdq $0x01, \T5, \T1, \T3
2305 vpclmulqdq $0x10, \T5, \T1, \T3
2308 vmovdqu 16*4(arg1), \T1
2309 vaesenc \T1, \XMM1, \XMM1
2310 vaesenc \T1, \XMM2, \XMM2
2311 vaesenc \T1, \XMM3, \XMM3
2312 vaesenc \T1, \XMM4, \XMM4
2313 vaesenc \T1, \XMM5, \XMM5
2314 vaesenc \T1, \XMM6, \XMM6
2315 vaesenc \T1, \XMM7, \XMM7
2316 vaesenc \T1, \XMM8, \XMM8
2318 #######################################################################
2320 vmovdqa TMP3(%rsp), \T1
2321 vmovdqu HashKey_6(arg2), \T5
2322 vpclmulqdq $0x11, \T5, \T1, \T3
2325 vpclmulqdq $0x00, \T5, \T1, \T3
2328 vpclmulqdq $0x01, \T5, \T1, \T3
2331 vpclmulqdq $0x10, \T5, \T1, \T3
2334 vmovdqu 16*5(arg1), \T1
2335 vaesenc \T1, \XMM1, \XMM1
2336 vaesenc \T1, \XMM2, \XMM2
2337 vaesenc \T1, \XMM3, \XMM3
2338 vaesenc \T1, \XMM4, \XMM4
2339 vaesenc \T1, \XMM5, \XMM5
2340 vaesenc \T1, \XMM6, \XMM6
2341 vaesenc \T1, \XMM7, \XMM7
2342 vaesenc \T1, \XMM8, \XMM8
2344 vmovdqa TMP4(%rsp), \T1
2345 vmovdqu HashKey_5(arg2), \T5
2346 vpclmulqdq $0x11, \T5, \T1, \T3
2349 vpclmulqdq $0x00, \T5, \T1, \T3
2352 vpclmulqdq $0x01, \T5, \T1, \T3
2355 vpclmulqdq $0x10, \T5, \T1, \T3
2358 vmovdqu 16*6(arg1), \T1
2359 vaesenc \T1, \XMM1, \XMM1
2360 vaesenc \T1, \XMM2, \XMM2
2361 vaesenc \T1, \XMM3, \XMM3
2362 vaesenc \T1, \XMM4, \XMM4
2363 vaesenc \T1, \XMM5, \XMM5
2364 vaesenc \T1, \XMM6, \XMM6
2365 vaesenc \T1, \XMM7, \XMM7
2366 vaesenc \T1, \XMM8, \XMM8
2369 vmovdqa TMP5(%rsp), \T1
2370 vmovdqu HashKey_4(arg2), \T5
2371 vpclmulqdq $0x11, \T5, \T1, \T3
2374 vpclmulqdq $0x00, \T5, \T1, \T3
2377 vpclmulqdq $0x01, \T5, \T1, \T3
2380 vpclmulqdq $0x10, \T5, \T1, \T3
2383 vmovdqu 16*7(arg1), \T1
2384 vaesenc \T1, \XMM1, \XMM1
2385 vaesenc \T1, \XMM2, \XMM2
2386 vaesenc \T1, \XMM3, \XMM3
2387 vaesenc \T1, \XMM4, \XMM4
2388 vaesenc \T1, \XMM5, \XMM5
2389 vaesenc \T1, \XMM6, \XMM6
2390 vaesenc \T1, \XMM7, \XMM7
2391 vaesenc \T1, \XMM8, \XMM8
2393 vmovdqa TMP6(%rsp), \T1
2394 vmovdqu HashKey_3(arg2), \T5
2395 vpclmulqdq $0x11, \T5, \T1, \T3
2398 vpclmulqdq $0x00, \T5, \T1, \T3
2401 vpclmulqdq $0x01, \T5, \T1, \T3
2404 vpclmulqdq $0x10, \T5, \T1, \T3
2407 vmovdqu 16*8(arg1), \T1
2408 vaesenc \T1, \XMM1, \XMM1
2409 vaesenc \T1, \XMM2, \XMM2
2410 vaesenc \T1, \XMM3, \XMM3
2411 vaesenc \T1, \XMM4, \XMM4
2412 vaesenc \T1, \XMM5, \XMM5
2413 vaesenc \T1, \XMM6, \XMM6
2414 vaesenc \T1, \XMM7, \XMM7
2415 vaesenc \T1, \XMM8, \XMM8
2417 vmovdqa TMP7(%rsp), \T1
2418 vmovdqu HashKey_2(arg2), \T5
2419 vpclmulqdq $0x11, \T5, \T1, \T3
2422 vpclmulqdq $0x00, \T5, \T1, \T3
2425 vpclmulqdq $0x01, \T5, \T1, \T3
2428 vpclmulqdq $0x10, \T5, \T1, \T3
2432 #######################################################################
2434 vmovdqu 16*9(arg1), \T5
2435 vaesenc \T5, \XMM1, \XMM1
2436 vaesenc \T5, \XMM2, \XMM2
2437 vaesenc \T5, \XMM3, \XMM3
2438 vaesenc \T5, \XMM4, \XMM4
2439 vaesenc \T5, \XMM5, \XMM5
2440 vaesenc \T5, \XMM6, \XMM6
2441 vaesenc \T5, \XMM7, \XMM7
2442 vaesenc \T5, \XMM8, \XMM8
2444 vmovdqa TMP8(%rsp), \T1
2445 vmovdqu HashKey(arg2), \T5
2447 vpclmulqdq $0x00, \T5, \T1, \T3
2450 vpclmulqdq $0x01, \T5, \T1, \T3
2453 vpclmulqdq $0x10, \T5, \T1, \T3
2456 vpclmulqdq $0x11, \T5, \T1, \T3
2460 vmovdqu 16*10(arg1), \T5
2465 vaesenc \T5, \XMM1, \XMM1
2466 vaesenc \T5, \XMM2, \XMM2
2467 vaesenc \T5, \XMM3, \XMM3
2468 vaesenc \T5, \XMM4, \XMM4
2469 vaesenc \T5, \XMM5, \XMM5
2470 vaesenc \T5, \XMM6, \XMM6
2471 vaesenc \T5, \XMM7, \XMM7
2472 vaesenc \T5, \XMM8, \XMM8
2474 vmovdqu 16*i(arg1), \T5
2483 vpxor 16*i(arg4, %r11), \T5, \T2
2485 vaesenclast \T2, reg_j, reg_j
2487 vaesenclast \T2, reg_j, \T3
2488 vmovdqu 16*i(arg4, %r11), reg_j
2489 vmovdqu \T3, 16*i(arg3, %r11)
2495 #######################################################################
2498 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2499 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2501 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2505 #######################################################################
2506 #first phase of the reduction
2507 vmovdqa POLY2(%rip), \T3
2509 vpclmulqdq $0x01, \T7, \T3, \T2
2510 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2512 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2513 #######################################################################
2515 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2516 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2517 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2518 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2519 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2520 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2521 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2522 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2525 #######################################################################
2526 #second phase of the reduction
2527 vpclmulqdq $0x00, \T7, \T3, \T2
2528 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2530 vpclmulqdq $0x10, \T7, \T3, \T4
2531 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2533 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2534 #######################################################################
2535 vpxor \T4, \T1, \T1 # the result is in T1
2537 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2538 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2539 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2540 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2541 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2542 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2543 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2544 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2547 vpxor \T1, \XMM1, \XMM1
2554 # GHASH the last 4 ciphertext blocks.
2555 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2559 vmovdqu HashKey_8(arg2), \T5
2561 vpshufd $0b01001110, \XMM1, \T2
2562 vpshufd $0b01001110, \T5, \T3
2563 vpxor \XMM1, \T2, \T2
2566 vpclmulqdq $0x11, \T5, \XMM1, \T6
2567 vpclmulqdq $0x00, \T5, \XMM1, \T7
2569 vpclmulqdq $0x00, \T3, \T2, \XMM1
2571 ######################
2573 vmovdqu HashKey_7(arg2), \T5
2574 vpshufd $0b01001110, \XMM2, \T2
2575 vpshufd $0b01001110, \T5, \T3
2576 vpxor \XMM2, \T2, \T2
2579 vpclmulqdq $0x11, \T5, \XMM2, \T4
2582 vpclmulqdq $0x00, \T5, \XMM2, \T4
2585 vpclmulqdq $0x00, \T3, \T2, \T2
2587 vpxor \T2, \XMM1, \XMM1
2589 ######################
2591 vmovdqu HashKey_6(arg2), \T5
2592 vpshufd $0b01001110, \XMM3, \T2
2593 vpshufd $0b01001110, \T5, \T3
2594 vpxor \XMM3, \T2, \T2
2597 vpclmulqdq $0x11, \T5, \XMM3, \T4
2600 vpclmulqdq $0x00, \T5, \XMM3, \T4
2603 vpclmulqdq $0x00, \T3, \T2, \T2
2605 vpxor \T2, \XMM1, \XMM1
2607 ######################
2609 vmovdqu HashKey_5(arg2), \T5
2610 vpshufd $0b01001110, \XMM4, \T2
2611 vpshufd $0b01001110, \T5, \T3
2612 vpxor \XMM4, \T2, \T2
2615 vpclmulqdq $0x11, \T5, \XMM4, \T4
2618 vpclmulqdq $0x00, \T5, \XMM4, \T4
2621 vpclmulqdq $0x00, \T3, \T2, \T2
2623 vpxor \T2, \XMM1, \XMM1
2625 ######################
2627 vmovdqu HashKey_4(arg2), \T5
2628 vpshufd $0b01001110, \XMM5, \T2
2629 vpshufd $0b01001110, \T5, \T3
2630 vpxor \XMM5, \T2, \T2
2633 vpclmulqdq $0x11, \T5, \XMM5, \T4
2636 vpclmulqdq $0x00, \T5, \XMM5, \T4
2639 vpclmulqdq $0x00, \T3, \T2, \T2
2641 vpxor \T2, \XMM1, \XMM1
2643 ######################
2645 vmovdqu HashKey_3(arg2), \T5
2646 vpshufd $0b01001110, \XMM6, \T2
2647 vpshufd $0b01001110, \T5, \T3
2648 vpxor \XMM6, \T2, \T2
2651 vpclmulqdq $0x11, \T5, \XMM6, \T4
2654 vpclmulqdq $0x00, \T5, \XMM6, \T4
2657 vpclmulqdq $0x00, \T3, \T2, \T2
2659 vpxor \T2, \XMM1, \XMM1
2661 ######################
2663 vmovdqu HashKey_2(arg2), \T5
2664 vpshufd $0b01001110, \XMM7, \T2
2665 vpshufd $0b01001110, \T5, \T3
2666 vpxor \XMM7, \T2, \T2
2669 vpclmulqdq $0x11, \T5, \XMM7, \T4
2672 vpclmulqdq $0x00, \T5, \XMM7, \T4
2675 vpclmulqdq $0x00, \T3, \T2, \T2
2677 vpxor \T2, \XMM1, \XMM1
2679 ######################
2681 vmovdqu HashKey(arg2), \T5
2682 vpshufd $0b01001110, \XMM8, \T2
2683 vpshufd $0b01001110, \T5, \T3
2684 vpxor \XMM8, \T2, \T2
2687 vpclmulqdq $0x11, \T5, \XMM8, \T4
2690 vpclmulqdq $0x00, \T5, \XMM8, \T4
2693 vpclmulqdq $0x00, \T3, \T2, \T2
2695 vpxor \T2, \XMM1, \XMM1
2696 vpxor \T6, \XMM1, \XMM1
2697 vpxor \T7, \XMM1, \T2
2702 vpslldq $8, \T2, \T4
2703 vpsrldq $8, \T2, \T2
2706 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2707 # accumulated carry-less multiplications
2709 #######################################################################
2710 #first phase of the reduction
2711 vmovdqa POLY2(%rip), \T3
2713 vpclmulqdq $0x01, \T7, \T3, \T2
2714 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2716 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2717 #######################################################################
2720 #second phase of the reduction
2721 vpclmulqdq $0x00, \T7, \T3, \T2
2722 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2724 vpclmulqdq $0x10, \T7, \T3, \T4
2725 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2727 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2728 #######################################################################
2729 vpxor \T4, \T6, \T6 # the result is in T6
2734 #############################################################
2735 #void aesni_gcm_init_avx_gen4
2736 # (gcm_data *my_ctx_data,
2737 # gcm_context_data *data,
2738 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2739 # (from Security Association) concatenated with 8 byte
2740 # Initialisation Vector (from IPSec ESP Payload)
2741 # concatenated with 0x00000001. 16-byte aligned pointer. */
2742 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2743 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2744 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2745 #############################################################
2746 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2748 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2751 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2753 ###############################################################################
2754 #void aesni_gcm_enc_avx_gen4(
2755 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2756 # gcm_context_data *data,
2757 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2758 # const u8 *in, /* Plaintext input */
2759 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2760 ###############################################################################
2761 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2765 je key_256_enc_update4
2767 je key_128_enc_update4
2769 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2772 key_128_enc_update4:
2773 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2776 key_256_enc_update4:
2777 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2780 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2782 ###############################################################################
2783 #void aesni_gcm_dec_update_avx_gen4(
2784 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2785 # gcm_context_data *data,
2786 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2787 # const u8 *in, /* Ciphertext input */
2788 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2789 ###############################################################################
2790 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2794 je key_256_dec_update4
2796 je key_128_dec_update4
2798 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2801 key_128_dec_update4:
2802 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2805 key_256_dec_update4:
2806 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2809 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2811 ###############################################################################
2812 #void aesni_gcm_finalize_avx_gen4(
2813 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2814 # gcm_context_data *data,
2815 # u8 *auth_tag, /* Authenticated Tag output. */
2816 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2817 # Valid values are 16 (most likely), 12 or 8. */
2818 ###############################################################################
2819 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2823 je key_256_finalize4
2825 je key_128_finalize4
2827 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2831 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2835 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2838 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2840 #endif /* CONFIG_AS_AVX2 */