1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
124 # constants in mergeable sections, linker can reorder and merge
125 .section .rodata.cst16.POLY, "aM", @progbits, 16
127 POLY: .octa 0xC2000000000000000000000000000001
129 .section .rodata.cst16.POLY2, "aM", @progbits, 16
131 POLY2: .octa 0xC20000000000000000000001C2000000
133 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
135 TWOONE: .octa 0x00000001000000000000000000000001
137 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141 .section .rodata.cst16.ONE, "aM", @progbits, 16
143 ONE: .octa 0x00000000000000000000000000000001
145 .section .rodata.cst16.ONEf, "aM", @progbits, 16
147 ONEf: .octa 0x01000000000000000000000000000000
149 # order of these constants should not change.
150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151 .section .rodata, "a", @progbits
153 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
159 .type aad_shift_arr, @object
160 .size aad_shift_arr, 272
162 .octa 0xffffffffffffffffffffffffffffffff
163 .octa 0xffffffffffffffffffffffffffffff0C
164 .octa 0xffffffffffffffffffffffffffff0D0C
165 .octa 0xffffffffffffffffffffffffff0E0D0C
166 .octa 0xffffffffffffffffffffffff0F0E0D0C
167 .octa 0xffffffffffffffffffffff0C0B0A0908
168 .octa 0xffffffffffffffffffff0D0C0B0A0908
169 .octa 0xffffffffffffffffff0E0D0C0B0A0908
170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
171 .octa 0xffffffffffffff0C0B0A090807060504
172 .octa 0xffffffffffff0D0C0B0A090807060504
173 .octa 0xffffffffff0E0D0C0B0A090807060504
174 .octa 0xffffffff0F0E0D0C0B0A090807060504
175 .octa 0xffffff0C0B0A09080706050403020100
176 .octa 0xffff0D0C0B0A09080706050403020100
177 .octa 0xff0E0D0C0B0A09080706050403020100
178 .octa 0x0F0E0D0C0B0A09080706050403020100
186 #define InLen (16*1)+8
187 #define PBlockEncKey 16*2
189 #define CurCount 16*4
190 #define PBlockLen 16*5
192 HashKey = 16*6 # store HashKey <<1 mod poly here
193 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
194 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
195 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
196 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
197 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
198 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
199 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
200 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
215 #define keysize 2*15*16(arg1)
225 .macro define_reg r n
236 TMP1 = 16*0 # Temporary storage for AAD
237 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
238 TMP3 = 16*2 # Temporary storage for AES State 3
239 TMP4 = 16*3 # Temporary storage for AES State 4
240 TMP5 = 16*4 # Temporary storage for AES State 5
241 TMP6 = 16*5 # Temporary storage for AES State 6
242 TMP7 = 16*6 # Temporary storage for AES State 7
243 TMP8 = 16*7 # Temporary storage for AES State 8
245 VARIABLE_OFFSET = 16*8
247 ################################
249 ################################
261 sub $VARIABLE_OFFSET, %rsp
262 and $~63, %rsp # align rsp to 64 bytes
274 # Encryption of a single block
275 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
276 vpxor (arg1), \XMM0, \XMM0
280 vaesenc 16*i(arg1), \XMM0, \XMM0
284 vaesenclast 16*i(arg1), \XMM0, \XMM0
287 # combined for GCM encrypt and decrypt functions
288 # clobbering all xmm registers
289 # clobbering r10, r11, r12, r13, r14, r15
290 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
291 vmovdqu AadHash(arg2), %xmm8
292 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
293 add arg5, InLen(arg2)
295 # initialize the data pointer offset as zero
298 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
301 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
302 and $-16, %r13 # r13 = r13 - (r13 mod 16)
307 jz _initial_num_blocks_is_0\@
310 je _initial_num_blocks_is_7\@
312 je _initial_num_blocks_is_6\@
314 je _initial_num_blocks_is_5\@
316 je _initial_num_blocks_is_4\@
318 je _initial_num_blocks_is_3\@
320 je _initial_num_blocks_is_2\@
322 jmp _initial_num_blocks_is_1\@
324 _initial_num_blocks_is_7\@:
325 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
327 jmp _initial_blocks_encrypted\@
329 _initial_num_blocks_is_6\@:
330 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
332 jmp _initial_blocks_encrypted\@
334 _initial_num_blocks_is_5\@:
335 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
337 jmp _initial_blocks_encrypted\@
339 _initial_num_blocks_is_4\@:
340 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
342 jmp _initial_blocks_encrypted\@
344 _initial_num_blocks_is_3\@:
345 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
347 jmp _initial_blocks_encrypted\@
349 _initial_num_blocks_is_2\@:
350 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
352 jmp _initial_blocks_encrypted\@
354 _initial_num_blocks_is_1\@:
355 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
357 jmp _initial_blocks_encrypted\@
359 _initial_num_blocks_is_0\@:
360 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
363 _initial_blocks_encrypted\@:
365 je _zero_cipher_left\@
368 je _eight_cipher_left\@
375 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
385 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
388 jne _encrypt_by_8_new\@
390 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
391 jmp _eight_cipher_left\@
394 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
396 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
397 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400 jne _encrypt_by_8_new\@
402 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
407 _eight_cipher_left\@:
408 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
412 vmovdqu %xmm14, AadHash(arg2)
413 vmovdqu %xmm9, CurCount(arg2)
417 and $15, %r13 # r13 = (arg5 mod 16)
419 je _multiple_of_16_bytes\@
421 # handle the last <16 Byte block separately
423 mov %r13, PBlockLen(arg2)
425 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
426 vmovdqu %xmm9, CurCount(arg2)
427 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
429 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
430 vmovdqu %xmm9, PBlockEncKey(arg2)
433 jge _large_enough_update\@
435 lea (arg4,%r11,1), %r10
438 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
440 lea SHIFT_MASK+16(%rip), %r12
441 sub %r13, %r12 # adjust the shuffle mask pointer to be
442 # able to shift 16-r13 bytes (r13 is the
443 # number of bytes in plaintext mod 16)
445 jmp _final_ghash_mul\@
447 _large_enough_update\@:
451 # receive the last <16 Byte block
452 vmovdqu (arg4, %r11, 1), %xmm1
457 lea SHIFT_MASK+16(%rip), %r12
458 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
459 # (r13 is the number of bytes in plaintext mod 16)
461 # get the appropriate shuffle mask
462 vmovdqu (%r12), %xmm2
463 # shift right 16-r13 bytes
464 vpshufb %xmm2, %xmm1, %xmm1
469 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
470 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
471 # mask out top 16-r13 bytes of xmm9
472 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
473 vpand %xmm1, %xmm2, %xmm2
474 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
475 vpxor %xmm2, %xmm14, %xmm14
477 vmovdqu %xmm14, AadHash(arg2)
479 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
480 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
481 # mask out top 16-r13 bytes of xmm9
482 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
483 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
484 vpxor %xmm9, %xmm14, %xmm14
486 vmovdqu %xmm14, AadHash(arg2)
487 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
491 #############################
495 jle _less_than_8_bytes_left\@
497 mov %rax, (arg3 , %r11)
499 vpsrldq $8, %xmm9, %xmm9
503 _less_than_8_bytes_left\@:
504 movb %al, (arg3 , %r11)
508 jne _less_than_8_bytes_left\@
509 #############################
511 _multiple_of_16_bytes\@:
515 # GCM_COMPLETE Finishes update of tag of last partial block
516 # Output: Authorization Tag (AUTH_TAG)
517 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
518 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
519 vmovdqu AadHash(arg2), %xmm14
520 vmovdqu HashKey(arg2), %xmm13
522 mov PBlockLen(arg2), %r12
526 #GHASH computation for the last <16 Byte block
527 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
530 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
531 shl $3, %r12 # convert into number of bits
532 vmovd %r12d, %xmm15 # len(A) in xmm15
534 mov InLen(arg2), %r12
535 shl $3, %r12 # len(C) in bits (*128)
537 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
538 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
540 vpxor %xmm15, %xmm14, %xmm14
541 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
542 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
544 vmovdqu OrigIV(arg2), %xmm9
546 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
548 vpxor %xmm14, %xmm9, %xmm9
553 mov \AUTH_TAG, %r10 # r10 = authTag
554 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
567 vpsrldq $8, %xmm9, %xmm9
575 vpsrldq $4, %xmm9, %xmm9
592 vmovdqu %xmm9, (%r10)
597 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
599 mov \AAD, %r10 # r10 = AAD
600 mov \AADLEN, %r12 # r12 = aadLen
611 vpshufb SHUF_MASK(%rip), \T7, \T7
613 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
618 jge _get_AAD_blocks\@
625 /* read the last <16B of AAD. since we have at least 4B of
626 data right after the AAD (the ICV, and maybe some CT), we can
627 read 4B/8B blocks safely, and then get rid of the extra stuff */
645 vpslldq $12, \T1, \T1
649 /* finalize: shift out the extra bytes we read, and align
650 left. since pslldq can only shift by an immediate, we use
651 vpshufb and an array of shuffle masks */
654 vmovdqu aad_shift_arr(%r11), \T1
655 vpshufb \T1, \T7, \T7
656 _get_AAD_rest_final\@:
657 vpshufb SHUF_MASK(%rip), \T7, \T7
659 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
662 vmovdqu \T7, AadHash(arg2)
665 .macro INIT GHASH_MUL PRECOMPUTE
667 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
669 mov %r11, InLen(arg2) # ctx_data.in_length = 0
671 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
672 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
675 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
677 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
678 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
680 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
682 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
683 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
685 vpsllq $1, %xmm6, %xmm6
686 vpsrlq $63, %xmm2, %xmm2
688 vpslldq $8, %xmm2, %xmm2
689 vpsrldq $8, %xmm1, %xmm1
690 vpor %xmm2, %xmm6, %xmm6
692 vpshufd $0b00100100, %xmm1, %xmm2
693 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
694 vpand POLY(%rip), %xmm2, %xmm2
695 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
696 #######################################################################
697 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
699 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
701 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
705 # Reads DLEN bytes starting at DPTR and stores in XMMDst
706 # where 0 < DLEN < 16
707 # Clobbers %rax, DLEN
708 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
709 vpxor \XMMDst, \XMMDst, \XMMDst
714 vpinsrq $0, %rax, \XMMDst, \XMMDst
716 jz _done_read_partial_block_\@
720 mov 7(\DPTR, \DLEN, 1), %al
722 jnz _read_next_byte_\@
723 vpinsrq $1, %rax, \XMMDst, \XMMDst
724 jmp _done_read_partial_block_\@
727 _read_next_byte_lt8_\@:
729 mov -1(\DPTR, \DLEN, 1), %al
731 jnz _read_next_byte_lt8_\@
732 vpinsrq $0, %rax, \XMMDst, \XMMDst
733 _done_read_partial_block_\@:
736 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
737 # between update calls.
738 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
739 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
740 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
741 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
743 mov PBlockLen(arg2), %r13
745 je _partial_block_done_\@ # Leave Macro if no partial blocks
746 # Read in input data without over reading
747 cmp $16, \PLAIN_CYPH_LEN
748 jl _fewer_than_16_bytes_\@
749 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
752 _fewer_than_16_bytes_\@:
753 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
754 mov \PLAIN_CYPH_LEN, %r12
755 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
757 mov PBlockLen(arg2), %r13
759 _data_read_\@: # Finished reading in data
761 vmovdqu PBlockEncKey(arg2), %xmm9
762 vmovdqu HashKey(arg2), %xmm13
764 lea SHIFT_MASK(%rip), %r12
766 # adjust the shuffle mask pointer to be able to shift r13 bytes
767 # r16-r13 is the number of bytes in plaintext mod 16)
769 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
770 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
774 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
776 mov \PLAIN_CYPH_LEN, %r10
778 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
780 # Determine if if partial block is not being filled and
781 # shift mask accordingly
782 jge _no_extra_mask_1_\@
786 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
787 # get the appropriate mask to mask out bottom r13 bytes of xmm9
788 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
790 vpand %xmm1, %xmm3, %xmm3
791 vmovdqa SHUF_MASK(%rip), %xmm10
792 vpshufb %xmm10, %xmm3, %xmm3
793 vpshufb %xmm2, %xmm3, %xmm3
794 vpxor %xmm3, \AAD_HASH, \AAD_HASH
797 jl _partial_incomplete_1_\@
799 # GHASH computation for the last <16 Byte block
800 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
803 mov %rax, PBlockLen(arg2)
805 _partial_incomplete_1_\@:
806 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
808 vmovdqu \AAD_HASH, AadHash(arg2)
810 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
812 mov \PLAIN_CYPH_LEN, %r10
814 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
816 # Determine if if partial block is not being filled and
817 # shift mask accordingly
818 jge _no_extra_mask_2_\@
822 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
823 # get the appropriate mask to mask out bottom r13 bytes of xmm9
824 vpand %xmm1, %xmm9, %xmm9
826 vmovdqa SHUF_MASK(%rip), %xmm1
827 vpshufb %xmm1, %xmm9, %xmm9
828 vpshufb %xmm2, %xmm9, %xmm9
829 vpxor %xmm9, \AAD_HASH, \AAD_HASH
832 jl _partial_incomplete_2_\@
834 # GHASH computation for the last <16 Byte block
835 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
838 mov %rax, PBlockLen(arg2)
840 _partial_incomplete_2_\@:
841 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
843 vmovdqu \AAD_HASH, AadHash(arg2)
845 vmovdqa SHUF_MASK(%rip), %xmm10
846 # shuffle xmm9 back to output as ciphertext
847 vpshufb %xmm10, %xmm9, %xmm9
848 vpshufb %xmm2, %xmm9, %xmm9
850 # output encrypted Bytes
855 # Set r13 to be the number of bytes to write out
859 mov \PLAIN_CYPH_LEN, %r13
864 jle _less_than_8_bytes_left_\@
866 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
871 _less_than_8_bytes_left_\@:
872 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
876 jne _less_than_8_bytes_left_\@
877 _partial_block_done_\@:
878 .endm # PARTIAL_BLOCK
880 ###############################################################################
881 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
882 # Input: A and B (128-bits each, bit-reflected)
883 # Output: C = A*B*x mod poly, (i.e. >>1 )
884 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
885 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
886 ###############################################################################
887 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
889 vpshufd $0b01001110, \GH, \T2
890 vpshufd $0b01001110, \HK, \T3
891 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
892 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
894 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
895 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
896 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
898 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
900 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
901 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
903 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
905 #first phase of the reduction
906 vpslld $31, \GH, \T2 # packed right shifting << 31
907 vpslld $30, \GH, \T3 # packed right shifting shift << 30
908 vpslld $25, \GH, \T4 # packed right shifting shift << 25
910 vpxor \T3, \T2, \T2 # xor the shifted versions
913 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
915 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
916 vpxor \T2, \GH, \GH # first phase of the reduction complete
918 #second phase of the reduction
920 vpsrld $1,\GH, \T2 # packed left shifting >> 1
921 vpsrld $2,\GH, \T3 # packed left shifting >> 2
922 vpsrld $7,\GH, \T4 # packed left shifting >> 7
923 vpxor \T3, \T2, \T2 # xor the shifted versions
928 vpxor \T1, \GH, \GH # the result is in GH
933 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
935 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
938 vpshufd $0b01001110, \T5, \T1
940 vmovdqu \T1, HashKey_k(arg2)
942 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
943 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
944 vpshufd $0b01001110, \T5, \T1
946 vmovdqu \T1, HashKey_2_k(arg2)
948 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
949 vmovdqu \T5, HashKey_3(arg2)
950 vpshufd $0b01001110, \T5, \T1
952 vmovdqu \T1, HashKey_3_k(arg2)
954 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
955 vmovdqu \T5, HashKey_4(arg2)
956 vpshufd $0b01001110, \T5, \T1
958 vmovdqu \T1, HashKey_4_k(arg2)
960 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
961 vmovdqu \T5, HashKey_5(arg2)
962 vpshufd $0b01001110, \T5, \T1
964 vmovdqu \T1, HashKey_5_k(arg2)
966 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
967 vmovdqu \T5, HashKey_6(arg2)
968 vpshufd $0b01001110, \T5, \T1
970 vmovdqu \T1, HashKey_6_k(arg2)
972 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
973 vmovdqu \T5, HashKey_7(arg2)
974 vpshufd $0b01001110, \T5, \T1
976 vmovdqu \T1, HashKey_7_k(arg2)
978 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
979 vmovdqu \T5, HashKey_8(arg2)
980 vpshufd $0b01001110, \T5, \T1
982 vmovdqu \T1, HashKey_8_k(arg2)
986 ## if a = number of total plaintext bytes
988 ## num_initial_blocks = b mod 4#
989 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
990 ## r10, r11, r12, rax are clobbered
991 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
993 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
994 i = (8-\num_initial_blocks)
996 vmovdqu AadHash(arg2), reg_i
998 # start AES for num_initial_blocks blocks
999 vmovdqu CurCount(arg2), \CTR
1001 i = (9-\num_initial_blocks)
1003 .rep \num_initial_blocks
1004 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1006 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1011 vmovdqa (arg1), \T_key
1012 i = (9-\num_initial_blocks)
1014 .rep \num_initial_blocks
1015 vpxor \T_key, reg_i, reg_i
1023 vmovdqa 16*j(arg1), \T_key
1024 i = (9-\num_initial_blocks)
1026 .rep \num_initial_blocks
1027 vaesenc \T_key, reg_i, reg_i
1036 vmovdqa 16*j(arg1), \T_key
1037 i = (9-\num_initial_blocks)
1039 .rep \num_initial_blocks
1040 vaesenclast \T_key, reg_i, reg_i
1045 i = (9-\num_initial_blocks)
1047 .rep \num_initial_blocks
1048 vmovdqu (arg4, %r11), \T1
1049 vpxor \T1, reg_i, reg_i
1050 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1055 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1061 i = (8-\num_initial_blocks)
1062 j = (9-\num_initial_blocks)
1065 .rep \num_initial_blocks
1066 vpxor reg_i, reg_j, reg_j
1067 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1072 # XMM8 has the combined result here
1074 vmovdqa \XMM8, TMP1(%rsp)
1078 jl _initial_blocks_done\@ # no need for precomputed constants
1080 ###############################################################################
1081 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1082 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1084 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1086 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1088 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1090 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1092 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1094 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1096 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1098 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1100 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1102 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1104 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1106 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1108 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1110 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1112 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1114 vmovdqa (arg1), \T_key
1115 vpxor \T_key, \XMM1, \XMM1
1116 vpxor \T_key, \XMM2, \XMM2
1117 vpxor \T_key, \XMM3, \XMM3
1118 vpxor \T_key, \XMM4, \XMM4
1119 vpxor \T_key, \XMM5, \XMM5
1120 vpxor \T_key, \XMM6, \XMM6
1121 vpxor \T_key, \XMM7, \XMM7
1122 vpxor \T_key, \XMM8, \XMM8
1126 .rep \REP # do REP rounds
1127 vmovdqa 16*i(arg1), \T_key
1128 vaesenc \T_key, \XMM1, \XMM1
1129 vaesenc \T_key, \XMM2, \XMM2
1130 vaesenc \T_key, \XMM3, \XMM3
1131 vaesenc \T_key, \XMM4, \XMM4
1132 vaesenc \T_key, \XMM5, \XMM5
1133 vaesenc \T_key, \XMM6, \XMM6
1134 vaesenc \T_key, \XMM7, \XMM7
1135 vaesenc \T_key, \XMM8, \XMM8
1140 vmovdqa 16*i(arg1), \T_key
1141 vaesenclast \T_key, \XMM1, \XMM1
1142 vaesenclast \T_key, \XMM2, \XMM2
1143 vaesenclast \T_key, \XMM3, \XMM3
1144 vaesenclast \T_key, \XMM4, \XMM4
1145 vaesenclast \T_key, \XMM5, \XMM5
1146 vaesenclast \T_key, \XMM6, \XMM6
1147 vaesenclast \T_key, \XMM7, \XMM7
1148 vaesenclast \T_key, \XMM8, \XMM8
1150 vmovdqu (arg4, %r11), \T1
1151 vpxor \T1, \XMM1, \XMM1
1152 vmovdqu \XMM1, (arg3 , %r11)
1157 vmovdqu 16*1(arg4, %r11), \T1
1158 vpxor \T1, \XMM2, \XMM2
1159 vmovdqu \XMM2, 16*1(arg3 , %r11)
1164 vmovdqu 16*2(arg4, %r11), \T1
1165 vpxor \T1, \XMM3, \XMM3
1166 vmovdqu \XMM3, 16*2(arg3 , %r11)
1171 vmovdqu 16*3(arg4, %r11), \T1
1172 vpxor \T1, \XMM4, \XMM4
1173 vmovdqu \XMM4, 16*3(arg3 , %r11)
1178 vmovdqu 16*4(arg4, %r11), \T1
1179 vpxor \T1, \XMM5, \XMM5
1180 vmovdqu \XMM5, 16*4(arg3 , %r11)
1185 vmovdqu 16*5(arg4, %r11), \T1
1186 vpxor \T1, \XMM6, \XMM6
1187 vmovdqu \XMM6, 16*5(arg3 , %r11)
1192 vmovdqu 16*6(arg4, %r11), \T1
1193 vpxor \T1, \XMM7, \XMM7
1194 vmovdqu \XMM7, 16*6(arg3 , %r11)
1199 vmovdqu 16*7(arg4, %r11), \T1
1200 vpxor \T1, \XMM8, \XMM8
1201 vmovdqu \XMM8, 16*7(arg3 , %r11)
1208 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1209 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1210 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1211 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1212 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1213 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1214 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1215 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1216 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1218 ###############################################################################
1220 _initial_blocks_done\@:
1224 # encrypt 8 blocks at a time
1225 # ghash the 8 previously encrypted ciphertext blocks
1226 # arg1, arg3, arg4 are used as pointers only, not modified
1227 # r11 is the data offset value
1228 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1231 vmovdqa \XMM2, TMP2(%rsp)
1232 vmovdqa \XMM3, TMP3(%rsp)
1233 vmovdqa \XMM4, TMP4(%rsp)
1234 vmovdqa \XMM5, TMP5(%rsp)
1235 vmovdqa \XMM6, TMP6(%rsp)
1236 vmovdqa \XMM7, TMP7(%rsp)
1237 vmovdqa \XMM8, TMP8(%rsp)
1239 .if \loop_idx == in_order
1240 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1241 vpaddd ONE(%rip), \XMM1, \XMM2
1242 vpaddd ONE(%rip), \XMM2, \XMM3
1243 vpaddd ONE(%rip), \XMM3, \XMM4
1244 vpaddd ONE(%rip), \XMM4, \XMM5
1245 vpaddd ONE(%rip), \XMM5, \XMM6
1246 vpaddd ONE(%rip), \XMM6, \XMM7
1247 vpaddd ONE(%rip), \XMM7, \XMM8
1250 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1251 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1252 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1253 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1254 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1255 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1256 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1257 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1259 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1260 vpaddd ONEf(%rip), \XMM1, \XMM2
1261 vpaddd ONEf(%rip), \XMM2, \XMM3
1262 vpaddd ONEf(%rip), \XMM3, \XMM4
1263 vpaddd ONEf(%rip), \XMM4, \XMM5
1264 vpaddd ONEf(%rip), \XMM5, \XMM6
1265 vpaddd ONEf(%rip), \XMM6, \XMM7
1266 vpaddd ONEf(%rip), \XMM7, \XMM8
1271 #######################################################################
1274 vpxor \T1, \XMM1, \XMM1
1275 vpxor \T1, \XMM2, \XMM2
1276 vpxor \T1, \XMM3, \XMM3
1277 vpxor \T1, \XMM4, \XMM4
1278 vpxor \T1, \XMM5, \XMM5
1279 vpxor \T1, \XMM6, \XMM6
1280 vpxor \T1, \XMM7, \XMM7
1281 vpxor \T1, \XMM8, \XMM8
1283 #######################################################################
1289 vmovdqu 16*1(arg1), \T1
1290 vaesenc \T1, \XMM1, \XMM1
1291 vaesenc \T1, \XMM2, \XMM2
1292 vaesenc \T1, \XMM3, \XMM3
1293 vaesenc \T1, \XMM4, \XMM4
1294 vaesenc \T1, \XMM5, \XMM5
1295 vaesenc \T1, \XMM6, \XMM6
1296 vaesenc \T1, \XMM7, \XMM7
1297 vaesenc \T1, \XMM8, \XMM8
1299 vmovdqu 16*2(arg1), \T1
1300 vaesenc \T1, \XMM1, \XMM1
1301 vaesenc \T1, \XMM2, \XMM2
1302 vaesenc \T1, \XMM3, \XMM3
1303 vaesenc \T1, \XMM4, \XMM4
1304 vaesenc \T1, \XMM5, \XMM5
1305 vaesenc \T1, \XMM6, \XMM6
1306 vaesenc \T1, \XMM7, \XMM7
1307 vaesenc \T1, \XMM8, \XMM8
1310 #######################################################################
1312 vmovdqu HashKey_8(arg2), \T5
1313 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1314 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1316 vpshufd $0b01001110, \T2, \T6
1319 vmovdqu HashKey_8_k(arg2), \T5
1320 vpclmulqdq $0x00, \T5, \T6, \T6
1322 vmovdqu 16*3(arg1), \T1
1323 vaesenc \T1, \XMM1, \XMM1
1324 vaesenc \T1, \XMM2, \XMM2
1325 vaesenc \T1, \XMM3, \XMM3
1326 vaesenc \T1, \XMM4, \XMM4
1327 vaesenc \T1, \XMM5, \XMM5
1328 vaesenc \T1, \XMM6, \XMM6
1329 vaesenc \T1, \XMM7, \XMM7
1330 vaesenc \T1, \XMM8, \XMM8
1332 vmovdqa TMP2(%rsp), \T1
1333 vmovdqu HashKey_7(arg2), \T5
1334 vpclmulqdq $0x11, \T5, \T1, \T3
1336 vpclmulqdq $0x00, \T5, \T1, \T3
1339 vpshufd $0b01001110, \T1, \T3
1341 vmovdqu HashKey_7_k(arg2), \T5
1342 vpclmulqdq $0x10, \T5, \T3, \T3
1345 vmovdqu 16*4(arg1), \T1
1346 vaesenc \T1, \XMM1, \XMM1
1347 vaesenc \T1, \XMM2, \XMM2
1348 vaesenc \T1, \XMM3, \XMM3
1349 vaesenc \T1, \XMM4, \XMM4
1350 vaesenc \T1, \XMM5, \XMM5
1351 vaesenc \T1, \XMM6, \XMM6
1352 vaesenc \T1, \XMM7, \XMM7
1353 vaesenc \T1, \XMM8, \XMM8
1355 #######################################################################
1357 vmovdqa TMP3(%rsp), \T1
1358 vmovdqu HashKey_6(arg2), \T5
1359 vpclmulqdq $0x11, \T5, \T1, \T3
1361 vpclmulqdq $0x00, \T5, \T1, \T3
1364 vpshufd $0b01001110, \T1, \T3
1366 vmovdqu HashKey_6_k(arg2), \T5
1367 vpclmulqdq $0x10, \T5, \T3, \T3
1370 vmovdqu 16*5(arg1), \T1
1371 vaesenc \T1, \XMM1, \XMM1
1372 vaesenc \T1, \XMM2, \XMM2
1373 vaesenc \T1, \XMM3, \XMM3
1374 vaesenc \T1, \XMM4, \XMM4
1375 vaesenc \T1, \XMM5, \XMM5
1376 vaesenc \T1, \XMM6, \XMM6
1377 vaesenc \T1, \XMM7, \XMM7
1378 vaesenc \T1, \XMM8, \XMM8
1380 vmovdqa TMP4(%rsp), \T1
1381 vmovdqu HashKey_5(arg2), \T5
1382 vpclmulqdq $0x11, \T5, \T1, \T3
1384 vpclmulqdq $0x00, \T5, \T1, \T3
1387 vpshufd $0b01001110, \T1, \T3
1389 vmovdqu HashKey_5_k(arg2), \T5
1390 vpclmulqdq $0x10, \T5, \T3, \T3
1393 vmovdqu 16*6(arg1), \T1
1394 vaesenc \T1, \XMM1, \XMM1
1395 vaesenc \T1, \XMM2, \XMM2
1396 vaesenc \T1, \XMM3, \XMM3
1397 vaesenc \T1, \XMM4, \XMM4
1398 vaesenc \T1, \XMM5, \XMM5
1399 vaesenc \T1, \XMM6, \XMM6
1400 vaesenc \T1, \XMM7, \XMM7
1401 vaesenc \T1, \XMM8, \XMM8
1404 vmovdqa TMP5(%rsp), \T1
1405 vmovdqu HashKey_4(arg2), \T5
1406 vpclmulqdq $0x11, \T5, \T1, \T3
1408 vpclmulqdq $0x00, \T5, \T1, \T3
1411 vpshufd $0b01001110, \T1, \T3
1413 vmovdqu HashKey_4_k(arg2), \T5
1414 vpclmulqdq $0x10, \T5, \T3, \T3
1417 vmovdqu 16*7(arg1), \T1
1418 vaesenc \T1, \XMM1, \XMM1
1419 vaesenc \T1, \XMM2, \XMM2
1420 vaesenc \T1, \XMM3, \XMM3
1421 vaesenc \T1, \XMM4, \XMM4
1422 vaesenc \T1, \XMM5, \XMM5
1423 vaesenc \T1, \XMM6, \XMM6
1424 vaesenc \T1, \XMM7, \XMM7
1425 vaesenc \T1, \XMM8, \XMM8
1427 vmovdqa TMP6(%rsp), \T1
1428 vmovdqu HashKey_3(arg2), \T5
1429 vpclmulqdq $0x11, \T5, \T1, \T3
1431 vpclmulqdq $0x00, \T5, \T1, \T3
1434 vpshufd $0b01001110, \T1, \T3
1436 vmovdqu HashKey_3_k(arg2), \T5
1437 vpclmulqdq $0x10, \T5, \T3, \T3
1441 vmovdqu 16*8(arg1), \T1
1442 vaesenc \T1, \XMM1, \XMM1
1443 vaesenc \T1, \XMM2, \XMM2
1444 vaesenc \T1, \XMM3, \XMM3
1445 vaesenc \T1, \XMM4, \XMM4
1446 vaesenc \T1, \XMM5, \XMM5
1447 vaesenc \T1, \XMM6, \XMM6
1448 vaesenc \T1, \XMM7, \XMM7
1449 vaesenc \T1, \XMM8, \XMM8
1451 vmovdqa TMP7(%rsp), \T1
1452 vmovdqu HashKey_2(arg2), \T5
1453 vpclmulqdq $0x11, \T5, \T1, \T3
1455 vpclmulqdq $0x00, \T5, \T1, \T3
1458 vpshufd $0b01001110, \T1, \T3
1460 vmovdqu HashKey_2_k(arg2), \T5
1461 vpclmulqdq $0x10, \T5, \T3, \T3
1464 #######################################################################
1466 vmovdqu 16*9(arg1), \T5
1467 vaesenc \T5, \XMM1, \XMM1
1468 vaesenc \T5, \XMM2, \XMM2
1469 vaesenc \T5, \XMM3, \XMM3
1470 vaesenc \T5, \XMM4, \XMM4
1471 vaesenc \T5, \XMM5, \XMM5
1472 vaesenc \T5, \XMM6, \XMM6
1473 vaesenc \T5, \XMM7, \XMM7
1474 vaesenc \T5, \XMM8, \XMM8
1476 vmovdqa TMP8(%rsp), \T1
1477 vmovdqu HashKey(arg2), \T5
1478 vpclmulqdq $0x11, \T5, \T1, \T3
1480 vpclmulqdq $0x00, \T5, \T1, \T3
1483 vpshufd $0b01001110, \T1, \T3
1485 vmovdqu HashKey_k(arg2), \T5
1486 vpclmulqdq $0x10, \T5, \T3, \T3
1492 vmovdqu 16*10(arg1), \T5
1498 vaesenc \T5, \XMM1, \XMM1
1499 vaesenc \T5, \XMM2, \XMM2
1500 vaesenc \T5, \XMM3, \XMM3
1501 vaesenc \T5, \XMM4, \XMM4
1502 vaesenc \T5, \XMM5, \XMM5
1503 vaesenc \T5, \XMM6, \XMM6
1504 vaesenc \T5, \XMM7, \XMM7
1505 vaesenc \T5, \XMM8, \XMM8
1507 vmovdqu 16*i(arg1), \T5
1516 vpxor 16*i(arg4, %r11), \T5, \T2
1518 vaesenclast \T2, reg_j, reg_j
1520 vaesenclast \T2, reg_j, \T3
1521 vmovdqu 16*i(arg4, %r11), reg_j
1522 vmovdqu \T3, 16*i(arg3, %r11)
1528 #######################################################################
1531 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1532 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1534 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1538 #######################################################################
1539 #first phase of the reduction
1540 #######################################################################
1541 vpslld $31, \T7, \T2 # packed right shifting << 31
1542 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1543 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1545 vpxor \T3, \T2, \T2 # xor the shifted versions
1548 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1550 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1551 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1552 #######################################################################
1554 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1555 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1556 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1557 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1558 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1559 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1560 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1561 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1564 #######################################################################
1565 #second phase of the reduction
1566 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1567 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1568 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1569 vpxor \T3, \T2, \T2 # xor the shifted versions
1574 vpxor \T7, \T6, \T6 # the result is in T6
1575 #######################################################################
1577 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1578 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1579 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1580 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1581 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1582 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1583 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1584 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1587 vpxor \T6, \XMM1, \XMM1
1594 # GHASH the last 4 ciphertext blocks.
1595 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1600 vpshufd $0b01001110, \XMM1, \T2
1601 vpxor \XMM1, \T2, \T2
1602 vmovdqu HashKey_8(arg2), \T5
1603 vpclmulqdq $0x11, \T5, \XMM1, \T6
1604 vpclmulqdq $0x00, \T5, \XMM1, \T7
1606 vmovdqu HashKey_8_k(arg2), \T3
1607 vpclmulqdq $0x00, \T3, \T2, \XMM1
1609 ######################
1611 vpshufd $0b01001110, \XMM2, \T2
1612 vpxor \XMM2, \T2, \T2
1613 vmovdqu HashKey_7(arg2), \T5
1614 vpclmulqdq $0x11, \T5, \XMM2, \T4
1617 vpclmulqdq $0x00, \T5, \XMM2, \T4
1620 vmovdqu HashKey_7_k(arg2), \T3
1621 vpclmulqdq $0x00, \T3, \T2, \T2
1622 vpxor \T2, \XMM1, \XMM1
1624 ######################
1626 vpshufd $0b01001110, \XMM3, \T2
1627 vpxor \XMM3, \T2, \T2
1628 vmovdqu HashKey_6(arg2), \T5
1629 vpclmulqdq $0x11, \T5, \XMM3, \T4
1632 vpclmulqdq $0x00, \T5, \XMM3, \T4
1635 vmovdqu HashKey_6_k(arg2), \T3
1636 vpclmulqdq $0x00, \T3, \T2, \T2
1637 vpxor \T2, \XMM1, \XMM1
1639 ######################
1641 vpshufd $0b01001110, \XMM4, \T2
1642 vpxor \XMM4, \T2, \T2
1643 vmovdqu HashKey_5(arg2), \T5
1644 vpclmulqdq $0x11, \T5, \XMM4, \T4
1647 vpclmulqdq $0x00, \T5, \XMM4, \T4
1650 vmovdqu HashKey_5_k(arg2), \T3
1651 vpclmulqdq $0x00, \T3, \T2, \T2
1652 vpxor \T2, \XMM1, \XMM1
1654 ######################
1656 vpshufd $0b01001110, \XMM5, \T2
1657 vpxor \XMM5, \T2, \T2
1658 vmovdqu HashKey_4(arg2), \T5
1659 vpclmulqdq $0x11, \T5, \XMM5, \T4
1662 vpclmulqdq $0x00, \T5, \XMM5, \T4
1665 vmovdqu HashKey_4_k(arg2), \T3
1666 vpclmulqdq $0x00, \T3, \T2, \T2
1667 vpxor \T2, \XMM1, \XMM1
1669 ######################
1671 vpshufd $0b01001110, \XMM6, \T2
1672 vpxor \XMM6, \T2, \T2
1673 vmovdqu HashKey_3(arg2), \T5
1674 vpclmulqdq $0x11, \T5, \XMM6, \T4
1677 vpclmulqdq $0x00, \T5, \XMM6, \T4
1680 vmovdqu HashKey_3_k(arg2), \T3
1681 vpclmulqdq $0x00, \T3, \T2, \T2
1682 vpxor \T2, \XMM1, \XMM1
1684 ######################
1686 vpshufd $0b01001110, \XMM7, \T2
1687 vpxor \XMM7, \T2, \T2
1688 vmovdqu HashKey_2(arg2), \T5
1689 vpclmulqdq $0x11, \T5, \XMM7, \T4
1692 vpclmulqdq $0x00, \T5, \XMM7, \T4
1695 vmovdqu HashKey_2_k(arg2), \T3
1696 vpclmulqdq $0x00, \T3, \T2, \T2
1697 vpxor \T2, \XMM1, \XMM1
1699 ######################
1701 vpshufd $0b01001110, \XMM8, \T2
1702 vpxor \XMM8, \T2, \T2
1703 vmovdqu HashKey(arg2), \T5
1704 vpclmulqdq $0x11, \T5, \XMM8, \T4
1707 vpclmulqdq $0x00, \T5, \XMM8, \T4
1710 vmovdqu HashKey_k(arg2), \T3
1711 vpclmulqdq $0x00, \T3, \T2, \T2
1713 vpxor \T2, \XMM1, \XMM1
1714 vpxor \T6, \XMM1, \XMM1
1715 vpxor \T7, \XMM1, \T2
1720 vpslldq $8, \T2, \T4
1721 vpsrldq $8, \T2, \T2
1724 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1725 # the accumulated carry-less multiplications
1727 #######################################################################
1728 #first phase of the reduction
1729 vpslld $31, \T7, \T2 # packed right shifting << 31
1730 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1731 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1733 vpxor \T3, \T2, \T2 # xor the shifted versions
1736 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1738 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1739 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1740 #######################################################################
1743 #second phase of the reduction
1744 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1745 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1746 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1747 vpxor \T3, \T2, \T2 # xor the shifted versions
1752 vpxor \T7, \T6, \T6 # the result is in T6
1756 #############################################################
1757 #void aesni_gcm_precomp_avx_gen2
1758 # (gcm_data *my_ctx_data,
1759 # gcm_context_data *data,
1760 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1761 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1762 # (from Security Association) concatenated with 8 byte
1763 # Initialisation Vector (from IPSec ESP Payload)
1764 # concatenated with 0x00000001. 16-byte aligned pointer. */
1765 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1766 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1767 #############################################################
1768 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1770 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1773 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1775 ###############################################################################
1776 #void aesni_gcm_enc_update_avx_gen2(
1777 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1778 # gcm_context_data *data,
1779 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1780 # const u8 *in, /* Plaintext input */
1781 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1782 ###############################################################################
1783 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1787 je key_256_enc_update
1789 je key_128_enc_update
1791 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1795 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1799 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1802 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1804 ###############################################################################
1805 #void aesni_gcm_dec_update_avx_gen2(
1806 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1807 # gcm_context_data *data,
1808 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1809 # const u8 *in, /* Ciphertext input */
1810 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1811 ###############################################################################
1812 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1816 je key_256_dec_update
1818 je key_128_dec_update
1820 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1824 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1828 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1831 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1833 ###############################################################################
1834 #void aesni_gcm_finalize_avx_gen2(
1835 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1836 # gcm_context_data *data,
1837 # u8 *auth_tag, /* Authenticated Tag output. */
1838 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1839 # Valid values are 16 (most likely), 12 or 8. */
1840 ###############################################################################
1841 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1849 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1853 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1857 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1860 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1862 ###############################################################################
1863 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1864 # Input: A and B (128-bits each, bit-reflected)
1865 # Output: C = A*B*x mod poly, (i.e. >>1 )
1866 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1867 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1868 ###############################################################################
1869 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1871 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1872 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1873 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1874 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1878 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1879 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1884 #######################################################################
1885 #first phase of the reduction
1886 vmovdqa POLY2(%rip), \T3
1888 vpclmulqdq $0x01, \GH, \T3, \T2
1889 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1891 vpxor \T2, \GH, \GH # first phase of the reduction complete
1892 #######################################################################
1893 #second phase of the reduction
1894 vpclmulqdq $0x00, \GH, \T3, \T2
1895 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1897 vpclmulqdq $0x10, \GH, \T3, \GH
1898 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1900 vpxor \T2, \GH, \GH # second phase of the reduction complete
1901 #######################################################################
1902 vpxor \T1, \GH, \GH # the result is in GH
1907 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1909 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1911 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1912 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1914 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1915 vmovdqu \T5, HashKey_3(arg2)
1917 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1918 vmovdqu \T5, HashKey_4(arg2)
1920 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1921 vmovdqu \T5, HashKey_5(arg2)
1923 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1924 vmovdqu \T5, HashKey_6(arg2)
1926 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1927 vmovdqu \T5, HashKey_7(arg2)
1929 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1930 vmovdqu \T5, HashKey_8(arg2)
1934 ## if a = number of total plaintext bytes
1936 ## num_initial_blocks = b mod 4#
1937 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1938 ## r10, r11, r12, rax are clobbered
1939 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1941 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1942 i = (8-\num_initial_blocks)
1944 vmovdqu AadHash(arg2), reg_i
1946 # start AES for num_initial_blocks blocks
1947 vmovdqu CurCount(arg2), \CTR
1949 i = (9-\num_initial_blocks)
1951 .rep \num_initial_blocks
1952 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1954 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1959 vmovdqa (arg1), \T_key
1960 i = (9-\num_initial_blocks)
1962 .rep \num_initial_blocks
1963 vpxor \T_key, reg_i, reg_i
1971 vmovdqa 16*j(arg1), \T_key
1972 i = (9-\num_initial_blocks)
1974 .rep \num_initial_blocks
1975 vaesenc \T_key, reg_i, reg_i
1985 vmovdqa 16*j(arg1), \T_key
1986 i = (9-\num_initial_blocks)
1988 .rep \num_initial_blocks
1989 vaesenclast \T_key, reg_i, reg_i
1994 i = (9-\num_initial_blocks)
1996 .rep \num_initial_blocks
1997 vmovdqu (arg4, %r11), \T1
1998 vpxor \T1, reg_i, reg_i
1999 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
2000 # num_initial_blocks blocks
2005 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2011 i = (8-\num_initial_blocks)
2012 j = (9-\num_initial_blocks)
2015 .rep \num_initial_blocks
2016 vpxor reg_i, reg_j, reg_j
2017 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2022 # XMM8 has the combined result here
2024 vmovdqa \XMM8, TMP1(%rsp)
2028 jl _initial_blocks_done\@ # no need for precomputed constants
2030 ###############################################################################
2031 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2032 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2034 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2036 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2038 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2040 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2042 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2044 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2046 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2048 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2050 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2052 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2054 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2056 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2058 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2060 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2062 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2064 vmovdqa (arg1), \T_key
2065 vpxor \T_key, \XMM1, \XMM1
2066 vpxor \T_key, \XMM2, \XMM2
2067 vpxor \T_key, \XMM3, \XMM3
2068 vpxor \T_key, \XMM4, \XMM4
2069 vpxor \T_key, \XMM5, \XMM5
2070 vpxor \T_key, \XMM6, \XMM6
2071 vpxor \T_key, \XMM7, \XMM7
2072 vpxor \T_key, \XMM8, \XMM8
2076 .rep \REP # do REP rounds
2077 vmovdqa 16*i(arg1), \T_key
2078 vaesenc \T_key, \XMM1, \XMM1
2079 vaesenc \T_key, \XMM2, \XMM2
2080 vaesenc \T_key, \XMM3, \XMM3
2081 vaesenc \T_key, \XMM4, \XMM4
2082 vaesenc \T_key, \XMM5, \XMM5
2083 vaesenc \T_key, \XMM6, \XMM6
2084 vaesenc \T_key, \XMM7, \XMM7
2085 vaesenc \T_key, \XMM8, \XMM8
2091 vmovdqa 16*i(arg1), \T_key
2092 vaesenclast \T_key, \XMM1, \XMM1
2093 vaesenclast \T_key, \XMM2, \XMM2
2094 vaesenclast \T_key, \XMM3, \XMM3
2095 vaesenclast \T_key, \XMM4, \XMM4
2096 vaesenclast \T_key, \XMM5, \XMM5
2097 vaesenclast \T_key, \XMM6, \XMM6
2098 vaesenclast \T_key, \XMM7, \XMM7
2099 vaesenclast \T_key, \XMM8, \XMM8
2101 vmovdqu (arg4, %r11), \T1
2102 vpxor \T1, \XMM1, \XMM1
2103 vmovdqu \XMM1, (arg3 , %r11)
2108 vmovdqu 16*1(arg4, %r11), \T1
2109 vpxor \T1, \XMM2, \XMM2
2110 vmovdqu \XMM2, 16*1(arg3 , %r11)
2115 vmovdqu 16*2(arg4, %r11), \T1
2116 vpxor \T1, \XMM3, \XMM3
2117 vmovdqu \XMM3, 16*2(arg3 , %r11)
2122 vmovdqu 16*3(arg4, %r11), \T1
2123 vpxor \T1, \XMM4, \XMM4
2124 vmovdqu \XMM4, 16*3(arg3 , %r11)
2129 vmovdqu 16*4(arg4, %r11), \T1
2130 vpxor \T1, \XMM5, \XMM5
2131 vmovdqu \XMM5, 16*4(arg3 , %r11)
2136 vmovdqu 16*5(arg4, %r11), \T1
2137 vpxor \T1, \XMM6, \XMM6
2138 vmovdqu \XMM6, 16*5(arg3 , %r11)
2143 vmovdqu 16*6(arg4, %r11), \T1
2144 vpxor \T1, \XMM7, \XMM7
2145 vmovdqu \XMM7, 16*6(arg3 , %r11)
2150 vmovdqu 16*7(arg4, %r11), \T1
2151 vpxor \T1, \XMM8, \XMM8
2152 vmovdqu \XMM8, 16*7(arg3 , %r11)
2159 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2160 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2161 # the corresponding ciphertext
2162 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2163 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2164 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2165 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2166 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2167 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2168 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2170 ###############################################################################
2172 _initial_blocks_done\@:
2179 # encrypt 8 blocks at a time
2180 # ghash the 8 previously encrypted ciphertext blocks
2181 # arg1, arg3, arg4 are used as pointers only, not modified
2182 # r11 is the data offset value
2183 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2186 vmovdqa \XMM2, TMP2(%rsp)
2187 vmovdqa \XMM3, TMP3(%rsp)
2188 vmovdqa \XMM4, TMP4(%rsp)
2189 vmovdqa \XMM5, TMP5(%rsp)
2190 vmovdqa \XMM6, TMP6(%rsp)
2191 vmovdqa \XMM7, TMP7(%rsp)
2192 vmovdqa \XMM8, TMP8(%rsp)
2194 .if \loop_idx == in_order
2195 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2196 vpaddd ONE(%rip), \XMM1, \XMM2
2197 vpaddd ONE(%rip), \XMM2, \XMM3
2198 vpaddd ONE(%rip), \XMM3, \XMM4
2199 vpaddd ONE(%rip), \XMM4, \XMM5
2200 vpaddd ONE(%rip), \XMM5, \XMM6
2201 vpaddd ONE(%rip), \XMM6, \XMM7
2202 vpaddd ONE(%rip), \XMM7, \XMM8
2205 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2206 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2207 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2208 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2209 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2210 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2211 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2212 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2214 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2215 vpaddd ONEf(%rip), \XMM1, \XMM2
2216 vpaddd ONEf(%rip), \XMM2, \XMM3
2217 vpaddd ONEf(%rip), \XMM3, \XMM4
2218 vpaddd ONEf(%rip), \XMM4, \XMM5
2219 vpaddd ONEf(%rip), \XMM5, \XMM6
2220 vpaddd ONEf(%rip), \XMM6, \XMM7
2221 vpaddd ONEf(%rip), \XMM7, \XMM8
2226 #######################################################################
2229 vpxor \T1, \XMM1, \XMM1
2230 vpxor \T1, \XMM2, \XMM2
2231 vpxor \T1, \XMM3, \XMM3
2232 vpxor \T1, \XMM4, \XMM4
2233 vpxor \T1, \XMM5, \XMM5
2234 vpxor \T1, \XMM6, \XMM6
2235 vpxor \T1, \XMM7, \XMM7
2236 vpxor \T1, \XMM8, \XMM8
2238 #######################################################################
2244 vmovdqu 16*1(arg1), \T1
2245 vaesenc \T1, \XMM1, \XMM1
2246 vaesenc \T1, \XMM2, \XMM2
2247 vaesenc \T1, \XMM3, \XMM3
2248 vaesenc \T1, \XMM4, \XMM4
2249 vaesenc \T1, \XMM5, \XMM5
2250 vaesenc \T1, \XMM6, \XMM6
2251 vaesenc \T1, \XMM7, \XMM7
2252 vaesenc \T1, \XMM8, \XMM8
2254 vmovdqu 16*2(arg1), \T1
2255 vaesenc \T1, \XMM1, \XMM1
2256 vaesenc \T1, \XMM2, \XMM2
2257 vaesenc \T1, \XMM3, \XMM3
2258 vaesenc \T1, \XMM4, \XMM4
2259 vaesenc \T1, \XMM5, \XMM5
2260 vaesenc \T1, \XMM6, \XMM6
2261 vaesenc \T1, \XMM7, \XMM7
2262 vaesenc \T1, \XMM8, \XMM8
2265 #######################################################################
2267 vmovdqu HashKey_8(arg2), \T5
2268 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2269 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2270 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2271 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2274 vmovdqu 16*3(arg1), \T1
2275 vaesenc \T1, \XMM1, \XMM1
2276 vaesenc \T1, \XMM2, \XMM2
2277 vaesenc \T1, \XMM3, \XMM3
2278 vaesenc \T1, \XMM4, \XMM4
2279 vaesenc \T1, \XMM5, \XMM5
2280 vaesenc \T1, \XMM6, \XMM6
2281 vaesenc \T1, \XMM7, \XMM7
2282 vaesenc \T1, \XMM8, \XMM8
2284 vmovdqa TMP2(%rsp), \T1
2285 vmovdqu HashKey_7(arg2), \T5
2286 vpclmulqdq $0x11, \T5, \T1, \T3
2289 vpclmulqdq $0x00, \T5, \T1, \T3
2292 vpclmulqdq $0x01, \T5, \T1, \T3
2295 vpclmulqdq $0x10, \T5, \T1, \T3
2298 vmovdqu 16*4(arg1), \T1
2299 vaesenc \T1, \XMM1, \XMM1
2300 vaesenc \T1, \XMM2, \XMM2
2301 vaesenc \T1, \XMM3, \XMM3
2302 vaesenc \T1, \XMM4, \XMM4
2303 vaesenc \T1, \XMM5, \XMM5
2304 vaesenc \T1, \XMM6, \XMM6
2305 vaesenc \T1, \XMM7, \XMM7
2306 vaesenc \T1, \XMM8, \XMM8
2308 #######################################################################
2310 vmovdqa TMP3(%rsp), \T1
2311 vmovdqu HashKey_6(arg2), \T5
2312 vpclmulqdq $0x11, \T5, \T1, \T3
2315 vpclmulqdq $0x00, \T5, \T1, \T3
2318 vpclmulqdq $0x01, \T5, \T1, \T3
2321 vpclmulqdq $0x10, \T5, \T1, \T3
2324 vmovdqu 16*5(arg1), \T1
2325 vaesenc \T1, \XMM1, \XMM1
2326 vaesenc \T1, \XMM2, \XMM2
2327 vaesenc \T1, \XMM3, \XMM3
2328 vaesenc \T1, \XMM4, \XMM4
2329 vaesenc \T1, \XMM5, \XMM5
2330 vaesenc \T1, \XMM6, \XMM6
2331 vaesenc \T1, \XMM7, \XMM7
2332 vaesenc \T1, \XMM8, \XMM8
2334 vmovdqa TMP4(%rsp), \T1
2335 vmovdqu HashKey_5(arg2), \T5
2336 vpclmulqdq $0x11, \T5, \T1, \T3
2339 vpclmulqdq $0x00, \T5, \T1, \T3
2342 vpclmulqdq $0x01, \T5, \T1, \T3
2345 vpclmulqdq $0x10, \T5, \T1, \T3
2348 vmovdqu 16*6(arg1), \T1
2349 vaesenc \T1, \XMM1, \XMM1
2350 vaesenc \T1, \XMM2, \XMM2
2351 vaesenc \T1, \XMM3, \XMM3
2352 vaesenc \T1, \XMM4, \XMM4
2353 vaesenc \T1, \XMM5, \XMM5
2354 vaesenc \T1, \XMM6, \XMM6
2355 vaesenc \T1, \XMM7, \XMM7
2356 vaesenc \T1, \XMM8, \XMM8
2359 vmovdqa TMP5(%rsp), \T1
2360 vmovdqu HashKey_4(arg2), \T5
2361 vpclmulqdq $0x11, \T5, \T1, \T3
2364 vpclmulqdq $0x00, \T5, \T1, \T3
2367 vpclmulqdq $0x01, \T5, \T1, \T3
2370 vpclmulqdq $0x10, \T5, \T1, \T3
2373 vmovdqu 16*7(arg1), \T1
2374 vaesenc \T1, \XMM1, \XMM1
2375 vaesenc \T1, \XMM2, \XMM2
2376 vaesenc \T1, \XMM3, \XMM3
2377 vaesenc \T1, \XMM4, \XMM4
2378 vaesenc \T1, \XMM5, \XMM5
2379 vaesenc \T1, \XMM6, \XMM6
2380 vaesenc \T1, \XMM7, \XMM7
2381 vaesenc \T1, \XMM8, \XMM8
2383 vmovdqa TMP6(%rsp), \T1
2384 vmovdqu HashKey_3(arg2), \T5
2385 vpclmulqdq $0x11, \T5, \T1, \T3
2388 vpclmulqdq $0x00, \T5, \T1, \T3
2391 vpclmulqdq $0x01, \T5, \T1, \T3
2394 vpclmulqdq $0x10, \T5, \T1, \T3
2397 vmovdqu 16*8(arg1), \T1
2398 vaesenc \T1, \XMM1, \XMM1
2399 vaesenc \T1, \XMM2, \XMM2
2400 vaesenc \T1, \XMM3, \XMM3
2401 vaesenc \T1, \XMM4, \XMM4
2402 vaesenc \T1, \XMM5, \XMM5
2403 vaesenc \T1, \XMM6, \XMM6
2404 vaesenc \T1, \XMM7, \XMM7
2405 vaesenc \T1, \XMM8, \XMM8
2407 vmovdqa TMP7(%rsp), \T1
2408 vmovdqu HashKey_2(arg2), \T5
2409 vpclmulqdq $0x11, \T5, \T1, \T3
2412 vpclmulqdq $0x00, \T5, \T1, \T3
2415 vpclmulqdq $0x01, \T5, \T1, \T3
2418 vpclmulqdq $0x10, \T5, \T1, \T3
2422 #######################################################################
2424 vmovdqu 16*9(arg1), \T5
2425 vaesenc \T5, \XMM1, \XMM1
2426 vaesenc \T5, \XMM2, \XMM2
2427 vaesenc \T5, \XMM3, \XMM3
2428 vaesenc \T5, \XMM4, \XMM4
2429 vaesenc \T5, \XMM5, \XMM5
2430 vaesenc \T5, \XMM6, \XMM6
2431 vaesenc \T5, \XMM7, \XMM7
2432 vaesenc \T5, \XMM8, \XMM8
2434 vmovdqa TMP8(%rsp), \T1
2435 vmovdqu HashKey(arg2), \T5
2437 vpclmulqdq $0x00, \T5, \T1, \T3
2440 vpclmulqdq $0x01, \T5, \T1, \T3
2443 vpclmulqdq $0x10, \T5, \T1, \T3
2446 vpclmulqdq $0x11, \T5, \T1, \T3
2450 vmovdqu 16*10(arg1), \T5
2455 vaesenc \T5, \XMM1, \XMM1
2456 vaesenc \T5, \XMM2, \XMM2
2457 vaesenc \T5, \XMM3, \XMM3
2458 vaesenc \T5, \XMM4, \XMM4
2459 vaesenc \T5, \XMM5, \XMM5
2460 vaesenc \T5, \XMM6, \XMM6
2461 vaesenc \T5, \XMM7, \XMM7
2462 vaesenc \T5, \XMM8, \XMM8
2464 vmovdqu 16*i(arg1), \T5
2473 vpxor 16*i(arg4, %r11), \T5, \T2
2475 vaesenclast \T2, reg_j, reg_j
2477 vaesenclast \T2, reg_j, \T3
2478 vmovdqu 16*i(arg4, %r11), reg_j
2479 vmovdqu \T3, 16*i(arg3, %r11)
2485 #######################################################################
2488 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2489 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2491 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2495 #######################################################################
2496 #first phase of the reduction
2497 vmovdqa POLY2(%rip), \T3
2499 vpclmulqdq $0x01, \T7, \T3, \T2
2500 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2502 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2503 #######################################################################
2505 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2506 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2507 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2508 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2509 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2510 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2511 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2512 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2515 #######################################################################
2516 #second phase of the reduction
2517 vpclmulqdq $0x00, \T7, \T3, \T2
2518 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2520 vpclmulqdq $0x10, \T7, \T3, \T4
2521 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2523 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2524 #######################################################################
2525 vpxor \T4, \T1, \T1 # the result is in T1
2527 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2528 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2529 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2530 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2531 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2532 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2533 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2534 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2537 vpxor \T1, \XMM1, \XMM1
2544 # GHASH the last 4 ciphertext blocks.
2545 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2549 vmovdqu HashKey_8(arg2), \T5
2551 vpshufd $0b01001110, \XMM1, \T2
2552 vpshufd $0b01001110, \T5, \T3
2553 vpxor \XMM1, \T2, \T2
2556 vpclmulqdq $0x11, \T5, \XMM1, \T6
2557 vpclmulqdq $0x00, \T5, \XMM1, \T7
2559 vpclmulqdq $0x00, \T3, \T2, \XMM1
2561 ######################
2563 vmovdqu HashKey_7(arg2), \T5
2564 vpshufd $0b01001110, \XMM2, \T2
2565 vpshufd $0b01001110, \T5, \T3
2566 vpxor \XMM2, \T2, \T2
2569 vpclmulqdq $0x11, \T5, \XMM2, \T4
2572 vpclmulqdq $0x00, \T5, \XMM2, \T4
2575 vpclmulqdq $0x00, \T3, \T2, \T2
2577 vpxor \T2, \XMM1, \XMM1
2579 ######################
2581 vmovdqu HashKey_6(arg2), \T5
2582 vpshufd $0b01001110, \XMM3, \T2
2583 vpshufd $0b01001110, \T5, \T3
2584 vpxor \XMM3, \T2, \T2
2587 vpclmulqdq $0x11, \T5, \XMM3, \T4
2590 vpclmulqdq $0x00, \T5, \XMM3, \T4
2593 vpclmulqdq $0x00, \T3, \T2, \T2
2595 vpxor \T2, \XMM1, \XMM1
2597 ######################
2599 vmovdqu HashKey_5(arg2), \T5
2600 vpshufd $0b01001110, \XMM4, \T2
2601 vpshufd $0b01001110, \T5, \T3
2602 vpxor \XMM4, \T2, \T2
2605 vpclmulqdq $0x11, \T5, \XMM4, \T4
2608 vpclmulqdq $0x00, \T5, \XMM4, \T4
2611 vpclmulqdq $0x00, \T3, \T2, \T2
2613 vpxor \T2, \XMM1, \XMM1
2615 ######################
2617 vmovdqu HashKey_4(arg2), \T5
2618 vpshufd $0b01001110, \XMM5, \T2
2619 vpshufd $0b01001110, \T5, \T3
2620 vpxor \XMM5, \T2, \T2
2623 vpclmulqdq $0x11, \T5, \XMM5, \T4
2626 vpclmulqdq $0x00, \T5, \XMM5, \T4
2629 vpclmulqdq $0x00, \T3, \T2, \T2
2631 vpxor \T2, \XMM1, \XMM1
2633 ######################
2635 vmovdqu HashKey_3(arg2), \T5
2636 vpshufd $0b01001110, \XMM6, \T2
2637 vpshufd $0b01001110, \T5, \T3
2638 vpxor \XMM6, \T2, \T2
2641 vpclmulqdq $0x11, \T5, \XMM6, \T4
2644 vpclmulqdq $0x00, \T5, \XMM6, \T4
2647 vpclmulqdq $0x00, \T3, \T2, \T2
2649 vpxor \T2, \XMM1, \XMM1
2651 ######################
2653 vmovdqu HashKey_2(arg2), \T5
2654 vpshufd $0b01001110, \XMM7, \T2
2655 vpshufd $0b01001110, \T5, \T3
2656 vpxor \XMM7, \T2, \T2
2659 vpclmulqdq $0x11, \T5, \XMM7, \T4
2662 vpclmulqdq $0x00, \T5, \XMM7, \T4
2665 vpclmulqdq $0x00, \T3, \T2, \T2
2667 vpxor \T2, \XMM1, \XMM1
2669 ######################
2671 vmovdqu HashKey(arg2), \T5
2672 vpshufd $0b01001110, \XMM8, \T2
2673 vpshufd $0b01001110, \T5, \T3
2674 vpxor \XMM8, \T2, \T2
2677 vpclmulqdq $0x11, \T5, \XMM8, \T4
2680 vpclmulqdq $0x00, \T5, \XMM8, \T4
2683 vpclmulqdq $0x00, \T3, \T2, \T2
2685 vpxor \T2, \XMM1, \XMM1
2686 vpxor \T6, \XMM1, \XMM1
2687 vpxor \T7, \XMM1, \T2
2692 vpslldq $8, \T2, \T4
2693 vpsrldq $8, \T2, \T2
2696 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2697 # accumulated carry-less multiplications
2699 #######################################################################
2700 #first phase of the reduction
2701 vmovdqa POLY2(%rip), \T3
2703 vpclmulqdq $0x01, \T7, \T3, \T2
2704 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2706 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2707 #######################################################################
2710 #second phase of the reduction
2711 vpclmulqdq $0x00, \T7, \T3, \T2
2712 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2714 vpclmulqdq $0x10, \T7, \T3, \T4
2715 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2717 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2718 #######################################################################
2719 vpxor \T4, \T6, \T6 # the result is in T6
2724 #############################################################
2725 #void aesni_gcm_init_avx_gen4
2726 # (gcm_data *my_ctx_data,
2727 # gcm_context_data *data,
2728 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2729 # (from Security Association) concatenated with 8 byte
2730 # Initialisation Vector (from IPSec ESP Payload)
2731 # concatenated with 0x00000001. 16-byte aligned pointer. */
2732 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2733 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2734 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2735 #############################################################
2736 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2738 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2741 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2743 ###############################################################################
2744 #void aesni_gcm_enc_avx_gen4(
2745 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2746 # gcm_context_data *data,
2747 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2748 # const u8 *in, /* Plaintext input */
2749 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2750 ###############################################################################
2751 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2755 je key_256_enc_update4
2757 je key_128_enc_update4
2759 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2762 key_128_enc_update4:
2763 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2766 key_256_enc_update4:
2767 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2770 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2772 ###############################################################################
2773 #void aesni_gcm_dec_update_avx_gen4(
2774 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2775 # gcm_context_data *data,
2776 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2777 # const u8 *in, /* Ciphertext input */
2778 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2779 ###############################################################################
2780 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2784 je key_256_dec_update4
2786 je key_128_dec_update4
2788 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2791 key_128_dec_update4:
2792 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2795 key_256_dec_update4:
2796 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2799 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2801 ###############################################################################
2802 #void aesni_gcm_finalize_avx_gen4(
2803 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2804 # gcm_context_data *data,
2805 # u8 *auth_tag, /* Authenticated Tag output. */
2806 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2807 # Valid values are 16 (most likely), 12 or 8. */
2808 ###############################################################################
2809 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2813 je key_256_finalize4
2815 je key_128_finalize4
2817 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2821 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2825 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2828 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)