2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
4 # Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 # Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
6 # Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
8 # This code is taken from the OpenSSL project but the author, Andy Polyakov,
9 # has relicensed it under the licenses specified in the SPDX header above.
10 # The original headers, including the original license headers, are
11 # included below for completeness.
13 # ====================================================================
14 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
15 # project. The module is, however, dual licensed under OpenSSL and
16 # CRYPTOGAMS licenses depending on where you obtain it. For further
17 # details see http://www.openssl.org/~appro/cryptogams/.
18 # ====================================================================
20 # This module implements Poly1305 hash for x86_64.
28 # Add AVX512F+VL+BW code path.
32 # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
33 # executed even on Knights Landing. Trigger for modification was
34 # observation that AVX512 code paths can negatively affect overall
35 # Skylake-X system performance. Since we are likely to suppress
36 # AVX512F capability flag [at least on Skylake-X], conversion serves
37 # as kind of "investment protection". Note that next *lake processor,
38 # Cannonlake, has AVX512IFMA code path to execute...
40 # Numbers are cycles per processed byte with poly1305_blocks alone,
41 # measured with rdtsc at fixed clock frequency.
43 # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
46 # Westmere 1.88/+120% -
47 # Sandy Bridge 1.39/+140% 1.10
48 # Haswell 1.14/+175% 1.11 0.65
49 # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
50 # Silvermont 2.83/+95% -
51 # Knights L 3.60/? 1.65 1.10 0.41(***)
52 # Goldmont 1.70/+180% -
53 # VIA Nano 1.82/+150% -
54 # Sledgehammer 1.38/+160% -
55 # Bulldozer 2.30/+130% 0.97
56 # Ryzen 1.15/+200% 1.08 1.18
58 # (*) improvement coefficients relative to clang are more modest and
59 # are ~50% on most processors, in both cases we are comparing to
61 # (**) SSE2 implementation was attempted, but among non-AVX processors
62 # it was faster than integer-only code only on older Intel P4 and
63 # Core processors, 50-30%, less newer processor is, but slower on
64 # contemporary ones, for example almost 2x slower on Atom, and as
65 # former are naturally disappearing, SSE2 is deemed unnecessary;
66 # (***) strangely enough performance seems to vary from core to core,
67 # listed result is best case;
71 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
73 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
74 $kernel=0; $kernel=1 if (!$flavour && !$output);
77 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80 die "can't locate x86_64-xlate.pl";
82 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
85 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
86 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
87 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
90 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
91 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
92 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
93 $avx += 1 if ($1==2.11 && $2>=8);
96 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
97 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
98 $avx = ($1>=10) + ($1>=11);
101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
102 $avx = ($2>=3.0) + ($2>3.0);
105 $avx = 4; # The kernel uses ifdefs for this.
108 sub declare_function() {
109 my ($name, $align, $nargs) = @_;
111 $code .= ".align $align\n";
112 $code .= "SYM_FUNC_START($name)\n";
113 $code .= ".L$name:\n";
115 $code .= ".globl $name\n";
116 $code .= ".type $name,\@function,$nargs\n";
117 $code .= ".align $align\n";
125 $code .= "SYM_FUNC_END($name)\n";
127 $code .= ".size $name,.-$name\n";
131 $code.=<<___ if $kernel;
132 #include <linux/linkage.h>
136 $code.=<<___ if $kernel;
143 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
145 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
147 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
149 .long 2,2,2,3,2,0,2,1
151 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
154 .long 0,1,1,2,2,3,7,7
158 .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
166 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
167 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
169 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
170 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
173 $code.=<<___ if (!$kernel);
174 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
178 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
179 my ($mac,$nonce)=($inp,$len); # *_emit arguments
180 my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
181 my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
183 sub poly1305_iteration {
184 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
185 # output: $h0-$h2 *= $r0-$r1
193 mov %rax,$h0 # future $h0
203 mov $h2,$h1 # borrow $h1
207 imulq $s1,$h1 # h2*s1
212 imulq $r0,$h2 # h2*r0
214 mov \$-4,%rax # mask value
217 and $d3,%rax # last reduction step
228 ########################################################################
229 # Layout of opaque area is following.
231 # unsigned __int64 h[3]; # current hash value base 2^64
232 # unsigned __int64 r[2]; # key value base 2^64
237 $code.=<<___ if (!$kernel);
238 .extern OPENSSL_ia32cap_P
240 .globl poly1305_init_x86_64
241 .hidden poly1305_init_x86_64
242 .globl poly1305_blocks_x86_64
243 .hidden poly1305_blocks_x86_64
244 .globl poly1305_emit_x86_64
245 .hidden poly1305_emit_x86_64
247 &declare_function("poly1305_init_x86_64", 32, 3);
250 mov %rax,0($ctx) # initialize hash value
257 $code.=<<___ if (!$kernel);
258 lea poly1305_blocks_x86_64(%rip),%r10
259 lea poly1305_emit_x86_64(%rip),%r11
261 $code.=<<___ if (!$kernel && $avx);
262 mov OPENSSL_ia32cap_P+4(%rip),%r9
263 lea poly1305_blocks_avx(%rip),%rax
264 lea poly1305_emit_avx(%rip),%rcx
265 bt \$`60-32`,%r9 # AVX?
269 $code.=<<___ if (!$kernel && $avx>1);
270 lea poly1305_blocks_avx2(%rip),%rax
271 bt \$`5+32`,%r9 # AVX2?
274 $code.=<<___ if (!$kernel && $avx>3);
275 mov \$`(1<<31|1<<21|1<<16)`,%rax
282 mov \$0x0ffffffc0fffffff,%rax
283 mov \$0x0ffffffc0ffffffc,%rcx
289 $code.=<<___ if (!$kernel && $flavour !~ /elf32/);
293 $code.=<<___ if (!$kernel && $flavour =~ /elf32/);
302 &end_function("poly1305_init_x86_64");
304 &declare_function("poly1305_blocks_x86_64", 32, 4);
309 jz .Lno_data # too short
325 mov $len,%r15 # reassign $len
327 mov 24($ctx),$r0 # load r
330 mov 0($ctx),$h0 # load hash value
337 add $r1,$s1 # s1 = r1 + (r1 >> 2)
342 add 0($inp),$h0 # accumulate input
348 &poly1305_iteration();
358 mov $h0,0($ctx) # store hash value
373 .cfi_adjust_cfa_offset -48
379 &end_function("poly1305_blocks_x86_64");
381 &declare_function("poly1305_emit_x86_64", 32, 3);
384 mov 0($ctx),%r8 # load hash value
389 add \$5,%r8 # compare to modulus
393 shr \$2,%r10 # did 130-bit value overflow?
397 add 0($nonce),%rax # accumulate nonce
399 mov %rax,0($mac) # write result
404 &end_function("poly1305_emit_x86_64");
407 ########################################################################
408 # Layout of opaque area is following.
410 # unsigned __int32 h[5]; # current hash value base 2^26
411 # unsigned __int32 is_base2_26;
412 # unsigned __int64 r[2]; # key value base 2^64
413 # unsigned __int64 pad;
414 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
416 # where r^n are base 2^26 digits of degrees of multiplier key. There are
417 # 5 digits, but last four are interleaved with multiples of 5, totalling
418 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
420 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
421 map("%xmm$_",(0..15));
424 .type __poly1305_block,\@abi-omnipotent
429 &poly1305_iteration();
433 .size __poly1305_block,.-__poly1305_block
435 .type __poly1305_init_avx,\@abi-omnipotent
444 lea 48+64($ctx),$ctx # size optimization
447 call __poly1305_block # r^2
449 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
455 mov %eax,`16*0+0-64`($ctx)
457 mov %edx,`16*0+4-64`($ctx)
464 mov %eax,`16*1+0-64`($ctx)
465 lea (%rax,%rax,4),%eax # *5
466 mov %edx,`16*1+4-64`($ctx)
467 lea (%rdx,%rdx,4),%edx # *5
468 mov %eax,`16*2+0-64`($ctx)
470 mov %edx,`16*2+4-64`($ctx)
481 mov %eax,`16*3+0-64`($ctx)
482 lea (%rax,%rax,4),%eax # *5
483 mov %edx,`16*3+4-64`($ctx)
484 lea (%rdx,%rdx,4),%edx # *5
485 mov %eax,`16*4+0-64`($ctx)
487 mov %edx,`16*4+4-64`($ctx)
496 mov %eax,`16*5+0-64`($ctx)
497 lea (%rax,%rax,4),%eax # *5
498 mov %edx,`16*5+4-64`($ctx)
499 lea (%rdx,%rdx,4),%edx # *5
500 mov %eax,`16*6+0-64`($ctx)
502 mov %edx,`16*6+4-64`($ctx)
508 mov $d1#d,`16*7+0-64`($ctx)
509 lea ($d1,$d1,4),$d1 # *5
510 mov $d2#d,`16*7+4-64`($ctx)
511 lea ($d2,$d2,4),$d2 # *5
512 mov $d1#d,`16*8+0-64`($ctx)
513 mov $d2#d,`16*8+4-64`($ctx)
516 call __poly1305_block # r^3
518 mov \$0x3ffffff,%eax # save r^3 base 2^26
522 mov %eax,`16*0+12-64`($ctx)
526 mov %edx,`16*1+12-64`($ctx)
527 lea (%rdx,%rdx,4),%edx # *5
529 mov %edx,`16*2+12-64`($ctx)
535 mov %eax,`16*3+12-64`($ctx)
536 lea (%rax,%rax,4),%eax # *5
538 mov %eax,`16*4+12-64`($ctx)
543 mov %edx,`16*5+12-64`($ctx)
544 lea (%rdx,%rdx,4),%edx # *5
546 mov %edx,`16*6+12-64`($ctx)
551 mov $d1#d,`16*7+12-64`($ctx)
552 lea ($d1,$d1,4),$d1 # *5
553 mov $d1#d,`16*8+12-64`($ctx)
556 call __poly1305_block # r^4
558 mov \$0x3ffffff,%eax # save r^4 base 2^26
562 mov %eax,`16*0+8-64`($ctx)
566 mov %edx,`16*1+8-64`($ctx)
567 lea (%rdx,%rdx,4),%edx # *5
569 mov %edx,`16*2+8-64`($ctx)
575 mov %eax,`16*3+8-64`($ctx)
576 lea (%rax,%rax,4),%eax # *5
578 mov %eax,`16*4+8-64`($ctx)
583 mov %edx,`16*5+8-64`($ctx)
584 lea (%rdx,%rdx,4),%edx # *5
586 mov %edx,`16*6+8-64`($ctx)
591 mov $d1#d,`16*7+8-64`($ctx)
592 lea ($d1,$d1,4),$d1 # *5
593 mov $d1#d,`16*8+8-64`($ctx)
595 lea -48-64($ctx),$ctx # size [de-]optimization
598 .size __poly1305_init_avx,.-__poly1305_init_avx
601 &declare_function("poly1305_blocks_avx", 32, 4);
604 mov 20($ctx),%r8d # is_base2_26
637 mov $len,%r15 # reassign $len
639 mov 0($ctx),$d1 # load hash value
643 mov 24($ctx),$r0 # load r
646 ################################# base 2^26 -> base 2^64
648 and \$`-1*(1<<31)`,$d1
649 mov $d2,$r1 # borrow $r1
651 and \$`-1*(1<<31)`,$d2
665 adc \$0,$h2 # can be partially reduced...
667 mov \$-4,$d2 # ... so reduce
680 add $r1,$s1 # s1 = r1 + (r1 >> 2)
682 add 0($inp),$h0 # accumulate input
687 call __poly1305_block
689 test $padbit,$padbit # if $padbit is zero,
690 jz .Lstore_base2_64_avx # store hash in base 2^64 format
692 ################################# base 2^64 -> base 2^26
699 and \$0x3ffffff,%rax # h[0]
701 and \$0x3ffffff,%rdx # h[1]
705 and \$0x3ffffff,$h0 # h[2]
707 and \$0x3ffffff,$h1 # h[3]
711 jz .Lstore_base2_26_avx
721 .Lstore_base2_64_avx:
724 mov $h2,16($ctx) # note that is_base2_26 is zeroed
728 .Lstore_base2_26_avx:
729 mov %rax#d,0($ctx) # store hash value base 2^26
749 .Lblocks_avx_epilogue:
771 mov $len,%r15 # reassign $len
773 mov 24($ctx),$r0 # load r
776 mov 0($ctx),$h0 # load hash value
783 add $r1,$s1 # s1 = r1 + (r1 >> 2)
788 add 0($inp),$h0 # accumulate input
794 call __poly1305_block
797 ################################# base 2^64 -> base 2^26
804 and \$0x3ffffff,%rax # h[0]
806 and \$0x3ffffff,%rdx # h[1]
810 and \$0x3ffffff,$h0 # h[2]
812 and \$0x3ffffff,$h1 # h[3]
820 movl \$1,20($ctx) # set is_base2_26
822 call __poly1305_init_avx
838 .Lbase2_64_avx_epilogue:
845 vmovd 4*0($ctx),$H0 # load hash value
853 $code.=<<___ if (!$win64);
855 .cfi_def_cfa_register %r10
861 $code.=<<___ if ($win64);
864 vmovdqa %xmm6,0x50(%r11)
865 vmovdqa %xmm7,0x60(%r11)
866 vmovdqa %xmm8,0x70(%r11)
867 vmovdqa %xmm9,0x80(%r11)
868 vmovdqa %xmm10,0x90(%r11)
869 vmovdqa %xmm11,0xa0(%r11)
870 vmovdqa %xmm12,0xb0(%r11)
871 vmovdqa %xmm13,0xc0(%r11)
872 vmovdqa %xmm14,0xd0(%r11)
873 vmovdqa %xmm15,0xe0(%r11)
881 vmovdqu `16*3`($ctx),$D4 # preload r0^2
882 lea `16*3+64`($ctx),$ctx # size optimization
883 lea .Lconst(%rip),%rcx
885 ################################################################
887 vmovdqu 16*2($inp),$T0
888 vmovdqu 16*3($inp),$T1
889 vmovdqa 64(%rcx),$MASK # .Lmask26
891 vpsrldq \$6,$T0,$T2 # splat input
893 vpunpckhqdq $T1,$T0,$T4 # 4
894 vpunpcklqdq $T1,$T0,$T0 # 0:1
895 vpunpcklqdq $T3,$T2,$T3 # 2:3
897 vpsrlq \$40,$T4,$T4 # 4
899 vpand $MASK,$T0,$T0 # 0
901 vpand $MASK,$T1,$T1 # 1
903 vpand $MASK,$T2,$T2 # 2
904 vpand $MASK,$T3,$T3 # 3
905 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
909 # expand and copy pre-calculated table to stack
910 vmovdqu `16*1-64`($ctx),$D1
911 vmovdqu `16*2-64`($ctx),$D2
912 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
913 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
914 vmovdqa $D3,-0x90(%r11)
915 vmovdqa $D0,0x00(%rsp)
916 vpshufd \$0xEE,$D1,$D4
917 vmovdqu `16*3-64`($ctx),$D0
918 vpshufd \$0x44,$D1,$D1
919 vmovdqa $D4,-0x80(%r11)
920 vmovdqa $D1,0x10(%rsp)
921 vpshufd \$0xEE,$D2,$D3
922 vmovdqu `16*4-64`($ctx),$D1
923 vpshufd \$0x44,$D2,$D2
924 vmovdqa $D3,-0x70(%r11)
925 vmovdqa $D2,0x20(%rsp)
926 vpshufd \$0xEE,$D0,$D4
927 vmovdqu `16*5-64`($ctx),$D2
928 vpshufd \$0x44,$D0,$D0
929 vmovdqa $D4,-0x60(%r11)
930 vmovdqa $D0,0x30(%rsp)
931 vpshufd \$0xEE,$D1,$D3
932 vmovdqu `16*6-64`($ctx),$D0
933 vpshufd \$0x44,$D1,$D1
934 vmovdqa $D3,-0x50(%r11)
935 vmovdqa $D1,0x40(%rsp)
936 vpshufd \$0xEE,$D2,$D4
937 vmovdqu `16*7-64`($ctx),$D1
938 vpshufd \$0x44,$D2,$D2
939 vmovdqa $D4,-0x40(%r11)
940 vmovdqa $D2,0x50(%rsp)
941 vpshufd \$0xEE,$D0,$D3
942 vmovdqu `16*8-64`($ctx),$D2
943 vpshufd \$0x44,$D0,$D0
944 vmovdqa $D3,-0x30(%r11)
945 vmovdqa $D0,0x60(%rsp)
946 vpshufd \$0xEE,$D1,$D4
947 vpshufd \$0x44,$D1,$D1
948 vmovdqa $D4,-0x20(%r11)
949 vmovdqa $D1,0x70(%rsp)
950 vpshufd \$0xEE,$D2,$D3
951 vmovdqa 0x00(%rsp),$D4 # preload r0^2
952 vpshufd \$0x44,$D2,$D2
953 vmovdqa $D3,-0x10(%r11)
954 vmovdqa $D2,0x80(%rsp)
960 ################################################################
961 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
962 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
963 # \___________________/
964 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
965 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
966 # \___________________/ \____________________/
968 # Note that we start with inp[2:3]*r^2. This is because it
969 # doesn't depend on reduction in previous iteration.
970 ################################################################
971 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
972 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
973 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
974 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
975 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
977 # though note that $Tx and $Hx are "reversed" in this section,
978 # and $D4 is preloaded with r0^2...
980 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
981 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
982 vmovdqa $H2,0x20(%r11) # offload hash
983 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
984 vmovdqa 0x10(%rsp),$H2 # r1^2
985 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
986 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
988 vmovdqa $H0,0x00(%r11) #
989 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
990 vmovdqa $H1,0x10(%r11) #
991 vpmuludq $T3,$H2,$H1 # h3*r1
992 vpaddq $H0,$D0,$D0 # d0 += h4*s1
993 vpaddq $H1,$D4,$D4 # d4 += h3*r1
994 vmovdqa $H3,0x30(%r11) #
995 vpmuludq $T2,$H2,$H0 # h2*r1
996 vpmuludq $T1,$H2,$H1 # h1*r1
997 vpaddq $H0,$D3,$D3 # d3 += h2*r1
998 vmovdqa 0x30(%rsp),$H3 # r2^2
999 vpaddq $H1,$D2,$D2 # d2 += h1*r1
1000 vmovdqa $H4,0x40(%r11) #
1001 vpmuludq $T0,$H2,$H2 # h0*r1
1002 vpmuludq $T2,$H3,$H0 # h2*r2
1003 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1005 vmovdqa 0x40(%rsp),$H4 # s2^2
1006 vpaddq $H0,$D4,$D4 # d4 += h2*r2
1007 vpmuludq $T1,$H3,$H1 # h1*r2
1008 vpmuludq $T0,$H3,$H3 # h0*r2
1009 vpaddq $H1,$D3,$D3 # d3 += h1*r2
1010 vmovdqa 0x50(%rsp),$H2 # r3^2
1011 vpaddq $H3,$D2,$D2 # d2 += h0*r2
1012 vpmuludq $T4,$H4,$H0 # h4*s2
1013 vpmuludq $T3,$H4,$H4 # h3*s2
1014 vpaddq $H0,$D1,$D1 # d1 += h4*s2
1015 vmovdqa 0x60(%rsp),$H3 # s3^2
1016 vpaddq $H4,$D0,$D0 # d0 += h3*s2
1018 vmovdqa 0x80(%rsp),$H4 # s4^2
1019 vpmuludq $T1,$H2,$H1 # h1*r3
1020 vpmuludq $T0,$H2,$H2 # h0*r3
1021 vpaddq $H1,$D4,$D4 # d4 += h1*r3
1022 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1023 vpmuludq $T4,$H3,$H0 # h4*s3
1024 vpmuludq $T3,$H3,$H1 # h3*s3
1025 vpaddq $H0,$D2,$D2 # d2 += h4*s3
1026 vmovdqu 16*0($inp),$H0 # load input
1027 vpaddq $H1,$D1,$D1 # d1 += h3*s3
1028 vpmuludq $T2,$H3,$H3 # h2*s3
1029 vpmuludq $T2,$H4,$T2 # h2*s4
1030 vpaddq $H3,$D0,$D0 # d0 += h2*s3
1032 vmovdqu 16*1($inp),$H1 #
1033 vpaddq $T2,$D1,$D1 # d1 += h2*s4
1034 vpmuludq $T3,$H4,$T3 # h3*s4
1035 vpmuludq $T4,$H4,$T4 # h4*s4
1036 vpsrldq \$6,$H0,$H2 # splat input
1037 vpaddq $T3,$D2,$D2 # d2 += h3*s4
1038 vpaddq $T4,$D3,$D3 # d3 += h4*s4
1039 vpsrldq \$6,$H1,$H3 #
1040 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
1041 vpmuludq $T1,$H4,$T0 # h1*s4
1042 vpunpckhqdq $H1,$H0,$H4 # 4
1043 vpaddq $T4,$D4,$D4 # d4 += h0*r4
1044 vmovdqa -0x90(%r11),$T4 # r0^4
1045 vpaddq $T0,$D0,$D0 # d0 += h1*s4
1047 vpunpcklqdq $H1,$H0,$H0 # 0:1
1048 vpunpcklqdq $H3,$H2,$H3 # 2:3
1050 #vpsrlq \$40,$H4,$H4 # 4
1051 vpsrldq \$`40/8`,$H4,$H4 # 4
1053 vpand $MASK,$H0,$H0 # 0
1055 vpand $MASK,$H1,$H1 # 1
1056 vpand 0(%rcx),$H4,$H4 # .Lmask24
1058 vpand $MASK,$H2,$H2 # 2
1059 vpand $MASK,$H3,$H3 # 3
1060 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1062 vpaddq 0x00(%r11),$H0,$H0 # add hash value
1063 vpaddq 0x10(%r11),$H1,$H1
1064 vpaddq 0x20(%r11),$H2,$H2
1065 vpaddq 0x30(%r11),$H3,$H3
1066 vpaddq 0x40(%r11),$H4,$H4
1073 ################################################################
1074 # Now we accumulate (inp[0:1]+hash)*r^4
1075 ################################################################
1076 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1077 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1078 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1079 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1080 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1082 vpmuludq $H0,$T4,$T0 # h0*r0
1083 vpmuludq $H1,$T4,$T1 # h1*r0
1086 vmovdqa -0x80(%r11),$T2 # r1^4
1087 vpmuludq $H2,$T4,$T0 # h2*r0
1088 vpmuludq $H3,$T4,$T1 # h3*r0
1091 vpmuludq $H4,$T4,$T4 # h4*r0
1092 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1095 vpaddq $T0,$D0,$D0 # d0 += h4*s1
1096 vpmuludq $H2,$T2,$T1 # h2*r1
1097 vpmuludq $H3,$T2,$T0 # h3*r1
1098 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1099 vmovdqa -0x60(%r11),$T3 # r2^4
1100 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1101 vpmuludq $H1,$T2,$T1 # h1*r1
1102 vpmuludq $H0,$T2,$T2 # h0*r1
1103 vpaddq $T1,$D2,$D2 # d2 += h1*r1
1104 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1106 vmovdqa -0x50(%r11),$T4 # s2^4
1107 vpmuludq $H2,$T3,$T0 # h2*r2
1108 vpmuludq $H1,$T3,$T1 # h1*r2
1109 vpaddq $T0,$D4,$D4 # d4 += h2*r2
1110 vpaddq $T1,$D3,$D3 # d3 += h1*r2
1111 vmovdqa -0x40(%r11),$T2 # r3^4
1112 vpmuludq $H0,$T3,$T3 # h0*r2
1113 vpmuludq $H4,$T4,$T0 # h4*s2
1114 vpaddq $T3,$D2,$D2 # d2 += h0*r2
1115 vpaddq $T0,$D1,$D1 # d1 += h4*s2
1116 vmovdqa -0x30(%r11),$T3 # s3^4
1117 vpmuludq $H3,$T4,$T4 # h3*s2
1118 vpmuludq $H1,$T2,$T1 # h1*r3
1119 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1121 vmovdqa -0x10(%r11),$T4 # s4^4
1122 vpaddq $T1,$D4,$D4 # d4 += h1*r3
1123 vpmuludq $H0,$T2,$T2 # h0*r3
1124 vpmuludq $H4,$T3,$T0 # h4*s3
1125 vpaddq $T2,$D3,$D3 # d3 += h0*r3
1126 vpaddq $T0,$D2,$D2 # d2 += h4*s3
1127 vmovdqu 16*2($inp),$T0 # load input
1128 vpmuludq $H3,$T3,$T2 # h3*s3
1129 vpmuludq $H2,$T3,$T3 # h2*s3
1130 vpaddq $T2,$D1,$D1 # d1 += h3*s3
1131 vmovdqu 16*3($inp),$T1 #
1132 vpaddq $T3,$D0,$D0 # d0 += h2*s3
1134 vpmuludq $H2,$T4,$H2 # h2*s4
1135 vpmuludq $H3,$T4,$H3 # h3*s4
1136 vpsrldq \$6,$T0,$T2 # splat input
1137 vpaddq $H2,$D1,$D1 # d1 += h2*s4
1138 vpmuludq $H4,$T4,$H4 # h4*s4
1139 vpsrldq \$6,$T1,$T3 #
1140 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1141 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1142 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1143 vpmuludq $H1,$T4,$H0
1144 vpunpckhqdq $T1,$T0,$T4 # 4
1145 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1146 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1148 vpunpcklqdq $T1,$T0,$T0 # 0:1
1149 vpunpcklqdq $T3,$T2,$T3 # 2:3
1151 #vpsrlq \$40,$T4,$T4 # 4
1152 vpsrldq \$`40/8`,$T4,$T4 # 4
1154 vmovdqa 0x00(%rsp),$D4 # preload r0^2
1155 vpand $MASK,$T0,$T0 # 0
1157 vpand $MASK,$T1,$T1 # 1
1158 vpand 0(%rcx),$T4,$T4 # .Lmask24
1160 vpand $MASK,$T2,$T2 # 2
1161 vpand $MASK,$T3,$T3 # 3
1162 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1164 ################################################################
1165 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1170 vpaddq $D3,$H4,$H4 # h3 -> h4
1174 vpaddq $D0,$D1,$H1 # h0 -> h1
1181 vpaddq $D1,$H2,$H2 # h1 -> h2
1185 vpaddq $D0,$H0,$H0 # h4 -> h0
1189 vpaddq $D2,$H3,$H3 # h2 -> h3
1193 vpaddq $D0,$H1,$H1 # h0 -> h1
1197 vpaddq $D3,$H4,$H4 # h3 -> h4
1202 ################################################################
1203 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1205 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1216 vmovdqa $H2,0x20(%r11)
1217 vmovdqa $H0,0x00(%r11)
1218 vmovdqa $H1,0x10(%r11)
1219 vmovdqa $H3,0x30(%r11)
1220 vmovdqa $H4,0x40(%r11)
1222 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1223 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1224 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1225 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1226 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1228 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1229 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1230 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1231 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1232 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1233 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1235 vpmuludq $T3,$H2,$H0 # h3*r1
1236 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1237 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1238 vpmuludq $T2,$H2,$H1 # h2*r1
1239 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1240 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1241 vpmuludq $T1,$H2,$H0 # h1*r1
1242 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1243 vpmuludq $T0,$H2,$H2 # h0*r1
1244 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1245 vpmuludq $T4,$H3,$H3 # h4*s1
1246 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1248 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1249 vpmuludq $T2,$H4,$H1 # h2*r2
1250 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1251 vpmuludq $T1,$H4,$H0 # h1*r2
1252 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1253 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1254 vpmuludq $T0,$H4,$H4 # h0*r2
1255 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1256 vpmuludq $T4,$H2,$H1 # h4*s2
1257 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1258 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1259 vpmuludq $T3,$H2,$H2 # h3*s2
1260 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1262 vpmuludq $T1,$H3,$H0 # h1*r3
1263 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1264 vpmuludq $T0,$H3,$H3 # h0*r3
1265 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1266 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1267 vpmuludq $T4,$H4,$H1 # h4*s3
1268 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1269 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1270 vpmuludq $T3,$H4,$H0 # h3*s3
1271 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1272 vpmuludq $T2,$H4,$H4 # h2*s3
1273 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1275 vpmuludq $T0,$H2,$H2 # h0*r4
1276 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1277 vpmuludq $T4,$H3,$H1 # h4*s4
1278 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1279 vpmuludq $T3,$H3,$H0 # h3*s4
1280 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1281 vpmuludq $T2,$H3,$H1 # h2*s4
1282 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1283 vpmuludq $T1,$H3,$H3 # h1*s4
1284 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1288 vmovdqu 16*0($inp),$H0 # load input
1289 vmovdqu 16*1($inp),$H1
1291 vpsrldq \$6,$H0,$H2 # splat input
1293 vpunpckhqdq $H1,$H0,$H4 # 4
1294 vpunpcklqdq $H1,$H0,$H0 # 0:1
1295 vpunpcklqdq $H3,$H2,$H3 # 2:3
1297 vpsrlq \$40,$H4,$H4 # 4
1299 vpand $MASK,$H0,$H0 # 0
1301 vpand $MASK,$H1,$H1 # 1
1303 vpand $MASK,$H2,$H2 # 2
1304 vpand $MASK,$H3,$H3 # 3
1305 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1307 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1308 vpaddq 0x00(%r11),$H0,$H0
1309 vpaddq 0x10(%r11),$H1,$H1
1310 vpaddq 0x20(%r11),$H2,$H2
1311 vpaddq 0x30(%r11),$H3,$H3
1312 vpaddq 0x40(%r11),$H4,$H4
1314 ################################################################
1315 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1317 vpmuludq $H0,$T4,$T0 # h0*r0
1318 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1319 vpmuludq $H1,$T4,$T1 # h1*r0
1320 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1321 vpmuludq $H2,$T4,$T0 # h2*r0
1322 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1323 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1324 vpmuludq $H3,$T4,$T1 # h3*r0
1325 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1326 vpmuludq $H4,$T4,$T4 # h4*r0
1327 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1329 vpmuludq $H3,$T2,$T0 # h3*r1
1330 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1331 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1332 vpmuludq $H2,$T2,$T1 # h2*r1
1333 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1334 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1335 vpmuludq $H1,$T2,$T0 # h1*r1
1336 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1337 vpmuludq $H0,$T2,$T2 # h0*r1
1338 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1339 vpmuludq $H4,$T3,$T3 # h4*s1
1340 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1342 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1343 vpmuludq $H2,$T4,$T1 # h2*r2
1344 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1345 vpmuludq $H1,$T4,$T0 # h1*r2
1346 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1347 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1348 vpmuludq $H0,$T4,$T4 # h0*r2
1349 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1350 vpmuludq $H4,$T2,$T1 # h4*s2
1351 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1352 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1353 vpmuludq $H3,$T2,$T2 # h3*s2
1354 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1356 vpmuludq $H1,$T3,$T0 # h1*r3
1357 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1358 vpmuludq $H0,$T3,$T3 # h0*r3
1359 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1360 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1361 vpmuludq $H4,$T4,$T1 # h4*s3
1362 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1363 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1364 vpmuludq $H3,$T4,$T0 # h3*s3
1365 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1366 vpmuludq $H2,$T4,$T4 # h2*s3
1367 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1369 vpmuludq $H0,$T2,$T2 # h0*r4
1370 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1371 vpmuludq $H4,$T3,$T1 # h4*s4
1372 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1373 vpmuludq $H3,$T3,$T0 # h3*s4
1374 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1375 vpmuludq $H2,$T3,$T1 # h2*s4
1376 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1377 vpmuludq $H1,$T3,$T3 # h1*s4
1378 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1381 ################################################################
1382 # horizontal addition
1395 ################################################################
1400 vpaddq $H3,$D4,$D4 # h3 -> h4
1404 vpaddq $H0,$D1,$D1 # h0 -> h1
1411 vpaddq $H1,$D2,$D2 # h1 -> h2
1415 vpaddq $H4,$D0,$D0 # h4 -> h0
1419 vpaddq $H2,$D3,$D3 # h2 -> h3
1423 vpaddq $H0,$D1,$D1 # h0 -> h1
1427 vpaddq $H3,$D4,$D4 # h3 -> h4
1429 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1430 vmovd $D1,`4*1-48-64`($ctx)
1431 vmovd $D2,`4*2-48-64`($ctx)
1432 vmovd $D3,`4*3-48-64`($ctx)
1433 vmovd $D4,`4*4-48-64`($ctx)
1435 $code.=<<___ if ($win64);
1436 vmovdqa 0x50(%r11),%xmm6
1437 vmovdqa 0x60(%r11),%xmm7
1438 vmovdqa 0x70(%r11),%xmm8
1439 vmovdqa 0x80(%r11),%xmm9
1440 vmovdqa 0x90(%r11),%xmm10
1441 vmovdqa 0xa0(%r11),%xmm11
1442 vmovdqa 0xb0(%r11),%xmm12
1443 vmovdqa 0xc0(%r11),%xmm13
1444 vmovdqa 0xd0(%r11),%xmm14
1445 vmovdqa 0xe0(%r11),%xmm15
1449 $code.=<<___ if (!$win64);
1451 .cfi_def_cfa_register %rsp
1458 &end_function("poly1305_blocks_avx");
1460 &declare_function("poly1305_emit_avx", 32, 3);
1462 cmpl \$0,20($ctx) # is_base2_26?
1465 mov 0($ctx),%eax # load hash value base 2^26
1471 shl \$26,%rcx # base 2^26 -> base 2^64
1487 mov %r10,%rax # could be partially reduced, so reduce
1498 add \$5,%r8 # compare to modulus
1502 shr \$2,%r10 # did 130-bit value overflow?
1506 add 0($nonce),%rax # accumulate nonce
1508 mov %rax,0($mac) # write result
1513 &end_function("poly1305_emit_avx");
1517 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1518 map("%ymm$_",(0..15));
1521 sub poly1305_blocks_avxN {
1523 my $suffix = $avx512 ? "_avx512" : "";
1526 mov 20($ctx),%r8d # is_base2_26
1528 jae .Lblocks_avx2$suffix
1532 .Lblocks_avx2$suffix:
1534 jz .Lno_data_avx2$suffix
1539 jz .Lbase2_64_avx2$suffix
1542 jz .Leven_avx2$suffix
1557 .Lblocks_avx2_body$suffix:
1559 mov $len,%r15 # reassign $len
1561 mov 0($ctx),$d1 # load hash value
1565 mov 24($ctx),$r0 # load r
1568 ################################# base 2^26 -> base 2^64
1570 and \$`-1*(1<<31)`,$d1
1571 mov $d2,$r1 # borrow $r1
1573 and \$`-1*(1<<31)`,$d2
1587 adc \$0,$h2 # can be partially reduced...
1589 mov \$-4,$d2 # ... so reduce
1602 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1604 .Lbase2_26_pre_avx2$suffix:
1605 add 0($inp),$h0 # accumulate input
1611 call __poly1305_block
1615 jnz .Lbase2_26_pre_avx2$suffix
1617 test $padbit,$padbit # if $padbit is zero,
1618 jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
1620 ################################# base 2^64 -> base 2^26
1627 and \$0x3ffffff,%rax # h[0]
1629 and \$0x3ffffff,%rdx # h[1]
1633 and \$0x3ffffff,$h0 # h[2]
1635 and \$0x3ffffff,$h1 # h[3]
1639 jz .Lstore_base2_26_avx2$suffix
1646 jmp .Lproceed_avx2$suffix
1649 .Lstore_base2_64_avx2$suffix:
1652 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1653 jmp .Ldone_avx2$suffix
1656 .Lstore_base2_26_avx2$suffix:
1657 mov %rax#d,0($ctx) # store hash value base 2^26
1676 .Lno_data_avx2$suffix:
1677 .Lblocks_avx2_epilogue$suffix:
1682 .Lbase2_64_avx2$suffix:
1697 .Lbase2_64_avx2_body$suffix:
1699 mov $len,%r15 # reassign $len
1701 mov 24($ctx),$r0 # load r
1704 mov 0($ctx),$h0 # load hash value
1711 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1714 jz .Linit_avx2$suffix
1716 .Lbase2_64_pre_avx2$suffix:
1717 add 0($inp),$h0 # accumulate input
1723 call __poly1305_block
1727 jnz .Lbase2_64_pre_avx2$suffix
1730 ################################# base 2^64 -> base 2^26
1737 and \$0x3ffffff,%rax # h[0]
1739 and \$0x3ffffff,%rdx # h[1]
1743 and \$0x3ffffff,$h0 # h[2]
1745 and \$0x3ffffff,$h1 # h[3]
1753 movl \$1,20($ctx) # set is_base2_26
1755 call __poly1305_init_avx
1757 .Lproceed_avx2$suffix:
1758 mov %r15,$len # restore $len
1760 $code.=<<___ if (!$kernel);
1761 mov OPENSSL_ia32cap_P+8(%rip),%r9d
1762 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1777 .Lbase2_64_avx2_epilogue$suffix:
1778 jmp .Ldo_avx2$suffix
1785 $code.=<<___ if (!$kernel);
1786 mov OPENSSL_ia32cap_P+8(%rip),%r9d
1789 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1790 vmovd 4*1($ctx),%x#$H1
1791 vmovd 4*2($ctx),%x#$H2
1792 vmovd 4*3($ctx),%x#$H3
1793 vmovd 4*4($ctx),%x#$H4
1797 $code.=<<___ if (!$kernel && $avx>2);
1801 test \$`1<<16`,%r9d # check for AVX512F
1803 .Lskip_avx512$suffix:
1805 $code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1809 $code.=<<___ if (!$win64);
1811 .cfi_def_cfa_register %r10
1814 $code.=<<___ if ($win64);
1817 vmovdqa %xmm6,-0xb0(%r10)
1818 vmovdqa %xmm7,-0xa0(%r10)
1819 vmovdqa %xmm8,-0x90(%r10)
1820 vmovdqa %xmm9,-0x80(%r10)
1821 vmovdqa %xmm10,-0x70(%r10)
1822 vmovdqa %xmm11,-0x60(%r10)
1823 vmovdqa %xmm12,-0x50(%r10)
1824 vmovdqa %xmm13,-0x40(%r10)
1825 vmovdqa %xmm14,-0x30(%r10)
1826 vmovdqa %xmm15,-0x20(%r10)
1827 .Ldo_avx2_body$suffix:
1830 lea .Lconst(%rip),%rcx
1831 lea 48+64($ctx),$ctx # size optimization
1832 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1834 # expand and copy pre-calculated table to stack
1835 vmovdqu `16*0-64`($ctx),%x#$T2
1837 vmovdqu `16*1-64`($ctx),%x#$T3
1838 vmovdqu `16*2-64`($ctx),%x#$T4
1839 vmovdqu `16*3-64`($ctx),%x#$D0
1840 vmovdqu `16*4-64`($ctx),%x#$D1
1841 vmovdqu `16*5-64`($ctx),%x#$D2
1842 lea 0x90(%rsp),%rax # size optimization
1843 vmovdqu `16*6-64`($ctx),%x#$D3
1844 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1845 vmovdqu `16*7-64`($ctx),%x#$D4
1847 vmovdqu `16*8-64`($ctx),%x#$MASK
1849 vmovdqa $T2,0x00(%rsp)
1851 vmovdqa $T3,0x20-0x90(%rax)
1853 vmovdqa $T4,0x40-0x90(%rax)
1855 vmovdqa $D0,0x60-0x90(%rax)
1857 vmovdqa $D1,0x80-0x90(%rax)
1859 vmovdqa $D2,0xa0-0x90(%rax)
1860 vpermd $MASK,$T0,$MASK
1861 vmovdqa $D3,0xc0-0x90(%rax)
1862 vmovdqa $D4,0xe0-0x90(%rax)
1863 vmovdqa $MASK,0x100-0x90(%rax)
1864 vmovdqa 64(%rcx),$MASK # .Lmask26
1866 ################################################################
1868 vmovdqu 16*0($inp),%x#$T0
1869 vmovdqu 16*1($inp),%x#$T1
1870 vinserti128 \$1,16*2($inp),$T0,$T0
1871 vinserti128 \$1,16*3($inp),$T1,$T1
1874 vpsrldq \$6,$T0,$T2 # splat input
1876 vpunpckhqdq $T1,$T0,$T4 # 4
1877 vpunpcklqdq $T3,$T2,$T2 # 2:3
1878 vpunpcklqdq $T1,$T0,$T0 # 0:1
1883 vpsrlq \$40,$T4,$T4 # 4
1884 vpand $MASK,$T2,$T2 # 2
1885 vpand $MASK,$T0,$T0 # 0
1886 vpand $MASK,$T1,$T1 # 1
1887 vpand $MASK,$T3,$T3 # 3
1888 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1890 vpaddq $H2,$T2,$H2 # accumulate input
1892 jz .Ltail_avx2$suffix
1893 jmp .Loop_avx2$suffix
1897 ################################################################
1898 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1899 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1900 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1901 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1902 # \________/\__________/
1903 ################################################################
1904 #vpaddq $H2,$T2,$H2 # accumulate input
1906 vmovdqa `32*0`(%rsp),$T0 # r0^4
1908 vmovdqa `32*1`(%rsp),$T1 # r1^4
1910 vmovdqa `32*3`(%rsp),$T2 # r2^4
1912 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1913 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1915 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1916 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1917 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1918 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1919 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1921 # however, as h2 is "chronologically" first one available pull
1922 # corresponding operations up, so it's
1924 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1925 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1926 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1927 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1928 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1930 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1931 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1932 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1933 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1934 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1936 vpmuludq $H0,$T1,$T4 # h0*r1
1937 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1938 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1939 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1940 vpmuludq $H3,$T1,$T4 # h3*r1
1941 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1942 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1943 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1944 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1946 vpmuludq $H0,$T0,$T4 # h0*r0
1947 vpmuludq $H1,$T0,$H2 # h1*r0
1948 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1949 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1950 vpmuludq $H3,$T0,$T4 # h3*r0
1951 vpmuludq $H4,$T0,$H2 # h4*r0
1952 vmovdqu 16*0($inp),%x#$T0 # load input
1953 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1954 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1955 vinserti128 \$1,16*2($inp),$T0,$T0
1957 vpmuludq $H3,$T1,$T4 # h3*s2
1958 vpmuludq $H4,$T1,$H2 # h4*s2
1959 vmovdqu 16*1($inp),%x#$T1
1960 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1961 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1962 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1963 vpmuludq $H1,$T2,$T4 # h1*r2
1964 vpmuludq $H0,$T2,$T2 # h0*r2
1965 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1966 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1967 vinserti128 \$1,16*3($inp),$T1,$T1
1970 vpmuludq $H1,$H2,$T4 # h1*r3
1971 vpmuludq $H0,$H2,$H2 # h0*r3
1972 vpsrldq \$6,$T0,$T2 # splat input
1973 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1974 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1975 vpmuludq $H3,$T3,$T4 # h3*s3
1976 vpmuludq $H4,$T3,$H2 # h4*s3
1978 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1979 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1980 vpunpckhqdq $T1,$T0,$T4 # 4
1982 vpmuludq $H3,$S4,$H3 # h3*s4
1983 vpmuludq $H4,$S4,$H4 # h4*s4
1984 vpunpcklqdq $T1,$T0,$T0 # 0:1
1985 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1986 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1987 vpunpcklqdq $T3,$T2,$T3 # 2:3
1988 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1989 vpmuludq $H1,$S4,$H0 # h1*s4
1990 vmovdqa 64(%rcx),$MASK # .Lmask26
1991 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1992 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1994 ################################################################
1995 # lazy reduction (interleaved with tail of input splat)
1999 vpaddq $D3,$H4,$H4 # h3 -> h4
2003 vpaddq $D0,$D1,$H1 # h0 -> h1
2012 vpaddq $D1,$H2,$H2 # h1 -> h2
2016 vpaddq $D4,$H0,$H0 # h4 -> h0
2018 vpand $MASK,$T2,$T2 # 2
2023 vpaddq $D2,$H3,$H3 # h2 -> h3
2025 vpaddq $T2,$H2,$H2 # modulo-scheduled
2030 vpaddq $D0,$H1,$H1 # h0 -> h1
2032 vpsrlq \$40,$T4,$T4 # 4
2036 vpaddq $D3,$H4,$H4 # h3 -> h4
2038 vpand $MASK,$T0,$T0 # 0
2039 vpand $MASK,$T1,$T1 # 1
2040 vpand $MASK,$T3,$T3 # 3
2041 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2044 jnz .Loop_avx2$suffix
2048 ################################################################
2049 # while above multiplications were by r^4 in all lanes, in last
2050 # iteration we multiply least significant lane by r^4 and most
2051 # significant one by r, so copy of above except that references
2052 # to the precomputed table are displaced by 4...
2054 #vpaddq $H2,$T2,$H2 # accumulate input
2056 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
2058 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
2060 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
2062 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
2063 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
2065 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
2066 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
2067 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
2068 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
2069 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2071 vpmuludq $H0,$T1,$T4 # h0*r1
2072 vpmuludq $H1,$T1,$H2 # h1*r1
2073 vpaddq $T4,$D1,$D1 # d1 += h0*r1
2074 vpaddq $H2,$D2,$D2 # d2 += h1*r1
2075 vpmuludq $H3,$T1,$T4 # h3*r1
2076 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
2077 vpaddq $T4,$D4,$D4 # d4 += h3*r1
2078 vpaddq $H2,$D0,$D0 # d0 += h4*s1
2080 vpmuludq $H0,$T0,$T4 # h0*r0
2081 vpmuludq $H1,$T0,$H2 # h1*r0
2082 vpaddq $T4,$D0,$D0 # d0 += h0*r0
2083 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2084 vpaddq $H2,$D1,$D1 # d1 += h1*r0
2085 vpmuludq $H3,$T0,$T4 # h3*r0
2086 vpmuludq $H4,$T0,$H2 # h4*r0
2087 vpaddq $T4,$D3,$D3 # d3 += h3*r0
2088 vpaddq $H2,$D4,$D4 # d4 += h4*r0
2090 vpmuludq $H3,$T1,$T4 # h3*s2
2091 vpmuludq $H4,$T1,$H2 # h4*s2
2092 vpaddq $T4,$D0,$D0 # d0 += h3*s2
2093 vpaddq $H2,$D1,$D1 # d1 += h4*s2
2094 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2095 vpmuludq $H1,$T2,$T4 # h1*r2
2096 vpmuludq $H0,$T2,$T2 # h0*r2
2097 vpaddq $T4,$D3,$D3 # d3 += h1*r2
2098 vpaddq $T2,$D2,$D2 # d2 += h0*r2
2100 vpmuludq $H1,$H2,$T4 # h1*r3
2101 vpmuludq $H0,$H2,$H2 # h0*r3
2102 vpaddq $T4,$D4,$D4 # d4 += h1*r3
2103 vpaddq $H2,$D3,$D3 # d3 += h0*r3
2104 vpmuludq $H3,$T3,$T4 # h3*s3
2105 vpmuludq $H4,$T3,$H2 # h4*s3
2106 vpaddq $T4,$D1,$D1 # d1 += h3*s3
2107 vpaddq $H2,$D2,$D2 # d2 += h4*s3
2109 vpmuludq $H3,$S4,$H3 # h3*s4
2110 vpmuludq $H4,$S4,$H4 # h4*s4
2111 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2112 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2113 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2114 vpmuludq $H1,$S4,$H0 # h1*s4
2115 vmovdqa 64(%rcx),$MASK # .Lmask26
2116 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2117 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2119 ################################################################
2120 # horizontal addition
2133 vpermq \$0x2,$H3,$T3
2134 vpermq \$0x2,$H4,$T4
2135 vpermq \$0x2,$H0,$T0
2136 vpermq \$0x2,$D1,$T1
2137 vpermq \$0x2,$H2,$T2
2144 ################################################################
2149 vpaddq $D3,$H4,$H4 # h3 -> h4
2153 vpaddq $D0,$D1,$H1 # h0 -> h1
2160 vpaddq $D1,$H2,$H2 # h1 -> h2
2164 vpaddq $D4,$H0,$H0 # h4 -> h0
2168 vpaddq $D2,$H3,$H3 # h2 -> h3
2172 vpaddq $D0,$H1,$H1 # h0 -> h1
2176 vpaddq $D3,$H4,$H4 # h3 -> h4
2178 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2179 vmovd %x#$H1,`4*1-48-64`($ctx)
2180 vmovd %x#$H2,`4*2-48-64`($ctx)
2181 vmovd %x#$H3,`4*3-48-64`($ctx)
2182 vmovd %x#$H4,`4*4-48-64`($ctx)
2184 $code.=<<___ if ($win64);
2185 vmovdqa -0xb0(%r10),%xmm6
2186 vmovdqa -0xa0(%r10),%xmm7
2187 vmovdqa -0x90(%r10),%xmm8
2188 vmovdqa -0x80(%r10),%xmm9
2189 vmovdqa -0x70(%r10),%xmm10
2190 vmovdqa -0x60(%r10),%xmm11
2191 vmovdqa -0x50(%r10),%xmm12
2192 vmovdqa -0x40(%r10),%xmm13
2193 vmovdqa -0x30(%r10),%xmm14
2194 vmovdqa -0x20(%r10),%xmm15
2196 .Ldo_avx2_epilogue$suffix:
2198 $code.=<<___ if (!$win64);
2200 .cfi_def_cfa_register %rsp
2207 if($avx > 2 && $avx512) {
2208 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2209 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2210 my $PADBIT="%zmm30";
2212 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2213 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2214 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2215 map(s/%y/%z/,($MASK));
2223 $code.=<<___ if (!$win64);
2225 .cfi_def_cfa_register %r10
2228 $code.=<<___ if ($win64);
2231 vmovdqa %xmm6,-0xb0(%r10)
2232 vmovdqa %xmm7,-0xa0(%r10)
2233 vmovdqa %xmm8,-0x90(%r10)
2234 vmovdqa %xmm9,-0x80(%r10)
2235 vmovdqa %xmm10,-0x70(%r10)
2236 vmovdqa %xmm11,-0x60(%r10)
2237 vmovdqa %xmm12,-0x50(%r10)
2238 vmovdqa %xmm13,-0x40(%r10)
2239 vmovdqa %xmm14,-0x30(%r10)
2240 vmovdqa %xmm15,-0x20(%r10)
2244 lea .Lconst(%rip),%rcx
2245 lea 48+64($ctx),$ctx # size optimization
2246 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
2248 # expand pre-calculated table
2249 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2251 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2253 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2254 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2255 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2256 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2257 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2258 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2259 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2260 vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2261 vpbroadcastq 64(%rcx),$MASK # .Lmask26
2265 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
2266 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2268 vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2271 vmovdqa64 $S1,0x40(%rsp){%k2}
2274 vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2276 vmovdqa64 $S2,0x80(%rsp){%k2}
2277 vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2278 vmovdqa64 $S3,0xc0(%rsp){%k2}
2279 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2280 vmovdqa64 $S4,0x100(%rsp){%k2}
2282 ################################################################
2283 # calculate 5th through 8th powers of the key
2285 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2286 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2287 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2288 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2289 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2291 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2292 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2293 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2294 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2295 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2298 vpmuludq $T1,$S4,$M0
2299 vpmuludq $T1,$R0,$M1
2300 vpmuludq $T1,$R1,$M2
2301 vpmuludq $T1,$R2,$M3
2302 vpmuludq $T1,$R3,$M4
2304 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2305 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2306 vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2307 vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2308 vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2310 vpmuludq $T2,$S3,$M0
2311 vpmuludq $T2,$S4,$M1
2312 vpmuludq $T2,$R1,$M3
2313 vpmuludq $T2,$R2,$M4
2314 vpmuludq $T2,$R0,$M2
2316 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2317 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2318 vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2319 vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2320 vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2322 vpmuludq $T3,$S2,$M0
2323 vpmuludq $T3,$R0,$M3
2324 vpmuludq $T3,$R1,$M4
2325 vpmuludq $T3,$S3,$M1
2326 vpmuludq $T3,$S4,$M2
2327 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2328 vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2329 vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2330 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2331 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2333 vpmuludq $T4,$S4,$M3
2334 vpmuludq $T4,$R0,$M4
2335 vpmuludq $T4,$S1,$M0
2336 vpmuludq $T4,$S2,$M1
2337 vpmuludq $T4,$S3,$M2
2338 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2339 vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2340 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2341 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2342 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2344 ################################################################
2346 vmovdqu64 16*0($inp),%z#$T3
2347 vmovdqu64 16*4($inp),%z#$T4
2350 ################################################################
2354 vpandq $MASK,$D3,$D3
2355 vpaddq $M3,$D4,$D4 # d3 -> d4
2358 vpandq $MASK,$D0,$D0
2359 vpaddq $M0,$D1,$D1 # d0 -> d1
2362 vpandq $MASK,$D4,$D4
2365 vpandq $MASK,$D1,$D1
2366 vpaddq $M1,$D2,$D2 # d1 -> d2
2370 vpaddq $M4,$D0,$D0 # d4 -> d0
2373 vpandq $MASK,$D2,$D2
2374 vpaddq $M2,$D3,$D3 # d2 -> d3
2377 vpandq $MASK,$D0,$D0
2378 vpaddq $M0,$D1,$D1 # d0 -> d1
2381 vpandq $MASK,$D3,$D3
2382 vpaddq $M3,$D4,$D4 # d3 -> d4
2384 ################################################################
2385 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2388 vpunpcklqdq $T4,$T3,$T0 # transpose input
2389 vpunpckhqdq $T4,$T3,$T4
2391 # ... since input 64-bit lanes are ordered as 73625140, we could
2392 # "vperm" it to 76543210 (here and in each loop iteration), *or*
2393 # we could just flow along, hence the goal for $R0-$S4 is
2394 # 1858286838784888 ...
2396 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2400 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2406 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2407 vpermd $D1,$M0,${R1}{%k1}
2408 vpermd $D2,$M0,${R2}{%k1}
2409 vpermd $D3,$M0,${R3}{%k1}
2410 vpermd $D4,$M0,${R4}{%k1}
2412 vpslld \$2,$R1,$S1 # *5
2421 vpbroadcastq 32(%rcx),$PADBIT # .L129
2423 vpsrlq \$52,$T0,$T2 # splat input
2428 vpsrlq \$40,$T4,$T4 # 4
2429 vpandq $MASK,$T2,$T2 # 2
2430 vpandq $MASK,$T0,$T0 # 0
2431 #vpandq $MASK,$T1,$T1 # 1
2432 #vpandq $MASK,$T3,$T3 # 3
2433 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2435 vpaddq $H2,$T2,$H2 # accumulate input
2442 ################################################################
2443 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2444 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2445 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2446 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2447 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2448 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2449 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2450 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2451 # \________/\___________/
2452 ################################################################
2453 #vpaddq $H2,$T2,$H2 # accumulate input
2455 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2456 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2457 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2458 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2459 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2461 # however, as h2 is "chronologically" first one available pull
2462 # corresponding operations up, so it's
2464 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2465 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2466 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2467 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2468 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2470 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2472 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2473 vpandq $MASK,$T1,$T1 # 1
2474 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2475 vpandq $MASK,$T3,$T3 # 3
2476 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2477 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2478 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2479 vpaddq $H1,$T1,$H1 # accumulate input
2483 vmovdqu64 16*0($inp),$T3 # load input
2484 vmovdqu64 16*4($inp),$T4
2486 vpmuludq $H0,$R3,$M3
2487 vpmuludq $H0,$R4,$M4
2488 vpmuludq $H0,$R0,$M0
2489 vpmuludq $H0,$R1,$M1
2490 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2491 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2492 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2493 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2495 vpmuludq $H1,$R2,$M3
2496 vpmuludq $H1,$R3,$M4
2497 vpmuludq $H1,$S4,$M0
2498 vpmuludq $H0,$R2,$M2
2499 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2500 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2501 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2502 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2504 vpunpcklqdq $T4,$T3,$T0 # transpose input
2505 vpunpckhqdq $T4,$T3,$T4
2507 vpmuludq $H3,$R0,$M3
2508 vpmuludq $H3,$R1,$M4
2509 vpmuludq $H1,$R0,$M1
2510 vpmuludq $H1,$R1,$M2
2511 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2512 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2513 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2514 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2516 vpmuludq $H4,$S4,$M3
2517 vpmuludq $H4,$R0,$M4
2518 vpmuludq $H3,$S2,$M0
2519 vpmuludq $H3,$S3,$M1
2520 vpaddq $M3,$D3,$D3 # d3 += h4*s4
2521 vpmuludq $H3,$S4,$M2
2522 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2523 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2524 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2525 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2527 vpmuludq $H4,$S1,$M0
2528 vpmuludq $H4,$S2,$M1
2529 vpmuludq $H4,$S3,$M2
2530 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2531 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2532 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2534 ################################################################
2535 # lazy reduction (interleaved with input splat)
2537 vpsrlq \$52,$T0,$T2 # splat input
2541 vpandq $MASK,$D3,$D3
2542 vpaddq $H3,$D4,$H4 # h3 -> h4
2547 vpandq $MASK,$H0,$H0
2548 vpaddq $D0,$H1,$H1 # h0 -> h1
2550 vpandq $MASK,$T2,$T2 # 2
2553 vpandq $MASK,$H4,$H4
2556 vpandq $MASK,$H1,$H1
2557 vpaddq $D1,$H2,$H2 # h1 -> h2
2561 vpaddq $D4,$H0,$H0 # h4 -> h0
2563 vpaddq $T2,$H2,$H2 # modulo-scheduled
2567 vpandq $MASK,$H2,$H2
2568 vpaddq $D2,$D3,$H3 # h2 -> h3
2573 vpandq $MASK,$H0,$H0
2574 vpaddq $D0,$H1,$H1 # h0 -> h1
2576 vpsrlq \$40,$T4,$T4 # 4
2579 vpandq $MASK,$H3,$H3
2580 vpaddq $D3,$H4,$H4 # h3 -> h4
2582 vpandq $MASK,$T0,$T0 # 0
2583 #vpandq $MASK,$T1,$T1 # 1
2584 #vpandq $MASK,$T3,$T3 # 3
2585 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2591 ################################################################
2592 # while above multiplications were by r^8 in all lanes, in last
2593 # iteration we multiply least significant lane by r^8 and most
2594 # significant one by r, that's why table gets shifted...
2596 vpsrlq \$32,$R0,$R0 # 0105020603070408
2606 ################################################################
2607 # load either next or last 64 byte of input
2608 lea ($inp,$len),$inp
2610 #vpaddq $H2,$T2,$H2 # accumulate input
2613 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2614 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2615 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2616 vpandq $MASK,$T1,$T1 # 1
2617 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2618 vpandq $MASK,$T3,$T3 # 3
2619 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2620 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2621 vpaddq $H1,$T1,$H1 # accumulate input
2625 vmovdqu 16*0($inp),%x#$T0
2626 vpmuludq $H0,$R3,$M3
2627 vpmuludq $H0,$R4,$M4
2628 vpmuludq $H0,$R0,$M0
2629 vpmuludq $H0,$R1,$M1
2630 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2631 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2632 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2633 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2635 vmovdqu 16*1($inp),%x#$T1
2636 vpmuludq $H1,$R2,$M3
2637 vpmuludq $H1,$R3,$M4
2638 vpmuludq $H1,$S4,$M0
2639 vpmuludq $H0,$R2,$M2
2640 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2641 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2642 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2643 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2645 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
2646 vpmuludq $H3,$R0,$M3
2647 vpmuludq $H3,$R1,$M4
2648 vpmuludq $H1,$R0,$M1
2649 vpmuludq $H1,$R1,$M2
2650 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2651 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2652 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2653 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2655 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2656 vpmuludq $H4,$S4,$M3
2657 vpmuludq $H4,$R0,$M4
2658 vpmuludq $H3,$S2,$M0
2659 vpmuludq $H3,$S3,$M1
2660 vpmuludq $H3,$S4,$M2
2661 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2662 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2663 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2664 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2665 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2667 vpmuludq $H4,$S1,$M0
2668 vpmuludq $H4,$S2,$M1
2669 vpmuludq $H4,$S3,$M2
2670 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2671 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2672 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2674 ################################################################
2675 # horizontal addition
2678 vpermq \$0xb1,$H3,$D3
2679 vpermq \$0xb1,$D4,$H4
2680 vpermq \$0xb1,$H0,$D0
2681 vpermq \$0xb1,$H1,$D1
2682 vpermq \$0xb1,$H2,$D2
2690 vpermq \$0x2,$H3,$D3
2691 vpermq \$0x2,$H4,$D4
2692 vpermq \$0x2,$H0,$D0
2693 vpermq \$0x2,$H1,$D1
2694 vpermq \$0x2,$H2,$D2
2701 vextracti64x4 \$0x1,$H3,%y#$D3
2702 vextracti64x4 \$0x1,$H4,%y#$D4
2703 vextracti64x4 \$0x1,$H0,%y#$D0
2704 vextracti64x4 \$0x1,$H1,%y#$D1
2705 vextracti64x4 \$0x1,$H2,%y#$D2
2706 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2707 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2708 vpaddq $D0,$H0,${H0}{%k3}{z}
2709 vpaddq $D1,$H1,${H1}{%k3}{z}
2710 vpaddq $D2,$H2,${H2}{%k3}{z}
2712 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2713 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2715 ################################################################
2716 # lazy reduction (interleaved with input splat)
2720 vpsrldq \$6,$T0,$T2 # splat input
2722 vpunpckhqdq $T1,$T0,$T4 # 4
2723 vpaddq $D3,$H4,$H4 # h3 -> h4
2727 vpunpcklqdq $T3,$T2,$T2 # 2:3
2728 vpunpcklqdq $T1,$T0,$T0 # 0:1
2729 vpaddq $D0,$H1,$H1 # h0 -> h1
2738 vpaddq $D1,$H2,$H2 # h1 -> h2
2743 vpsrlq \$40,$T4,$T4 # 4
2744 vpaddq $D4,$H0,$H0 # h4 -> h0
2748 vpand $MASK,$T2,$T2 # 2
2749 vpand $MASK,$T0,$T0 # 0
2750 vpaddq $D2,$H3,$H3 # h2 -> h3
2754 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2755 vpand $MASK,$T1,$T1 # 1
2756 vpaddq $D0,$H1,$H1 # h0 -> h1
2760 vpand $MASK,$T3,$T3 # 3
2761 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2762 vpaddq $D3,$H4,$H4 # h3 -> h4
2764 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2766 jnz .Ltail_avx2$suffix
2768 vpsubq $T2,$H2,$H2 # undo input accumulation
2769 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2770 vmovd %x#$H1,`4*1-48-64`($ctx)
2771 vmovd %x#$H2,`4*2-48-64`($ctx)
2772 vmovd %x#$H3,`4*3-48-64`($ctx)
2773 vmovd %x#$H4,`4*4-48-64`($ctx)
2776 $code.=<<___ if ($win64);
2777 movdqa -0xb0(%r10),%xmm6
2778 movdqa -0xa0(%r10),%xmm7
2779 movdqa -0x90(%r10),%xmm8
2780 movdqa -0x80(%r10),%xmm9
2781 movdqa -0x70(%r10),%xmm10
2782 movdqa -0x60(%r10),%xmm11
2783 movdqa -0x50(%r10),%xmm12
2784 movdqa -0x40(%r10),%xmm13
2785 movdqa -0x30(%r10),%xmm14
2786 movdqa -0x20(%r10),%xmm15
2788 .Ldo_avx512_epilogue:
2790 $code.=<<___ if (!$win64);
2792 .cfi_def_cfa_register %rsp
2803 &declare_function("poly1305_blocks_avx2", 32, 4);
2804 poly1305_blocks_avxN(0);
2805 &end_function("poly1305_blocks_avx2");
2807 #######################################################################
2809 # On entry we have input length divisible by 64. But since inner loop
2810 # processes 128 bytes per iteration, cases when length is not divisible
2811 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2812 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
2813 # for this tail, we wouldn't have to even allocate stack frame...
2816 $code .= "#ifdef CONFIG_AS_AVX512\n";
2819 &declare_function("poly1305_blocks_avx512", 32, 4);
2820 poly1305_blocks_avxN(1);
2821 &end_function("poly1305_blocks_avx512");
2824 $code .= "#endif\n";
2827 if (!$kernel && $avx>3) {
2828 ########################################################################
2829 # VPMADD52 version using 2^44 radix.
2831 # One can argue that base 2^52 would be more natural. Well, even though
2832 # some operations would be more natural, one has to recognize couple of
2833 # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2834 # at amount of multiply-n-accumulate operations. Secondly, it makes it
2835 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2836 # reference implementations], which means that more such operations
2837 # would have to be performed in inner loop, which in turn makes critical
2838 # path longer. In other words, even though base 2^44 reduction might
2839 # look less elegant, overall critical path is actually shorter...
2841 ########################################################################
2842 # Layout of opaque area is following.
2844 # unsigned __int64 h[3]; # current hash value base 2^44
2845 # unsigned __int64 s[2]; # key value*20 base 2^44
2846 # unsigned __int64 r[3]; # key value base 2^44
2847 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2848 # # r^n positions reflect
2849 # # placement in register, not
2850 # # memory, R[3] is R[1]*20
2853 .type poly1305_init_base2_44,\@function,3
2855 poly1305_init_base2_44:
2857 mov %rax,0($ctx) # initialize hash value
2862 lea poly1305_blocks_vpmadd52(%rip),%r10
2863 lea poly1305_emit_base2_44(%rip),%r11
2865 mov \$0x0ffffffc0fffffff,%rax
2866 mov \$0x0ffffffc0ffffffc,%rcx
2868 mov \$0x00000fffffffffff,%r8
2870 mov \$0x00000fffffffffff,%r9
2873 mov %r8,40($ctx) # r0
2876 mov %rax,48($ctx) # r1
2877 lea (%rax,%rax,4),%rax # *5
2878 mov %rcx,56($ctx) # r2
2879 shl \$2,%rax # magic <<2
2880 lea (%rcx,%rcx,4),%rcx # *5
2881 shl \$2,%rcx # magic <<2
2882 mov %rax,24($ctx) # s1
2883 mov %rcx,32($ctx) # s2
2884 movq \$-1,64($ctx) # write impossible value
2886 $code.=<<___ if ($flavour !~ /elf32/);
2890 $code.=<<___ if ($flavour =~ /elf32/);
2897 .size poly1305_init_base2_44,.-poly1305_init_base2_44
2900 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2901 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2902 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2905 .type poly1305_blocks_vpmadd52,\@function,4
2907 poly1305_blocks_vpmadd52:
2909 jz .Lno_data_vpmadd52 # too short
2912 mov 64($ctx),%r8 # peek on power of the key
2914 # if powers of the key are not calculated yet, process up to 3
2915 # blocks with this single-block subroutine, otherwise ensure that
2916 # length is divisible by 2 blocks and pass the rest down to next
2921 cmp \$4,$len # is input long
2923 test %r8,%r8 # is power value impossible?
2926 and $len,%rax # is input of favourable length?
2927 jz .Lblocks_vpmadd52_4x
2933 lea .L2_44_inp_permd(%rip),%r10
2936 vmovq $padbit,%x#$PAD
2937 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2938 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2939 vpermq \$0xcf,$PAD,$PAD
2940 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2942 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2943 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2944 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2945 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2947 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2948 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2954 vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2957 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2958 vpsrlvq $inp_shift,$T0,$T0
2959 vpandq $reduc_mask,$T0,$T0
2962 vpaddq $T0,$Dlo,$Dlo # accumulate input
2964 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2965 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2966 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2968 vpxord $Dlo,$Dlo,$Dlo
2969 vpxord $Dhi,$Dhi,$Dhi
2971 vpmadd52luq $r2r1r0,$H0,$Dlo
2972 vpmadd52huq $r2r1r0,$H0,$Dhi
2974 vpmadd52luq $r1r0s2,$H1,$Dlo
2975 vpmadd52huq $r1r0s2,$H1,$Dhi
2977 vpmadd52luq $r0s2s1,$H2,$Dlo
2978 vpmadd52huq $r0s2s1,$H2,$Dhi
2980 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2981 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2982 vpandq $reduc_mask,$Dlo,$Dlo
2984 vpaddq $T0,$Dhi,$Dhi
2986 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2988 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2990 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2991 vpandq $reduc_mask,$Dlo,$Dlo
2993 vpermq \$0b10010011,$T0,$T0
2995 vpaddq $T0,$Dlo,$Dlo
2997 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2999 vpaddq $T0,$Dlo,$Dlo
3002 vpaddq $T0,$Dlo,$Dlo
3007 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
3010 jnz .Lblocks_vpmadd52_4x
3014 .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
3018 ########################################################################
3019 # As implied by its name 4x subroutine processes 4 blocks in parallel
3020 # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
3021 # and is handled in 256-bit %ymm registers.
3023 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3024 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3025 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3028 .type poly1305_blocks_vpmadd52_4x,\@function,4
3030 poly1305_blocks_vpmadd52_4x:
3032 jz .Lno_data_vpmadd52_4x # too short
3035 mov 64($ctx),%r8 # peek on power of the key
3037 .Lblocks_vpmadd52_4x:
3038 vpbroadcastq $padbit,$PAD
3040 vmovdqa64 .Lx_mask44(%rip),$mask44
3042 vmovdqa64 .Lx_mask42(%rip),$mask42
3043 kmovw %eax,%k1 # used in 2x path
3045 test %r8,%r8 # is power value impossible?
3046 js .Linit_vpmadd52 # if it is, then init R[4]
3048 vmovq 0($ctx),%x#$H0 # load current hash value
3049 vmovq 8($ctx),%x#$H1
3050 vmovq 16($ctx),%x#$H2
3052 test \$3,$len # is length 4*n+2?
3053 jnz .Lblocks_vpmadd52_2x_do
3055 .Lblocks_vpmadd52_4x_do:
3056 vpbroadcastq 64($ctx),$R0 # load 4th power of the key
3057 vpbroadcastq 96($ctx),$R1
3058 vpbroadcastq 128($ctx),$R2
3059 vpbroadcastq 160($ctx),$S1
3061 .Lblocks_vpmadd52_4x_key_loaded:
3062 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3066 test \$7,$len # is len 8*n?
3067 jz .Lblocks_vpmadd52_8x
3069 vmovdqu64 16*0($inp),$T2 # load data
3070 vmovdqu64 16*2($inp),$T3
3073 vpunpcklqdq $T3,$T2,$T1 # transpose data
3074 vpunpckhqdq $T3,$T2,$T3
3076 # at this point 64-bit lanes are ordered as 3-1-2-0
3078 vpsrlq \$24,$T3,$T2 # splat the data
3080 vpaddq $T2,$H2,$H2 # accumulate input
3081 vpandq $mask44,$T1,$T0
3085 vpandq $mask44,$T1,$T1
3088 jz .Ltail_vpmadd52_4x
3089 jmp .Loop_vpmadd52_4x
3094 vmovq 24($ctx),%x#$S1 # load key
3095 vmovq 56($ctx),%x#$H2
3096 vmovq 32($ctx),%x#$S2
3097 vmovq 40($ctx),%x#$R0
3098 vmovq 48($ctx),%x#$R1
3106 .Lmul_init_vpmadd52:
3107 vpxorq $D0lo,$D0lo,$D0lo
3108 vpmadd52luq $H2,$S1,$D0lo
3109 vpxorq $D0hi,$D0hi,$D0hi
3110 vpmadd52huq $H2,$S1,$D0hi
3111 vpxorq $D1lo,$D1lo,$D1lo
3112 vpmadd52luq $H2,$S2,$D1lo
3113 vpxorq $D1hi,$D1hi,$D1hi
3114 vpmadd52huq $H2,$S2,$D1hi
3115 vpxorq $D2lo,$D2lo,$D2lo
3116 vpmadd52luq $H2,$R0,$D2lo
3117 vpxorq $D2hi,$D2hi,$D2hi
3118 vpmadd52huq $H2,$R0,$D2hi
3120 vpmadd52luq $H0,$R0,$D0lo
3121 vpmadd52huq $H0,$R0,$D0hi
3122 vpmadd52luq $H0,$R1,$D1lo
3123 vpmadd52huq $H0,$R1,$D1hi
3124 vpmadd52luq $H0,$R2,$D2lo
3125 vpmadd52huq $H0,$R2,$D2hi
3127 vpmadd52luq $H1,$S2,$D0lo
3128 vpmadd52huq $H1,$S2,$D0hi
3129 vpmadd52luq $H1,$R0,$D1lo
3130 vpmadd52huq $H1,$R0,$D1hi
3131 vpmadd52luq $H1,$R1,$D2lo
3132 vpmadd52huq $H1,$R1,$D2hi
3134 ################################################################
3136 vpsrlq \$44,$D0lo,$tmp
3137 vpsllq \$8,$D0hi,$D0hi
3138 vpandq $mask44,$D0lo,$H0
3139 vpaddq $tmp,$D0hi,$D0hi
3141 vpaddq $D0hi,$D1lo,$D1lo
3143 vpsrlq \$44,$D1lo,$tmp
3144 vpsllq \$8,$D1hi,$D1hi
3145 vpandq $mask44,$D1lo,$H1
3146 vpaddq $tmp,$D1hi,$D1hi
3148 vpaddq $D1hi,$D2lo,$D2lo
3150 vpsrlq \$42,$D2lo,$tmp
3151 vpsllq \$10,$D2hi,$D2hi
3152 vpandq $mask42,$D2lo,$H2
3153 vpaddq $tmp,$D2hi,$D2hi
3155 vpaddq $D2hi,$H0,$H0
3156 vpsllq \$2,$D2hi,$D2hi
3158 vpaddq $D2hi,$H0,$H0
3160 vpsrlq \$44,$H0,$tmp # additional step
3161 vpandq $mask44,$H0,$H0
3166 jz .Ldone_init_vpmadd52
3168 vpunpcklqdq $R1,$H1,$R1 # 1,2
3169 vpbroadcastq %x#$H1,%x#$H1 # 2,2
3170 vpunpcklqdq $R2,$H2,$R2
3171 vpbroadcastq %x#$H2,%x#$H2
3172 vpunpcklqdq $R0,$H0,$R0
3173 vpbroadcastq %x#$H0,%x#$H0
3175 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3176 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3182 jmp .Lmul_init_vpmadd52
3186 .Ldone_init_vpmadd52:
3187 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3188 vinserti128 \$1,%x#$R2,$H2,$R2
3189 vinserti128 \$1,%x#$R0,$H0,$R0
3191 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3192 vpermq \$0b11011000,$R2,$R2
3193 vpermq \$0b11011000,$R0,$R0
3195 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3199 vmovq 0($ctx),%x#$H0 # load current hash value
3200 vmovq 8($ctx),%x#$H1
3201 vmovq 16($ctx),%x#$H2
3203 test \$3,$len # is length 4*n+2?
3204 jnz .Ldone_init_vpmadd52_2x
3206 vmovdqu64 $R0,64($ctx) # save key powers
3207 vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3208 vmovdqu64 $R1,96($ctx)
3209 vpbroadcastq %x#$R1,$R1
3210 vmovdqu64 $R2,128($ctx)
3211 vpbroadcastq %x#$R2,$R2
3212 vmovdqu64 $S1,160($ctx)
3213 vpbroadcastq %x#$S1,$S1
3215 jmp .Lblocks_vpmadd52_4x_key_loaded
3219 .Ldone_init_vpmadd52_2x:
3220 vmovdqu64 $R0,64($ctx) # save key powers
3221 vpsrldq \$8,$R0,$R0 # 0-1-0-2
3222 vmovdqu64 $R1,96($ctx)
3224 vmovdqu64 $R2,128($ctx)
3226 vmovdqu64 $S1,160($ctx)
3228 jmp .Lblocks_vpmadd52_2x_key_loaded
3232 .Lblocks_vpmadd52_2x_do:
3233 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3234 vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3235 vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3236 vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3238 .Lblocks_vpmadd52_2x_key_loaded:
3239 vmovdqu64 16*0($inp),$T2 # load data
3243 vpunpcklqdq $T3,$T2,$T1 # transpose data
3244 vpunpckhqdq $T3,$T2,$T3
3246 # at this point 64-bit lanes are ordered as x-1-x-0
3248 vpsrlq \$24,$T3,$T2 # splat the data
3250 vpaddq $T2,$H2,$H2 # accumulate input
3251 vpandq $mask44,$T1,$T0
3255 vpandq $mask44,$T1,$T1
3257 jmp .Ltail_vpmadd52_2x
3262 #vpaddq $T2,$H2,$H2 # accumulate input
3266 vpxorq $D0lo,$D0lo,$D0lo
3267 vpmadd52luq $H2,$S1,$D0lo
3268 vpxorq $D0hi,$D0hi,$D0hi
3269 vpmadd52huq $H2,$S1,$D0hi
3270 vpxorq $D1lo,$D1lo,$D1lo
3271 vpmadd52luq $H2,$S2,$D1lo
3272 vpxorq $D1hi,$D1hi,$D1hi
3273 vpmadd52huq $H2,$S2,$D1hi
3274 vpxorq $D2lo,$D2lo,$D2lo
3275 vpmadd52luq $H2,$R0,$D2lo
3276 vpxorq $D2hi,$D2hi,$D2hi
3277 vpmadd52huq $H2,$R0,$D2hi
3279 vmovdqu64 16*0($inp),$T2 # load data
3280 vmovdqu64 16*2($inp),$T3
3282 vpmadd52luq $H0,$R0,$D0lo
3283 vpmadd52huq $H0,$R0,$D0hi
3284 vpmadd52luq $H0,$R1,$D1lo
3285 vpmadd52huq $H0,$R1,$D1hi
3286 vpmadd52luq $H0,$R2,$D2lo
3287 vpmadd52huq $H0,$R2,$D2hi
3289 vpunpcklqdq $T3,$T2,$T1 # transpose data
3290 vpunpckhqdq $T3,$T2,$T3
3291 vpmadd52luq $H1,$S2,$D0lo
3292 vpmadd52huq $H1,$S2,$D0hi
3293 vpmadd52luq $H1,$R0,$D1lo
3294 vpmadd52huq $H1,$R0,$D1hi
3295 vpmadd52luq $H1,$R1,$D2lo
3296 vpmadd52huq $H1,$R1,$D2hi
3298 ################################################################
3299 # partial reduction (interleaved with data splat)
3300 vpsrlq \$44,$D0lo,$tmp
3301 vpsllq \$8,$D0hi,$D0hi
3302 vpandq $mask44,$D0lo,$H0
3303 vpaddq $tmp,$D0hi,$D0hi
3307 vpaddq $D0hi,$D1lo,$D1lo
3309 vpsrlq \$44,$D1lo,$tmp
3310 vpsllq \$8,$D1hi,$D1hi
3311 vpandq $mask44,$D1lo,$H1
3312 vpaddq $tmp,$D1hi,$D1hi
3314 vpandq $mask44,$T1,$T0
3317 vpaddq $D1hi,$D2lo,$D2lo
3319 vpsrlq \$42,$D2lo,$tmp
3320 vpsllq \$10,$D2hi,$D2hi
3321 vpandq $mask42,$D2lo,$H2
3322 vpaddq $tmp,$D2hi,$D2hi
3324 vpaddq $T2,$H2,$H2 # accumulate input
3325 vpaddq $D2hi,$H0,$H0
3326 vpsllq \$2,$D2hi,$D2hi
3328 vpaddq $D2hi,$H0,$H0
3330 vpandq $mask44,$T1,$T1
3332 vpsrlq \$44,$H0,$tmp # additional step
3333 vpandq $mask44,$H0,$H0
3337 sub \$4,$len # len-=64
3338 jnz .Loop_vpmadd52_4x
3341 vmovdqu64 128($ctx),$R2 # load all key powers
3342 vmovdqu64 160($ctx),$S1
3343 vmovdqu64 64($ctx),$R0
3344 vmovdqu64 96($ctx),$R1
3347 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3351 #vpaddq $T2,$H2,$H2 # accumulate input
3355 vpxorq $D0lo,$D0lo,$D0lo
3356 vpmadd52luq $H2,$S1,$D0lo
3357 vpxorq $D0hi,$D0hi,$D0hi
3358 vpmadd52huq $H2,$S1,$D0hi
3359 vpxorq $D1lo,$D1lo,$D1lo
3360 vpmadd52luq $H2,$S2,$D1lo
3361 vpxorq $D1hi,$D1hi,$D1hi
3362 vpmadd52huq $H2,$S2,$D1hi
3363 vpxorq $D2lo,$D2lo,$D2lo
3364 vpmadd52luq $H2,$R0,$D2lo
3365 vpxorq $D2hi,$D2hi,$D2hi
3366 vpmadd52huq $H2,$R0,$D2hi
3368 vpmadd52luq $H0,$R0,$D0lo
3369 vpmadd52huq $H0,$R0,$D0hi
3370 vpmadd52luq $H0,$R1,$D1lo
3371 vpmadd52huq $H0,$R1,$D1hi
3372 vpmadd52luq $H0,$R2,$D2lo
3373 vpmadd52huq $H0,$R2,$D2hi
3375 vpmadd52luq $H1,$S2,$D0lo
3376 vpmadd52huq $H1,$S2,$D0hi
3377 vpmadd52luq $H1,$R0,$D1lo
3378 vpmadd52huq $H1,$R0,$D1hi
3379 vpmadd52luq $H1,$R1,$D2lo
3380 vpmadd52huq $H1,$R1,$D2hi
3382 ################################################################
3383 # horizontal addition
3387 vpsrldq \$8,$D0lo,$T0
3388 vpsrldq \$8,$D0hi,$H0
3389 vpsrldq \$8,$D1lo,$T1
3390 vpsrldq \$8,$D1hi,$H1
3391 vpaddq $T0,$D0lo,$D0lo
3392 vpaddq $H0,$D0hi,$D0hi
3393 vpsrldq \$8,$D2lo,$T2
3394 vpsrldq \$8,$D2hi,$H2
3395 vpaddq $T1,$D1lo,$D1lo
3396 vpaddq $H1,$D1hi,$D1hi
3397 vpermq \$0x2,$D0lo,$T0
3398 vpermq \$0x2,$D0hi,$H0
3399 vpaddq $T2,$D2lo,$D2lo
3400 vpaddq $H2,$D2hi,$D2hi
3402 vpermq \$0x2,$D1lo,$T1
3403 vpermq \$0x2,$D1hi,$H1
3404 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3405 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3406 vpermq \$0x2,$D2lo,$T2
3407 vpermq \$0x2,$D2hi,$H2
3408 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3409 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3410 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3411 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3413 ################################################################
3415 vpsrlq \$44,$D0lo,$tmp
3416 vpsllq \$8,$D0hi,$D0hi
3417 vpandq $mask44,$D0lo,$H0
3418 vpaddq $tmp,$D0hi,$D0hi
3420 vpaddq $D0hi,$D1lo,$D1lo
3422 vpsrlq \$44,$D1lo,$tmp
3423 vpsllq \$8,$D1hi,$D1hi
3424 vpandq $mask44,$D1lo,$H1
3425 vpaddq $tmp,$D1hi,$D1hi
3427 vpaddq $D1hi,$D2lo,$D2lo
3429 vpsrlq \$42,$D2lo,$tmp
3430 vpsllq \$10,$D2hi,$D2hi
3431 vpandq $mask42,$D2lo,$H2
3432 vpaddq $tmp,$D2hi,$D2hi
3434 vpaddq $D2hi,$H0,$H0
3435 vpsllq \$2,$D2hi,$D2hi
3437 vpaddq $D2hi,$H0,$H0
3439 vpsrlq \$44,$H0,$tmp # additional step
3440 vpandq $mask44,$H0,$H0
3443 # at this point $len is
3444 # either 4*n+2 or 0...
3445 sub \$2,$len # len-=32
3446 ja .Lblocks_vpmadd52_4x_do
3448 vmovq %x#$H0,0($ctx)
3449 vmovq %x#$H1,8($ctx)
3450 vmovq %x#$H2,16($ctx)
3453 .Lno_data_vpmadd52_4x:
3455 .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3459 ########################################################################
3460 # As implied by its name 8x subroutine processes 8 blocks in parallel...
3461 # This is intermediate version, as it's used only in cases when input
3462 # length is either 8*n, 8*n+1 or 8*n+2...
3464 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3465 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3466 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3467 my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3470 .type poly1305_blocks_vpmadd52_8x,\@function,4
3472 poly1305_blocks_vpmadd52_8x:
3474 jz .Lno_data_vpmadd52_8x # too short
3477 mov 64($ctx),%r8 # peek on power of the key
3479 vmovdqa64 .Lx_mask44(%rip),$mask44
3480 vmovdqa64 .Lx_mask42(%rip),$mask42
3482 test %r8,%r8 # is power value impossible?
3483 js .Linit_vpmadd52 # if it is, then init R[4]
3485 vmovq 0($ctx),%x#$H0 # load current hash value
3486 vmovq 8($ctx),%x#$H1
3487 vmovq 16($ctx),%x#$H2
3489 .Lblocks_vpmadd52_8x:
3490 ################################################################
3491 # fist we calculate more key powers
3493 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3494 vmovdqu64 160($ctx),$S1
3495 vmovdqu64 64($ctx),$R0
3496 vmovdqu64 96($ctx),$R1
3498 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3502 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3503 vpbroadcastq %x#$R0,$RR0
3504 vpbroadcastq %x#$R1,$RR1
3506 vpxorq $D0lo,$D0lo,$D0lo
3507 vpmadd52luq $RR2,$S1,$D0lo
3508 vpxorq $D0hi,$D0hi,$D0hi
3509 vpmadd52huq $RR2,$S1,$D0hi
3510 vpxorq $D1lo,$D1lo,$D1lo
3511 vpmadd52luq $RR2,$S2,$D1lo
3512 vpxorq $D1hi,$D1hi,$D1hi
3513 vpmadd52huq $RR2,$S2,$D1hi
3514 vpxorq $D2lo,$D2lo,$D2lo
3515 vpmadd52luq $RR2,$R0,$D2lo
3516 vpxorq $D2hi,$D2hi,$D2hi
3517 vpmadd52huq $RR2,$R0,$D2hi
3519 vpmadd52luq $RR0,$R0,$D0lo
3520 vpmadd52huq $RR0,$R0,$D0hi
3521 vpmadd52luq $RR0,$R1,$D1lo
3522 vpmadd52huq $RR0,$R1,$D1hi
3523 vpmadd52luq $RR0,$R2,$D2lo
3524 vpmadd52huq $RR0,$R2,$D2hi
3526 vpmadd52luq $RR1,$S2,$D0lo
3527 vpmadd52huq $RR1,$S2,$D0hi
3528 vpmadd52luq $RR1,$R0,$D1lo
3529 vpmadd52huq $RR1,$R0,$D1hi
3530 vpmadd52luq $RR1,$R1,$D2lo
3531 vpmadd52huq $RR1,$R1,$D2hi
3533 ################################################################
3535 vpsrlq \$44,$D0lo,$tmp
3536 vpsllq \$8,$D0hi,$D0hi
3537 vpandq $mask44,$D0lo,$RR0
3538 vpaddq $tmp,$D0hi,$D0hi
3540 vpaddq $D0hi,$D1lo,$D1lo
3542 vpsrlq \$44,$D1lo,$tmp
3543 vpsllq \$8,$D1hi,$D1hi
3544 vpandq $mask44,$D1lo,$RR1
3545 vpaddq $tmp,$D1hi,$D1hi
3547 vpaddq $D1hi,$D2lo,$D2lo
3549 vpsrlq \$42,$D2lo,$tmp
3550 vpsllq \$10,$D2hi,$D2hi
3551 vpandq $mask42,$D2lo,$RR2
3552 vpaddq $tmp,$D2hi,$D2hi
3554 vpaddq $D2hi,$RR0,$RR0
3555 vpsllq \$2,$D2hi,$D2hi
3557 vpaddq $D2hi,$RR0,$RR0
3559 vpsrlq \$44,$RR0,$tmp # additional step
3560 vpandq $mask44,$RR0,$RR0
3562 vpaddq $tmp,$RR1,$RR1
3564 ################################################################
3565 # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3566 # is 15263748, which reflects how data is loaded...
3568 vpunpcklqdq $R2,$RR2,$T2 # 3748
3569 vpunpckhqdq $R2,$RR2,$R2 # 1526
3570 vpunpcklqdq $R0,$RR0,$T0
3571 vpunpckhqdq $R0,$RR0,$R0
3572 vpunpcklqdq $R1,$RR1,$T1
3573 vpunpckhqdq $R1,$RR1,$R1
3575 ######## switch to %zmm
3576 map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3577 map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3578 map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3579 map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3582 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3583 vshufi64x2 \$0x44,$R0,$T0,$RR0
3584 vshufi64x2 \$0x44,$R1,$T1,$RR1
3586 vmovdqu64 16*0($inp),$T2 # load data
3587 vmovdqu64 16*4($inp),$T3
3590 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3591 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3592 vpaddq $RR2,$SS2,$SS2
3593 vpaddq $RR1,$SS1,$SS1
3594 vpsllq \$2,$SS2,$SS2
3595 vpsllq \$2,$SS1,$SS1
3597 vpbroadcastq $padbit,$PAD
3598 vpbroadcastq %x#$mask44,$mask44
3599 vpbroadcastq %x#$mask42,$mask42
3601 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3602 vpbroadcastq %x#$SS2,$S2
3603 vpbroadcastq %x#$RR0,$R0
3604 vpbroadcastq %x#$RR1,$R1
3605 vpbroadcastq %x#$RR2,$R2
3607 vpunpcklqdq $T3,$T2,$T1 # transpose data
3608 vpunpckhqdq $T3,$T2,$T3
3610 # at this point 64-bit lanes are ordered as 73625140
3612 vpsrlq \$24,$T3,$T2 # splat the data
3614 vpaddq $T2,$H2,$H2 # accumulate input
3615 vpandq $mask44,$T1,$T0
3619 vpandq $mask44,$T1,$T1
3622 jz .Ltail_vpmadd52_8x
3623 jmp .Loop_vpmadd52_8x
3627 #vpaddq $T2,$H2,$H2 # accumulate input
3631 vpxorq $D0lo,$D0lo,$D0lo
3632 vpmadd52luq $H2,$S1,$D0lo
3633 vpxorq $D0hi,$D0hi,$D0hi
3634 vpmadd52huq $H2,$S1,$D0hi
3635 vpxorq $D1lo,$D1lo,$D1lo
3636 vpmadd52luq $H2,$S2,$D1lo
3637 vpxorq $D1hi,$D1hi,$D1hi
3638 vpmadd52huq $H2,$S2,$D1hi
3639 vpxorq $D2lo,$D2lo,$D2lo
3640 vpmadd52luq $H2,$R0,$D2lo
3641 vpxorq $D2hi,$D2hi,$D2hi
3642 vpmadd52huq $H2,$R0,$D2hi
3644 vmovdqu64 16*0($inp),$T2 # load data
3645 vmovdqu64 16*4($inp),$T3
3647 vpmadd52luq $H0,$R0,$D0lo
3648 vpmadd52huq $H0,$R0,$D0hi
3649 vpmadd52luq $H0,$R1,$D1lo
3650 vpmadd52huq $H0,$R1,$D1hi
3651 vpmadd52luq $H0,$R2,$D2lo
3652 vpmadd52huq $H0,$R2,$D2hi
3654 vpunpcklqdq $T3,$T2,$T1 # transpose data
3655 vpunpckhqdq $T3,$T2,$T3
3656 vpmadd52luq $H1,$S2,$D0lo
3657 vpmadd52huq $H1,$S2,$D0hi
3658 vpmadd52luq $H1,$R0,$D1lo
3659 vpmadd52huq $H1,$R0,$D1hi
3660 vpmadd52luq $H1,$R1,$D2lo
3661 vpmadd52huq $H1,$R1,$D2hi
3663 ################################################################
3664 # partial reduction (interleaved with data splat)
3665 vpsrlq \$44,$D0lo,$tmp
3666 vpsllq \$8,$D0hi,$D0hi
3667 vpandq $mask44,$D0lo,$H0
3668 vpaddq $tmp,$D0hi,$D0hi
3672 vpaddq $D0hi,$D1lo,$D1lo
3674 vpsrlq \$44,$D1lo,$tmp
3675 vpsllq \$8,$D1hi,$D1hi
3676 vpandq $mask44,$D1lo,$H1
3677 vpaddq $tmp,$D1hi,$D1hi
3679 vpandq $mask44,$T1,$T0
3682 vpaddq $D1hi,$D2lo,$D2lo
3684 vpsrlq \$42,$D2lo,$tmp
3685 vpsllq \$10,$D2hi,$D2hi
3686 vpandq $mask42,$D2lo,$H2
3687 vpaddq $tmp,$D2hi,$D2hi
3689 vpaddq $T2,$H2,$H2 # accumulate input
3690 vpaddq $D2hi,$H0,$H0
3691 vpsllq \$2,$D2hi,$D2hi
3693 vpaddq $D2hi,$H0,$H0
3695 vpandq $mask44,$T1,$T1
3697 vpsrlq \$44,$H0,$tmp # additional step
3698 vpandq $mask44,$H0,$H0
3702 sub \$8,$len # len-=128
3703 jnz .Loop_vpmadd52_8x
3706 #vpaddq $T2,$H2,$H2 # accumulate input
3710 vpxorq $D0lo,$D0lo,$D0lo
3711 vpmadd52luq $H2,$SS1,$D0lo
3712 vpxorq $D0hi,$D0hi,$D0hi
3713 vpmadd52huq $H2,$SS1,$D0hi
3714 vpxorq $D1lo,$D1lo,$D1lo
3715 vpmadd52luq $H2,$SS2,$D1lo
3716 vpxorq $D1hi,$D1hi,$D1hi
3717 vpmadd52huq $H2,$SS2,$D1hi
3718 vpxorq $D2lo,$D2lo,$D2lo
3719 vpmadd52luq $H2,$RR0,$D2lo
3720 vpxorq $D2hi,$D2hi,$D2hi
3721 vpmadd52huq $H2,$RR0,$D2hi
3723 vpmadd52luq $H0,$RR0,$D0lo
3724 vpmadd52huq $H0,$RR0,$D0hi
3725 vpmadd52luq $H0,$RR1,$D1lo
3726 vpmadd52huq $H0,$RR1,$D1hi
3727 vpmadd52luq $H0,$RR2,$D2lo
3728 vpmadd52huq $H0,$RR2,$D2hi
3730 vpmadd52luq $H1,$SS2,$D0lo
3731 vpmadd52huq $H1,$SS2,$D0hi
3732 vpmadd52luq $H1,$RR0,$D1lo
3733 vpmadd52huq $H1,$RR0,$D1hi
3734 vpmadd52luq $H1,$RR1,$D2lo
3735 vpmadd52huq $H1,$RR1,$D2hi
3737 ################################################################
3738 # horizontal addition
3742 vpsrldq \$8,$D0lo,$T0
3743 vpsrldq \$8,$D0hi,$H0
3744 vpsrldq \$8,$D1lo,$T1
3745 vpsrldq \$8,$D1hi,$H1
3746 vpaddq $T0,$D0lo,$D0lo
3747 vpaddq $H0,$D0hi,$D0hi
3748 vpsrldq \$8,$D2lo,$T2
3749 vpsrldq \$8,$D2hi,$H2
3750 vpaddq $T1,$D1lo,$D1lo
3751 vpaddq $H1,$D1hi,$D1hi
3752 vpermq \$0x2,$D0lo,$T0
3753 vpermq \$0x2,$D0hi,$H0
3754 vpaddq $T2,$D2lo,$D2lo
3755 vpaddq $H2,$D2hi,$D2hi
3757 vpermq \$0x2,$D1lo,$T1
3758 vpermq \$0x2,$D1hi,$H1
3759 vpaddq $T0,$D0lo,$D0lo
3760 vpaddq $H0,$D0hi,$D0hi
3761 vpermq \$0x2,$D2lo,$T2
3762 vpermq \$0x2,$D2hi,$H2
3763 vpaddq $T1,$D1lo,$D1lo
3764 vpaddq $H1,$D1hi,$D1hi
3765 vextracti64x4 \$1,$D0lo,%y#$T0
3766 vextracti64x4 \$1,$D0hi,%y#$H0
3767 vpaddq $T2,$D2lo,$D2lo
3768 vpaddq $H2,$D2hi,$D2hi
3770 vextracti64x4 \$1,$D1lo,%y#$T1
3771 vextracti64x4 \$1,$D1hi,%y#$H1
3772 vextracti64x4 \$1,$D2lo,%y#$T2
3773 vextracti64x4 \$1,$D2hi,%y#$H2
3775 ######## switch back to %ymm
3776 map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3777 map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3778 map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3781 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3782 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3783 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3784 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3785 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3786 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3788 ################################################################
3790 vpsrlq \$44,$D0lo,$tmp
3791 vpsllq \$8,$D0hi,$D0hi
3792 vpandq $mask44,$D0lo,$H0
3793 vpaddq $tmp,$D0hi,$D0hi
3795 vpaddq $D0hi,$D1lo,$D1lo
3797 vpsrlq \$44,$D1lo,$tmp
3798 vpsllq \$8,$D1hi,$D1hi
3799 vpandq $mask44,$D1lo,$H1
3800 vpaddq $tmp,$D1hi,$D1hi
3802 vpaddq $D1hi,$D2lo,$D2lo
3804 vpsrlq \$42,$D2lo,$tmp
3805 vpsllq \$10,$D2hi,$D2hi
3806 vpandq $mask42,$D2lo,$H2
3807 vpaddq $tmp,$D2hi,$D2hi
3809 vpaddq $D2hi,$H0,$H0
3810 vpsllq \$2,$D2hi,$D2hi
3812 vpaddq $D2hi,$H0,$H0
3814 vpsrlq \$44,$H0,$tmp # additional step
3815 vpandq $mask44,$H0,$H0
3819 ################################################################
3821 vmovq %x#$H0,0($ctx)
3822 vmovq %x#$H1,8($ctx)
3823 vmovq %x#$H2,16($ctx)
3826 .Lno_data_vpmadd52_8x:
3828 .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3832 .type poly1305_emit_base2_44,\@function,3
3834 poly1305_emit_base2_44:
3835 mov 0($ctx),%r8 # load hash value
3851 add \$5,%r8 # compare to modulus
3855 shr \$2,%r10 # did 130-bit value overflow?
3859 add 0($nonce),%rax # accumulate nonce
3861 mov %rax,0($mac) # write result
3865 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3871 { # chacha20-poly1305 helpers
3872 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3873 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
3875 .globl xor128_encrypt_n_pad
3876 .type xor128_encrypt_n_pad,\@abi-omnipotent
3878 xor128_encrypt_n_pad:
3881 mov $len,%r10 # put len aside
3882 shr \$4,$len # len / 16
3886 movdqu ($inp,$otp),%xmm0
3888 movdqu %xmm0,($out,$otp)
3894 and \$15,%r10 # len % 16
3920 .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3922 .globl xor128_decrypt_n_pad
3923 .type xor128_decrypt_n_pad,\@abi-omnipotent
3925 xor128_decrypt_n_pad:
3928 mov $len,%r10 # put len aside
3929 shr \$4,$len # len / 16
3933 movdqu ($inp,$otp),%xmm0
3936 movdqu %xmm1,($out,$otp)
3943 and \$15,%r10 # len % 16
3952 mov ($inp,$otp),%r11b
3971 .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3975 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3976 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3984 .extern __imp_RtlVirtualUnwind
3985 .type se_handler,\@abi-omnipotent
3999 mov 120($context),%rax # pull context->Rax
4000 mov 248($context),%rbx # pull context->Rip
4002 mov 8($disp),%rsi # disp->ImageBase
4003 mov 56($disp),%r11 # disp->HandlerData
4005 mov 0(%r11),%r10d # HandlerData[0]
4006 lea (%rsi,%r10),%r10 # prologue label
4007 cmp %r10,%rbx # context->Rip<.Lprologue
4008 jb .Lcommon_seh_tail
4010 mov 152($context),%rax # pull context->Rsp
4012 mov 4(%r11),%r10d # HandlerData[1]
4013 lea (%rsi,%r10),%r10 # epilogue label
4014 cmp %r10,%rbx # context->Rip>=.Lepilogue
4015 jae .Lcommon_seh_tail
4025 mov %rbx,144($context) # restore context->Rbx
4026 mov %rbp,160($context) # restore context->Rbp
4027 mov %r12,216($context) # restore context->R12
4028 mov %r13,224($context) # restore context->R13
4029 mov %r14,232($context) # restore context->R14
4030 mov %r15,240($context) # restore context->R14
4032 jmp .Lcommon_seh_tail
4033 .size se_handler,.-se_handler
4035 .type avx_handler,\@abi-omnipotent
4049 mov 120($context),%rax # pull context->Rax
4050 mov 248($context),%rbx # pull context->Rip
4052 mov 8($disp),%rsi # disp->ImageBase
4053 mov 56($disp),%r11 # disp->HandlerData
4055 mov 0(%r11),%r10d # HandlerData[0]
4056 lea (%rsi,%r10),%r10 # prologue label
4057 cmp %r10,%rbx # context->Rip<prologue label
4058 jb .Lcommon_seh_tail
4060 mov 152($context),%rax # pull context->Rsp
4062 mov 4(%r11),%r10d # HandlerData[1]
4063 lea (%rsi,%r10),%r10 # epilogue label
4064 cmp %r10,%rbx # context->Rip>=epilogue label
4065 jae .Lcommon_seh_tail
4067 mov 208($context),%rax # pull context->R11
4071 lea 512($context),%rdi # &context.Xmm6
4073 .long 0xa548f3fc # cld; rep movsq
4078 mov %rax,152($context) # restore context->Rsp
4079 mov %rsi,168($context) # restore context->Rsi
4080 mov %rdi,176($context) # restore context->Rdi
4082 mov 40($disp),%rdi # disp->ContextRecord
4083 mov $context,%rsi # context
4084 mov \$154,%ecx # sizeof(CONTEXT)
4085 .long 0xa548f3fc # cld; rep movsq
4088 xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
4089 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4090 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4091 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4092 mov 40(%rsi),%r10 # disp->ContextRecord
4093 lea 56(%rsi),%r11 # &disp->HandlerData
4094 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4095 mov %r10,32(%rsp) # arg5
4096 mov %r11,40(%rsp) # arg6
4097 mov %r12,48(%rsp) # arg7
4098 mov %rcx,56(%rsp) # arg8, (NULL)
4099 call *__imp_RtlVirtualUnwind(%rip)
4101 mov \$1,%eax # ExceptionContinueSearch
4113 .size avx_handler,.-avx_handler
4117 .rva .LSEH_begin_poly1305_init_x86_64
4118 .rva .LSEH_end_poly1305_init_x86_64
4119 .rva .LSEH_info_poly1305_init_x86_64
4121 .rva .LSEH_begin_poly1305_blocks_x86_64
4122 .rva .LSEH_end_poly1305_blocks_x86_64
4123 .rva .LSEH_info_poly1305_blocks_x86_64
4125 .rva .LSEH_begin_poly1305_emit_x86_64
4126 .rva .LSEH_end_poly1305_emit_x86_64
4127 .rva .LSEH_info_poly1305_emit_x86_64
4129 $code.=<<___ if ($avx);
4130 .rva .LSEH_begin_poly1305_blocks_avx
4132 .rva .LSEH_info_poly1305_blocks_avx_1
4136 .rva .LSEH_info_poly1305_blocks_avx_2
4139 .rva .LSEH_end_poly1305_blocks_avx
4140 .rva .LSEH_info_poly1305_blocks_avx_3
4142 .rva .LSEH_begin_poly1305_emit_avx
4143 .rva .LSEH_end_poly1305_emit_avx
4144 .rva .LSEH_info_poly1305_emit_avx
4146 $code.=<<___ if ($avx>1);
4147 .rva .LSEH_begin_poly1305_blocks_avx2
4148 .rva .Lbase2_64_avx2
4149 .rva .LSEH_info_poly1305_blocks_avx2_1
4151 .rva .Lbase2_64_avx2
4153 .rva .LSEH_info_poly1305_blocks_avx2_2
4156 .rva .LSEH_end_poly1305_blocks_avx2
4157 .rva .LSEH_info_poly1305_blocks_avx2_3
4159 $code.=<<___ if ($avx>2);
4160 .rva .LSEH_begin_poly1305_blocks_avx512
4161 .rva .LSEH_end_poly1305_blocks_avx512
4162 .rva .LSEH_info_poly1305_blocks_avx512
4167 .LSEH_info_poly1305_init_x86_64:
4170 .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
4172 .LSEH_info_poly1305_blocks_x86_64:
4175 .rva .Lblocks_body,.Lblocks_epilogue
4177 .LSEH_info_poly1305_emit_x86_64:
4180 .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
4182 $code.=<<___ if ($avx);
4183 .LSEH_info_poly1305_blocks_avx_1:
4186 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4188 .LSEH_info_poly1305_blocks_avx_2:
4191 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4193 .LSEH_info_poly1305_blocks_avx_3:
4196 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4198 .LSEH_info_poly1305_emit_avx:
4201 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4203 $code.=<<___ if ($avx>1);
4204 .LSEH_info_poly1305_blocks_avx2_1:
4207 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4209 .LSEH_info_poly1305_blocks_avx2_2:
4212 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4214 .LSEH_info_poly1305_blocks_avx2_3:
4217 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4219 $code.=<<___ if ($avx>2);
4220 .LSEH_info_poly1305_blocks_avx512:
4223 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4230 last if (!s/^#/\/\// and !/^$/);
4235 foreach (split('\n',$code)) {
4236 s/\`([^\`]*)\`/eval($1)/ge;
4237 s/%r([a-z]+)#d/%e$1/g;
4238 s/%r([0-9]+)#d/%r$1d/g;
4239 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4242 s/(^\.type.*),[0-9]+$/\1/;
4243 s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;