crypto: arm - use a pattern rule for generating *.S files
[linux-2.6-microblaze.git] / arch / arm64 / crypto / poly1305-core.S_shipped
1 #ifndef __KERNEL__
2 # include "arm_arch.h"
3 .extern OPENSSL_armcap_P
4 #endif
5
6 .text
7
8 // forward "declarations" are required for Apple
9 .globl  poly1305_blocks
10 .globl  poly1305_emit
11
12 .globl  poly1305_init
13 .type   poly1305_init,%function
14 .align  5
15 poly1305_init:
16         cmp     x1,xzr
17         stp     xzr,xzr,[x0]            // zero hash value
18         stp     xzr,xzr,[x0,#16]        // [along with is_base2_26]
19
20         csel    x0,xzr,x0,eq
21         b.eq    .Lno_key
22
23 #ifndef __KERNEL__
24         adrp    x17,OPENSSL_armcap_P
25         ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
26 #endif
27
28         ldp     x7,x8,[x1]              // load key
29         mov     x9,#0xfffffffc0fffffff
30         movk    x9,#0x0fff,lsl#48
31 #ifdef  __AARCH64EB__
32         rev     x7,x7                   // flip bytes
33         rev     x8,x8
34 #endif
35         and     x7,x7,x9                // &=0ffffffc0fffffff
36         and     x9,x9,#-4
37         and     x8,x8,x9                // &=0ffffffc0ffffffc
38         mov     w9,#-1
39         stp     x7,x8,[x0,#32]  // save key value
40         str     w9,[x0,#48]     // impossible key power value
41
42 #ifndef __KERNEL__
43         tst     w17,#ARMV7_NEON
44
45         adr     x12,.Lpoly1305_blocks
46         adr     x7,.Lpoly1305_blocks_neon
47         adr     x13,.Lpoly1305_emit
48
49         csel    x12,x12,x7,eq
50
51 # ifdef __ILP32__
52         stp     w12,w13,[x2]
53 # else
54         stp     x12,x13,[x2]
55 # endif
56 #endif
57         mov     x0,#1
58 .Lno_key:
59         ret
60 .size   poly1305_init,.-poly1305_init
61
62 .type   poly1305_blocks,%function
63 .align  5
64 poly1305_blocks:
65 .Lpoly1305_blocks:
66         ands    x2,x2,#-16
67         b.eq    .Lno_data
68
69         ldp     x4,x5,[x0]              // load hash value
70         ldp     x6,x17,[x0,#16] // [along with is_base2_26]
71         ldp     x7,x8,[x0,#32]  // load key value
72
73 #ifdef  __AARCH64EB__
74         lsr     x12,x4,#32
75         mov     w13,w4
76         lsr     x14,x5,#32
77         mov     w15,w5
78         lsr     x16,x6,#32
79 #else
80         mov     w12,w4
81         lsr     x13,x4,#32
82         mov     w14,w5
83         lsr     x15,x5,#32
84         mov     w16,w6
85 #endif
86
87         add     x12,x12,x13,lsl#26      // base 2^26 -> base 2^64
88         lsr     x13,x14,#12
89         adds    x12,x12,x14,lsl#52
90         add     x13,x13,x15,lsl#14
91         adc     x13,x13,xzr
92         lsr     x14,x16,#24
93         adds    x13,x13,x16,lsl#40
94         adc     x14,x14,xzr
95
96         cmp     x17,#0                  // is_base2_26?
97         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
98         csel    x4,x4,x12,eq            // choose between radixes
99         csel    x5,x5,x13,eq
100         csel    x6,x6,x14,eq
101
102 .Loop:
103         ldp     x10,x11,[x1],#16        // load input
104         sub     x2,x2,#16
105 #ifdef  __AARCH64EB__
106         rev     x10,x10
107         rev     x11,x11
108 #endif
109         adds    x4,x4,x10               // accumulate input
110         adcs    x5,x5,x11
111
112         mul     x12,x4,x7               // h0*r0
113         adc     x6,x6,x3
114         umulh   x13,x4,x7
115
116         mul     x10,x5,x9               // h1*5*r1
117         umulh   x11,x5,x9
118
119         adds    x12,x12,x10
120         mul     x10,x4,x8               // h0*r1
121         adc     x13,x13,x11
122         umulh   x14,x4,x8
123
124         adds    x13,x13,x10
125         mul     x10,x5,x7               // h1*r0
126         adc     x14,x14,xzr
127         umulh   x11,x5,x7
128
129         adds    x13,x13,x10
130         mul     x10,x6,x9               // h2*5*r1
131         adc     x14,x14,x11
132         mul     x11,x6,x7               // h2*r0
133
134         adds    x13,x13,x10
135         adc     x14,x14,x11
136
137         and     x10,x14,#-4             // final reduction
138         and     x6,x14,#3
139         add     x10,x10,x14,lsr#2
140         adds    x4,x12,x10
141         adcs    x5,x13,xzr
142         adc     x6,x6,xzr
143
144         cbnz    x2,.Loop
145
146         stp     x4,x5,[x0]              // store hash value
147         stp     x6,xzr,[x0,#16] // [and clear is_base2_26]
148
149 .Lno_data:
150         ret
151 .size   poly1305_blocks,.-poly1305_blocks
152
153 .type   poly1305_emit,%function
154 .align  5
155 poly1305_emit:
156 .Lpoly1305_emit:
157         ldp     x4,x5,[x0]              // load hash base 2^64
158         ldp     x6,x7,[x0,#16]  // [along with is_base2_26]
159         ldp     x10,x11,[x2]    // load nonce
160
161 #ifdef  __AARCH64EB__
162         lsr     x12,x4,#32
163         mov     w13,w4
164         lsr     x14,x5,#32
165         mov     w15,w5
166         lsr     x16,x6,#32
167 #else
168         mov     w12,w4
169         lsr     x13,x4,#32
170         mov     w14,w5
171         lsr     x15,x5,#32
172         mov     w16,w6
173 #endif
174
175         add     x12,x12,x13,lsl#26      // base 2^26 -> base 2^64
176         lsr     x13,x14,#12
177         adds    x12,x12,x14,lsl#52
178         add     x13,x13,x15,lsl#14
179         adc     x13,x13,xzr
180         lsr     x14,x16,#24
181         adds    x13,x13,x16,lsl#40
182         adc     x14,x14,xzr
183
184         cmp     x7,#0                   // is_base2_26?
185         csel    x4,x4,x12,eq            // choose between radixes
186         csel    x5,x5,x13,eq
187         csel    x6,x6,x14,eq
188
189         adds    x12,x4,#5               // compare to modulus
190         adcs    x13,x5,xzr
191         adc     x14,x6,xzr
192
193         tst     x14,#-4                 // see if it's carried/borrowed
194
195         csel    x4,x4,x12,eq
196         csel    x5,x5,x13,eq
197
198 #ifdef  __AARCH64EB__
199         ror     x10,x10,#32             // flip nonce words
200         ror     x11,x11,#32
201 #endif
202         adds    x4,x4,x10               // accumulate nonce
203         adc     x5,x5,x11
204 #ifdef  __AARCH64EB__
205         rev     x4,x4                   // flip output bytes
206         rev     x5,x5
207 #endif
208         stp     x4,x5,[x1]              // write result
209
210         ret
211 .size   poly1305_emit,.-poly1305_emit
212 .type   poly1305_mult,%function
213 .align  5
214 poly1305_mult:
215         mul     x12,x4,x7               // h0*r0
216         umulh   x13,x4,x7
217
218         mul     x10,x5,x9               // h1*5*r1
219         umulh   x11,x5,x9
220
221         adds    x12,x12,x10
222         mul     x10,x4,x8               // h0*r1
223         adc     x13,x13,x11
224         umulh   x14,x4,x8
225
226         adds    x13,x13,x10
227         mul     x10,x5,x7               // h1*r0
228         adc     x14,x14,xzr
229         umulh   x11,x5,x7
230
231         adds    x13,x13,x10
232         mul     x10,x6,x9               // h2*5*r1
233         adc     x14,x14,x11
234         mul     x11,x6,x7               // h2*r0
235
236         adds    x13,x13,x10
237         adc     x14,x14,x11
238
239         and     x10,x14,#-4             // final reduction
240         and     x6,x14,#3
241         add     x10,x10,x14,lsr#2
242         adds    x4,x12,x10
243         adcs    x5,x13,xzr
244         adc     x6,x6,xzr
245
246         ret
247 .size   poly1305_mult,.-poly1305_mult
248
249 .type   poly1305_splat,%function
250 .align  4
251 poly1305_splat:
252         and     x12,x4,#0x03ffffff      // base 2^64 -> base 2^26
253         ubfx    x13,x4,#26,#26
254         extr    x14,x5,x4,#52
255         and     x14,x14,#0x03ffffff
256         ubfx    x15,x5,#14,#26
257         extr    x16,x6,x5,#40
258
259         str     w12,[x0,#16*0]  // r0
260         add     w12,w13,w13,lsl#2       // r1*5
261         str     w13,[x0,#16*1]  // r1
262         add     w13,w14,w14,lsl#2       // r2*5
263         str     w12,[x0,#16*2]  // s1
264         str     w14,[x0,#16*3]  // r2
265         add     w14,w15,w15,lsl#2       // r3*5
266         str     w13,[x0,#16*4]  // s2
267         str     w15,[x0,#16*5]  // r3
268         add     w15,w16,w16,lsl#2       // r4*5
269         str     w14,[x0,#16*6]  // s3
270         str     w16,[x0,#16*7]  // r4
271         str     w15,[x0,#16*8]  // s4
272
273         ret
274 .size   poly1305_splat,.-poly1305_splat
275
276 #ifdef  __KERNEL__
277 .globl  poly1305_blocks_neon
278 #endif
279 .type   poly1305_blocks_neon,%function
280 .align  5
281 poly1305_blocks_neon:
282 .Lpoly1305_blocks_neon:
283         ldr     x17,[x0,#24]
284         cmp     x2,#128
285         b.lo    .Lpoly1305_blocks
286
287         .inst   0xd503233f              // paciasp
288         stp     x29,x30,[sp,#-80]!
289         add     x29,sp,#0
290
291         stp     d8,d9,[sp,#16]          // meet ABI requirements
292         stp     d10,d11,[sp,#32]
293         stp     d12,d13,[sp,#48]
294         stp     d14,d15,[sp,#64]
295
296         cbz     x17,.Lbase2_64_neon
297
298         ldp     w10,w11,[x0]            // load hash value base 2^26
299         ldp     w12,w13,[x0,#8]
300         ldr     w14,[x0,#16]
301
302         tst     x2,#31
303         b.eq    .Leven_neon
304
305         ldp     x7,x8,[x0,#32]  // load key value
306
307         add     x4,x10,x11,lsl#26       // base 2^26 -> base 2^64
308         lsr     x5,x12,#12
309         adds    x4,x4,x12,lsl#52
310         add     x5,x5,x13,lsl#14
311         adc     x5,x5,xzr
312         lsr     x6,x14,#24
313         adds    x5,x5,x14,lsl#40
314         adc     x14,x6,xzr              // can be partially reduced...
315
316         ldp     x12,x13,[x1],#16        // load input
317         sub     x2,x2,#16
318         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
319
320 #ifdef  __AARCH64EB__
321         rev     x12,x12
322         rev     x13,x13
323 #endif
324         adds    x4,x4,x12               // accumulate input
325         adcs    x5,x5,x13
326         adc     x6,x6,x3
327
328         bl      poly1305_mult
329
330         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
331         ubfx    x11,x4,#26,#26
332         extr    x12,x5,x4,#52
333         and     x12,x12,#0x03ffffff
334         ubfx    x13,x5,#14,#26
335         extr    x14,x6,x5,#40
336
337         b       .Leven_neon
338
339 .align  4
340 .Lbase2_64_neon:
341         ldp     x7,x8,[x0,#32]  // load key value
342
343         ldp     x4,x5,[x0]              // load hash value base 2^64
344         ldr     x6,[x0,#16]
345
346         tst     x2,#31
347         b.eq    .Linit_neon
348
349         ldp     x12,x13,[x1],#16        // load input
350         sub     x2,x2,#16
351         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
352 #ifdef  __AARCH64EB__
353         rev     x12,x12
354         rev     x13,x13
355 #endif
356         adds    x4,x4,x12               // accumulate input
357         adcs    x5,x5,x13
358         adc     x6,x6,x3
359
360         bl      poly1305_mult
361
362 .Linit_neon:
363         ldr     w17,[x0,#48]            // first table element
364         and     x10,x4,#0x03ffffff      // base 2^64 -> base 2^26
365         ubfx    x11,x4,#26,#26
366         extr    x12,x5,x4,#52
367         and     x12,x12,#0x03ffffff
368         ubfx    x13,x5,#14,#26
369         extr    x14,x6,x5,#40
370
371         cmp     w17,#-1                 // is value impossible?
372         b.ne    .Leven_neon
373
374         fmov    d24,x10
375         fmov    d25,x11
376         fmov    d26,x12
377         fmov    d27,x13
378         fmov    d28,x14
379
380         ////////////////////////////////// initialize r^n table
381         mov     x4,x7                   // r^1
382         add     x9,x8,x8,lsr#2  // s1 = r1 + (r1 >> 2)
383         mov     x5,x8
384         mov     x6,xzr
385         add     x0,x0,#48+12
386         bl      poly1305_splat
387
388         bl      poly1305_mult           // r^2
389         sub     x0,x0,#4
390         bl      poly1305_splat
391
392         bl      poly1305_mult           // r^3
393         sub     x0,x0,#4
394         bl      poly1305_splat
395
396         bl      poly1305_mult           // r^4
397         sub     x0,x0,#4
398         bl      poly1305_splat
399         sub     x0,x0,#48               // restore original x0
400         b       .Ldo_neon
401
402 .align  4
403 .Leven_neon:
404         fmov    d24,x10
405         fmov    d25,x11
406         fmov    d26,x12
407         fmov    d27,x13
408         fmov    d28,x14
409
410 .Ldo_neon:
411         ldp     x8,x12,[x1,#32] // inp[2:3]
412         subs    x2,x2,#64
413         ldp     x9,x13,[x1,#48]
414         add     x16,x1,#96
415         adr     x17,.Lzeros
416
417         lsl     x3,x3,#24
418         add     x15,x0,#48
419
420 #ifdef  __AARCH64EB__
421         rev     x8,x8
422         rev     x12,x12
423         rev     x9,x9
424         rev     x13,x13
425 #endif
426         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
427         and     x5,x9,#0x03ffffff
428         ubfx    x6,x8,#26,#26
429         ubfx    x7,x9,#26,#26
430         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
431         extr    x8,x12,x8,#52
432         extr    x9,x13,x9,#52
433         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
434         fmov    d14,x4
435         and     x8,x8,#0x03ffffff
436         and     x9,x9,#0x03ffffff
437         ubfx    x10,x12,#14,#26
438         ubfx    x11,x13,#14,#26
439         add     x12,x3,x12,lsr#40
440         add     x13,x3,x13,lsr#40
441         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
442         fmov    d15,x6
443         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
444         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
445         fmov    d16,x8
446         fmov    d17,x10
447         fmov    d18,x12
448
449         ldp     x8,x12,[x1],#16 // inp[0:1]
450         ldp     x9,x13,[x1],#48
451
452         ld1     {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
453         ld1     {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
454         ld1     {v8.4s},[x15]
455
456 #ifdef  __AARCH64EB__
457         rev     x8,x8
458         rev     x12,x12
459         rev     x9,x9
460         rev     x13,x13
461 #endif
462         and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
463         and     x5,x9,#0x03ffffff
464         ubfx    x6,x8,#26,#26
465         ubfx    x7,x9,#26,#26
466         add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
467         extr    x8,x12,x8,#52
468         extr    x9,x13,x9,#52
469         add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
470         fmov    d9,x4
471         and     x8,x8,#0x03ffffff
472         and     x9,x9,#0x03ffffff
473         ubfx    x10,x12,#14,#26
474         ubfx    x11,x13,#14,#26
475         add     x12,x3,x12,lsr#40
476         add     x13,x3,x13,lsr#40
477         add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
478         fmov    d10,x6
479         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
480         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
481         movi    v31.2d,#-1
482         fmov    d11,x8
483         fmov    d12,x10
484         fmov    d13,x12
485         ushr    v31.2d,v31.2d,#38
486
487         b.ls    .Lskip_loop
488
489 .align  4
490 .Loop_neon:
491         ////////////////////////////////////////////////////////////////
492         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
493         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
494         //   ___________________/
495         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
496         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
497         //   ___________________/ ____________________/
498         //
499         // Note that we start with inp[2:3]*r^2. This is because it
500         // doesn't depend on reduction in previous iteration.
501         ////////////////////////////////////////////////////////////////
502         // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
503         // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
504         // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
505         // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
506         // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
507
508         subs    x2,x2,#64
509         umull   v23.2d,v14.2s,v7.s[2]
510         csel    x16,x17,x16,lo
511         umull   v22.2d,v14.2s,v5.s[2]
512         umull   v21.2d,v14.2s,v3.s[2]
513          ldp    x8,x12,[x16],#16        // inp[2:3] (or zero)
514         umull   v20.2d,v14.2s,v1.s[2]
515          ldp    x9,x13,[x16],#48
516         umull   v19.2d,v14.2s,v0.s[2]
517 #ifdef  __AARCH64EB__
518          rev    x8,x8
519          rev    x12,x12
520          rev    x9,x9
521          rev    x13,x13
522 #endif
523
524         umlal   v23.2d,v15.2s,v5.s[2]
525          and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
526         umlal   v22.2d,v15.2s,v3.s[2]
527          and    x5,x9,#0x03ffffff
528         umlal   v21.2d,v15.2s,v1.s[2]
529          ubfx   x6,x8,#26,#26
530         umlal   v20.2d,v15.2s,v0.s[2]
531          ubfx   x7,x9,#26,#26
532         umlal   v19.2d,v15.2s,v8.s[2]
533          add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
534
535         umlal   v23.2d,v16.2s,v3.s[2]
536          extr   x8,x12,x8,#52
537         umlal   v22.2d,v16.2s,v1.s[2]
538          extr   x9,x13,x9,#52
539         umlal   v21.2d,v16.2s,v0.s[2]
540          add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
541         umlal   v20.2d,v16.2s,v8.s[2]
542          fmov   d14,x4
543         umlal   v19.2d,v16.2s,v6.s[2]
544          and    x8,x8,#0x03ffffff
545
546         umlal   v23.2d,v17.2s,v1.s[2]
547          and    x9,x9,#0x03ffffff
548         umlal   v22.2d,v17.2s,v0.s[2]
549          ubfx   x10,x12,#14,#26
550         umlal   v21.2d,v17.2s,v8.s[2]
551          ubfx   x11,x13,#14,#26
552         umlal   v20.2d,v17.2s,v6.s[2]
553          add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
554         umlal   v19.2d,v17.2s,v4.s[2]
555          fmov   d15,x6
556
557         add     v11.2s,v11.2s,v26.2s
558          add    x12,x3,x12,lsr#40
559         umlal   v23.2d,v18.2s,v0.s[2]
560          add    x13,x3,x13,lsr#40
561         umlal   v22.2d,v18.2s,v8.s[2]
562          add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
563         umlal   v21.2d,v18.2s,v6.s[2]
564          add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
565         umlal   v20.2d,v18.2s,v4.s[2]
566          fmov   d16,x8
567         umlal   v19.2d,v18.2s,v2.s[2]
568          fmov   d17,x10
569
570         ////////////////////////////////////////////////////////////////
571         // (hash+inp[0:1])*r^4 and accumulate
572
573         add     v9.2s,v9.2s,v24.2s
574          fmov   d18,x12
575         umlal   v22.2d,v11.2s,v1.s[0]
576          ldp    x8,x12,[x1],#16 // inp[0:1]
577         umlal   v19.2d,v11.2s,v6.s[0]
578          ldp    x9,x13,[x1],#48
579         umlal   v23.2d,v11.2s,v3.s[0]
580         umlal   v20.2d,v11.2s,v8.s[0]
581         umlal   v21.2d,v11.2s,v0.s[0]
582 #ifdef  __AARCH64EB__
583          rev    x8,x8
584          rev    x12,x12
585          rev    x9,x9
586          rev    x13,x13
587 #endif
588
589         add     v10.2s,v10.2s,v25.2s
590         umlal   v22.2d,v9.2s,v5.s[0]
591         umlal   v23.2d,v9.2s,v7.s[0]
592          and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
593         umlal   v21.2d,v9.2s,v3.s[0]
594          and    x5,x9,#0x03ffffff
595         umlal   v19.2d,v9.2s,v0.s[0]
596          ubfx   x6,x8,#26,#26
597         umlal   v20.2d,v9.2s,v1.s[0]
598          ubfx   x7,x9,#26,#26
599
600         add     v12.2s,v12.2s,v27.2s
601          add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
602         umlal   v22.2d,v10.2s,v3.s[0]
603          extr   x8,x12,x8,#52
604         umlal   v23.2d,v10.2s,v5.s[0]
605          extr   x9,x13,x9,#52
606         umlal   v19.2d,v10.2s,v8.s[0]
607          add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
608         umlal   v21.2d,v10.2s,v1.s[0]
609          fmov   d9,x4
610         umlal   v20.2d,v10.2s,v0.s[0]
611          and    x8,x8,#0x03ffffff
612
613         add     v13.2s,v13.2s,v28.2s
614          and    x9,x9,#0x03ffffff
615         umlal   v22.2d,v12.2s,v0.s[0]
616          ubfx   x10,x12,#14,#26
617         umlal   v19.2d,v12.2s,v4.s[0]
618          ubfx   x11,x13,#14,#26
619         umlal   v23.2d,v12.2s,v1.s[0]
620          add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
621         umlal   v20.2d,v12.2s,v6.s[0]
622          fmov   d10,x6
623         umlal   v21.2d,v12.2s,v8.s[0]
624          add    x12,x3,x12,lsr#40
625
626         umlal   v22.2d,v13.2s,v8.s[0]
627          add    x13,x3,x13,lsr#40
628         umlal   v19.2d,v13.2s,v2.s[0]
629          add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
630         umlal   v23.2d,v13.2s,v0.s[0]
631          add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
632         umlal   v20.2d,v13.2s,v4.s[0]
633          fmov   d11,x8
634         umlal   v21.2d,v13.2s,v6.s[0]
635          fmov   d12,x10
636          fmov   d13,x12
637
638         /////////////////////////////////////////////////////////////////
639         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
640         // and P. Schwabe
641         //
642         // [see discussion in poly1305-armv4 module]
643
644         ushr    v29.2d,v22.2d,#26
645         xtn     v27.2s,v22.2d
646          ushr   v30.2d,v19.2d,#26
647          and    v19.16b,v19.16b,v31.16b
648         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
649         bic     v27.2s,#0xfc,lsl#24     // &=0x03ffffff
650          add    v20.2d,v20.2d,v30.2d    // h0 -> h1
651
652         ushr    v29.2d,v23.2d,#26
653         xtn     v28.2s,v23.2d
654          ushr   v30.2d,v20.2d,#26
655          xtn    v25.2s,v20.2d
656         bic     v28.2s,#0xfc,lsl#24
657          add    v21.2d,v21.2d,v30.2d    // h1 -> h2
658
659         add     v19.2d,v19.2d,v29.2d
660         shl     v29.2d,v29.2d,#2
661          shrn   v30.2s,v21.2d,#26
662          xtn    v26.2s,v21.2d
663         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
664          bic    v25.2s,#0xfc,lsl#24
665          add    v27.2s,v27.2s,v30.2s            // h2 -> h3
666          bic    v26.2s,#0xfc,lsl#24
667
668         shrn    v29.2s,v19.2d,#26
669         xtn     v24.2s,v19.2d
670          ushr   v30.2s,v27.2s,#26
671          bic    v27.2s,#0xfc,lsl#24
672          bic    v24.2s,#0xfc,lsl#24
673         add     v25.2s,v25.2s,v29.2s            // h0 -> h1
674          add    v28.2s,v28.2s,v30.2s            // h3 -> h4
675
676         b.hi    .Loop_neon
677
678 .Lskip_loop:
679         dup     v16.2d,v16.d[0]
680         add     v11.2s,v11.2s,v26.2s
681
682         ////////////////////////////////////////////////////////////////
683         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
684
685         adds    x2,x2,#32
686         b.ne    .Long_tail
687
688         dup     v16.2d,v11.d[0]
689         add     v14.2s,v9.2s,v24.2s
690         add     v17.2s,v12.2s,v27.2s
691         add     v15.2s,v10.2s,v25.2s
692         add     v18.2s,v13.2s,v28.2s
693
694 .Long_tail:
695         dup     v14.2d,v14.d[0]
696         umull2  v19.2d,v16.4s,v6.4s
697         umull2  v22.2d,v16.4s,v1.4s
698         umull2  v23.2d,v16.4s,v3.4s
699         umull2  v21.2d,v16.4s,v0.4s
700         umull2  v20.2d,v16.4s,v8.4s
701
702         dup     v15.2d,v15.d[0]
703         umlal2  v19.2d,v14.4s,v0.4s
704         umlal2  v21.2d,v14.4s,v3.4s
705         umlal2  v22.2d,v14.4s,v5.4s
706         umlal2  v23.2d,v14.4s,v7.4s
707         umlal2  v20.2d,v14.4s,v1.4s
708
709         dup     v17.2d,v17.d[0]
710         umlal2  v19.2d,v15.4s,v8.4s
711         umlal2  v22.2d,v15.4s,v3.4s
712         umlal2  v21.2d,v15.4s,v1.4s
713         umlal2  v23.2d,v15.4s,v5.4s
714         umlal2  v20.2d,v15.4s,v0.4s
715
716         dup     v18.2d,v18.d[0]
717         umlal2  v22.2d,v17.4s,v0.4s
718         umlal2  v23.2d,v17.4s,v1.4s
719         umlal2  v19.2d,v17.4s,v4.4s
720         umlal2  v20.2d,v17.4s,v6.4s
721         umlal2  v21.2d,v17.4s,v8.4s
722
723         umlal2  v22.2d,v18.4s,v8.4s
724         umlal2  v19.2d,v18.4s,v2.4s
725         umlal2  v23.2d,v18.4s,v0.4s
726         umlal2  v20.2d,v18.4s,v4.4s
727         umlal2  v21.2d,v18.4s,v6.4s
728
729         b.eq    .Lshort_tail
730
731         ////////////////////////////////////////////////////////////////
732         // (hash+inp[0:1])*r^4:r^3 and accumulate
733
734         add     v9.2s,v9.2s,v24.2s
735         umlal   v22.2d,v11.2s,v1.2s
736         umlal   v19.2d,v11.2s,v6.2s
737         umlal   v23.2d,v11.2s,v3.2s
738         umlal   v20.2d,v11.2s,v8.2s
739         umlal   v21.2d,v11.2s,v0.2s
740
741         add     v10.2s,v10.2s,v25.2s
742         umlal   v22.2d,v9.2s,v5.2s
743         umlal   v19.2d,v9.2s,v0.2s
744         umlal   v23.2d,v9.2s,v7.2s
745         umlal   v20.2d,v9.2s,v1.2s
746         umlal   v21.2d,v9.2s,v3.2s
747
748         add     v12.2s,v12.2s,v27.2s
749         umlal   v22.2d,v10.2s,v3.2s
750         umlal   v19.2d,v10.2s,v8.2s
751         umlal   v23.2d,v10.2s,v5.2s
752         umlal   v20.2d,v10.2s,v0.2s
753         umlal   v21.2d,v10.2s,v1.2s
754
755         add     v13.2s,v13.2s,v28.2s
756         umlal   v22.2d,v12.2s,v0.2s
757         umlal   v19.2d,v12.2s,v4.2s
758         umlal   v23.2d,v12.2s,v1.2s
759         umlal   v20.2d,v12.2s,v6.2s
760         umlal   v21.2d,v12.2s,v8.2s
761
762         umlal   v22.2d,v13.2s,v8.2s
763         umlal   v19.2d,v13.2s,v2.2s
764         umlal   v23.2d,v13.2s,v0.2s
765         umlal   v20.2d,v13.2s,v4.2s
766         umlal   v21.2d,v13.2s,v6.2s
767
768 .Lshort_tail:
769         ////////////////////////////////////////////////////////////////
770         // horizontal add
771
772         addp    v22.2d,v22.2d,v22.2d
773          ldp    d8,d9,[sp,#16]          // meet ABI requirements
774         addp    v19.2d,v19.2d,v19.2d
775          ldp    d10,d11,[sp,#32]
776         addp    v23.2d,v23.2d,v23.2d
777          ldp    d12,d13,[sp,#48]
778         addp    v20.2d,v20.2d,v20.2d
779          ldp    d14,d15,[sp,#64]
780         addp    v21.2d,v21.2d,v21.2d
781          ldr    x30,[sp,#8]
782
783         ////////////////////////////////////////////////////////////////
784         // lazy reduction, but without narrowing
785
786         ushr    v29.2d,v22.2d,#26
787         and     v22.16b,v22.16b,v31.16b
788          ushr   v30.2d,v19.2d,#26
789          and    v19.16b,v19.16b,v31.16b
790
791         add     v23.2d,v23.2d,v29.2d    // h3 -> h4
792          add    v20.2d,v20.2d,v30.2d    // h0 -> h1
793
794         ushr    v29.2d,v23.2d,#26
795         and     v23.16b,v23.16b,v31.16b
796          ushr   v30.2d,v20.2d,#26
797          and    v20.16b,v20.16b,v31.16b
798          add    v21.2d,v21.2d,v30.2d    // h1 -> h2
799
800         add     v19.2d,v19.2d,v29.2d
801         shl     v29.2d,v29.2d,#2
802          ushr   v30.2d,v21.2d,#26
803          and    v21.16b,v21.16b,v31.16b
804         add     v19.2d,v19.2d,v29.2d    // h4 -> h0
805          add    v22.2d,v22.2d,v30.2d    // h2 -> h3
806
807         ushr    v29.2d,v19.2d,#26
808         and     v19.16b,v19.16b,v31.16b
809          ushr   v30.2d,v22.2d,#26
810          and    v22.16b,v22.16b,v31.16b
811         add     v20.2d,v20.2d,v29.2d    // h0 -> h1
812          add    v23.2d,v23.2d,v30.2d    // h3 -> h4
813
814         ////////////////////////////////////////////////////////////////
815         // write the result, can be partially reduced
816
817         st4     {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
818         mov     x4,#1
819         st1     {v23.s}[0],[x0]
820         str     x4,[x0,#8]              // set is_base2_26
821
822         ldr     x29,[sp],#80
823          .inst  0xd50323bf              // autiasp
824         ret
825 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
826
827 .align  5
828 .Lzeros:
829 .long   0,0,0,0,0,0,0,0
830 .asciz  "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
831 .align  2
832 #if !defined(__KERNEL__) && !defined(_WIN64)
833 .comm   OPENSSL_armcap_P,4,4
834 .hidden OPENSSL_armcap_P
835 #endif