Merge tag 'fuse-update-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/mszered...
[linux-2.6-microblaze.git] / arch / arm / crypto / sha512-core.S_shipped
1 @ SPDX-License-Identifier: GPL-2.0
2
3 @ This code is taken from the OpenSSL project but the author (Andy Polyakov)
4 @ has relicensed it under the GPLv2. Therefore this program is free software;
5 @ you can redistribute it and/or modify it under the terms of the GNU General
6 @ Public License version 2 as published by the Free Software Foundation.
7 @
8 @ The original headers, including the original license headers, are
9 @ included below for completeness.
10
11 @ ====================================================================
12 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
13 @ project. The module is, however, dual licensed under OpenSSL and
14 @ CRYPTOGAMS licenses depending on where you obtain it. For further
15 @ details see https://www.openssl.org/~appro/cryptogams/.
16 @ ====================================================================
17
18 @ SHA512 block procedure for ARMv4. September 2007.
19
20 @ This code is ~4.5 (four and a half) times faster than code generated
21 @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
22 @ Xscale PXA250 core].
23 @
24 @ July 2010.
25 @
26 @ Rescheduling for dual-issue pipeline resulted in 6% improvement on
27 @ Cortex A8 core and ~40 cycles per processed byte.
28
29 @ February 2011.
30 @
31 @ Profiler-assisted and platform-specific optimization resulted in 7%
32 @ improvement on Coxtex A8 core and ~38 cycles per byte.
33
34 @ March 2011.
35 @
36 @ Add NEON implementation. On Cortex A8 it was measured to process
37 @ one byte in 23.3 cycles or ~60% faster than integer-only code.
38
39 @ August 2012.
40 @
41 @ Improve NEON performance by 12% on Snapdragon S4. In absolute
42 @ terms it's 22.6 cycles per byte, which is disappointing result.
43 @ Technical writers asserted that 3-way S4 pipeline can sustain
44 @ multiple NEON instructions per cycle, but dual NEON issue could
45 @ not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
46 @ for further details. On side note Cortex-A15 processes one byte in
47 @ 16 cycles.
48
49 @ Byte order [in]dependence. =========================================
50 @
51 @ Originally caller was expected to maintain specific *dword* order in
52 @ h[0-7], namely with most significant dword at *lower* address, which
53 @ was reflected in below two parameters as 0 and 4. Now caller is
54 @ expected to maintain native byte order for whole 64-bit values.
55 #ifndef __KERNEL__
56 # include "arm_arch.h"
57 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
58 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
59 #else
60 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
61 # define __ARM_MAX_ARCH__ 7
62 # define VFP_ABI_PUSH
63 # define VFP_ABI_POP
64 #endif
65
66 #ifdef __ARMEL__
67 # define LO 0
68 # define HI 4
69 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
70 #else
71 # define HI 0
72 # define LO 4
73 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
74 #endif
75
76 .text
77 #if __ARM_ARCH__<7
78 .code   32
79 #else
80 .syntax unified
81 # ifdef __thumb2__
82 .thumb
83 # else
84 .code   32
85 # endif
86 #endif
87
88 .type   K512,%object
89 .align  5
90 K512:
91 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
92 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
93 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
94 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
95 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
96 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
97 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
98 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
99 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
100 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
101 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
102 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
103 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
104 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
105 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
106 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
107 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
108 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
109 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
110 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
111 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
112 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
113 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
114 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
115 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
116 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
117 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
118 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
119 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
120 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
121 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
122 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
123 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
124 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
125 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
126 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
127 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
128 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
129 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
130 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
131 .size   K512,.-K512
132 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
133 .LOPENSSL_armcap:
134 .word   OPENSSL_armcap_P-sha512_block_data_order
135 .skip   32-4
136 #else
137 .skip   32
138 #endif
139
140 .global sha512_block_data_order
141 .type   sha512_block_data_order,%function
142 sha512_block_data_order:
143 .Lsha512_block_data_order:
144 #if __ARM_ARCH__<7
145         sub     r3,pc,#8                @ sha512_block_data_order
146 #else
147         adr     r3,.Lsha512_block_data_order
148 #endif
149 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
150         ldr     r12,.LOPENSSL_armcap
151         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
152         tst     r12,#1
153         bne     .LNEON
154 #endif
155         add     r2,r1,r2,lsl#7  @ len to point at the end of inp
156         stmdb   sp!,{r4-r12,lr}
157         sub     r14,r3,#672             @ K512
158         sub     sp,sp,#9*8
159
160         ldr     r7,[r0,#32+LO]
161         ldr     r8,[r0,#32+HI]
162         ldr     r9, [r0,#48+LO]
163         ldr     r10, [r0,#48+HI]
164         ldr     r11, [r0,#56+LO]
165         ldr     r12, [r0,#56+HI]
166 .Loop:
167         str     r9, [sp,#48+0]
168         str     r10, [sp,#48+4]
169         str     r11, [sp,#56+0]
170         str     r12, [sp,#56+4]
171         ldr     r5,[r0,#0+LO]
172         ldr     r6,[r0,#0+HI]
173         ldr     r3,[r0,#8+LO]
174         ldr     r4,[r0,#8+HI]
175         ldr     r9, [r0,#16+LO]
176         ldr     r10, [r0,#16+HI]
177         ldr     r11, [r0,#24+LO]
178         ldr     r12, [r0,#24+HI]
179         str     r3,[sp,#8+0]
180         str     r4,[sp,#8+4]
181         str     r9, [sp,#16+0]
182         str     r10, [sp,#16+4]
183         str     r11, [sp,#24+0]
184         str     r12, [sp,#24+4]
185         ldr     r3,[r0,#40+LO]
186         ldr     r4,[r0,#40+HI]
187         str     r3,[sp,#40+0]
188         str     r4,[sp,#40+4]
189
190 .L00_15:
191 #if __ARM_ARCH__<7
192         ldrb    r3,[r1,#7]
193         ldrb    r9, [r1,#6]
194         ldrb    r10, [r1,#5]
195         ldrb    r11, [r1,#4]
196         ldrb    r4,[r1,#3]
197         ldrb    r12, [r1,#2]
198         orr     r3,r3,r9,lsl#8
199         ldrb    r9, [r1,#1]
200         orr     r3,r3,r10,lsl#16
201         ldrb    r10, [r1],#8
202         orr     r3,r3,r11,lsl#24
203         orr     r4,r4,r12,lsl#8
204         orr     r4,r4,r9,lsl#16
205         orr     r4,r4,r10,lsl#24
206 #else
207         ldr     r3,[r1,#4]
208         ldr     r4,[r1],#8
209 #ifdef __ARMEL__
210         rev     r3,r3
211         rev     r4,r4
212 #endif
213 #endif
214         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
215         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
216         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
217         mov     r9,r7,lsr#14
218         str     r3,[sp,#64+0]
219         mov     r10,r8,lsr#14
220         str     r4,[sp,#64+4]
221         eor     r9,r9,r8,lsl#18
222         ldr     r11,[sp,#56+0]  @ h.lo
223         eor     r10,r10,r7,lsl#18
224         ldr     r12,[sp,#56+4]  @ h.hi
225         eor     r9,r9,r7,lsr#18
226         eor     r10,r10,r8,lsr#18
227         eor     r9,r9,r8,lsl#14
228         eor     r10,r10,r7,lsl#14
229         eor     r9,r9,r8,lsr#9
230         eor     r10,r10,r7,lsr#9
231         eor     r9,r9,r7,lsl#23
232         eor     r10,r10,r8,lsl#23       @ Sigma1(e)
233         adds    r3,r3,r9
234         ldr     r9,[sp,#40+0]   @ f.lo
235         adc     r4,r4,r10               @ T += Sigma1(e)
236         ldr     r10,[sp,#40+4]  @ f.hi
237         adds    r3,r3,r11
238         ldr     r11,[sp,#48+0]  @ g.lo
239         adc     r4,r4,r12               @ T += h
240         ldr     r12,[sp,#48+4]  @ g.hi
241
242         eor     r9,r9,r11
243         str     r7,[sp,#32+0]
244         eor     r10,r10,r12
245         str     r8,[sp,#32+4]
246         and     r9,r9,r7
247         str     r5,[sp,#0+0]
248         and     r10,r10,r8
249         str     r6,[sp,#0+4]
250         eor     r9,r9,r11
251         ldr     r11,[r14,#LO]   @ K[i].lo
252         eor     r10,r10,r12             @ Ch(e,f,g)
253         ldr     r12,[r14,#HI]   @ K[i].hi
254
255         adds    r3,r3,r9
256         ldr     r7,[sp,#24+0]   @ d.lo
257         adc     r4,r4,r10               @ T += Ch(e,f,g)
258         ldr     r8,[sp,#24+4]   @ d.hi
259         adds    r3,r3,r11
260         and     r9,r11,#0xff
261         adc     r4,r4,r12               @ T += K[i]
262         adds    r7,r7,r3
263         ldr     r11,[sp,#8+0]   @ b.lo
264         adc     r8,r8,r4                @ d += T
265         teq     r9,#148
266
267         ldr     r12,[sp,#16+0]  @ c.lo
268 #if __ARM_ARCH__>=7
269         it      eq                      @ Thumb2 thing, sanity check in ARM
270 #endif
271         orreq   r14,r14,#1
272         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
273         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
274         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
275         mov     r9,r5,lsr#28
276         mov     r10,r6,lsr#28
277         eor     r9,r9,r6,lsl#4
278         eor     r10,r10,r5,lsl#4
279         eor     r9,r9,r6,lsr#2
280         eor     r10,r10,r5,lsr#2
281         eor     r9,r9,r5,lsl#30
282         eor     r10,r10,r6,lsl#30
283         eor     r9,r9,r6,lsr#7
284         eor     r10,r10,r5,lsr#7
285         eor     r9,r9,r5,lsl#25
286         eor     r10,r10,r6,lsl#25       @ Sigma0(a)
287         adds    r3,r3,r9
288         and     r9,r5,r11
289         adc     r4,r4,r10               @ T += Sigma0(a)
290
291         ldr     r10,[sp,#8+4]   @ b.hi
292         orr     r5,r5,r11
293         ldr     r11,[sp,#16+4]  @ c.hi
294         and     r5,r5,r12
295         and     r12,r6,r10
296         orr     r6,r6,r10
297         orr     r5,r5,r9                @ Maj(a,b,c).lo
298         and     r6,r6,r11
299         adds    r5,r5,r3
300         orr     r6,r6,r12               @ Maj(a,b,c).hi
301         sub     sp,sp,#8
302         adc     r6,r6,r4                @ h += T
303         tst     r14,#1
304         add     r14,r14,#8
305         tst     r14,#1
306         beq     .L00_15
307         ldr     r9,[sp,#184+0]
308         ldr     r10,[sp,#184+4]
309         bic     r14,r14,#1
310 .L16_79:
311         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
312         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
313         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
314         mov     r3,r9,lsr#1
315         ldr     r11,[sp,#80+0]
316         mov     r4,r10,lsr#1
317         ldr     r12,[sp,#80+4]
318         eor     r3,r3,r10,lsl#31
319         eor     r4,r4,r9,lsl#31
320         eor     r3,r3,r9,lsr#8
321         eor     r4,r4,r10,lsr#8
322         eor     r3,r3,r10,lsl#24
323         eor     r4,r4,r9,lsl#24
324         eor     r3,r3,r9,lsr#7
325         eor     r4,r4,r10,lsr#7
326         eor     r3,r3,r10,lsl#25
327
328         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
329         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
330         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
331         mov     r9,r11,lsr#19
332         mov     r10,r12,lsr#19
333         eor     r9,r9,r12,lsl#13
334         eor     r10,r10,r11,lsl#13
335         eor     r9,r9,r12,lsr#29
336         eor     r10,r10,r11,lsr#29
337         eor     r9,r9,r11,lsl#3
338         eor     r10,r10,r12,lsl#3
339         eor     r9,r9,r11,lsr#6
340         eor     r10,r10,r12,lsr#6
341         ldr     r11,[sp,#120+0]
342         eor     r9,r9,r12,lsl#26
343
344         ldr     r12,[sp,#120+4]
345         adds    r3,r3,r9
346         ldr     r9,[sp,#192+0]
347         adc     r4,r4,r10
348
349         ldr     r10,[sp,#192+4]
350         adds    r3,r3,r11
351         adc     r4,r4,r12
352         adds    r3,r3,r9
353         adc     r4,r4,r10
354         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
355         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
356         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
357         mov     r9,r7,lsr#14
358         str     r3,[sp,#64+0]
359         mov     r10,r8,lsr#14
360         str     r4,[sp,#64+4]
361         eor     r9,r9,r8,lsl#18
362         ldr     r11,[sp,#56+0]  @ h.lo
363         eor     r10,r10,r7,lsl#18
364         ldr     r12,[sp,#56+4]  @ h.hi
365         eor     r9,r9,r7,lsr#18
366         eor     r10,r10,r8,lsr#18
367         eor     r9,r9,r8,lsl#14
368         eor     r10,r10,r7,lsl#14
369         eor     r9,r9,r8,lsr#9
370         eor     r10,r10,r7,lsr#9
371         eor     r9,r9,r7,lsl#23
372         eor     r10,r10,r8,lsl#23       @ Sigma1(e)
373         adds    r3,r3,r9
374         ldr     r9,[sp,#40+0]   @ f.lo
375         adc     r4,r4,r10               @ T += Sigma1(e)
376         ldr     r10,[sp,#40+4]  @ f.hi
377         adds    r3,r3,r11
378         ldr     r11,[sp,#48+0]  @ g.lo
379         adc     r4,r4,r12               @ T += h
380         ldr     r12,[sp,#48+4]  @ g.hi
381
382         eor     r9,r9,r11
383         str     r7,[sp,#32+0]
384         eor     r10,r10,r12
385         str     r8,[sp,#32+4]
386         and     r9,r9,r7
387         str     r5,[sp,#0+0]
388         and     r10,r10,r8
389         str     r6,[sp,#0+4]
390         eor     r9,r9,r11
391         ldr     r11,[r14,#LO]   @ K[i].lo
392         eor     r10,r10,r12             @ Ch(e,f,g)
393         ldr     r12,[r14,#HI]   @ K[i].hi
394
395         adds    r3,r3,r9
396         ldr     r7,[sp,#24+0]   @ d.lo
397         adc     r4,r4,r10               @ T += Ch(e,f,g)
398         ldr     r8,[sp,#24+4]   @ d.hi
399         adds    r3,r3,r11
400         and     r9,r11,#0xff
401         adc     r4,r4,r12               @ T += K[i]
402         adds    r7,r7,r3
403         ldr     r11,[sp,#8+0]   @ b.lo
404         adc     r8,r8,r4                @ d += T
405         teq     r9,#23
406
407         ldr     r12,[sp,#16+0]  @ c.lo
408 #if __ARM_ARCH__>=7
409         it      eq                      @ Thumb2 thing, sanity check in ARM
410 #endif
411         orreq   r14,r14,#1
412         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
413         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
414         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
415         mov     r9,r5,lsr#28
416         mov     r10,r6,lsr#28
417         eor     r9,r9,r6,lsl#4
418         eor     r10,r10,r5,lsl#4
419         eor     r9,r9,r6,lsr#2
420         eor     r10,r10,r5,lsr#2
421         eor     r9,r9,r5,lsl#30
422         eor     r10,r10,r6,lsl#30
423         eor     r9,r9,r6,lsr#7
424         eor     r10,r10,r5,lsr#7
425         eor     r9,r9,r5,lsl#25
426         eor     r10,r10,r6,lsl#25       @ Sigma0(a)
427         adds    r3,r3,r9
428         and     r9,r5,r11
429         adc     r4,r4,r10               @ T += Sigma0(a)
430
431         ldr     r10,[sp,#8+4]   @ b.hi
432         orr     r5,r5,r11
433         ldr     r11,[sp,#16+4]  @ c.hi
434         and     r5,r5,r12
435         and     r12,r6,r10
436         orr     r6,r6,r10
437         orr     r5,r5,r9                @ Maj(a,b,c).lo
438         and     r6,r6,r11
439         adds    r5,r5,r3
440         orr     r6,r6,r12               @ Maj(a,b,c).hi
441         sub     sp,sp,#8
442         adc     r6,r6,r4                @ h += T
443         tst     r14,#1
444         add     r14,r14,#8
445 #if __ARM_ARCH__>=7
446         ittt    eq                      @ Thumb2 thing, sanity check in ARM
447 #endif
448         ldreq   r9,[sp,#184+0]
449         ldreq   r10,[sp,#184+4]
450         beq     .L16_79
451         bic     r14,r14,#1
452
453         ldr     r3,[sp,#8+0]
454         ldr     r4,[sp,#8+4]
455         ldr     r9, [r0,#0+LO]
456         ldr     r10, [r0,#0+HI]
457         ldr     r11, [r0,#8+LO]
458         ldr     r12, [r0,#8+HI]
459         adds    r9,r5,r9
460         str     r9, [r0,#0+LO]
461         adc     r10,r6,r10
462         str     r10, [r0,#0+HI]
463         adds    r11,r3,r11
464         str     r11, [r0,#8+LO]
465         adc     r12,r4,r12
466         str     r12, [r0,#8+HI]
467
468         ldr     r5,[sp,#16+0]
469         ldr     r6,[sp,#16+4]
470         ldr     r3,[sp,#24+0]
471         ldr     r4,[sp,#24+4]
472         ldr     r9, [r0,#16+LO]
473         ldr     r10, [r0,#16+HI]
474         ldr     r11, [r0,#24+LO]
475         ldr     r12, [r0,#24+HI]
476         adds    r9,r5,r9
477         str     r9, [r0,#16+LO]
478         adc     r10,r6,r10
479         str     r10, [r0,#16+HI]
480         adds    r11,r3,r11
481         str     r11, [r0,#24+LO]
482         adc     r12,r4,r12
483         str     r12, [r0,#24+HI]
484
485         ldr     r3,[sp,#40+0]
486         ldr     r4,[sp,#40+4]
487         ldr     r9, [r0,#32+LO]
488         ldr     r10, [r0,#32+HI]
489         ldr     r11, [r0,#40+LO]
490         ldr     r12, [r0,#40+HI]
491         adds    r7,r7,r9
492         str     r7,[r0,#32+LO]
493         adc     r8,r8,r10
494         str     r8,[r0,#32+HI]
495         adds    r11,r3,r11
496         str     r11, [r0,#40+LO]
497         adc     r12,r4,r12
498         str     r12, [r0,#40+HI]
499
500         ldr     r5,[sp,#48+0]
501         ldr     r6,[sp,#48+4]
502         ldr     r3,[sp,#56+0]
503         ldr     r4,[sp,#56+4]
504         ldr     r9, [r0,#48+LO]
505         ldr     r10, [r0,#48+HI]
506         ldr     r11, [r0,#56+LO]
507         ldr     r12, [r0,#56+HI]
508         adds    r9,r5,r9
509         str     r9, [r0,#48+LO]
510         adc     r10,r6,r10
511         str     r10, [r0,#48+HI]
512         adds    r11,r3,r11
513         str     r11, [r0,#56+LO]
514         adc     r12,r4,r12
515         str     r12, [r0,#56+HI]
516
517         add     sp,sp,#640
518         sub     r14,r14,#640
519
520         teq     r1,r2
521         bne     .Loop
522
523         add     sp,sp,#8*9              @ destroy frame
524 #if __ARM_ARCH__>=5
525         ldmia   sp!,{r4-r12,pc}
526 #else
527         ldmia   sp!,{r4-r12,lr}
528         tst     lr,#1
529         moveq   pc,lr                   @ be binary compatible with V4, yet
530         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
531 #endif
532 .size   sha512_block_data_order,.-sha512_block_data_order
533 #if __ARM_MAX_ARCH__>=7
534 .arch   armv7-a
535 .fpu    neon
536
537 .global sha512_block_data_order_neon
538 .type   sha512_block_data_order_neon,%function
539 .align  4
540 sha512_block_data_order_neon:
541 .LNEON:
542         dmb                             @ errata #451034 on early Cortex A8
543         add     r2,r1,r2,lsl#7  @ len to point at the end of inp
544         VFP_ABI_PUSH
545         adr     r3,.Lsha512_block_data_order
546         sub     r3,r3,.Lsha512_block_data_order-K512
547         vldmia  r0,{d16-d23}            @ load context
548 .Loop_neon:
549         vshr.u64        d24,d20,#14     @ 0
550 #if 0<16
551         vld1.64         {d0},[r1]!      @ handles unaligned
552 #endif
553         vshr.u64        d25,d20,#18
554 #if 0>0
555          vadd.i64       d16,d30                 @ h+=Maj from the past
556 #endif
557         vshr.u64        d26,d20,#41
558         vld1.64         {d28},[r3,:64]! @ K[i++]
559         vsli.64         d24,d20,#50
560         vsli.64         d25,d20,#46
561         vmov            d29,d20
562         vsli.64         d26,d20,#23
563 #if 0<16 && defined(__ARMEL__)
564         vrev64.8        d0,d0
565 #endif
566         veor            d25,d24
567         vbsl            d29,d21,d22             @ Ch(e,f,g)
568         vshr.u64        d24,d16,#28
569         veor            d26,d25                 @ Sigma1(e)
570         vadd.i64        d27,d29,d23
571         vshr.u64        d25,d16,#34
572         vsli.64         d24,d16,#36
573         vadd.i64        d27,d26
574         vshr.u64        d26,d16,#39
575         vadd.i64        d28,d0
576         vsli.64         d25,d16,#30
577         veor            d30,d16,d17
578         vsli.64         d26,d16,#25
579         veor            d23,d24,d25
580         vadd.i64        d27,d28
581         vbsl            d30,d18,d17             @ Maj(a,b,c)
582         veor            d23,d26                 @ Sigma0(a)
583         vadd.i64        d19,d27
584         vadd.i64        d30,d27
585         @ vadd.i64      d23,d30
586         vshr.u64        d24,d19,#14     @ 1
587 #if 1<16
588         vld1.64         {d1},[r1]!      @ handles unaligned
589 #endif
590         vshr.u64        d25,d19,#18
591 #if 1>0
592          vadd.i64       d23,d30                 @ h+=Maj from the past
593 #endif
594         vshr.u64        d26,d19,#41
595         vld1.64         {d28},[r3,:64]! @ K[i++]
596         vsli.64         d24,d19,#50
597         vsli.64         d25,d19,#46
598         vmov            d29,d19
599         vsli.64         d26,d19,#23
600 #if 1<16 && defined(__ARMEL__)
601         vrev64.8        d1,d1
602 #endif
603         veor            d25,d24
604         vbsl            d29,d20,d21             @ Ch(e,f,g)
605         vshr.u64        d24,d23,#28
606         veor            d26,d25                 @ Sigma1(e)
607         vadd.i64        d27,d29,d22
608         vshr.u64        d25,d23,#34
609         vsli.64         d24,d23,#36
610         vadd.i64        d27,d26
611         vshr.u64        d26,d23,#39
612         vadd.i64        d28,d1
613         vsli.64         d25,d23,#30
614         veor            d30,d23,d16
615         vsli.64         d26,d23,#25
616         veor            d22,d24,d25
617         vadd.i64        d27,d28
618         vbsl            d30,d17,d16             @ Maj(a,b,c)
619         veor            d22,d26                 @ Sigma0(a)
620         vadd.i64        d18,d27
621         vadd.i64        d30,d27
622         @ vadd.i64      d22,d30
623         vshr.u64        d24,d18,#14     @ 2
624 #if 2<16
625         vld1.64         {d2},[r1]!      @ handles unaligned
626 #endif
627         vshr.u64        d25,d18,#18
628 #if 2>0
629          vadd.i64       d22,d30                 @ h+=Maj from the past
630 #endif
631         vshr.u64        d26,d18,#41
632         vld1.64         {d28},[r3,:64]! @ K[i++]
633         vsli.64         d24,d18,#50
634         vsli.64         d25,d18,#46
635         vmov            d29,d18
636         vsli.64         d26,d18,#23
637 #if 2<16 && defined(__ARMEL__)
638         vrev64.8        d2,d2
639 #endif
640         veor            d25,d24
641         vbsl            d29,d19,d20             @ Ch(e,f,g)
642         vshr.u64        d24,d22,#28
643         veor            d26,d25                 @ Sigma1(e)
644         vadd.i64        d27,d29,d21
645         vshr.u64        d25,d22,#34
646         vsli.64         d24,d22,#36
647         vadd.i64        d27,d26
648         vshr.u64        d26,d22,#39
649         vadd.i64        d28,d2
650         vsli.64         d25,d22,#30
651         veor            d30,d22,d23
652         vsli.64         d26,d22,#25
653         veor            d21,d24,d25
654         vadd.i64        d27,d28
655         vbsl            d30,d16,d23             @ Maj(a,b,c)
656         veor            d21,d26                 @ Sigma0(a)
657         vadd.i64        d17,d27
658         vadd.i64        d30,d27
659         @ vadd.i64      d21,d30
660         vshr.u64        d24,d17,#14     @ 3
661 #if 3<16
662         vld1.64         {d3},[r1]!      @ handles unaligned
663 #endif
664         vshr.u64        d25,d17,#18
665 #if 3>0
666          vadd.i64       d21,d30                 @ h+=Maj from the past
667 #endif
668         vshr.u64        d26,d17,#41
669         vld1.64         {d28},[r3,:64]! @ K[i++]
670         vsli.64         d24,d17,#50
671         vsli.64         d25,d17,#46
672         vmov            d29,d17
673         vsli.64         d26,d17,#23
674 #if 3<16 && defined(__ARMEL__)
675         vrev64.8        d3,d3
676 #endif
677         veor            d25,d24
678         vbsl            d29,d18,d19             @ Ch(e,f,g)
679         vshr.u64        d24,d21,#28
680         veor            d26,d25                 @ Sigma1(e)
681         vadd.i64        d27,d29,d20
682         vshr.u64        d25,d21,#34
683         vsli.64         d24,d21,#36
684         vadd.i64        d27,d26
685         vshr.u64        d26,d21,#39
686         vadd.i64        d28,d3
687         vsli.64         d25,d21,#30
688         veor            d30,d21,d22
689         vsli.64         d26,d21,#25
690         veor            d20,d24,d25
691         vadd.i64        d27,d28
692         vbsl            d30,d23,d22             @ Maj(a,b,c)
693         veor            d20,d26                 @ Sigma0(a)
694         vadd.i64        d16,d27
695         vadd.i64        d30,d27
696         @ vadd.i64      d20,d30
697         vshr.u64        d24,d16,#14     @ 4
698 #if 4<16
699         vld1.64         {d4},[r1]!      @ handles unaligned
700 #endif
701         vshr.u64        d25,d16,#18
702 #if 4>0
703          vadd.i64       d20,d30                 @ h+=Maj from the past
704 #endif
705         vshr.u64        d26,d16,#41
706         vld1.64         {d28},[r3,:64]! @ K[i++]
707         vsli.64         d24,d16,#50
708         vsli.64         d25,d16,#46
709         vmov            d29,d16
710         vsli.64         d26,d16,#23
711 #if 4<16 && defined(__ARMEL__)
712         vrev64.8        d4,d4
713 #endif
714         veor            d25,d24
715         vbsl            d29,d17,d18             @ Ch(e,f,g)
716         vshr.u64        d24,d20,#28
717         veor            d26,d25                 @ Sigma1(e)
718         vadd.i64        d27,d29,d19
719         vshr.u64        d25,d20,#34
720         vsli.64         d24,d20,#36
721         vadd.i64        d27,d26
722         vshr.u64        d26,d20,#39
723         vadd.i64        d28,d4
724         vsli.64         d25,d20,#30
725         veor            d30,d20,d21
726         vsli.64         d26,d20,#25
727         veor            d19,d24,d25
728         vadd.i64        d27,d28
729         vbsl            d30,d22,d21             @ Maj(a,b,c)
730         veor            d19,d26                 @ Sigma0(a)
731         vadd.i64        d23,d27
732         vadd.i64        d30,d27
733         @ vadd.i64      d19,d30
734         vshr.u64        d24,d23,#14     @ 5
735 #if 5<16
736         vld1.64         {d5},[r1]!      @ handles unaligned
737 #endif
738         vshr.u64        d25,d23,#18
739 #if 5>0
740          vadd.i64       d19,d30                 @ h+=Maj from the past
741 #endif
742         vshr.u64        d26,d23,#41
743         vld1.64         {d28},[r3,:64]! @ K[i++]
744         vsli.64         d24,d23,#50
745         vsli.64         d25,d23,#46
746         vmov            d29,d23
747         vsli.64         d26,d23,#23
748 #if 5<16 && defined(__ARMEL__)
749         vrev64.8        d5,d5
750 #endif
751         veor            d25,d24
752         vbsl            d29,d16,d17             @ Ch(e,f,g)
753         vshr.u64        d24,d19,#28
754         veor            d26,d25                 @ Sigma1(e)
755         vadd.i64        d27,d29,d18
756         vshr.u64        d25,d19,#34
757         vsli.64         d24,d19,#36
758         vadd.i64        d27,d26
759         vshr.u64        d26,d19,#39
760         vadd.i64        d28,d5
761         vsli.64         d25,d19,#30
762         veor            d30,d19,d20
763         vsli.64         d26,d19,#25
764         veor            d18,d24,d25
765         vadd.i64        d27,d28
766         vbsl            d30,d21,d20             @ Maj(a,b,c)
767         veor            d18,d26                 @ Sigma0(a)
768         vadd.i64        d22,d27
769         vadd.i64        d30,d27
770         @ vadd.i64      d18,d30
771         vshr.u64        d24,d22,#14     @ 6
772 #if 6<16
773         vld1.64         {d6},[r1]!      @ handles unaligned
774 #endif
775         vshr.u64        d25,d22,#18
776 #if 6>0
777          vadd.i64       d18,d30                 @ h+=Maj from the past
778 #endif
779         vshr.u64        d26,d22,#41
780         vld1.64         {d28},[r3,:64]! @ K[i++]
781         vsli.64         d24,d22,#50
782         vsli.64         d25,d22,#46
783         vmov            d29,d22
784         vsli.64         d26,d22,#23
785 #if 6<16 && defined(__ARMEL__)
786         vrev64.8        d6,d6
787 #endif
788         veor            d25,d24
789         vbsl            d29,d23,d16             @ Ch(e,f,g)
790         vshr.u64        d24,d18,#28
791         veor            d26,d25                 @ Sigma1(e)
792         vadd.i64        d27,d29,d17
793         vshr.u64        d25,d18,#34
794         vsli.64         d24,d18,#36
795         vadd.i64        d27,d26
796         vshr.u64        d26,d18,#39
797         vadd.i64        d28,d6
798         vsli.64         d25,d18,#30
799         veor            d30,d18,d19
800         vsli.64         d26,d18,#25
801         veor            d17,d24,d25
802         vadd.i64        d27,d28
803         vbsl            d30,d20,d19             @ Maj(a,b,c)
804         veor            d17,d26                 @ Sigma0(a)
805         vadd.i64        d21,d27
806         vadd.i64        d30,d27
807         @ vadd.i64      d17,d30
808         vshr.u64        d24,d21,#14     @ 7
809 #if 7<16
810         vld1.64         {d7},[r1]!      @ handles unaligned
811 #endif
812         vshr.u64        d25,d21,#18
813 #if 7>0
814          vadd.i64       d17,d30                 @ h+=Maj from the past
815 #endif
816         vshr.u64        d26,d21,#41
817         vld1.64         {d28},[r3,:64]! @ K[i++]
818         vsli.64         d24,d21,#50
819         vsli.64         d25,d21,#46
820         vmov            d29,d21
821         vsli.64         d26,d21,#23
822 #if 7<16 && defined(__ARMEL__)
823         vrev64.8        d7,d7
824 #endif
825         veor            d25,d24
826         vbsl            d29,d22,d23             @ Ch(e,f,g)
827         vshr.u64        d24,d17,#28
828         veor            d26,d25                 @ Sigma1(e)
829         vadd.i64        d27,d29,d16
830         vshr.u64        d25,d17,#34
831         vsli.64         d24,d17,#36
832         vadd.i64        d27,d26
833         vshr.u64        d26,d17,#39
834         vadd.i64        d28,d7
835         vsli.64         d25,d17,#30
836         veor            d30,d17,d18
837         vsli.64         d26,d17,#25
838         veor            d16,d24,d25
839         vadd.i64        d27,d28
840         vbsl            d30,d19,d18             @ Maj(a,b,c)
841         veor            d16,d26                 @ Sigma0(a)
842         vadd.i64        d20,d27
843         vadd.i64        d30,d27
844         @ vadd.i64      d16,d30
845         vshr.u64        d24,d20,#14     @ 8
846 #if 8<16
847         vld1.64         {d8},[r1]!      @ handles unaligned
848 #endif
849         vshr.u64        d25,d20,#18
850 #if 8>0
851          vadd.i64       d16,d30                 @ h+=Maj from the past
852 #endif
853         vshr.u64        d26,d20,#41
854         vld1.64         {d28},[r3,:64]! @ K[i++]
855         vsli.64         d24,d20,#50
856         vsli.64         d25,d20,#46
857         vmov            d29,d20
858         vsli.64         d26,d20,#23
859 #if 8<16 && defined(__ARMEL__)
860         vrev64.8        d8,d8
861 #endif
862         veor            d25,d24
863         vbsl            d29,d21,d22             @ Ch(e,f,g)
864         vshr.u64        d24,d16,#28
865         veor            d26,d25                 @ Sigma1(e)
866         vadd.i64        d27,d29,d23
867         vshr.u64        d25,d16,#34
868         vsli.64         d24,d16,#36
869         vadd.i64        d27,d26
870         vshr.u64        d26,d16,#39
871         vadd.i64        d28,d8
872         vsli.64         d25,d16,#30
873         veor            d30,d16,d17
874         vsli.64         d26,d16,#25
875         veor            d23,d24,d25
876         vadd.i64        d27,d28
877         vbsl            d30,d18,d17             @ Maj(a,b,c)
878         veor            d23,d26                 @ Sigma0(a)
879         vadd.i64        d19,d27
880         vadd.i64        d30,d27
881         @ vadd.i64      d23,d30
882         vshr.u64        d24,d19,#14     @ 9
883 #if 9<16
884         vld1.64         {d9},[r1]!      @ handles unaligned
885 #endif
886         vshr.u64        d25,d19,#18
887 #if 9>0
888          vadd.i64       d23,d30                 @ h+=Maj from the past
889 #endif
890         vshr.u64        d26,d19,#41
891         vld1.64         {d28},[r3,:64]! @ K[i++]
892         vsli.64         d24,d19,#50
893         vsli.64         d25,d19,#46
894         vmov            d29,d19
895         vsli.64         d26,d19,#23
896 #if 9<16 && defined(__ARMEL__)
897         vrev64.8        d9,d9
898 #endif
899         veor            d25,d24
900         vbsl            d29,d20,d21             @ Ch(e,f,g)
901         vshr.u64        d24,d23,#28
902         veor            d26,d25                 @ Sigma1(e)
903         vadd.i64        d27,d29,d22
904         vshr.u64        d25,d23,#34
905         vsli.64         d24,d23,#36
906         vadd.i64        d27,d26
907         vshr.u64        d26,d23,#39
908         vadd.i64        d28,d9
909         vsli.64         d25,d23,#30
910         veor            d30,d23,d16
911         vsli.64         d26,d23,#25
912         veor            d22,d24,d25
913         vadd.i64        d27,d28
914         vbsl            d30,d17,d16             @ Maj(a,b,c)
915         veor            d22,d26                 @ Sigma0(a)
916         vadd.i64        d18,d27
917         vadd.i64        d30,d27
918         @ vadd.i64      d22,d30
919         vshr.u64        d24,d18,#14     @ 10
920 #if 10<16
921         vld1.64         {d10},[r1]!     @ handles unaligned
922 #endif
923         vshr.u64        d25,d18,#18
924 #if 10>0
925          vadd.i64       d22,d30                 @ h+=Maj from the past
926 #endif
927         vshr.u64        d26,d18,#41
928         vld1.64         {d28},[r3,:64]! @ K[i++]
929         vsli.64         d24,d18,#50
930         vsli.64         d25,d18,#46
931         vmov            d29,d18
932         vsli.64         d26,d18,#23
933 #if 10<16 && defined(__ARMEL__)
934         vrev64.8        d10,d10
935 #endif
936         veor            d25,d24
937         vbsl            d29,d19,d20             @ Ch(e,f,g)
938         vshr.u64        d24,d22,#28
939         veor            d26,d25                 @ Sigma1(e)
940         vadd.i64        d27,d29,d21
941         vshr.u64        d25,d22,#34
942         vsli.64         d24,d22,#36
943         vadd.i64        d27,d26
944         vshr.u64        d26,d22,#39
945         vadd.i64        d28,d10
946         vsli.64         d25,d22,#30
947         veor            d30,d22,d23
948         vsli.64         d26,d22,#25
949         veor            d21,d24,d25
950         vadd.i64        d27,d28
951         vbsl            d30,d16,d23             @ Maj(a,b,c)
952         veor            d21,d26                 @ Sigma0(a)
953         vadd.i64        d17,d27
954         vadd.i64        d30,d27
955         @ vadd.i64      d21,d30
956         vshr.u64        d24,d17,#14     @ 11
957 #if 11<16
958         vld1.64         {d11},[r1]!     @ handles unaligned
959 #endif
960         vshr.u64        d25,d17,#18
961 #if 11>0
962          vadd.i64       d21,d30                 @ h+=Maj from the past
963 #endif
964         vshr.u64        d26,d17,#41
965         vld1.64         {d28},[r3,:64]! @ K[i++]
966         vsli.64         d24,d17,#50
967         vsli.64         d25,d17,#46
968         vmov            d29,d17
969         vsli.64         d26,d17,#23
970 #if 11<16 && defined(__ARMEL__)
971         vrev64.8        d11,d11
972 #endif
973         veor            d25,d24
974         vbsl            d29,d18,d19             @ Ch(e,f,g)
975         vshr.u64        d24,d21,#28
976         veor            d26,d25                 @ Sigma1(e)
977         vadd.i64        d27,d29,d20
978         vshr.u64        d25,d21,#34
979         vsli.64         d24,d21,#36
980         vadd.i64        d27,d26
981         vshr.u64        d26,d21,#39
982         vadd.i64        d28,d11
983         vsli.64         d25,d21,#30
984         veor            d30,d21,d22
985         vsli.64         d26,d21,#25
986         veor            d20,d24,d25
987         vadd.i64        d27,d28
988         vbsl            d30,d23,d22             @ Maj(a,b,c)
989         veor            d20,d26                 @ Sigma0(a)
990         vadd.i64        d16,d27
991         vadd.i64        d30,d27
992         @ vadd.i64      d20,d30
993         vshr.u64        d24,d16,#14     @ 12
994 #if 12<16
995         vld1.64         {d12},[r1]!     @ handles unaligned
996 #endif
997         vshr.u64        d25,d16,#18
998 #if 12>0
999          vadd.i64       d20,d30                 @ h+=Maj from the past
1000 #endif
1001         vshr.u64        d26,d16,#41
1002         vld1.64         {d28},[r3,:64]! @ K[i++]
1003         vsli.64         d24,d16,#50
1004         vsli.64         d25,d16,#46
1005         vmov            d29,d16
1006         vsli.64         d26,d16,#23
1007 #if 12<16 && defined(__ARMEL__)
1008         vrev64.8        d12,d12
1009 #endif
1010         veor            d25,d24
1011         vbsl            d29,d17,d18             @ Ch(e,f,g)
1012         vshr.u64        d24,d20,#28
1013         veor            d26,d25                 @ Sigma1(e)
1014         vadd.i64        d27,d29,d19
1015         vshr.u64        d25,d20,#34
1016         vsli.64         d24,d20,#36
1017         vadd.i64        d27,d26
1018         vshr.u64        d26,d20,#39
1019         vadd.i64        d28,d12
1020         vsli.64         d25,d20,#30
1021         veor            d30,d20,d21
1022         vsli.64         d26,d20,#25
1023         veor            d19,d24,d25
1024         vadd.i64        d27,d28
1025         vbsl            d30,d22,d21             @ Maj(a,b,c)
1026         veor            d19,d26                 @ Sigma0(a)
1027         vadd.i64        d23,d27
1028         vadd.i64        d30,d27
1029         @ vadd.i64      d19,d30
1030         vshr.u64        d24,d23,#14     @ 13
1031 #if 13<16
1032         vld1.64         {d13},[r1]!     @ handles unaligned
1033 #endif
1034         vshr.u64        d25,d23,#18
1035 #if 13>0
1036          vadd.i64       d19,d30                 @ h+=Maj from the past
1037 #endif
1038         vshr.u64        d26,d23,#41
1039         vld1.64         {d28},[r3,:64]! @ K[i++]
1040         vsli.64         d24,d23,#50
1041         vsli.64         d25,d23,#46
1042         vmov            d29,d23
1043         vsli.64         d26,d23,#23
1044 #if 13<16 && defined(__ARMEL__)
1045         vrev64.8        d13,d13
1046 #endif
1047         veor            d25,d24
1048         vbsl            d29,d16,d17             @ Ch(e,f,g)
1049         vshr.u64        d24,d19,#28
1050         veor            d26,d25                 @ Sigma1(e)
1051         vadd.i64        d27,d29,d18
1052         vshr.u64        d25,d19,#34
1053         vsli.64         d24,d19,#36
1054         vadd.i64        d27,d26
1055         vshr.u64        d26,d19,#39
1056         vadd.i64        d28,d13
1057         vsli.64         d25,d19,#30
1058         veor            d30,d19,d20
1059         vsli.64         d26,d19,#25
1060         veor            d18,d24,d25
1061         vadd.i64        d27,d28
1062         vbsl            d30,d21,d20             @ Maj(a,b,c)
1063         veor            d18,d26                 @ Sigma0(a)
1064         vadd.i64        d22,d27
1065         vadd.i64        d30,d27
1066         @ vadd.i64      d18,d30
1067         vshr.u64        d24,d22,#14     @ 14
1068 #if 14<16
1069         vld1.64         {d14},[r1]!     @ handles unaligned
1070 #endif
1071         vshr.u64        d25,d22,#18
1072 #if 14>0
1073          vadd.i64       d18,d30                 @ h+=Maj from the past
1074 #endif
1075         vshr.u64        d26,d22,#41
1076         vld1.64         {d28},[r3,:64]! @ K[i++]
1077         vsli.64         d24,d22,#50
1078         vsli.64         d25,d22,#46
1079         vmov            d29,d22
1080         vsli.64         d26,d22,#23
1081 #if 14<16 && defined(__ARMEL__)
1082         vrev64.8        d14,d14
1083 #endif
1084         veor            d25,d24
1085         vbsl            d29,d23,d16             @ Ch(e,f,g)
1086         vshr.u64        d24,d18,#28
1087         veor            d26,d25                 @ Sigma1(e)
1088         vadd.i64        d27,d29,d17
1089         vshr.u64        d25,d18,#34
1090         vsli.64         d24,d18,#36
1091         vadd.i64        d27,d26
1092         vshr.u64        d26,d18,#39
1093         vadd.i64        d28,d14
1094         vsli.64         d25,d18,#30
1095         veor            d30,d18,d19
1096         vsli.64         d26,d18,#25
1097         veor            d17,d24,d25
1098         vadd.i64        d27,d28
1099         vbsl            d30,d20,d19             @ Maj(a,b,c)
1100         veor            d17,d26                 @ Sigma0(a)
1101         vadd.i64        d21,d27
1102         vadd.i64        d30,d27
1103         @ vadd.i64      d17,d30
1104         vshr.u64        d24,d21,#14     @ 15
1105 #if 15<16
1106         vld1.64         {d15},[r1]!     @ handles unaligned
1107 #endif
1108         vshr.u64        d25,d21,#18
1109 #if 15>0
1110          vadd.i64       d17,d30                 @ h+=Maj from the past
1111 #endif
1112         vshr.u64        d26,d21,#41
1113         vld1.64         {d28},[r3,:64]! @ K[i++]
1114         vsli.64         d24,d21,#50
1115         vsli.64         d25,d21,#46
1116         vmov            d29,d21
1117         vsli.64         d26,d21,#23
1118 #if 15<16 && defined(__ARMEL__)
1119         vrev64.8        d15,d15
1120 #endif
1121         veor            d25,d24
1122         vbsl            d29,d22,d23             @ Ch(e,f,g)
1123         vshr.u64        d24,d17,#28
1124         veor            d26,d25                 @ Sigma1(e)
1125         vadd.i64        d27,d29,d16
1126         vshr.u64        d25,d17,#34
1127         vsli.64         d24,d17,#36
1128         vadd.i64        d27,d26
1129         vshr.u64        d26,d17,#39
1130         vadd.i64        d28,d15
1131         vsli.64         d25,d17,#30
1132         veor            d30,d17,d18
1133         vsli.64         d26,d17,#25
1134         veor            d16,d24,d25
1135         vadd.i64        d27,d28
1136         vbsl            d30,d19,d18             @ Maj(a,b,c)
1137         veor            d16,d26                 @ Sigma0(a)
1138         vadd.i64        d20,d27
1139         vadd.i64        d30,d27
1140         @ vadd.i64      d16,d30
1141         mov             r12,#4
1142 .L16_79_neon:
1143         subs            r12,#1
1144         vshr.u64        q12,q7,#19
1145         vshr.u64        q13,q7,#61
1146          vadd.i64       d16,d30                 @ h+=Maj from the past
1147         vshr.u64        q15,q7,#6
1148         vsli.64         q12,q7,#45
1149         vext.8          q14,q0,q1,#8    @ X[i+1]
1150         vsli.64         q13,q7,#3
1151         veor            q15,q12
1152         vshr.u64        q12,q14,#1
1153         veor            q15,q13                         @ sigma1(X[i+14])
1154         vshr.u64        q13,q14,#8
1155         vadd.i64        q0,q15
1156         vshr.u64        q15,q14,#7
1157         vsli.64         q12,q14,#63
1158         vsli.64         q13,q14,#56
1159         vext.8          q14,q4,q5,#8    @ X[i+9]
1160         veor            q15,q12
1161         vshr.u64        d24,d20,#14             @ from NEON_00_15
1162         vadd.i64        q0,q14
1163         vshr.u64        d25,d20,#18             @ from NEON_00_15
1164         veor            q15,q13                         @ sigma0(X[i+1])
1165         vshr.u64        d26,d20,#41             @ from NEON_00_15
1166         vadd.i64        q0,q15
1167         vld1.64         {d28},[r3,:64]! @ K[i++]
1168         vsli.64         d24,d20,#50
1169         vsli.64         d25,d20,#46
1170         vmov            d29,d20
1171         vsli.64         d26,d20,#23
1172 #if 16<16 && defined(__ARMEL__)
1173         vrev64.8        ,
1174 #endif
1175         veor            d25,d24
1176         vbsl            d29,d21,d22             @ Ch(e,f,g)
1177         vshr.u64        d24,d16,#28
1178         veor            d26,d25                 @ Sigma1(e)
1179         vadd.i64        d27,d29,d23
1180         vshr.u64        d25,d16,#34
1181         vsli.64         d24,d16,#36
1182         vadd.i64        d27,d26
1183         vshr.u64        d26,d16,#39
1184         vadd.i64        d28,d0
1185         vsli.64         d25,d16,#30
1186         veor            d30,d16,d17
1187         vsli.64         d26,d16,#25
1188         veor            d23,d24,d25
1189         vadd.i64        d27,d28
1190         vbsl            d30,d18,d17             @ Maj(a,b,c)
1191         veor            d23,d26                 @ Sigma0(a)
1192         vadd.i64        d19,d27
1193         vadd.i64        d30,d27
1194         @ vadd.i64      d23,d30
1195         vshr.u64        d24,d19,#14     @ 17
1196 #if 17<16
1197         vld1.64         {d1},[r1]!      @ handles unaligned
1198 #endif
1199         vshr.u64        d25,d19,#18
1200 #if 17>0
1201          vadd.i64       d23,d30                 @ h+=Maj from the past
1202 #endif
1203         vshr.u64        d26,d19,#41
1204         vld1.64         {d28},[r3,:64]! @ K[i++]
1205         vsli.64         d24,d19,#50
1206         vsli.64         d25,d19,#46
1207         vmov            d29,d19
1208         vsli.64         d26,d19,#23
1209 #if 17<16 && defined(__ARMEL__)
1210         vrev64.8        ,
1211 #endif
1212         veor            d25,d24
1213         vbsl            d29,d20,d21             @ Ch(e,f,g)
1214         vshr.u64        d24,d23,#28
1215         veor            d26,d25                 @ Sigma1(e)
1216         vadd.i64        d27,d29,d22
1217         vshr.u64        d25,d23,#34
1218         vsli.64         d24,d23,#36
1219         vadd.i64        d27,d26
1220         vshr.u64        d26,d23,#39
1221         vadd.i64        d28,d1
1222         vsli.64         d25,d23,#30
1223         veor            d30,d23,d16
1224         vsli.64         d26,d23,#25
1225         veor            d22,d24,d25
1226         vadd.i64        d27,d28
1227         vbsl            d30,d17,d16             @ Maj(a,b,c)
1228         veor            d22,d26                 @ Sigma0(a)
1229         vadd.i64        d18,d27
1230         vadd.i64        d30,d27
1231         @ vadd.i64      d22,d30
1232         vshr.u64        q12,q0,#19
1233         vshr.u64        q13,q0,#61
1234          vadd.i64       d22,d30                 @ h+=Maj from the past
1235         vshr.u64        q15,q0,#6
1236         vsli.64         q12,q0,#45
1237         vext.8          q14,q1,q2,#8    @ X[i+1]
1238         vsli.64         q13,q0,#3
1239         veor            q15,q12
1240         vshr.u64        q12,q14,#1
1241         veor            q15,q13                         @ sigma1(X[i+14])
1242         vshr.u64        q13,q14,#8
1243         vadd.i64        q1,q15
1244         vshr.u64        q15,q14,#7
1245         vsli.64         q12,q14,#63
1246         vsli.64         q13,q14,#56
1247         vext.8          q14,q5,q6,#8    @ X[i+9]
1248         veor            q15,q12
1249         vshr.u64        d24,d18,#14             @ from NEON_00_15
1250         vadd.i64        q1,q14
1251         vshr.u64        d25,d18,#18             @ from NEON_00_15
1252         veor            q15,q13                         @ sigma0(X[i+1])
1253         vshr.u64        d26,d18,#41             @ from NEON_00_15
1254         vadd.i64        q1,q15
1255         vld1.64         {d28},[r3,:64]! @ K[i++]
1256         vsli.64         d24,d18,#50
1257         vsli.64         d25,d18,#46
1258         vmov            d29,d18
1259         vsli.64         d26,d18,#23
1260 #if 18<16 && defined(__ARMEL__)
1261         vrev64.8        ,
1262 #endif
1263         veor            d25,d24
1264         vbsl            d29,d19,d20             @ Ch(e,f,g)
1265         vshr.u64        d24,d22,#28
1266         veor            d26,d25                 @ Sigma1(e)
1267         vadd.i64        d27,d29,d21
1268         vshr.u64        d25,d22,#34
1269         vsli.64         d24,d22,#36
1270         vadd.i64        d27,d26
1271         vshr.u64        d26,d22,#39
1272         vadd.i64        d28,d2
1273         vsli.64         d25,d22,#30
1274         veor            d30,d22,d23
1275         vsli.64         d26,d22,#25
1276         veor            d21,d24,d25
1277         vadd.i64        d27,d28
1278         vbsl            d30,d16,d23             @ Maj(a,b,c)
1279         veor            d21,d26                 @ Sigma0(a)
1280         vadd.i64        d17,d27
1281         vadd.i64        d30,d27
1282         @ vadd.i64      d21,d30
1283         vshr.u64        d24,d17,#14     @ 19
1284 #if 19<16
1285         vld1.64         {d3},[r1]!      @ handles unaligned
1286 #endif
1287         vshr.u64        d25,d17,#18
1288 #if 19>0
1289          vadd.i64       d21,d30                 @ h+=Maj from the past
1290 #endif
1291         vshr.u64        d26,d17,#41
1292         vld1.64         {d28},[r3,:64]! @ K[i++]
1293         vsli.64         d24,d17,#50
1294         vsli.64         d25,d17,#46
1295         vmov            d29,d17
1296         vsli.64         d26,d17,#23
1297 #if 19<16 && defined(__ARMEL__)
1298         vrev64.8        ,
1299 #endif
1300         veor            d25,d24
1301         vbsl            d29,d18,d19             @ Ch(e,f,g)
1302         vshr.u64        d24,d21,#28
1303         veor            d26,d25                 @ Sigma1(e)
1304         vadd.i64        d27,d29,d20
1305         vshr.u64        d25,d21,#34
1306         vsli.64         d24,d21,#36
1307         vadd.i64        d27,d26
1308         vshr.u64        d26,d21,#39
1309         vadd.i64        d28,d3
1310         vsli.64         d25,d21,#30
1311         veor            d30,d21,d22
1312         vsli.64         d26,d21,#25
1313         veor            d20,d24,d25
1314         vadd.i64        d27,d28
1315         vbsl            d30,d23,d22             @ Maj(a,b,c)
1316         veor            d20,d26                 @ Sigma0(a)
1317         vadd.i64        d16,d27
1318         vadd.i64        d30,d27
1319         @ vadd.i64      d20,d30
1320         vshr.u64        q12,q1,#19
1321         vshr.u64        q13,q1,#61
1322          vadd.i64       d20,d30                 @ h+=Maj from the past
1323         vshr.u64        q15,q1,#6
1324         vsli.64         q12,q1,#45
1325         vext.8          q14,q2,q3,#8    @ X[i+1]
1326         vsli.64         q13,q1,#3
1327         veor            q15,q12
1328         vshr.u64        q12,q14,#1
1329         veor            q15,q13                         @ sigma1(X[i+14])
1330         vshr.u64        q13,q14,#8
1331         vadd.i64        q2,q15
1332         vshr.u64        q15,q14,#7
1333         vsli.64         q12,q14,#63
1334         vsli.64         q13,q14,#56
1335         vext.8          q14,q6,q7,#8    @ X[i+9]
1336         veor            q15,q12
1337         vshr.u64        d24,d16,#14             @ from NEON_00_15
1338         vadd.i64        q2,q14
1339         vshr.u64        d25,d16,#18             @ from NEON_00_15
1340         veor            q15,q13                         @ sigma0(X[i+1])
1341         vshr.u64        d26,d16,#41             @ from NEON_00_15
1342         vadd.i64        q2,q15
1343         vld1.64         {d28},[r3,:64]! @ K[i++]
1344         vsli.64         d24,d16,#50
1345         vsli.64         d25,d16,#46
1346         vmov            d29,d16
1347         vsli.64         d26,d16,#23
1348 #if 20<16 && defined(__ARMEL__)
1349         vrev64.8        ,
1350 #endif
1351         veor            d25,d24
1352         vbsl            d29,d17,d18             @ Ch(e,f,g)
1353         vshr.u64        d24,d20,#28
1354         veor            d26,d25                 @ Sigma1(e)
1355         vadd.i64        d27,d29,d19
1356         vshr.u64        d25,d20,#34
1357         vsli.64         d24,d20,#36
1358         vadd.i64        d27,d26
1359         vshr.u64        d26,d20,#39
1360         vadd.i64        d28,d4
1361         vsli.64         d25,d20,#30
1362         veor            d30,d20,d21
1363         vsli.64         d26,d20,#25
1364         veor            d19,d24,d25
1365         vadd.i64        d27,d28
1366         vbsl            d30,d22,d21             @ Maj(a,b,c)
1367         veor            d19,d26                 @ Sigma0(a)
1368         vadd.i64        d23,d27
1369         vadd.i64        d30,d27
1370         @ vadd.i64      d19,d30
1371         vshr.u64        d24,d23,#14     @ 21
1372 #if 21<16
1373         vld1.64         {d5},[r1]!      @ handles unaligned
1374 #endif
1375         vshr.u64        d25,d23,#18
1376 #if 21>0
1377          vadd.i64       d19,d30                 @ h+=Maj from the past
1378 #endif
1379         vshr.u64        d26,d23,#41
1380         vld1.64         {d28},[r3,:64]! @ K[i++]
1381         vsli.64         d24,d23,#50
1382         vsli.64         d25,d23,#46
1383         vmov            d29,d23
1384         vsli.64         d26,d23,#23
1385 #if 21<16 && defined(__ARMEL__)
1386         vrev64.8        ,
1387 #endif
1388         veor            d25,d24
1389         vbsl            d29,d16,d17             @ Ch(e,f,g)
1390         vshr.u64        d24,d19,#28
1391         veor            d26,d25                 @ Sigma1(e)
1392         vadd.i64        d27,d29,d18
1393         vshr.u64        d25,d19,#34
1394         vsli.64         d24,d19,#36
1395         vadd.i64        d27,d26
1396         vshr.u64        d26,d19,#39
1397         vadd.i64        d28,d5
1398         vsli.64         d25,d19,#30
1399         veor            d30,d19,d20
1400         vsli.64         d26,d19,#25
1401         veor            d18,d24,d25
1402         vadd.i64        d27,d28
1403         vbsl            d30,d21,d20             @ Maj(a,b,c)
1404         veor            d18,d26                 @ Sigma0(a)
1405         vadd.i64        d22,d27
1406         vadd.i64        d30,d27
1407         @ vadd.i64      d18,d30
1408         vshr.u64        q12,q2,#19
1409         vshr.u64        q13,q2,#61
1410          vadd.i64       d18,d30                 @ h+=Maj from the past
1411         vshr.u64        q15,q2,#6
1412         vsli.64         q12,q2,#45
1413         vext.8          q14,q3,q4,#8    @ X[i+1]
1414         vsli.64         q13,q2,#3
1415         veor            q15,q12
1416         vshr.u64        q12,q14,#1
1417         veor            q15,q13                         @ sigma1(X[i+14])
1418         vshr.u64        q13,q14,#8
1419         vadd.i64        q3,q15
1420         vshr.u64        q15,q14,#7
1421         vsli.64         q12,q14,#63
1422         vsli.64         q13,q14,#56
1423         vext.8          q14,q7,q0,#8    @ X[i+9]
1424         veor            q15,q12
1425         vshr.u64        d24,d22,#14             @ from NEON_00_15
1426         vadd.i64        q3,q14
1427         vshr.u64        d25,d22,#18             @ from NEON_00_15
1428         veor            q15,q13                         @ sigma0(X[i+1])
1429         vshr.u64        d26,d22,#41             @ from NEON_00_15
1430         vadd.i64        q3,q15
1431         vld1.64         {d28},[r3,:64]! @ K[i++]
1432         vsli.64         d24,d22,#50
1433         vsli.64         d25,d22,#46
1434         vmov            d29,d22
1435         vsli.64         d26,d22,#23
1436 #if 22<16 && defined(__ARMEL__)
1437         vrev64.8        ,
1438 #endif
1439         veor            d25,d24
1440         vbsl            d29,d23,d16             @ Ch(e,f,g)
1441         vshr.u64        d24,d18,#28
1442         veor            d26,d25                 @ Sigma1(e)
1443         vadd.i64        d27,d29,d17
1444         vshr.u64        d25,d18,#34
1445         vsli.64         d24,d18,#36
1446         vadd.i64        d27,d26
1447         vshr.u64        d26,d18,#39
1448         vadd.i64        d28,d6
1449         vsli.64         d25,d18,#30
1450         veor            d30,d18,d19
1451         vsli.64         d26,d18,#25
1452         veor            d17,d24,d25
1453         vadd.i64        d27,d28
1454         vbsl            d30,d20,d19             @ Maj(a,b,c)
1455         veor            d17,d26                 @ Sigma0(a)
1456         vadd.i64        d21,d27
1457         vadd.i64        d30,d27
1458         @ vadd.i64      d17,d30
1459         vshr.u64        d24,d21,#14     @ 23
1460 #if 23<16
1461         vld1.64         {d7},[r1]!      @ handles unaligned
1462 #endif
1463         vshr.u64        d25,d21,#18
1464 #if 23>0
1465          vadd.i64       d17,d30                 @ h+=Maj from the past
1466 #endif
1467         vshr.u64        d26,d21,#41
1468         vld1.64         {d28},[r3,:64]! @ K[i++]
1469         vsli.64         d24,d21,#50
1470         vsli.64         d25,d21,#46
1471         vmov            d29,d21
1472         vsli.64         d26,d21,#23
1473 #if 23<16 && defined(__ARMEL__)
1474         vrev64.8        ,
1475 #endif
1476         veor            d25,d24
1477         vbsl            d29,d22,d23             @ Ch(e,f,g)
1478         vshr.u64        d24,d17,#28
1479         veor            d26,d25                 @ Sigma1(e)
1480         vadd.i64        d27,d29,d16
1481         vshr.u64        d25,d17,#34
1482         vsli.64         d24,d17,#36
1483         vadd.i64        d27,d26
1484         vshr.u64        d26,d17,#39
1485         vadd.i64        d28,d7
1486         vsli.64         d25,d17,#30
1487         veor            d30,d17,d18
1488         vsli.64         d26,d17,#25
1489         veor            d16,d24,d25
1490         vadd.i64        d27,d28
1491         vbsl            d30,d19,d18             @ Maj(a,b,c)
1492         veor            d16,d26                 @ Sigma0(a)
1493         vadd.i64        d20,d27
1494         vadd.i64        d30,d27
1495         @ vadd.i64      d16,d30
1496         vshr.u64        q12,q3,#19
1497         vshr.u64        q13,q3,#61
1498          vadd.i64       d16,d30                 @ h+=Maj from the past
1499         vshr.u64        q15,q3,#6
1500         vsli.64         q12,q3,#45
1501         vext.8          q14,q4,q5,#8    @ X[i+1]
1502         vsli.64         q13,q3,#3
1503         veor            q15,q12
1504         vshr.u64        q12,q14,#1
1505         veor            q15,q13                         @ sigma1(X[i+14])
1506         vshr.u64        q13,q14,#8
1507         vadd.i64        q4,q15
1508         vshr.u64        q15,q14,#7
1509         vsli.64         q12,q14,#63
1510         vsli.64         q13,q14,#56
1511         vext.8          q14,q0,q1,#8    @ X[i+9]
1512         veor            q15,q12
1513         vshr.u64        d24,d20,#14             @ from NEON_00_15
1514         vadd.i64        q4,q14
1515         vshr.u64        d25,d20,#18             @ from NEON_00_15
1516         veor            q15,q13                         @ sigma0(X[i+1])
1517         vshr.u64        d26,d20,#41             @ from NEON_00_15
1518         vadd.i64        q4,q15
1519         vld1.64         {d28},[r3,:64]! @ K[i++]
1520         vsli.64         d24,d20,#50
1521         vsli.64         d25,d20,#46
1522         vmov            d29,d20
1523         vsli.64         d26,d20,#23
1524 #if 24<16 && defined(__ARMEL__)
1525         vrev64.8        ,
1526 #endif
1527         veor            d25,d24
1528         vbsl            d29,d21,d22             @ Ch(e,f,g)
1529         vshr.u64        d24,d16,#28
1530         veor            d26,d25                 @ Sigma1(e)
1531         vadd.i64        d27,d29,d23
1532         vshr.u64        d25,d16,#34
1533         vsli.64         d24,d16,#36
1534         vadd.i64        d27,d26
1535         vshr.u64        d26,d16,#39
1536         vadd.i64        d28,d8
1537         vsli.64         d25,d16,#30
1538         veor            d30,d16,d17
1539         vsli.64         d26,d16,#25
1540         veor            d23,d24,d25
1541         vadd.i64        d27,d28
1542         vbsl            d30,d18,d17             @ Maj(a,b,c)
1543         veor            d23,d26                 @ Sigma0(a)
1544         vadd.i64        d19,d27
1545         vadd.i64        d30,d27
1546         @ vadd.i64      d23,d30
1547         vshr.u64        d24,d19,#14     @ 25
1548 #if 25<16
1549         vld1.64         {d9},[r1]!      @ handles unaligned
1550 #endif
1551         vshr.u64        d25,d19,#18
1552 #if 25>0
1553          vadd.i64       d23,d30                 @ h+=Maj from the past
1554 #endif
1555         vshr.u64        d26,d19,#41
1556         vld1.64         {d28},[r3,:64]! @ K[i++]
1557         vsli.64         d24,d19,#50
1558         vsli.64         d25,d19,#46
1559         vmov            d29,d19
1560         vsli.64         d26,d19,#23
1561 #if 25<16 && defined(__ARMEL__)
1562         vrev64.8        ,
1563 #endif
1564         veor            d25,d24
1565         vbsl            d29,d20,d21             @ Ch(e,f,g)
1566         vshr.u64        d24,d23,#28
1567         veor            d26,d25                 @ Sigma1(e)
1568         vadd.i64        d27,d29,d22
1569         vshr.u64        d25,d23,#34
1570         vsli.64         d24,d23,#36
1571         vadd.i64        d27,d26
1572         vshr.u64        d26,d23,#39
1573         vadd.i64        d28,d9
1574         vsli.64         d25,d23,#30
1575         veor            d30,d23,d16
1576         vsli.64         d26,d23,#25
1577         veor            d22,d24,d25
1578         vadd.i64        d27,d28
1579         vbsl            d30,d17,d16             @ Maj(a,b,c)
1580         veor            d22,d26                 @ Sigma0(a)
1581         vadd.i64        d18,d27
1582         vadd.i64        d30,d27
1583         @ vadd.i64      d22,d30
1584         vshr.u64        q12,q4,#19
1585         vshr.u64        q13,q4,#61
1586          vadd.i64       d22,d30                 @ h+=Maj from the past
1587         vshr.u64        q15,q4,#6
1588         vsli.64         q12,q4,#45
1589         vext.8          q14,q5,q6,#8    @ X[i+1]
1590         vsli.64         q13,q4,#3
1591         veor            q15,q12
1592         vshr.u64        q12,q14,#1
1593         veor            q15,q13                         @ sigma1(X[i+14])
1594         vshr.u64        q13,q14,#8
1595         vadd.i64        q5,q15
1596         vshr.u64        q15,q14,#7
1597         vsli.64         q12,q14,#63
1598         vsli.64         q13,q14,#56
1599         vext.8          q14,q1,q2,#8    @ X[i+9]
1600         veor            q15,q12
1601         vshr.u64        d24,d18,#14             @ from NEON_00_15
1602         vadd.i64        q5,q14
1603         vshr.u64        d25,d18,#18             @ from NEON_00_15
1604         veor            q15,q13                         @ sigma0(X[i+1])
1605         vshr.u64        d26,d18,#41             @ from NEON_00_15
1606         vadd.i64        q5,q15
1607         vld1.64         {d28},[r3,:64]! @ K[i++]
1608         vsli.64         d24,d18,#50
1609         vsli.64         d25,d18,#46
1610         vmov            d29,d18
1611         vsli.64         d26,d18,#23
1612 #if 26<16 && defined(__ARMEL__)
1613         vrev64.8        ,
1614 #endif
1615         veor            d25,d24
1616         vbsl            d29,d19,d20             @ Ch(e,f,g)
1617         vshr.u64        d24,d22,#28
1618         veor            d26,d25                 @ Sigma1(e)
1619         vadd.i64        d27,d29,d21
1620         vshr.u64        d25,d22,#34
1621         vsli.64         d24,d22,#36
1622         vadd.i64        d27,d26
1623         vshr.u64        d26,d22,#39
1624         vadd.i64        d28,d10
1625         vsli.64         d25,d22,#30
1626         veor            d30,d22,d23
1627         vsli.64         d26,d22,#25
1628         veor            d21,d24,d25
1629         vadd.i64        d27,d28
1630         vbsl            d30,d16,d23             @ Maj(a,b,c)
1631         veor            d21,d26                 @ Sigma0(a)
1632         vadd.i64        d17,d27
1633         vadd.i64        d30,d27
1634         @ vadd.i64      d21,d30
1635         vshr.u64        d24,d17,#14     @ 27
1636 #if 27<16
1637         vld1.64         {d11},[r1]!     @ handles unaligned
1638 #endif
1639         vshr.u64        d25,d17,#18
1640 #if 27>0
1641          vadd.i64       d21,d30                 @ h+=Maj from the past
1642 #endif
1643         vshr.u64        d26,d17,#41
1644         vld1.64         {d28},[r3,:64]! @ K[i++]
1645         vsli.64         d24,d17,#50
1646         vsli.64         d25,d17,#46
1647         vmov            d29,d17
1648         vsli.64         d26,d17,#23
1649 #if 27<16 && defined(__ARMEL__)
1650         vrev64.8        ,
1651 #endif
1652         veor            d25,d24
1653         vbsl            d29,d18,d19             @ Ch(e,f,g)
1654         vshr.u64        d24,d21,#28
1655         veor            d26,d25                 @ Sigma1(e)
1656         vadd.i64        d27,d29,d20
1657         vshr.u64        d25,d21,#34
1658         vsli.64         d24,d21,#36
1659         vadd.i64        d27,d26
1660         vshr.u64        d26,d21,#39
1661         vadd.i64        d28,d11
1662         vsli.64         d25,d21,#30
1663         veor            d30,d21,d22
1664         vsli.64         d26,d21,#25
1665         veor            d20,d24,d25
1666         vadd.i64        d27,d28
1667         vbsl            d30,d23,d22             @ Maj(a,b,c)
1668         veor            d20,d26                 @ Sigma0(a)
1669         vadd.i64        d16,d27
1670         vadd.i64        d30,d27
1671         @ vadd.i64      d20,d30
1672         vshr.u64        q12,q5,#19
1673         vshr.u64        q13,q5,#61
1674          vadd.i64       d20,d30                 @ h+=Maj from the past
1675         vshr.u64        q15,q5,#6
1676         vsli.64         q12,q5,#45
1677         vext.8          q14,q6,q7,#8    @ X[i+1]
1678         vsli.64         q13,q5,#3
1679         veor            q15,q12
1680         vshr.u64        q12,q14,#1
1681         veor            q15,q13                         @ sigma1(X[i+14])
1682         vshr.u64        q13,q14,#8
1683         vadd.i64        q6,q15
1684         vshr.u64        q15,q14,#7
1685         vsli.64         q12,q14,#63
1686         vsli.64         q13,q14,#56
1687         vext.8          q14,q2,q3,#8    @ X[i+9]
1688         veor            q15,q12
1689         vshr.u64        d24,d16,#14             @ from NEON_00_15
1690         vadd.i64        q6,q14
1691         vshr.u64        d25,d16,#18             @ from NEON_00_15
1692         veor            q15,q13                         @ sigma0(X[i+1])
1693         vshr.u64        d26,d16,#41             @ from NEON_00_15
1694         vadd.i64        q6,q15
1695         vld1.64         {d28},[r3,:64]! @ K[i++]
1696         vsli.64         d24,d16,#50
1697         vsli.64         d25,d16,#46
1698         vmov            d29,d16
1699         vsli.64         d26,d16,#23
1700 #if 28<16 && defined(__ARMEL__)
1701         vrev64.8        ,
1702 #endif
1703         veor            d25,d24
1704         vbsl            d29,d17,d18             @ Ch(e,f,g)
1705         vshr.u64        d24,d20,#28
1706         veor            d26,d25                 @ Sigma1(e)
1707         vadd.i64        d27,d29,d19
1708         vshr.u64        d25,d20,#34
1709         vsli.64         d24,d20,#36
1710         vadd.i64        d27,d26
1711         vshr.u64        d26,d20,#39
1712         vadd.i64        d28,d12
1713         vsli.64         d25,d20,#30
1714         veor            d30,d20,d21
1715         vsli.64         d26,d20,#25
1716         veor            d19,d24,d25
1717         vadd.i64        d27,d28
1718         vbsl            d30,d22,d21             @ Maj(a,b,c)
1719         veor            d19,d26                 @ Sigma0(a)
1720         vadd.i64        d23,d27
1721         vadd.i64        d30,d27
1722         @ vadd.i64      d19,d30
1723         vshr.u64        d24,d23,#14     @ 29
1724 #if 29<16
1725         vld1.64         {d13},[r1]!     @ handles unaligned
1726 #endif
1727         vshr.u64        d25,d23,#18
1728 #if 29>0
1729          vadd.i64       d19,d30                 @ h+=Maj from the past
1730 #endif
1731         vshr.u64        d26,d23,#41
1732         vld1.64         {d28},[r3,:64]! @ K[i++]
1733         vsli.64         d24,d23,#50
1734         vsli.64         d25,d23,#46
1735         vmov            d29,d23
1736         vsli.64         d26,d23,#23
1737 #if 29<16 && defined(__ARMEL__)
1738         vrev64.8        ,
1739 #endif
1740         veor            d25,d24
1741         vbsl            d29,d16,d17             @ Ch(e,f,g)
1742         vshr.u64        d24,d19,#28
1743         veor            d26,d25                 @ Sigma1(e)
1744         vadd.i64        d27,d29,d18
1745         vshr.u64        d25,d19,#34
1746         vsli.64         d24,d19,#36
1747         vadd.i64        d27,d26
1748         vshr.u64        d26,d19,#39
1749         vadd.i64        d28,d13
1750         vsli.64         d25,d19,#30
1751         veor            d30,d19,d20
1752         vsli.64         d26,d19,#25
1753         veor            d18,d24,d25
1754         vadd.i64        d27,d28
1755         vbsl            d30,d21,d20             @ Maj(a,b,c)
1756         veor            d18,d26                 @ Sigma0(a)
1757         vadd.i64        d22,d27
1758         vadd.i64        d30,d27
1759         @ vadd.i64      d18,d30
1760         vshr.u64        q12,q6,#19
1761         vshr.u64        q13,q6,#61
1762          vadd.i64       d18,d30                 @ h+=Maj from the past
1763         vshr.u64        q15,q6,#6
1764         vsli.64         q12,q6,#45
1765         vext.8          q14,q7,q0,#8    @ X[i+1]
1766         vsli.64         q13,q6,#3
1767         veor            q15,q12
1768         vshr.u64        q12,q14,#1
1769         veor            q15,q13                         @ sigma1(X[i+14])
1770         vshr.u64        q13,q14,#8
1771         vadd.i64        q7,q15
1772         vshr.u64        q15,q14,#7
1773         vsli.64         q12,q14,#63
1774         vsli.64         q13,q14,#56
1775         vext.8          q14,q3,q4,#8    @ X[i+9]
1776         veor            q15,q12
1777         vshr.u64        d24,d22,#14             @ from NEON_00_15
1778         vadd.i64        q7,q14
1779         vshr.u64        d25,d22,#18             @ from NEON_00_15
1780         veor            q15,q13                         @ sigma0(X[i+1])
1781         vshr.u64        d26,d22,#41             @ from NEON_00_15
1782         vadd.i64        q7,q15
1783         vld1.64         {d28},[r3,:64]! @ K[i++]
1784         vsli.64         d24,d22,#50
1785         vsli.64         d25,d22,#46
1786         vmov            d29,d22
1787         vsli.64         d26,d22,#23
1788 #if 30<16 && defined(__ARMEL__)
1789         vrev64.8        ,
1790 #endif
1791         veor            d25,d24
1792         vbsl            d29,d23,d16             @ Ch(e,f,g)
1793         vshr.u64        d24,d18,#28
1794         veor            d26,d25                 @ Sigma1(e)
1795         vadd.i64        d27,d29,d17
1796         vshr.u64        d25,d18,#34
1797         vsli.64         d24,d18,#36
1798         vadd.i64        d27,d26
1799         vshr.u64        d26,d18,#39
1800         vadd.i64        d28,d14
1801         vsli.64         d25,d18,#30
1802         veor            d30,d18,d19
1803         vsli.64         d26,d18,#25
1804         veor            d17,d24,d25
1805         vadd.i64        d27,d28
1806         vbsl            d30,d20,d19             @ Maj(a,b,c)
1807         veor            d17,d26                 @ Sigma0(a)
1808         vadd.i64        d21,d27
1809         vadd.i64        d30,d27
1810         @ vadd.i64      d17,d30
1811         vshr.u64        d24,d21,#14     @ 31
1812 #if 31<16
1813         vld1.64         {d15},[r1]!     @ handles unaligned
1814 #endif
1815         vshr.u64        d25,d21,#18
1816 #if 31>0
1817          vadd.i64       d17,d30                 @ h+=Maj from the past
1818 #endif
1819         vshr.u64        d26,d21,#41
1820         vld1.64         {d28},[r3,:64]! @ K[i++]
1821         vsli.64         d24,d21,#50
1822         vsli.64         d25,d21,#46
1823         vmov            d29,d21
1824         vsli.64         d26,d21,#23
1825 #if 31<16 && defined(__ARMEL__)
1826         vrev64.8        ,
1827 #endif
1828         veor            d25,d24
1829         vbsl            d29,d22,d23             @ Ch(e,f,g)
1830         vshr.u64        d24,d17,#28
1831         veor            d26,d25                 @ Sigma1(e)
1832         vadd.i64        d27,d29,d16
1833         vshr.u64        d25,d17,#34
1834         vsli.64         d24,d17,#36
1835         vadd.i64        d27,d26
1836         vshr.u64        d26,d17,#39
1837         vadd.i64        d28,d15
1838         vsli.64         d25,d17,#30
1839         veor            d30,d17,d18
1840         vsli.64         d26,d17,#25
1841         veor            d16,d24,d25
1842         vadd.i64        d27,d28
1843         vbsl            d30,d19,d18             @ Maj(a,b,c)
1844         veor            d16,d26                 @ Sigma0(a)
1845         vadd.i64        d20,d27
1846         vadd.i64        d30,d27
1847         @ vadd.i64      d16,d30
1848         bne             .L16_79_neon
1849
1850          vadd.i64       d16,d30         @ h+=Maj from the past
1851         vldmia          r0,{d24-d31}    @ load context to temp
1852         vadd.i64        q8,q12          @ vectorized accumulate
1853         vadd.i64        q9,q13
1854         vadd.i64        q10,q14
1855         vadd.i64        q11,q15
1856         vstmia          r0,{d16-d23}    @ save context
1857         teq             r1,r2
1858         sub             r3,#640 @ rewind K512
1859         bne             .Loop_neon
1860
1861         VFP_ABI_POP
1862         bx      lr                              @ .word 0xe12fff1e
1863 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
1864 #endif
1865 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
1866 .align  2
1867 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1868 .comm   OPENSSL_armcap_P,4,4
1869 #endif