Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[linux-2.6-microblaze.git] / arch / arm / crypto / aes-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4  *
5  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10
11         .text
12         .arch           armv8-a
13         .fpu            crypto-neon-fp-armv8
14         .align          3
15
16         .macro          enc_round, state, key
17         aese.8          \state, \key
18         aesmc.8         \state, \state
19         .endm
20
21         .macro          dec_round, state, key
22         aesd.8          \state, \key
23         aesimc.8        \state, \state
24         .endm
25
26         .macro          enc_dround, key1, key2
27         enc_round       q0, \key1
28         enc_round       q0, \key2
29         .endm
30
31         .macro          dec_dround, key1, key2
32         dec_round       q0, \key1
33         dec_round       q0, \key2
34         .endm
35
36         .macro          enc_fround, key1, key2, key3
37         enc_round       q0, \key1
38         aese.8          q0, \key2
39         veor            q0, q0, \key3
40         .endm
41
42         .macro          dec_fround, key1, key2, key3
43         dec_round       q0, \key1
44         aesd.8          q0, \key2
45         veor            q0, q0, \key3
46         .endm
47
48         .macro          enc_dround_4x, key1, key2
49         enc_round       q0, \key1
50         enc_round       q1, \key1
51         enc_round       q2, \key1
52         enc_round       q3, \key1
53         enc_round       q0, \key2
54         enc_round       q1, \key2
55         enc_round       q2, \key2
56         enc_round       q3, \key2
57         .endm
58
59         .macro          dec_dround_4x, key1, key2
60         dec_round       q0, \key1
61         dec_round       q1, \key1
62         dec_round       q2, \key1
63         dec_round       q3, \key1
64         dec_round       q0, \key2
65         dec_round       q1, \key2
66         dec_round       q2, \key2
67         dec_round       q3, \key2
68         .endm
69
70         .macro          enc_fround_4x, key1, key2, key3
71         enc_round       q0, \key1
72         enc_round       q1, \key1
73         enc_round       q2, \key1
74         enc_round       q3, \key1
75         aese.8          q0, \key2
76         aese.8          q1, \key2
77         aese.8          q2, \key2
78         aese.8          q3, \key2
79         veor            q0, q0, \key3
80         veor            q1, q1, \key3
81         veor            q2, q2, \key3
82         veor            q3, q3, \key3
83         .endm
84
85         .macro          dec_fround_4x, key1, key2, key3
86         dec_round       q0, \key1
87         dec_round       q1, \key1
88         dec_round       q2, \key1
89         dec_round       q3, \key1
90         aesd.8          q0, \key2
91         aesd.8          q1, \key2
92         aesd.8          q2, \key2
93         aesd.8          q3, \key2
94         veor            q0, q0, \key3
95         veor            q1, q1, \key3
96         veor            q2, q2, \key3
97         veor            q3, q3, \key3
98         .endm
99
100         .macro          do_block, dround, fround
101         cmp             r3, #12                 @ which key size?
102         vld1.32         {q10-q11}, [ip]!
103         \dround         q8, q9
104         vld1.32         {q12-q13}, [ip]!
105         \dround         q10, q11
106         vld1.32         {q10-q11}, [ip]!
107         \dround         q12, q13
108         vld1.32         {q12-q13}, [ip]!
109         \dround         q10, q11
110         blo             0f                      @ AES-128: 10 rounds
111         vld1.32         {q10-q11}, [ip]!
112         \dround         q12, q13
113         beq             1f                      @ AES-192: 12 rounds
114         vld1.32         {q12-q13}, [ip]
115         \dround         q10, q11
116 0:      \fround         q12, q13, q14
117         bx              lr
118
119 1:      \fround         q10, q11, q14
120         bx              lr
121         .endm
122
123         /*
124          * Internal, non-AAPCS compliant functions that implement the core AES
125          * transforms. These should preserve all registers except q0 - q2 and ip
126          * Arguments:
127          *   q0        : first in/output block
128          *   q1        : second in/output block (_4x version only)
129          *   q2        : third in/output block (_4x version only)
130          *   q3        : fourth in/output block (_4x version only)
131          *   q8        : first round key
132          *   q9        : secound round key
133          *   q14       : final round key
134          *   r2        : address of round key array
135          *   r3        : number of rounds
136          */
137         .align          6
138 aes_encrypt:
139         add             ip, r2, #32             @ 3rd round key
140 .Laes_encrypt_tweak:
141         do_block        enc_dround, enc_fround
142 ENDPROC(aes_encrypt)
143
144         .align          6
145 aes_decrypt:
146         add             ip, r2, #32             @ 3rd round key
147         do_block        dec_dround, dec_fround
148 ENDPROC(aes_decrypt)
149
150         .align          6
151 aes_encrypt_4x:
152         add             ip, r2, #32             @ 3rd round key
153         do_block        enc_dround_4x, enc_fround_4x
154 ENDPROC(aes_encrypt_4x)
155
156         .align          6
157 aes_decrypt_4x:
158         add             ip, r2, #32             @ 3rd round key
159         do_block        dec_dround_4x, dec_fround_4x
160 ENDPROC(aes_decrypt_4x)
161
162         .macro          prepare_key, rk, rounds
163         add             ip, \rk, \rounds, lsl #4
164         vld1.32         {q8-q9}, [\rk]          @ load first 2 round keys
165         vld1.32         {q14}, [ip]             @ load last round key
166         .endm
167
168         /*
169          * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
170          *                 int blocks)
171          * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
172          *                 int blocks)
173          */
174 ENTRY(ce_aes_ecb_encrypt)
175         push            {r4, lr}
176         ldr             r4, [sp, #8]
177         prepare_key     r2, r3
178 .Lecbencloop4x:
179         subs            r4, r4, #4
180         bmi             .Lecbenc1x
181         vld1.8          {q0-q1}, [r1]!
182         vld1.8          {q2-q3}, [r1]!
183         bl              aes_encrypt_4x
184         vst1.8          {q0-q1}, [r0]!
185         vst1.8          {q2-q3}, [r0]!
186         b               .Lecbencloop4x
187 .Lecbenc1x:
188         adds            r4, r4, #4
189         beq             .Lecbencout
190 .Lecbencloop:
191         vld1.8          {q0}, [r1]!
192         bl              aes_encrypt
193         vst1.8          {q0}, [r0]!
194         subs            r4, r4, #1
195         bne             .Lecbencloop
196 .Lecbencout:
197         pop             {r4, pc}
198 ENDPROC(ce_aes_ecb_encrypt)
199
200 ENTRY(ce_aes_ecb_decrypt)
201         push            {r4, lr}
202         ldr             r4, [sp, #8]
203         prepare_key     r2, r3
204 .Lecbdecloop4x:
205         subs            r4, r4, #4
206         bmi             .Lecbdec1x
207         vld1.8          {q0-q1}, [r1]!
208         vld1.8          {q2-q3}, [r1]!
209         bl              aes_decrypt_4x
210         vst1.8          {q0-q1}, [r0]!
211         vst1.8          {q2-q3}, [r0]!
212         b               .Lecbdecloop4x
213 .Lecbdec1x:
214         adds            r4, r4, #4
215         beq             .Lecbdecout
216 .Lecbdecloop:
217         vld1.8          {q0}, [r1]!
218         bl              aes_decrypt
219         vst1.8          {q0}, [r0]!
220         subs            r4, r4, #1
221         bne             .Lecbdecloop
222 .Lecbdecout:
223         pop             {r4, pc}
224 ENDPROC(ce_aes_ecb_decrypt)
225
226         /*
227          * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228          *                 int blocks, u8 iv[])
229          * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230          *                 int blocks, u8 iv[])
231          */
232 ENTRY(ce_aes_cbc_encrypt)
233         push            {r4-r6, lr}
234         ldrd            r4, r5, [sp, #16]
235         vld1.8          {q0}, [r5]
236         prepare_key     r2, r3
237 .Lcbcencloop:
238         vld1.8          {q1}, [r1]!             @ get next pt block
239         veor            q0, q0, q1              @ ..and xor with iv
240         bl              aes_encrypt
241         vst1.8          {q0}, [r0]!
242         subs            r4, r4, #1
243         bne             .Lcbcencloop
244         vst1.8          {q0}, [r5]
245         pop             {r4-r6, pc}
246 ENDPROC(ce_aes_cbc_encrypt)
247
248 ENTRY(ce_aes_cbc_decrypt)
249         push            {r4-r6, lr}
250         ldrd            r4, r5, [sp, #16]
251         vld1.8          {q15}, [r5]             @ keep iv in q15
252         prepare_key     r2, r3
253 .Lcbcdecloop4x:
254         subs            r4, r4, #4
255         bmi             .Lcbcdec1x
256         vld1.8          {q0-q1}, [r1]!
257         vld1.8          {q2-q3}, [r1]!
258         vmov            q4, q0
259         vmov            q5, q1
260         vmov            q6, q2
261         vmov            q7, q3
262         bl              aes_decrypt_4x
263         veor            q0, q0, q15
264         veor            q1, q1, q4
265         veor            q2, q2, q5
266         veor            q3, q3, q6
267         vmov            q15, q7
268         vst1.8          {q0-q1}, [r0]!
269         vst1.8          {q2-q3}, [r0]!
270         b               .Lcbcdecloop4x
271 .Lcbcdec1x:
272         adds            r4, r4, #4
273         beq             .Lcbcdecout
274         vmov            q6, q14                 @ preserve last round key
275 .Lcbcdecloop:
276         vld1.8          {q0}, [r1]!             @ get next ct block
277         veor            q14, q15, q6            @ combine prev ct with last key
278         vmov            q15, q0
279         bl              aes_decrypt
280         vst1.8          {q0}, [r0]!
281         subs            r4, r4, #1
282         bne             .Lcbcdecloop
283 .Lcbcdecout:
284         vst1.8          {q15}, [r5]             @ keep iv in q15
285         pop             {r4-r6, pc}
286 ENDPROC(ce_aes_cbc_decrypt)
287
288
289         /*
290          * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291          *                        int rounds, int bytes, u8 const iv[])
292          * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293          *                        int rounds, int bytes, u8 const iv[])
294          */
295
296 ENTRY(ce_aes_cbc_cts_encrypt)
297         push            {r4-r6, lr}
298         ldrd            r4, r5, [sp, #16]
299
300         movw            ip, :lower16:.Lcts_permute_table
301         movt            ip, :upper16:.Lcts_permute_table
302         sub             r4, r4, #16
303         add             lr, ip, #32
304         add             ip, ip, r4
305         sub             lr, lr, r4
306         vld1.8          {q5}, [ip]
307         vld1.8          {q6}, [lr]
308
309         add             ip, r1, r4
310         vld1.8          {q0}, [r1]                      @ overlapping loads
311         vld1.8          {q3}, [ip]
312
313         vld1.8          {q1}, [r5]                      @ get iv
314         prepare_key     r2, r3
315
316         veor            q0, q0, q1                      @ xor with iv
317         bl              aes_encrypt
318
319         vtbl.8          d4, {d0-d1}, d10
320         vtbl.8          d5, {d0-d1}, d11
321         vtbl.8          d2, {d6-d7}, d12
322         vtbl.8          d3, {d6-d7}, d13
323
324         veor            q0, q0, q1
325         bl              aes_encrypt
326
327         add             r4, r0, r4
328         vst1.8          {q2}, [r4]                      @ overlapping stores
329         vst1.8          {q0}, [r0]
330
331         pop             {r4-r6, pc}
332 ENDPROC(ce_aes_cbc_cts_encrypt)
333
334 ENTRY(ce_aes_cbc_cts_decrypt)
335         push            {r4-r6, lr}
336         ldrd            r4, r5, [sp, #16]
337
338         movw            ip, :lower16:.Lcts_permute_table
339         movt            ip, :upper16:.Lcts_permute_table
340         sub             r4, r4, #16
341         add             lr, ip, #32
342         add             ip, ip, r4
343         sub             lr, lr, r4
344         vld1.8          {q5}, [ip]
345         vld1.8          {q6}, [lr]
346
347         add             ip, r1, r4
348         vld1.8          {q0}, [r1]                      @ overlapping loads
349         vld1.8          {q1}, [ip]
350
351         vld1.8          {q3}, [r5]                      @ get iv
352         prepare_key     r2, r3
353
354         bl              aes_decrypt
355
356         vtbl.8          d4, {d0-d1}, d10
357         vtbl.8          d5, {d0-d1}, d11
358         vtbx.8          d0, {d2-d3}, d12
359         vtbx.8          d1, {d2-d3}, d13
360
361         veor            q1, q1, q2
362         bl              aes_decrypt
363         veor            q0, q0, q3                      @ xor with iv
364
365         add             r4, r0, r4
366         vst1.8          {q1}, [r4]                      @ overlapping stores
367         vst1.8          {q0}, [r0]
368
369         pop             {r4-r6, pc}
370 ENDPROC(ce_aes_cbc_cts_decrypt)
371
372
373         /*
374          * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375          *                 int blocks, u8 ctr[])
376          */
377 ENTRY(ce_aes_ctr_encrypt)
378         push            {r4-r6, lr}
379         ldrd            r4, r5, [sp, #16]
380         vld1.8          {q7}, [r5]              @ load ctr
381         prepare_key     r2, r3
382         vmov            r6, s31                 @ keep swabbed ctr in r6
383         rev             r6, r6
384         cmn             r6, r4                  @ 32 bit overflow?
385         bcs             .Lctrloop
386 .Lctrloop4x:
387         subs            r4, r4, #4
388         bmi             .Lctr1x
389
390         /*
391          * NOTE: the sequence below has been carefully tweaked to avoid
392          * a silicon erratum that exists in Cortex-A57 (#1742098) and
393          * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
394          * may produce an incorrect result if they take their input from a
395          * register of which a single 32-bit lane has been updated the last
396          * time it was modified. To work around this, the lanes of registers
397          * q0-q3 below are not manipulated individually, and the different
398          * counter values are prepared by successive manipulations of q7.
399          */
400         add             ip, r6, #1
401         vmov            q0, q7
402         rev             ip, ip
403         add             lr, r6, #2
404         vmov            s31, ip                 @ set lane 3 of q1 via q7
405         add             ip, r6, #3
406         rev             lr, lr
407         vmov            q1, q7
408         vmov            s31, lr                 @ set lane 3 of q2 via q7
409         rev             ip, ip
410         vmov            q2, q7
411         vmov            s31, ip                 @ set lane 3 of q3 via q7
412         add             r6, r6, #4
413         vmov            q3, q7
414
415         vld1.8          {q4-q5}, [r1]!
416         vld1.8          {q6}, [r1]!
417         vld1.8          {q15}, [r1]!
418         bl              aes_encrypt_4x
419         veor            q0, q0, q4
420         veor            q1, q1, q5
421         veor            q2, q2, q6
422         veor            q3, q3, q15
423         rev             ip, r6
424         vst1.8          {q0-q1}, [r0]!
425         vst1.8          {q2-q3}, [r0]!
426         vmov            s31, ip
427         b               .Lctrloop4x
428 .Lctr1x:
429         adds            r4, r4, #4
430         beq             .Lctrout
431 .Lctrloop:
432         vmov            q0, q7
433         bl              aes_encrypt
434
435         adds            r6, r6, #1              @ increment BE ctr
436         rev             ip, r6
437         vmov            s31, ip
438         bcs             .Lctrcarry
439
440 .Lctrcarrydone:
441         subs            r4, r4, #1
442         bmi             .Lctrtailblock          @ blocks < 0 means tail block
443         vld1.8          {q3}, [r1]!
444         veor            q3, q0, q3
445         vst1.8          {q3}, [r0]!
446         bne             .Lctrloop
447
448 .Lctrout:
449         vst1.8          {q7}, [r5]              @ return next CTR value
450         pop             {r4-r6, pc}
451
452 .Lctrtailblock:
453         vst1.8          {q0}, [r0, :64]         @ return the key stream
454         b               .Lctrout
455
456 .Lctrcarry:
457         .irp            sreg, s30, s29, s28
458         vmov            ip, \sreg               @ load next word of ctr
459         rev             ip, ip                  @ ... to handle the carry
460         adds            ip, ip, #1
461         rev             ip, ip
462         vmov            \sreg, ip
463         bcc             .Lctrcarrydone
464         .endr
465         b               .Lctrcarrydone
466 ENDPROC(ce_aes_ctr_encrypt)
467
468         /*
469          * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
470          *                 int bytes, u8 iv[], u32 const rk2[], int first)
471          * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
472          *                 int bytes, u8 iv[], u32 const rk2[], int first)
473          */
474
475         .macro          next_tweak, out, in, const, tmp
476         vshr.s64        \tmp, \in, #63
477         vand            \tmp, \tmp, \const
478         vadd.u64        \out, \in, \in
479         vext.8          \tmp, \tmp, \tmp, #8
480         veor            \out, \out, \tmp
481         .endm
482
483 ce_aes_xts_init:
484         vmov.i32        d30, #0x87              @ compose tweak mask vector
485         vmovl.u32       q15, d30
486         vshr.u64        d30, d31, #7
487
488         ldrd            r4, r5, [sp, #16]       @ load args
489         ldr             r6, [sp, #28]
490         vld1.8          {q0}, [r5]              @ load iv
491         teq             r6, #1                  @ start of a block?
492         bxne            lr
493
494         @ Encrypt the IV in q0 with the second AES key. This should only
495         @ be done at the start of a block.
496         ldr             r6, [sp, #24]           @ load AES key 2
497         prepare_key     r6, r3
498         add             ip, r6, #32             @ 3rd round key of key 2
499         b               .Laes_encrypt_tweak     @ tail call
500 ENDPROC(ce_aes_xts_init)
501
502 ENTRY(ce_aes_xts_encrypt)
503         push            {r4-r6, lr}
504
505         bl              ce_aes_xts_init         @ run shared prologue
506         prepare_key     r2, r3
507         vmov            q4, q0
508
509         teq             r6, #0                  @ start of a block?
510         bne             .Lxtsenc4x
511
512 .Lxtsencloop4x:
513         next_tweak      q4, q4, q15, q10
514 .Lxtsenc4x:
515         subs            r4, r4, #64
516         bmi             .Lxtsenc1x
517         vld1.8          {q0-q1}, [r1]!          @ get 4 pt blocks
518         vld1.8          {q2-q3}, [r1]!
519         next_tweak      q5, q4, q15, q10
520         veor            q0, q0, q4
521         next_tweak      q6, q5, q15, q10
522         veor            q1, q1, q5
523         next_tweak      q7, q6, q15, q10
524         veor            q2, q2, q6
525         veor            q3, q3, q7
526         bl              aes_encrypt_4x
527         veor            q0, q0, q4
528         veor            q1, q1, q5
529         veor            q2, q2, q6
530         veor            q3, q3, q7
531         vst1.8          {q0-q1}, [r0]!          @ write 4 ct blocks
532         vst1.8          {q2-q3}, [r0]!
533         vmov            q4, q7
534         teq             r4, #0
535         beq             .Lxtsencret
536         b               .Lxtsencloop4x
537 .Lxtsenc1x:
538         adds            r4, r4, #64
539         beq             .Lxtsencout
540         subs            r4, r4, #16
541         bmi             .LxtsencctsNx
542 .Lxtsencloop:
543         vld1.8          {q0}, [r1]!
544 .Lxtsencctsout:
545         veor            q0, q0, q4
546         bl              aes_encrypt
547         veor            q0, q0, q4
548         teq             r4, #0
549         beq             .Lxtsencout
550         subs            r4, r4, #16
551         next_tweak      q4, q4, q15, q6
552         bmi             .Lxtsenccts
553         vst1.8          {q0}, [r0]!
554         b               .Lxtsencloop
555 .Lxtsencout:
556         vst1.8          {q0}, [r0]
557 .Lxtsencret:
558         vst1.8          {q4}, [r5]
559         pop             {r4-r6, pc}
560
561 .LxtsencctsNx:
562         vmov            q0, q3
563         sub             r0, r0, #16
564 .Lxtsenccts:
565         movw            ip, :lower16:.Lcts_permute_table
566         movt            ip, :upper16:.Lcts_permute_table
567
568         add             r1, r1, r4              @ rewind input pointer
569         add             r4, r4, #16             @ # bytes in final block
570         add             lr, ip, #32
571         add             ip, ip, r4
572         sub             lr, lr, r4
573         add             r4, r0, r4              @ output address of final block
574
575         vld1.8          {q1}, [r1]              @ load final partial block
576         vld1.8          {q2}, [ip]
577         vld1.8          {q3}, [lr]
578
579         vtbl.8          d4, {d0-d1}, d4
580         vtbl.8          d5, {d0-d1}, d5
581         vtbx.8          d0, {d2-d3}, d6
582         vtbx.8          d1, {d2-d3}, d7
583
584         vst1.8          {q2}, [r4]              @ overlapping stores
585         mov             r4, #0
586         b               .Lxtsencctsout
587 ENDPROC(ce_aes_xts_encrypt)
588
589
590 ENTRY(ce_aes_xts_decrypt)
591         push            {r4-r6, lr}
592
593         bl              ce_aes_xts_init         @ run shared prologue
594         prepare_key     r2, r3
595         vmov            q4, q0
596
597         /* subtract 16 bytes if we are doing CTS */
598         tst             r4, #0xf
599         subne           r4, r4, #0x10
600
601         teq             r6, #0                  @ start of a block?
602         bne             .Lxtsdec4x
603
604 .Lxtsdecloop4x:
605         next_tweak      q4, q4, q15, q10
606 .Lxtsdec4x:
607         subs            r4, r4, #64
608         bmi             .Lxtsdec1x
609         vld1.8          {q0-q1}, [r1]!          @ get 4 ct blocks
610         vld1.8          {q2-q3}, [r1]!
611         next_tweak      q5, q4, q15, q10
612         veor            q0, q0, q4
613         next_tweak      q6, q5, q15, q10
614         veor            q1, q1, q5
615         next_tweak      q7, q6, q15, q10
616         veor            q2, q2, q6
617         veor            q3, q3, q7
618         bl              aes_decrypt_4x
619         veor            q0, q0, q4
620         veor            q1, q1, q5
621         veor            q2, q2, q6
622         veor            q3, q3, q7
623         vst1.8          {q0-q1}, [r0]!          @ write 4 pt blocks
624         vst1.8          {q2-q3}, [r0]!
625         vmov            q4, q7
626         teq             r4, #0
627         beq             .Lxtsdecout
628         b               .Lxtsdecloop4x
629 .Lxtsdec1x:
630         adds            r4, r4, #64
631         beq             .Lxtsdecout
632         subs            r4, r4, #16
633 .Lxtsdecloop:
634         vld1.8          {q0}, [r1]!
635         bmi             .Lxtsdeccts
636 .Lxtsdecctsout:
637         veor            q0, q0, q4
638         bl              aes_decrypt
639         veor            q0, q0, q4
640         vst1.8          {q0}, [r0]!
641         teq             r4, #0
642         beq             .Lxtsdecout
643         subs            r4, r4, #16
644         next_tweak      q4, q4, q15, q6
645         b               .Lxtsdecloop
646 .Lxtsdecout:
647         vst1.8          {q4}, [r5]
648         pop             {r4-r6, pc}
649
650 .Lxtsdeccts:
651         movw            ip, :lower16:.Lcts_permute_table
652         movt            ip, :upper16:.Lcts_permute_table
653
654         add             r1, r1, r4              @ rewind input pointer
655         add             r4, r4, #16             @ # bytes in final block
656         add             lr, ip, #32
657         add             ip, ip, r4
658         sub             lr, lr, r4
659         add             r4, r0, r4              @ output address of final block
660
661         next_tweak      q5, q4, q15, q6
662
663         vld1.8          {q1}, [r1]              @ load final partial block
664         vld1.8          {q2}, [ip]
665         vld1.8          {q3}, [lr]
666
667         veor            q0, q0, q5
668         bl              aes_decrypt
669         veor            q0, q0, q5
670
671         vtbl.8          d4, {d0-d1}, d4
672         vtbl.8          d5, {d0-d1}, d5
673         vtbx.8          d0, {d2-d3}, d6
674         vtbx.8          d1, {d2-d3}, d7
675
676         vst1.8          {q2}, [r4]              @ overlapping stores
677         mov             r4, #0
678         b               .Lxtsdecctsout
679 ENDPROC(ce_aes_xts_decrypt)
680
681         /*
682          * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
683          *                             AES sbox substitution on each byte in
684          *                             'input'
685          */
686 ENTRY(ce_aes_sub)
687         vdup.32         q1, r0
688         veor            q0, q0, q0
689         aese.8          q0, q1
690         vmov            r0, s0
691         bx              lr
692 ENDPROC(ce_aes_sub)
693
694         /*
695          * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
696          *                                        operation on round key *src
697          */
698 ENTRY(ce_aes_invert)
699         vld1.32         {q0}, [r1]
700         aesimc.8        q0, q0
701         vst1.32         {q0}, [r0]
702         bx              lr
703 ENDPROC(ce_aes_invert)
704
705         .section        ".rodata", "a"
706         .align          6
707 .Lcts_permute_table:
708         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
709         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
710         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
711         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
712         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
713         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff