Merge tag 'v5.13-rc2' into spi-5.13
[linux-2.6-microblaze.git] / arch / arm64 / crypto / aes-modes.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4  *
5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7
8 /* included by aes-ce.S and aes-neon.S */
9
10         .text
11         .align          4
12
13 #ifndef MAX_STRIDE
14 #define MAX_STRIDE      4
15 #endif
16
17 #if MAX_STRIDE == 4
18 #define ST4(x...) x
19 #define ST5(x...)
20 #else
21 #define ST4(x...)
22 #define ST5(x...) x
23 #endif
24
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26         encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
27         ret
28 SYM_FUNC_END(aes_encrypt_block4x)
29
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31         decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
32         ret
33 SYM_FUNC_END(aes_decrypt_block4x)
34
35 #if MAX_STRIDE == 5
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37         encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
38         ret
39 SYM_FUNC_END(aes_encrypt_block5x)
40
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42         decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
43         ret
44 SYM_FUNC_END(aes_decrypt_block5x)
45 #endif
46
47         /*
48          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49          *                 int blocks)
50          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51          *                 int blocks)
52          */
53
54 AES_FUNC_START(aes_ecb_encrypt)
55         stp             x29, x30, [sp, #-16]!
56         mov             x29, sp
57
58         enc_prepare     w3, x2, x5
59
60 .LecbencloopNx:
61         subs            w4, w4, #MAX_STRIDE
62         bmi             .Lecbenc1x
63         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
64 ST4(    bl              aes_encrypt_block4x             )
65 ST5(    ld1             {v4.16b}, [x1], #16             )
66 ST5(    bl              aes_encrypt_block5x             )
67         st1             {v0.16b-v3.16b}, [x0], #64
68 ST5(    st1             {v4.16b}, [x0], #16             )
69         b               .LecbencloopNx
70 .Lecbenc1x:
71         adds            w4, w4, #MAX_STRIDE
72         beq             .Lecbencout
73 .Lecbencloop:
74         ld1             {v0.16b}, [x1], #16             /* get next pt block */
75         encrypt_block   v0, w3, x2, x5, w6
76         st1             {v0.16b}, [x0], #16
77         subs            w4, w4, #1
78         bne             .Lecbencloop
79 .Lecbencout:
80         ldp             x29, x30, [sp], #16
81         ret
82 AES_FUNC_END(aes_ecb_encrypt)
83
84
85 AES_FUNC_START(aes_ecb_decrypt)
86         stp             x29, x30, [sp, #-16]!
87         mov             x29, sp
88
89         dec_prepare     w3, x2, x5
90
91 .LecbdecloopNx:
92         subs            w4, w4, #MAX_STRIDE
93         bmi             .Lecbdec1x
94         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
95 ST4(    bl              aes_decrypt_block4x             )
96 ST5(    ld1             {v4.16b}, [x1], #16             )
97 ST5(    bl              aes_decrypt_block5x             )
98         st1             {v0.16b-v3.16b}, [x0], #64
99 ST5(    st1             {v4.16b}, [x0], #16             )
100         b               .LecbdecloopNx
101 .Lecbdec1x:
102         adds            w4, w4, #MAX_STRIDE
103         beq             .Lecbdecout
104 .Lecbdecloop:
105         ld1             {v0.16b}, [x1], #16             /* get next ct block */
106         decrypt_block   v0, w3, x2, x5, w6
107         st1             {v0.16b}, [x0], #16
108         subs            w4, w4, #1
109         bne             .Lecbdecloop
110 .Lecbdecout:
111         ldp             x29, x30, [sp], #16
112         ret
113 AES_FUNC_END(aes_ecb_decrypt)
114
115
116         /*
117          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118          *                 int blocks, u8 iv[])
119          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120          *                 int blocks, u8 iv[])
121          * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122          *                       int rounds, int blocks, u8 iv[],
123          *                       u32 const rk2[]);
124          * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125          *                       int rounds, int blocks, u8 iv[],
126          *                       u32 const rk2[]);
127          */
128
129 AES_FUNC_START(aes_essiv_cbc_encrypt)
130         ld1             {v4.16b}, [x5]                  /* get iv */
131
132         mov             w8, #14                         /* AES-256: 14 rounds */
133         enc_prepare     w8, x6, x7
134         encrypt_block   v4, w8, x6, x7, w9
135         enc_switch_key  w3, x2, x6
136         b               .Lcbcencloop4x
137
138 AES_FUNC_START(aes_cbc_encrypt)
139         ld1             {v4.16b}, [x5]                  /* get iv */
140         enc_prepare     w3, x2, x6
141
142 .Lcbcencloop4x:
143         subs            w4, w4, #4
144         bmi             .Lcbcenc1x
145         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
146         eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
147         encrypt_block   v0, w3, x2, x6, w7
148         eor             v1.16b, v1.16b, v0.16b
149         encrypt_block   v1, w3, x2, x6, w7
150         eor             v2.16b, v2.16b, v1.16b
151         encrypt_block   v2, w3, x2, x6, w7
152         eor             v3.16b, v3.16b, v2.16b
153         encrypt_block   v3, w3, x2, x6, w7
154         st1             {v0.16b-v3.16b}, [x0], #64
155         mov             v4.16b, v3.16b
156         b               .Lcbcencloop4x
157 .Lcbcenc1x:
158         adds            w4, w4, #4
159         beq             .Lcbcencout
160 .Lcbcencloop:
161         ld1             {v0.16b}, [x1], #16             /* get next pt block */
162         eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
163         encrypt_block   v4, w3, x2, x6, w7
164         st1             {v4.16b}, [x0], #16
165         subs            w4, w4, #1
166         bne             .Lcbcencloop
167 .Lcbcencout:
168         st1             {v4.16b}, [x5]                  /* return iv */
169         ret
170 AES_FUNC_END(aes_cbc_encrypt)
171 AES_FUNC_END(aes_essiv_cbc_encrypt)
172
173 AES_FUNC_START(aes_essiv_cbc_decrypt)
174         stp             x29, x30, [sp, #-16]!
175         mov             x29, sp
176
177         ld1             {cbciv.16b}, [x5]               /* get iv */
178
179         mov             w8, #14                         /* AES-256: 14 rounds */
180         enc_prepare     w8, x6, x7
181         encrypt_block   cbciv, w8, x6, x7, w9
182         b               .Lessivcbcdecstart
183
184 AES_FUNC_START(aes_cbc_decrypt)
185         stp             x29, x30, [sp, #-16]!
186         mov             x29, sp
187
188         ld1             {cbciv.16b}, [x5]               /* get iv */
189 .Lessivcbcdecstart:
190         dec_prepare     w3, x2, x6
191
192 .LcbcdecloopNx:
193         subs            w4, w4, #MAX_STRIDE
194         bmi             .Lcbcdec1x
195         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
196 #if MAX_STRIDE == 5
197         ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
198         mov             v5.16b, v0.16b
199         mov             v6.16b, v1.16b
200         mov             v7.16b, v2.16b
201         bl              aes_decrypt_block5x
202         sub             x1, x1, #32
203         eor             v0.16b, v0.16b, cbciv.16b
204         eor             v1.16b, v1.16b, v5.16b
205         ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
206         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
207         eor             v2.16b, v2.16b, v6.16b
208         eor             v3.16b, v3.16b, v7.16b
209         eor             v4.16b, v4.16b, v5.16b
210 #else
211         mov             v4.16b, v0.16b
212         mov             v5.16b, v1.16b
213         mov             v6.16b, v2.16b
214         bl              aes_decrypt_block4x
215         sub             x1, x1, #16
216         eor             v0.16b, v0.16b, cbciv.16b
217         eor             v1.16b, v1.16b, v4.16b
218         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
219         eor             v2.16b, v2.16b, v5.16b
220         eor             v3.16b, v3.16b, v6.16b
221 #endif
222         st1             {v0.16b-v3.16b}, [x0], #64
223 ST5(    st1             {v4.16b}, [x0], #16             )
224         b               .LcbcdecloopNx
225 .Lcbcdec1x:
226         adds            w4, w4, #MAX_STRIDE
227         beq             .Lcbcdecout
228 .Lcbcdecloop:
229         ld1             {v1.16b}, [x1], #16             /* get next ct block */
230         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
231         decrypt_block   v0, w3, x2, x6, w7
232         eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
233         mov             cbciv.16b, v1.16b               /* ct is next iv */
234         st1             {v0.16b}, [x0], #16
235         subs            w4, w4, #1
236         bne             .Lcbcdecloop
237 .Lcbcdecout:
238         st1             {cbciv.16b}, [x5]               /* return iv */
239         ldp             x29, x30, [sp], #16
240         ret
241 AES_FUNC_END(aes_cbc_decrypt)
242 AES_FUNC_END(aes_essiv_cbc_decrypt)
243
244
245         /*
246          * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247          *                     int rounds, int bytes, u8 const iv[])
248          * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249          *                     int rounds, int bytes, u8 const iv[])
250          */
251
252 AES_FUNC_START(aes_cbc_cts_encrypt)
253         adr_l           x8, .Lcts_permute_table
254         sub             x4, x4, #16
255         add             x9, x8, #32
256         add             x8, x8, x4
257         sub             x9, x9, x4
258         ld1             {v3.16b}, [x8]
259         ld1             {v4.16b}, [x9]
260
261         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
262         ld1             {v1.16b}, [x1]
263
264         ld1             {v5.16b}, [x5]                  /* get iv */
265         enc_prepare     w3, x2, x6
266
267         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
268         tbl             v1.16b, {v1.16b}, v4.16b
269         encrypt_block   v0, w3, x2, x6, w7
270
271         eor             v1.16b, v1.16b, v0.16b
272         tbl             v0.16b, {v0.16b}, v3.16b
273         encrypt_block   v1, w3, x2, x6, w7
274
275         add             x4, x0, x4
276         st1             {v0.16b}, [x4]                  /* overlapping stores */
277         st1             {v1.16b}, [x0]
278         ret
279 AES_FUNC_END(aes_cbc_cts_encrypt)
280
281 AES_FUNC_START(aes_cbc_cts_decrypt)
282         adr_l           x8, .Lcts_permute_table
283         sub             x4, x4, #16
284         add             x9, x8, #32
285         add             x8, x8, x4
286         sub             x9, x9, x4
287         ld1             {v3.16b}, [x8]
288         ld1             {v4.16b}, [x9]
289
290         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
291         ld1             {v1.16b}, [x1]
292
293         ld1             {v5.16b}, [x5]                  /* get iv */
294         dec_prepare     w3, x2, x6
295
296         decrypt_block   v0, w3, x2, x6, w7
297         tbl             v2.16b, {v0.16b}, v3.16b
298         eor             v2.16b, v2.16b, v1.16b
299
300         tbx             v0.16b, {v1.16b}, v4.16b
301         decrypt_block   v0, w3, x2, x6, w7
302         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
303
304         add             x4, x0, x4
305         st1             {v2.16b}, [x4]                  /* overlapping stores */
306         st1             {v0.16b}, [x0]
307         ret
308 AES_FUNC_END(aes_cbc_cts_decrypt)
309
310         .section        ".rodata", "a"
311         .align          6
312 .Lcts_permute_table:
313         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319         .previous
320
321
322         /*
323          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324          *                 int bytes, u8 ctr[], u8 finalbuf[])
325          */
326
327 AES_FUNC_START(aes_ctr_encrypt)
328         stp             x29, x30, [sp, #-16]!
329         mov             x29, sp
330
331         enc_prepare     w3, x2, x12
332         ld1             {vctr.16b}, [x5]
333
334         umov            x12, vctr.d[1]          /* keep swabbed ctr in reg */
335         rev             x12, x12
336
337 .LctrloopNx:
338         add             w7, w4, #15
339         sub             w4, w4, #MAX_STRIDE << 4
340         lsr             w7, w7, #4
341         mov             w8, #MAX_STRIDE
342         cmp             w7, w8
343         csel            w7, w7, w8, lt
344         adds            x12, x12, x7
345
346         mov             v0.16b, vctr.16b
347         mov             v1.16b, vctr.16b
348         mov             v2.16b, vctr.16b
349         mov             v3.16b, vctr.16b
350 ST5(    mov             v4.16b, vctr.16b                )
351         bcs             0f
352
353         .subsection     1
354         /* apply carry to outgoing counter */
355 0:      umov            x8, vctr.d[0]
356         rev             x8, x8
357         add             x8, x8, #1
358         rev             x8, x8
359         ins             vctr.d[0], x8
360
361         /* apply carry to N counter blocks for N := x12 */
362         cbz             x12, 2f
363         adr             x16, 1f
364         sub             x16, x16, x12, lsl #3
365         br              x16
366         hint            34                      // bti c
367         mov             v0.d[0], vctr.d[0]
368         hint            34                      // bti c
369         mov             v1.d[0], vctr.d[0]
370         hint            34                      // bti c
371         mov             v2.d[0], vctr.d[0]
372         hint            34                      // bti c
373         mov             v3.d[0], vctr.d[0]
374 ST5(    hint            34                              )
375 ST5(    mov             v4.d[0], vctr.d[0]              )
376 1:      b               2f
377         .previous
378
379 2:      rev             x7, x12
380         ins             vctr.d[1], x7
381         sub             x7, x12, #MAX_STRIDE - 1
382         sub             x8, x12, #MAX_STRIDE - 2
383         sub             x9, x12, #MAX_STRIDE - 3
384         rev             x7, x7
385         rev             x8, x8
386         mov             v1.d[1], x7
387         rev             x9, x9
388 ST5(    sub             x10, x12, #MAX_STRIDE - 4       )
389         mov             v2.d[1], x8
390 ST5(    rev             x10, x10                        )
391         mov             v3.d[1], x9
392 ST5(    mov             v4.d[1], x10                    )
393         tbnz            w4, #31, .Lctrtail
394         ld1             {v5.16b-v7.16b}, [x1], #48
395 ST4(    bl              aes_encrypt_block4x             )
396 ST5(    bl              aes_encrypt_block5x             )
397         eor             v0.16b, v5.16b, v0.16b
398 ST4(    ld1             {v5.16b}, [x1], #16             )
399         eor             v1.16b, v6.16b, v1.16b
400 ST5(    ld1             {v5.16b-v6.16b}, [x1], #32      )
401         eor             v2.16b, v7.16b, v2.16b
402         eor             v3.16b, v5.16b, v3.16b
403 ST5(    eor             v4.16b, v6.16b, v4.16b          )
404         st1             {v0.16b-v3.16b}, [x0], #64
405 ST5(    st1             {v4.16b}, [x0], #16             )
406         cbz             w4, .Lctrout
407         b               .LctrloopNx
408
409 .Lctrout:
410         st1             {vctr.16b}, [x5]        /* return next CTR value */
411         ldp             x29, x30, [sp], #16
412         ret
413
414 .Lctrtail:
415         /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
416         mov             x16, #16
417         ands            x13, x4, #0xf
418         csel            x13, x13, x16, ne
419
420 ST5(    cmp             w4, #64 - (MAX_STRIDE << 4)     )
421 ST5(    csel            x14, x16, xzr, gt               )
422         cmp             w4, #48 - (MAX_STRIDE << 4)
423         csel            x15, x16, xzr, gt
424         cmp             w4, #32 - (MAX_STRIDE << 4)
425         csel            x16, x16, xzr, gt
426         cmp             w4, #16 - (MAX_STRIDE << 4)
427         ble             .Lctrtail1x
428
429         adr_l           x12, .Lcts_permute_table
430         add             x12, x12, x13
431
432 ST5(    ld1             {v5.16b}, [x1], x14             )
433         ld1             {v6.16b}, [x1], x15
434         ld1             {v7.16b}, [x1], x16
435
436 ST4(    bl              aes_encrypt_block4x             )
437 ST5(    bl              aes_encrypt_block5x             )
438
439         ld1             {v8.16b}, [x1], x13
440         ld1             {v9.16b}, [x1]
441         ld1             {v10.16b}, [x12]
442
443 ST4(    eor             v6.16b, v6.16b, v0.16b          )
444 ST4(    eor             v7.16b, v7.16b, v1.16b          )
445 ST4(    tbl             v3.16b, {v3.16b}, v10.16b       )
446 ST4(    eor             v8.16b, v8.16b, v2.16b          )
447 ST4(    eor             v9.16b, v9.16b, v3.16b          )
448
449 ST5(    eor             v5.16b, v5.16b, v0.16b          )
450 ST5(    eor             v6.16b, v6.16b, v1.16b          )
451 ST5(    tbl             v4.16b, {v4.16b}, v10.16b       )
452 ST5(    eor             v7.16b, v7.16b, v2.16b          )
453 ST5(    eor             v8.16b, v8.16b, v3.16b          )
454 ST5(    eor             v9.16b, v9.16b, v4.16b          )
455
456 ST5(    st1             {v5.16b}, [x0], x14             )
457         st1             {v6.16b}, [x0], x15
458         st1             {v7.16b}, [x0], x16
459         add             x13, x13, x0
460         st1             {v9.16b}, [x13]         // overlapping stores
461         st1             {v8.16b}, [x0]
462         b               .Lctrout
463
464 .Lctrtail1x:
465         csel            x0, x0, x6, eq          // use finalbuf if less than a full block
466         ld1             {v5.16b}, [x1]
467 ST5(    mov             v3.16b, v4.16b                  )
468         encrypt_block   v3, w3, x2, x8, w7
469         eor             v5.16b, v5.16b, v3.16b
470         st1             {v5.16b}, [x0]
471         b               .Lctrout
472 AES_FUNC_END(aes_ctr_encrypt)
473
474
475         /*
476          * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
477          *                 int bytes, u8 const rk2[], u8 iv[], int first)
478          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
479          *                 int bytes, u8 const rk2[], u8 iv[], int first)
480          */
481
482         .macro          next_tweak, out, in, tmp
483         sshr            \tmp\().2d,  \in\().2d,   #63
484         and             \tmp\().16b, \tmp\().16b, xtsmask.16b
485         add             \out\().2d,  \in\().2d,   \in\().2d
486         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
487         eor             \out\().16b, \out\().16b, \tmp\().16b
488         .endm
489
490         .macro          xts_load_mask, tmp
491         movi            xtsmask.2s, #0x1
492         movi            \tmp\().2s, #0x87
493         uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
494         .endm
495
496 AES_FUNC_START(aes_xts_encrypt)
497         stp             x29, x30, [sp, #-16]!
498         mov             x29, sp
499
500         ld1             {v4.16b}, [x6]
501         xts_load_mask   v8
502         cbz             w7, .Lxtsencnotfirst
503
504         enc_prepare     w3, x5, x8
505         xts_cts_skip_tw w7, .LxtsencNx
506         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
507         enc_switch_key  w3, x2, x8
508         b               .LxtsencNx
509
510 .Lxtsencnotfirst:
511         enc_prepare     w3, x2, x8
512 .LxtsencloopNx:
513         next_tweak      v4, v4, v8
514 .LxtsencNx:
515         subs            w4, w4, #64
516         bmi             .Lxtsenc1x
517         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
518         next_tweak      v5, v4, v8
519         eor             v0.16b, v0.16b, v4.16b
520         next_tweak      v6, v5, v8
521         eor             v1.16b, v1.16b, v5.16b
522         eor             v2.16b, v2.16b, v6.16b
523         next_tweak      v7, v6, v8
524         eor             v3.16b, v3.16b, v7.16b
525         bl              aes_encrypt_block4x
526         eor             v3.16b, v3.16b, v7.16b
527         eor             v0.16b, v0.16b, v4.16b
528         eor             v1.16b, v1.16b, v5.16b
529         eor             v2.16b, v2.16b, v6.16b
530         st1             {v0.16b-v3.16b}, [x0], #64
531         mov             v4.16b, v7.16b
532         cbz             w4, .Lxtsencret
533         xts_reload_mask v8
534         b               .LxtsencloopNx
535 .Lxtsenc1x:
536         adds            w4, w4, #64
537         beq             .Lxtsencout
538         subs            w4, w4, #16
539         bmi             .LxtsencctsNx
540 .Lxtsencloop:
541         ld1             {v0.16b}, [x1], #16
542 .Lxtsencctsout:
543         eor             v0.16b, v0.16b, v4.16b
544         encrypt_block   v0, w3, x2, x8, w7
545         eor             v0.16b, v0.16b, v4.16b
546         cbz             w4, .Lxtsencout
547         subs            w4, w4, #16
548         next_tweak      v4, v4, v8
549         bmi             .Lxtsenccts
550         st1             {v0.16b}, [x0], #16
551         b               .Lxtsencloop
552 .Lxtsencout:
553         st1             {v0.16b}, [x0]
554 .Lxtsencret:
555         st1             {v4.16b}, [x6]
556         ldp             x29, x30, [sp], #16
557         ret
558
559 .LxtsencctsNx:
560         mov             v0.16b, v3.16b
561         sub             x0, x0, #16
562 .Lxtsenccts:
563         adr_l           x8, .Lcts_permute_table
564
565         add             x1, x1, w4, sxtw        /* rewind input pointer */
566         add             w4, w4, #16             /* # bytes in final block */
567         add             x9, x8, #32
568         add             x8, x8, x4
569         sub             x9, x9, x4
570         add             x4, x0, x4              /* output address of final block */
571
572         ld1             {v1.16b}, [x1]          /* load final block */
573         ld1             {v2.16b}, [x8]
574         ld1             {v3.16b}, [x9]
575
576         tbl             v2.16b, {v0.16b}, v2.16b
577         tbx             v0.16b, {v1.16b}, v3.16b
578         st1             {v2.16b}, [x4]                  /* overlapping stores */
579         mov             w4, wzr
580         b               .Lxtsencctsout
581 AES_FUNC_END(aes_xts_encrypt)
582
583 AES_FUNC_START(aes_xts_decrypt)
584         stp             x29, x30, [sp, #-16]!
585         mov             x29, sp
586
587         /* subtract 16 bytes if we are doing CTS */
588         sub             w8, w4, #0x10
589         tst             w4, #0xf
590         csel            w4, w4, w8, eq
591
592         ld1             {v4.16b}, [x6]
593         xts_load_mask   v8
594         xts_cts_skip_tw w7, .Lxtsdecskiptw
595         cbz             w7, .Lxtsdecnotfirst
596
597         enc_prepare     w3, x5, x8
598         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
599 .Lxtsdecskiptw:
600         dec_prepare     w3, x2, x8
601         b               .LxtsdecNx
602
603 .Lxtsdecnotfirst:
604         dec_prepare     w3, x2, x8
605 .LxtsdecloopNx:
606         next_tweak      v4, v4, v8
607 .LxtsdecNx:
608         subs            w4, w4, #64
609         bmi             .Lxtsdec1x
610         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
611         next_tweak      v5, v4, v8
612         eor             v0.16b, v0.16b, v4.16b
613         next_tweak      v6, v5, v8
614         eor             v1.16b, v1.16b, v5.16b
615         eor             v2.16b, v2.16b, v6.16b
616         next_tweak      v7, v6, v8
617         eor             v3.16b, v3.16b, v7.16b
618         bl              aes_decrypt_block4x
619         eor             v3.16b, v3.16b, v7.16b
620         eor             v0.16b, v0.16b, v4.16b
621         eor             v1.16b, v1.16b, v5.16b
622         eor             v2.16b, v2.16b, v6.16b
623         st1             {v0.16b-v3.16b}, [x0], #64
624         mov             v4.16b, v7.16b
625         cbz             w4, .Lxtsdecout
626         xts_reload_mask v8
627         b               .LxtsdecloopNx
628 .Lxtsdec1x:
629         adds            w4, w4, #64
630         beq             .Lxtsdecout
631         subs            w4, w4, #16
632 .Lxtsdecloop:
633         ld1             {v0.16b}, [x1], #16
634         bmi             .Lxtsdeccts
635 .Lxtsdecctsout:
636         eor             v0.16b, v0.16b, v4.16b
637         decrypt_block   v0, w3, x2, x8, w7
638         eor             v0.16b, v0.16b, v4.16b
639         st1             {v0.16b}, [x0], #16
640         cbz             w4, .Lxtsdecout
641         subs            w4, w4, #16
642         next_tweak      v4, v4, v8
643         b               .Lxtsdecloop
644 .Lxtsdecout:
645         st1             {v4.16b}, [x6]
646         ldp             x29, x30, [sp], #16
647         ret
648
649 .Lxtsdeccts:
650         adr_l           x8, .Lcts_permute_table
651
652         add             x1, x1, w4, sxtw        /* rewind input pointer */
653         add             w4, w4, #16             /* # bytes in final block */
654         add             x9, x8, #32
655         add             x8, x8, x4
656         sub             x9, x9, x4
657         add             x4, x0, x4              /* output address of final block */
658
659         next_tweak      v5, v4, v8
660
661         ld1             {v1.16b}, [x1]          /* load final block */
662         ld1             {v2.16b}, [x8]
663         ld1             {v3.16b}, [x9]
664
665         eor             v0.16b, v0.16b, v5.16b
666         decrypt_block   v0, w3, x2, x8, w7
667         eor             v0.16b, v0.16b, v5.16b
668
669         tbl             v2.16b, {v0.16b}, v2.16b
670         tbx             v0.16b, {v1.16b}, v3.16b
671
672         st1             {v2.16b}, [x4]                  /* overlapping stores */
673         mov             w4, wzr
674         b               .Lxtsdecctsout
675 AES_FUNC_END(aes_xts_decrypt)
676
677         /*
678          * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
679          *                int blocks, u8 dg[], int enc_before, int enc_after)
680          */
681 AES_FUNC_START(aes_mac_update)
682         ld1             {v0.16b}, [x4]                  /* get dg */
683         enc_prepare     w2, x1, x7
684         cbz             w5, .Lmacloop4x
685
686         encrypt_block   v0, w2, x1, x7, w8
687
688 .Lmacloop4x:
689         subs            w3, w3, #4
690         bmi             .Lmac1x
691         ld1             {v1.16b-v4.16b}, [x0], #64      /* get next pt block */
692         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
693         encrypt_block   v0, w2, x1, x7, w8
694         eor             v0.16b, v0.16b, v2.16b
695         encrypt_block   v0, w2, x1, x7, w8
696         eor             v0.16b, v0.16b, v3.16b
697         encrypt_block   v0, w2, x1, x7, w8
698         eor             v0.16b, v0.16b, v4.16b
699         cmp             w3, wzr
700         csinv           x5, x6, xzr, eq
701         cbz             w5, .Lmacout
702         encrypt_block   v0, w2, x1, x7, w8
703         st1             {v0.16b}, [x4]                  /* return dg */
704         cond_yield      .Lmacout, x7, x8
705         b               .Lmacloop4x
706 .Lmac1x:
707         add             w3, w3, #4
708 .Lmacloop:
709         cbz             w3, .Lmacout
710         ld1             {v1.16b}, [x0], #16             /* get next pt block */
711         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
712
713         subs            w3, w3, #1
714         csinv           x5, x6, xzr, eq
715         cbz             w5, .Lmacout
716
717 .Lmacenc:
718         encrypt_block   v0, w2, x1, x7, w8
719         b               .Lmacloop
720
721 .Lmacout:
722         st1             {v0.16b}, [x4]                  /* return dg */
723         mov             w0, w3
724         ret
725 AES_FUNC_END(aes_mac_update)