bbdb54702aa7a45d40fd26b7c2db5e2c98ffbca9
[linux-2.6-microblaze.git] / arch / arm64 / crypto / aes-modes.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4  *
5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7
8 /* included by aes-ce.S and aes-neon.S */
9
10         .text
11         .align          4
12
13 #ifndef MAX_STRIDE
14 #define MAX_STRIDE      4
15 #endif
16
17 #if MAX_STRIDE == 4
18 #define ST4(x...) x
19 #define ST5(x...)
20 #else
21 #define ST4(x...)
22 #define ST5(x...) x
23 #endif
24
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26         encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
27         ret
28 SYM_FUNC_END(aes_encrypt_block4x)
29
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31         decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
32         ret
33 SYM_FUNC_END(aes_decrypt_block4x)
34
35 #if MAX_STRIDE == 5
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37         encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
38         ret
39 SYM_FUNC_END(aes_encrypt_block5x)
40
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42         decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
43         ret
44 SYM_FUNC_END(aes_decrypt_block5x)
45 #endif
46
47         /*
48          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49          *                 int blocks)
50          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51          *                 int blocks)
52          */
53
54 AES_FUNC_START(aes_ecb_encrypt)
55         stp             x29, x30, [sp, #-16]!
56         mov             x29, sp
57
58         enc_prepare     w3, x2, x5
59
60 .LecbencloopNx:
61         subs            w4, w4, #MAX_STRIDE
62         bmi             .Lecbenc1x
63         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
64 ST4(    bl              aes_encrypt_block4x             )
65 ST5(    ld1             {v4.16b}, [x1], #16             )
66 ST5(    bl              aes_encrypt_block5x             )
67         st1             {v0.16b-v3.16b}, [x0], #64
68 ST5(    st1             {v4.16b}, [x0], #16             )
69         b               .LecbencloopNx
70 .Lecbenc1x:
71         adds            w4, w4, #MAX_STRIDE
72         beq             .Lecbencout
73 .Lecbencloop:
74         ld1             {v0.16b}, [x1], #16             /* get next pt block */
75         encrypt_block   v0, w3, x2, x5, w6
76         st1             {v0.16b}, [x0], #16
77         subs            w4, w4, #1
78         bne             .Lecbencloop
79 .Lecbencout:
80         ldp             x29, x30, [sp], #16
81         ret
82 AES_FUNC_END(aes_ecb_encrypt)
83
84
85 AES_FUNC_START(aes_ecb_decrypt)
86         stp             x29, x30, [sp, #-16]!
87         mov             x29, sp
88
89         dec_prepare     w3, x2, x5
90
91 .LecbdecloopNx:
92         subs            w4, w4, #MAX_STRIDE
93         bmi             .Lecbdec1x
94         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
95 ST4(    bl              aes_decrypt_block4x             )
96 ST5(    ld1             {v4.16b}, [x1], #16             )
97 ST5(    bl              aes_decrypt_block5x             )
98         st1             {v0.16b-v3.16b}, [x0], #64
99 ST5(    st1             {v4.16b}, [x0], #16             )
100         b               .LecbdecloopNx
101 .Lecbdec1x:
102         adds            w4, w4, #MAX_STRIDE
103         beq             .Lecbdecout
104 .Lecbdecloop:
105         ld1             {v0.16b}, [x1], #16             /* get next ct block */
106         decrypt_block   v0, w3, x2, x5, w6
107         st1             {v0.16b}, [x0], #16
108         subs            w4, w4, #1
109         bne             .Lecbdecloop
110 .Lecbdecout:
111         ldp             x29, x30, [sp], #16
112         ret
113 AES_FUNC_END(aes_ecb_decrypt)
114
115
116         /*
117          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118          *                 int blocks, u8 iv[])
119          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120          *                 int blocks, u8 iv[])
121          * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122          *                       int rounds, int blocks, u8 iv[],
123          *                       u32 const rk2[]);
124          * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125          *                       int rounds, int blocks, u8 iv[],
126          *                       u32 const rk2[]);
127          */
128
129 AES_FUNC_START(aes_essiv_cbc_encrypt)
130         ld1             {v4.16b}, [x5]                  /* get iv */
131
132         mov             w8, #14                         /* AES-256: 14 rounds */
133         enc_prepare     w8, x6, x7
134         encrypt_block   v4, w8, x6, x7, w9
135         enc_switch_key  w3, x2, x6
136         b               .Lcbcencloop4x
137
138 AES_FUNC_START(aes_cbc_encrypt)
139         ld1             {v4.16b}, [x5]                  /* get iv */
140         enc_prepare     w3, x2, x6
141
142 .Lcbcencloop4x:
143         subs            w4, w4, #4
144         bmi             .Lcbcenc1x
145         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
146         eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
147         encrypt_block   v0, w3, x2, x6, w7
148         eor             v1.16b, v1.16b, v0.16b
149         encrypt_block   v1, w3, x2, x6, w7
150         eor             v2.16b, v2.16b, v1.16b
151         encrypt_block   v2, w3, x2, x6, w7
152         eor             v3.16b, v3.16b, v2.16b
153         encrypt_block   v3, w3, x2, x6, w7
154         st1             {v0.16b-v3.16b}, [x0], #64
155         mov             v4.16b, v3.16b
156         b               .Lcbcencloop4x
157 .Lcbcenc1x:
158         adds            w4, w4, #4
159         beq             .Lcbcencout
160 .Lcbcencloop:
161         ld1             {v0.16b}, [x1], #16             /* get next pt block */
162         eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
163         encrypt_block   v4, w3, x2, x6, w7
164         st1             {v4.16b}, [x0], #16
165         subs            w4, w4, #1
166         bne             .Lcbcencloop
167 .Lcbcencout:
168         st1             {v4.16b}, [x5]                  /* return iv */
169         ret
170 AES_FUNC_END(aes_cbc_encrypt)
171 AES_FUNC_END(aes_essiv_cbc_encrypt)
172
173 AES_FUNC_START(aes_essiv_cbc_decrypt)
174         stp             x29, x30, [sp, #-16]!
175         mov             x29, sp
176
177         ld1             {cbciv.16b}, [x5]               /* get iv */
178
179         mov             w8, #14                         /* AES-256: 14 rounds */
180         enc_prepare     w8, x6, x7
181         encrypt_block   cbciv, w8, x6, x7, w9
182         b               .Lessivcbcdecstart
183
184 AES_FUNC_START(aes_cbc_decrypt)
185         stp             x29, x30, [sp, #-16]!
186         mov             x29, sp
187
188         ld1             {cbciv.16b}, [x5]               /* get iv */
189 .Lessivcbcdecstart:
190         dec_prepare     w3, x2, x6
191
192 .LcbcdecloopNx:
193         subs            w4, w4, #MAX_STRIDE
194         bmi             .Lcbcdec1x
195         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
196 #if MAX_STRIDE == 5
197         ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
198         mov             v5.16b, v0.16b
199         mov             v6.16b, v1.16b
200         mov             v7.16b, v2.16b
201         bl              aes_decrypt_block5x
202         sub             x1, x1, #32
203         eor             v0.16b, v0.16b, cbciv.16b
204         eor             v1.16b, v1.16b, v5.16b
205         ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
206         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
207         eor             v2.16b, v2.16b, v6.16b
208         eor             v3.16b, v3.16b, v7.16b
209         eor             v4.16b, v4.16b, v5.16b
210 #else
211         mov             v4.16b, v0.16b
212         mov             v5.16b, v1.16b
213         mov             v6.16b, v2.16b
214         bl              aes_decrypt_block4x
215         sub             x1, x1, #16
216         eor             v0.16b, v0.16b, cbciv.16b
217         eor             v1.16b, v1.16b, v4.16b
218         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
219         eor             v2.16b, v2.16b, v5.16b
220         eor             v3.16b, v3.16b, v6.16b
221 #endif
222         st1             {v0.16b-v3.16b}, [x0], #64
223 ST5(    st1             {v4.16b}, [x0], #16             )
224         b               .LcbcdecloopNx
225 .Lcbcdec1x:
226         adds            w4, w4, #MAX_STRIDE
227         beq             .Lcbcdecout
228 .Lcbcdecloop:
229         ld1             {v1.16b}, [x1], #16             /* get next ct block */
230         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
231         decrypt_block   v0, w3, x2, x6, w7
232         eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
233         mov             cbciv.16b, v1.16b               /* ct is next iv */
234         st1             {v0.16b}, [x0], #16
235         subs            w4, w4, #1
236         bne             .Lcbcdecloop
237 .Lcbcdecout:
238         st1             {cbciv.16b}, [x5]               /* return iv */
239         ldp             x29, x30, [sp], #16
240         ret
241 AES_FUNC_END(aes_cbc_decrypt)
242 AES_FUNC_END(aes_essiv_cbc_decrypt)
243
244
245         /*
246          * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247          *                     int rounds, int bytes, u8 const iv[])
248          * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249          *                     int rounds, int bytes, u8 const iv[])
250          */
251
252 AES_FUNC_START(aes_cbc_cts_encrypt)
253         adr_l           x8, .Lcts_permute_table
254         sub             x4, x4, #16
255         add             x9, x8, #32
256         add             x8, x8, x4
257         sub             x9, x9, x4
258         ld1             {v3.16b}, [x8]
259         ld1             {v4.16b}, [x9]
260
261         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
262         ld1             {v1.16b}, [x1]
263
264         ld1             {v5.16b}, [x5]                  /* get iv */
265         enc_prepare     w3, x2, x6
266
267         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
268         tbl             v1.16b, {v1.16b}, v4.16b
269         encrypt_block   v0, w3, x2, x6, w7
270
271         eor             v1.16b, v1.16b, v0.16b
272         tbl             v0.16b, {v0.16b}, v3.16b
273         encrypt_block   v1, w3, x2, x6, w7
274
275         add             x4, x0, x4
276         st1             {v0.16b}, [x4]                  /* overlapping stores */
277         st1             {v1.16b}, [x0]
278         ret
279 AES_FUNC_END(aes_cbc_cts_encrypt)
280
281 AES_FUNC_START(aes_cbc_cts_decrypt)
282         adr_l           x8, .Lcts_permute_table
283         sub             x4, x4, #16
284         add             x9, x8, #32
285         add             x8, x8, x4
286         sub             x9, x9, x4
287         ld1             {v3.16b}, [x8]
288         ld1             {v4.16b}, [x9]
289
290         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
291         ld1             {v1.16b}, [x1]
292
293         ld1             {v5.16b}, [x5]                  /* get iv */
294         dec_prepare     w3, x2, x6
295
296         decrypt_block   v0, w3, x2, x6, w7
297         tbl             v2.16b, {v0.16b}, v3.16b
298         eor             v2.16b, v2.16b, v1.16b
299
300         tbx             v0.16b, {v1.16b}, v4.16b
301         decrypt_block   v0, w3, x2, x6, w7
302         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
303
304         add             x4, x0, x4
305         st1             {v2.16b}, [x4]                  /* overlapping stores */
306         st1             {v0.16b}, [x0]
307         ret
308 AES_FUNC_END(aes_cbc_cts_decrypt)
309
310         .section        ".rodata", "a"
311         .align          6
312 .Lcts_permute_table:
313         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319         .previous
320
321
322         /*
323          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324          *                 int bytes, u8 ctr[], u8 finalbuf[])
325          */
326
327 AES_FUNC_START(aes_ctr_encrypt)
328         stp             x29, x30, [sp, #-16]!
329         mov             x29, sp
330
331         enc_prepare     w3, x2, x12
332         ld1             {vctr.16b}, [x5]
333
334         umov            x12, vctr.d[1]          /* keep swabbed ctr in reg */
335         rev             x12, x12
336
337 .LctrloopNx:
338         add             w7, w4, #15
339         sub             w4, w4, #MAX_STRIDE << 4
340         lsr             w7, w7, #4
341         mov             w8, #MAX_STRIDE
342         cmp             w7, w8
343         csel            w7, w7, w8, lt
344         adds            x12, x12, x7
345
346         mov             v0.16b, vctr.16b
347         mov             v1.16b, vctr.16b
348         mov             v2.16b, vctr.16b
349         mov             v3.16b, vctr.16b
350 ST5(    mov             v4.16b, vctr.16b                )
351         bcs             0f
352
353         .subsection     1
354         /* apply carry to outgoing counter */
355 0:      umov            x8, vctr.d[0]
356         rev             x8, x8
357         add             x8, x8, #1
358         rev             x8, x8
359         ins             vctr.d[0], x8
360
361         /* apply carry to N counter blocks for N := x12 */
362         adr             x16, 1f
363         sub             x16, x16, x12, lsl #3
364         br              x16
365         hint            34                      // bti c
366         mov             v0.d[0], vctr.d[0]
367         hint            34                      // bti c
368         mov             v1.d[0], vctr.d[0]
369         hint            34                      // bti c
370         mov             v2.d[0], vctr.d[0]
371         hint            34                      // bti c
372         mov             v3.d[0], vctr.d[0]
373 ST5(    hint            34                              )
374 ST5(    mov             v4.d[0], vctr.d[0]              )
375 1:      b               2f
376         .previous
377
378 2:      rev             x7, x12
379         ins             vctr.d[1], x7
380         sub             x7, x12, #MAX_STRIDE - 1
381         sub             x8, x12, #MAX_STRIDE - 2
382         sub             x9, x12, #MAX_STRIDE - 3
383         rev             x7, x7
384         rev             x8, x8
385         mov             v1.d[1], x7
386         rev             x9, x9
387 ST5(    sub             x10, x12, #MAX_STRIDE - 4       )
388         mov             v2.d[1], x8
389 ST5(    rev             x10, x10                        )
390         mov             v3.d[1], x9
391 ST5(    mov             v4.d[1], x10                    )
392         tbnz            w4, #31, .Lctrtail
393         ld1             {v5.16b-v7.16b}, [x1], #48
394 ST4(    bl              aes_encrypt_block4x             )
395 ST5(    bl              aes_encrypt_block5x             )
396         eor             v0.16b, v5.16b, v0.16b
397 ST4(    ld1             {v5.16b}, [x1], #16             )
398         eor             v1.16b, v6.16b, v1.16b
399 ST5(    ld1             {v5.16b-v6.16b}, [x1], #32      )
400         eor             v2.16b, v7.16b, v2.16b
401         eor             v3.16b, v5.16b, v3.16b
402 ST5(    eor             v4.16b, v6.16b, v4.16b          )
403         st1             {v0.16b-v3.16b}, [x0], #64
404 ST5(    st1             {v4.16b}, [x0], #16             )
405         cbz             w4, .Lctrout
406         b               .LctrloopNx
407
408 .Lctrout:
409         st1             {vctr.16b}, [x5]        /* return next CTR value */
410         ldp             x29, x30, [sp], #16
411         ret
412
413 .Lctrtail:
414         /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
415         mov             x16, #16
416         ands            x13, x4, #0xf
417         csel            x13, x13, x16, ne
418
419 ST5(    cmp             w4, #64 - (MAX_STRIDE << 4)     )
420 ST5(    csel            x14, x16, xzr, gt               )
421         cmp             w4, #48 - (MAX_STRIDE << 4)
422         csel            x15, x16, xzr, gt
423         cmp             w4, #32 - (MAX_STRIDE << 4)
424         csel            x16, x16, xzr, gt
425         cmp             w4, #16 - (MAX_STRIDE << 4)
426         ble             .Lctrtail1x
427
428         adr_l           x12, .Lcts_permute_table
429         add             x12, x12, x13
430
431 ST5(    ld1             {v5.16b}, [x1], x14             )
432         ld1             {v6.16b}, [x1], x15
433         ld1             {v7.16b}, [x1], x16
434
435 ST4(    bl              aes_encrypt_block4x             )
436 ST5(    bl              aes_encrypt_block5x             )
437
438         ld1             {v8.16b}, [x1], x13
439         ld1             {v9.16b}, [x1]
440         ld1             {v10.16b}, [x12]
441
442 ST4(    eor             v6.16b, v6.16b, v0.16b          )
443 ST4(    eor             v7.16b, v7.16b, v1.16b          )
444 ST4(    tbl             v3.16b, {v3.16b}, v10.16b       )
445 ST4(    eor             v8.16b, v8.16b, v2.16b          )
446 ST4(    eor             v9.16b, v9.16b, v3.16b          )
447
448 ST5(    eor             v5.16b, v5.16b, v0.16b          )
449 ST5(    eor             v6.16b, v6.16b, v1.16b          )
450 ST5(    tbl             v4.16b, {v4.16b}, v10.16b       )
451 ST5(    eor             v7.16b, v7.16b, v2.16b          )
452 ST5(    eor             v8.16b, v8.16b, v3.16b          )
453 ST5(    eor             v9.16b, v9.16b, v4.16b          )
454
455 ST5(    st1             {v5.16b}, [x0], x14             )
456         st1             {v6.16b}, [x0], x15
457         st1             {v7.16b}, [x0], x16
458         add             x13, x13, x0
459         st1             {v9.16b}, [x13]         // overlapping stores
460         st1             {v8.16b}, [x0]
461         b               .Lctrout
462
463 .Lctrtail1x:
464         csel            x0, x0, x6, eq          // use finalbuf if less than a full block
465         ld1             {v5.16b}, [x1]
466 ST5(    mov             v3.16b, v4.16b                  )
467         encrypt_block   v3, w3, x2, x8, w7
468         eor             v5.16b, v5.16b, v3.16b
469         st1             {v5.16b}, [x0]
470         b               .Lctrout
471 AES_FUNC_END(aes_ctr_encrypt)
472
473
474         /*
475          * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
476          *                 int bytes, u8 const rk2[], u8 iv[], int first)
477          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
478          *                 int bytes, u8 const rk2[], u8 iv[], int first)
479          */
480
481         .macro          next_tweak, out, in, tmp
482         sshr            \tmp\().2d,  \in\().2d,   #63
483         and             \tmp\().16b, \tmp\().16b, xtsmask.16b
484         add             \out\().2d,  \in\().2d,   \in\().2d
485         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
486         eor             \out\().16b, \out\().16b, \tmp\().16b
487         .endm
488
489         .macro          xts_load_mask, tmp
490         movi            xtsmask.2s, #0x1
491         movi            \tmp\().2s, #0x87
492         uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
493         .endm
494
495 AES_FUNC_START(aes_xts_encrypt)
496         stp             x29, x30, [sp, #-16]!
497         mov             x29, sp
498
499         ld1             {v4.16b}, [x6]
500         xts_load_mask   v8
501         cbz             w7, .Lxtsencnotfirst
502
503         enc_prepare     w3, x5, x8
504         xts_cts_skip_tw w7, .LxtsencNx
505         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
506         enc_switch_key  w3, x2, x8
507         b               .LxtsencNx
508
509 .Lxtsencnotfirst:
510         enc_prepare     w3, x2, x8
511 .LxtsencloopNx:
512         next_tweak      v4, v4, v8
513 .LxtsencNx:
514         subs            w4, w4, #64
515         bmi             .Lxtsenc1x
516         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
517         next_tweak      v5, v4, v8
518         eor             v0.16b, v0.16b, v4.16b
519         next_tweak      v6, v5, v8
520         eor             v1.16b, v1.16b, v5.16b
521         eor             v2.16b, v2.16b, v6.16b
522         next_tweak      v7, v6, v8
523         eor             v3.16b, v3.16b, v7.16b
524         bl              aes_encrypt_block4x
525         eor             v3.16b, v3.16b, v7.16b
526         eor             v0.16b, v0.16b, v4.16b
527         eor             v1.16b, v1.16b, v5.16b
528         eor             v2.16b, v2.16b, v6.16b
529         st1             {v0.16b-v3.16b}, [x0], #64
530         mov             v4.16b, v7.16b
531         cbz             w4, .Lxtsencret
532         xts_reload_mask v8
533         b               .LxtsencloopNx
534 .Lxtsenc1x:
535         adds            w4, w4, #64
536         beq             .Lxtsencout
537         subs            w4, w4, #16
538         bmi             .LxtsencctsNx
539 .Lxtsencloop:
540         ld1             {v0.16b}, [x1], #16
541 .Lxtsencctsout:
542         eor             v0.16b, v0.16b, v4.16b
543         encrypt_block   v0, w3, x2, x8, w7
544         eor             v0.16b, v0.16b, v4.16b
545         cbz             w4, .Lxtsencout
546         subs            w4, w4, #16
547         next_tweak      v4, v4, v8
548         bmi             .Lxtsenccts
549         st1             {v0.16b}, [x0], #16
550         b               .Lxtsencloop
551 .Lxtsencout:
552         st1             {v0.16b}, [x0]
553 .Lxtsencret:
554         st1             {v4.16b}, [x6]
555         ldp             x29, x30, [sp], #16
556         ret
557
558 .LxtsencctsNx:
559         mov             v0.16b, v3.16b
560         sub             x0, x0, #16
561 .Lxtsenccts:
562         adr_l           x8, .Lcts_permute_table
563
564         add             x1, x1, w4, sxtw        /* rewind input pointer */
565         add             w4, w4, #16             /* # bytes in final block */
566         add             x9, x8, #32
567         add             x8, x8, x4
568         sub             x9, x9, x4
569         add             x4, x0, x4              /* output address of final block */
570
571         ld1             {v1.16b}, [x1]          /* load final block */
572         ld1             {v2.16b}, [x8]
573         ld1             {v3.16b}, [x9]
574
575         tbl             v2.16b, {v0.16b}, v2.16b
576         tbx             v0.16b, {v1.16b}, v3.16b
577         st1             {v2.16b}, [x4]                  /* overlapping stores */
578         mov             w4, wzr
579         b               .Lxtsencctsout
580 AES_FUNC_END(aes_xts_encrypt)
581
582 AES_FUNC_START(aes_xts_decrypt)
583         stp             x29, x30, [sp, #-16]!
584         mov             x29, sp
585
586         /* subtract 16 bytes if we are doing CTS */
587         sub             w8, w4, #0x10
588         tst             w4, #0xf
589         csel            w4, w4, w8, eq
590
591         ld1             {v4.16b}, [x6]
592         xts_load_mask   v8
593         xts_cts_skip_tw w7, .Lxtsdecskiptw
594         cbz             w7, .Lxtsdecnotfirst
595
596         enc_prepare     w3, x5, x8
597         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
598 .Lxtsdecskiptw:
599         dec_prepare     w3, x2, x8
600         b               .LxtsdecNx
601
602 .Lxtsdecnotfirst:
603         dec_prepare     w3, x2, x8
604 .LxtsdecloopNx:
605         next_tweak      v4, v4, v8
606 .LxtsdecNx:
607         subs            w4, w4, #64
608         bmi             .Lxtsdec1x
609         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
610         next_tweak      v5, v4, v8
611         eor             v0.16b, v0.16b, v4.16b
612         next_tweak      v6, v5, v8
613         eor             v1.16b, v1.16b, v5.16b
614         eor             v2.16b, v2.16b, v6.16b
615         next_tweak      v7, v6, v8
616         eor             v3.16b, v3.16b, v7.16b
617         bl              aes_decrypt_block4x
618         eor             v3.16b, v3.16b, v7.16b
619         eor             v0.16b, v0.16b, v4.16b
620         eor             v1.16b, v1.16b, v5.16b
621         eor             v2.16b, v2.16b, v6.16b
622         st1             {v0.16b-v3.16b}, [x0], #64
623         mov             v4.16b, v7.16b
624         cbz             w4, .Lxtsdecout
625         xts_reload_mask v8
626         b               .LxtsdecloopNx
627 .Lxtsdec1x:
628         adds            w4, w4, #64
629         beq             .Lxtsdecout
630         subs            w4, w4, #16
631 .Lxtsdecloop:
632         ld1             {v0.16b}, [x1], #16
633         bmi             .Lxtsdeccts
634 .Lxtsdecctsout:
635         eor             v0.16b, v0.16b, v4.16b
636         decrypt_block   v0, w3, x2, x8, w7
637         eor             v0.16b, v0.16b, v4.16b
638         st1             {v0.16b}, [x0], #16
639         cbz             w4, .Lxtsdecout
640         subs            w4, w4, #16
641         next_tweak      v4, v4, v8
642         b               .Lxtsdecloop
643 .Lxtsdecout:
644         st1             {v4.16b}, [x6]
645         ldp             x29, x30, [sp], #16
646         ret
647
648 .Lxtsdeccts:
649         adr_l           x8, .Lcts_permute_table
650
651         add             x1, x1, w4, sxtw        /* rewind input pointer */
652         add             w4, w4, #16             /* # bytes in final block */
653         add             x9, x8, #32
654         add             x8, x8, x4
655         sub             x9, x9, x4
656         add             x4, x0, x4              /* output address of final block */
657
658         next_tweak      v5, v4, v8
659
660         ld1             {v1.16b}, [x1]          /* load final block */
661         ld1             {v2.16b}, [x8]
662         ld1             {v3.16b}, [x9]
663
664         eor             v0.16b, v0.16b, v5.16b
665         decrypt_block   v0, w3, x2, x8, w7
666         eor             v0.16b, v0.16b, v5.16b
667
668         tbl             v2.16b, {v0.16b}, v2.16b
669         tbx             v0.16b, {v1.16b}, v3.16b
670
671         st1             {v2.16b}, [x4]                  /* overlapping stores */
672         mov             w4, wzr
673         b               .Lxtsdecctsout
674 AES_FUNC_END(aes_xts_decrypt)
675
676         /*
677          * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
678          *                int blocks, u8 dg[], int enc_before, int enc_after)
679          */
680 AES_FUNC_START(aes_mac_update)
681         ld1             {v0.16b}, [x4]                  /* get dg */
682         enc_prepare     w2, x1, x7
683         cbz             w5, .Lmacloop4x
684
685         encrypt_block   v0, w2, x1, x7, w8
686
687 .Lmacloop4x:
688         subs            w3, w3, #4
689         bmi             .Lmac1x
690         ld1             {v1.16b-v4.16b}, [x0], #64      /* get next pt block */
691         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
692         encrypt_block   v0, w2, x1, x7, w8
693         eor             v0.16b, v0.16b, v2.16b
694         encrypt_block   v0, w2, x1, x7, w8
695         eor             v0.16b, v0.16b, v3.16b
696         encrypt_block   v0, w2, x1, x7, w8
697         eor             v0.16b, v0.16b, v4.16b
698         cmp             w3, wzr
699         csinv           x5, x6, xzr, eq
700         cbz             w5, .Lmacout
701         encrypt_block   v0, w2, x1, x7, w8
702         st1             {v0.16b}, [x4]                  /* return dg */
703         cond_yield      .Lmacout, x7
704         b               .Lmacloop4x
705 .Lmac1x:
706         add             w3, w3, #4
707 .Lmacloop:
708         cbz             w3, .Lmacout
709         ld1             {v1.16b}, [x0], #16             /* get next pt block */
710         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
711
712         subs            w3, w3, #1
713         csinv           x5, x6, xzr, eq
714         cbz             w5, .Lmacout
715
716 .Lmacenc:
717         encrypt_block   v0, w2, x1, x7, w8
718         b               .Lmacloop
719
720 .Lmacout:
721         st1             {v0.16b}, [x4]                  /* return dg */
722         mov             w0, w3
723         ret
724 AES_FUNC_END(aes_mac_update)