riscv: implement Zicbom-based CMO instructions + the t-head variant
[linux-2.6-microblaze.git] / arch / arm64 / crypto / aes-modes.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4  *
5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7
8 /* included by aes-ce.S and aes-neon.S */
9
10         .text
11         .align          4
12
13 #ifndef MAX_STRIDE
14 #define MAX_STRIDE      4
15 #endif
16
17 #if MAX_STRIDE == 4
18 #define ST4(x...) x
19 #define ST5(x...)
20 #else
21 #define ST4(x...)
22 #define ST5(x...) x
23 #endif
24
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26         encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
27         ret
28 SYM_FUNC_END(aes_encrypt_block4x)
29
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31         decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
32         ret
33 SYM_FUNC_END(aes_decrypt_block4x)
34
35 #if MAX_STRIDE == 5
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37         encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
38         ret
39 SYM_FUNC_END(aes_encrypt_block5x)
40
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42         decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
43         ret
44 SYM_FUNC_END(aes_decrypt_block5x)
45 #endif
46
47         /*
48          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49          *                 int blocks)
50          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51          *                 int blocks)
52          */
53
54 AES_FUNC_START(aes_ecb_encrypt)
55         stp             x29, x30, [sp, #-16]!
56         mov             x29, sp
57
58         enc_prepare     w3, x2, x5
59
60 .LecbencloopNx:
61         subs            w4, w4, #MAX_STRIDE
62         bmi             .Lecbenc1x
63         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
64 ST4(    bl              aes_encrypt_block4x             )
65 ST5(    ld1             {v4.16b}, [x1], #16             )
66 ST5(    bl              aes_encrypt_block5x             )
67         st1             {v0.16b-v3.16b}, [x0], #64
68 ST5(    st1             {v4.16b}, [x0], #16             )
69         b               .LecbencloopNx
70 .Lecbenc1x:
71         adds            w4, w4, #MAX_STRIDE
72         beq             .Lecbencout
73 .Lecbencloop:
74         ld1             {v0.16b}, [x1], #16             /* get next pt block */
75         encrypt_block   v0, w3, x2, x5, w6
76         st1             {v0.16b}, [x0], #16
77         subs            w4, w4, #1
78         bne             .Lecbencloop
79 .Lecbencout:
80         ldp             x29, x30, [sp], #16
81         ret
82 AES_FUNC_END(aes_ecb_encrypt)
83
84
85 AES_FUNC_START(aes_ecb_decrypt)
86         stp             x29, x30, [sp, #-16]!
87         mov             x29, sp
88
89         dec_prepare     w3, x2, x5
90
91 .LecbdecloopNx:
92         subs            w4, w4, #MAX_STRIDE
93         bmi             .Lecbdec1x
94         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
95 ST4(    bl              aes_decrypt_block4x             )
96 ST5(    ld1             {v4.16b}, [x1], #16             )
97 ST5(    bl              aes_decrypt_block5x             )
98         st1             {v0.16b-v3.16b}, [x0], #64
99 ST5(    st1             {v4.16b}, [x0], #16             )
100         b               .LecbdecloopNx
101 .Lecbdec1x:
102         adds            w4, w4, #MAX_STRIDE
103         beq             .Lecbdecout
104 .Lecbdecloop:
105         ld1             {v0.16b}, [x1], #16             /* get next ct block */
106         decrypt_block   v0, w3, x2, x5, w6
107         st1             {v0.16b}, [x0], #16
108         subs            w4, w4, #1
109         bne             .Lecbdecloop
110 .Lecbdecout:
111         ldp             x29, x30, [sp], #16
112         ret
113 AES_FUNC_END(aes_ecb_decrypt)
114
115
116         /*
117          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118          *                 int blocks, u8 iv[])
119          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120          *                 int blocks, u8 iv[])
121          * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122          *                       int rounds, int blocks, u8 iv[],
123          *                       u32 const rk2[]);
124          * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125          *                       int rounds, int blocks, u8 iv[],
126          *                       u32 const rk2[]);
127          */
128
129 AES_FUNC_START(aes_essiv_cbc_encrypt)
130         ld1             {v4.16b}, [x5]                  /* get iv */
131
132         mov             w8, #14                         /* AES-256: 14 rounds */
133         enc_prepare     w8, x6, x7
134         encrypt_block   v4, w8, x6, x7, w9
135         enc_switch_key  w3, x2, x6
136         b               .Lcbcencloop4x
137
138 AES_FUNC_START(aes_cbc_encrypt)
139         ld1             {v4.16b}, [x5]                  /* get iv */
140         enc_prepare     w3, x2, x6
141
142 .Lcbcencloop4x:
143         subs            w4, w4, #4
144         bmi             .Lcbcenc1x
145         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
146         eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
147         encrypt_block   v0, w3, x2, x6, w7
148         eor             v1.16b, v1.16b, v0.16b
149         encrypt_block   v1, w3, x2, x6, w7
150         eor             v2.16b, v2.16b, v1.16b
151         encrypt_block   v2, w3, x2, x6, w7
152         eor             v3.16b, v3.16b, v2.16b
153         encrypt_block   v3, w3, x2, x6, w7
154         st1             {v0.16b-v3.16b}, [x0], #64
155         mov             v4.16b, v3.16b
156         b               .Lcbcencloop4x
157 .Lcbcenc1x:
158         adds            w4, w4, #4
159         beq             .Lcbcencout
160 .Lcbcencloop:
161         ld1             {v0.16b}, [x1], #16             /* get next pt block */
162         eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
163         encrypt_block   v4, w3, x2, x6, w7
164         st1             {v4.16b}, [x0], #16
165         subs            w4, w4, #1
166         bne             .Lcbcencloop
167 .Lcbcencout:
168         st1             {v4.16b}, [x5]                  /* return iv */
169         ret
170 AES_FUNC_END(aes_cbc_encrypt)
171 AES_FUNC_END(aes_essiv_cbc_encrypt)
172
173 AES_FUNC_START(aes_essiv_cbc_decrypt)
174         stp             x29, x30, [sp, #-16]!
175         mov             x29, sp
176
177         ld1             {cbciv.16b}, [x5]               /* get iv */
178
179         mov             w8, #14                         /* AES-256: 14 rounds */
180         enc_prepare     w8, x6, x7
181         encrypt_block   cbciv, w8, x6, x7, w9
182         b               .Lessivcbcdecstart
183
184 AES_FUNC_START(aes_cbc_decrypt)
185         stp             x29, x30, [sp, #-16]!
186         mov             x29, sp
187
188         ld1             {cbciv.16b}, [x5]               /* get iv */
189 .Lessivcbcdecstart:
190         dec_prepare     w3, x2, x6
191
192 .LcbcdecloopNx:
193         subs            w4, w4, #MAX_STRIDE
194         bmi             .Lcbcdec1x
195         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
196 #if MAX_STRIDE == 5
197         ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
198         mov             v5.16b, v0.16b
199         mov             v6.16b, v1.16b
200         mov             v7.16b, v2.16b
201         bl              aes_decrypt_block5x
202         sub             x1, x1, #32
203         eor             v0.16b, v0.16b, cbciv.16b
204         eor             v1.16b, v1.16b, v5.16b
205         ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
206         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
207         eor             v2.16b, v2.16b, v6.16b
208         eor             v3.16b, v3.16b, v7.16b
209         eor             v4.16b, v4.16b, v5.16b
210 #else
211         mov             v4.16b, v0.16b
212         mov             v5.16b, v1.16b
213         mov             v6.16b, v2.16b
214         bl              aes_decrypt_block4x
215         sub             x1, x1, #16
216         eor             v0.16b, v0.16b, cbciv.16b
217         eor             v1.16b, v1.16b, v4.16b
218         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
219         eor             v2.16b, v2.16b, v5.16b
220         eor             v3.16b, v3.16b, v6.16b
221 #endif
222         st1             {v0.16b-v3.16b}, [x0], #64
223 ST5(    st1             {v4.16b}, [x0], #16             )
224         b               .LcbcdecloopNx
225 .Lcbcdec1x:
226         adds            w4, w4, #MAX_STRIDE
227         beq             .Lcbcdecout
228 .Lcbcdecloop:
229         ld1             {v1.16b}, [x1], #16             /* get next ct block */
230         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
231         decrypt_block   v0, w3, x2, x6, w7
232         eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
233         mov             cbciv.16b, v1.16b               /* ct is next iv */
234         st1             {v0.16b}, [x0], #16
235         subs            w4, w4, #1
236         bne             .Lcbcdecloop
237 .Lcbcdecout:
238         st1             {cbciv.16b}, [x5]               /* return iv */
239         ldp             x29, x30, [sp], #16
240         ret
241 AES_FUNC_END(aes_cbc_decrypt)
242 AES_FUNC_END(aes_essiv_cbc_decrypt)
243
244
245         /*
246          * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247          *                     int rounds, int bytes, u8 const iv[])
248          * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249          *                     int rounds, int bytes, u8 const iv[])
250          */
251
252 AES_FUNC_START(aes_cbc_cts_encrypt)
253         adr_l           x8, .Lcts_permute_table
254         sub             x4, x4, #16
255         add             x9, x8, #32
256         add             x8, x8, x4
257         sub             x9, x9, x4
258         ld1             {v3.16b}, [x8]
259         ld1             {v4.16b}, [x9]
260
261         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
262         ld1             {v1.16b}, [x1]
263
264         ld1             {v5.16b}, [x5]                  /* get iv */
265         enc_prepare     w3, x2, x6
266
267         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
268         tbl             v1.16b, {v1.16b}, v4.16b
269         encrypt_block   v0, w3, x2, x6, w7
270
271         eor             v1.16b, v1.16b, v0.16b
272         tbl             v0.16b, {v0.16b}, v3.16b
273         encrypt_block   v1, w3, x2, x6, w7
274
275         add             x4, x0, x4
276         st1             {v0.16b}, [x4]                  /* overlapping stores */
277         st1             {v1.16b}, [x0]
278         ret
279 AES_FUNC_END(aes_cbc_cts_encrypt)
280
281 AES_FUNC_START(aes_cbc_cts_decrypt)
282         adr_l           x8, .Lcts_permute_table
283         sub             x4, x4, #16
284         add             x9, x8, #32
285         add             x8, x8, x4
286         sub             x9, x9, x4
287         ld1             {v3.16b}, [x8]
288         ld1             {v4.16b}, [x9]
289
290         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
291         ld1             {v1.16b}, [x1]
292
293         ld1             {v5.16b}, [x5]                  /* get iv */
294         dec_prepare     w3, x2, x6
295
296         decrypt_block   v0, w3, x2, x6, w7
297         tbl             v2.16b, {v0.16b}, v3.16b
298         eor             v2.16b, v2.16b, v1.16b
299
300         tbx             v0.16b, {v1.16b}, v4.16b
301         decrypt_block   v0, w3, x2, x6, w7
302         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
303
304         add             x4, x0, x4
305         st1             {v2.16b}, [x4]                  /* overlapping stores */
306         st1             {v0.16b}, [x0]
307         ret
308 AES_FUNC_END(aes_cbc_cts_decrypt)
309
310         .section        ".rodata", "a"
311         .align          6
312 .Lcts_permute_table:
313         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319         .previous
320
321         /*
322          * This macro generates the code for CTR and XCTR mode.
323          */
324 .macro ctr_encrypt xctr
325         // Arguments
326         OUT             .req x0
327         IN              .req x1
328         KEY             .req x2
329         ROUNDS_W        .req w3
330         BYTES_W         .req w4
331         IV              .req x5
332         BYTE_CTR_W      .req w6         // XCTR only
333         // Intermediate values
334         CTR_W           .req w11        // XCTR only
335         CTR             .req x11        // XCTR only
336         IV_PART         .req x12
337         BLOCKS          .req x13
338         BLOCKS_W        .req w13
339
340         stp             x29, x30, [sp, #-16]!
341         mov             x29, sp
342
343         enc_prepare     ROUNDS_W, KEY, IV_PART
344         ld1             {vctr.16b}, [IV]
345
346         /*
347          * Keep 64 bits of the IV in a register.  For CTR mode this lets us
348          * easily increment the IV.  For XCTR mode this lets us efficiently XOR
349          * the 64-bit counter with the IV.
350          */
351         .if \xctr
352                 umov            IV_PART, vctr.d[0]
353                 lsr             CTR_W, BYTE_CTR_W, #4
354         .else
355                 umov            IV_PART, vctr.d[1]
356                 rev             IV_PART, IV_PART
357         .endif
358
359 .LctrloopNx\xctr:
360         add             BLOCKS_W, BYTES_W, #15
361         sub             BYTES_W, BYTES_W, #MAX_STRIDE << 4
362         lsr             BLOCKS_W, BLOCKS_W, #4
363         mov             w8, #MAX_STRIDE
364         cmp             BLOCKS_W, w8
365         csel            BLOCKS_W, BLOCKS_W, w8, lt
366
367         /*
368          * Set up the counter values in v0-v{MAX_STRIDE-1}.
369          *
370          * If we are encrypting less than MAX_STRIDE blocks, the tail block
371          * handling code expects the last keystream block to be in
372          * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
373          * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
374          */
375         .if \xctr
376                 add             CTR, CTR, BLOCKS
377         .else
378                 adds            IV_PART, IV_PART, BLOCKS
379         .endif
380         mov             v0.16b, vctr.16b
381         mov             v1.16b, vctr.16b
382         mov             v2.16b, vctr.16b
383         mov             v3.16b, vctr.16b
384 ST5(    mov             v4.16b, vctr.16b                )
385         .if \xctr
386                 sub             x6, CTR, #MAX_STRIDE - 1
387                 sub             x7, CTR, #MAX_STRIDE - 2
388                 sub             x8, CTR, #MAX_STRIDE - 3
389                 sub             x9, CTR, #MAX_STRIDE - 4
390 ST5(            sub             x10, CTR, #MAX_STRIDE - 5       )
391                 eor             x6, x6, IV_PART
392                 eor             x7, x7, IV_PART
393                 eor             x8, x8, IV_PART
394                 eor             x9, x9, IV_PART
395 ST5(            eor             x10, x10, IV_PART               )
396                 mov             v0.d[0], x6
397                 mov             v1.d[0], x7
398                 mov             v2.d[0], x8
399                 mov             v3.d[0], x9
400 ST5(            mov             v4.d[0], x10                    )
401         .else
402                 bcs             0f
403                 .subsection     1
404                 /*
405                  * This subsection handles carries.
406                  *
407                  * Conditional branching here is allowed with respect to time
408                  * invariance since the branches are dependent on the IV instead
409                  * of the plaintext or key.  This code is rarely executed in
410                  * practice anyway.
411                  */
412
413                 /* Apply carry to outgoing counter. */
414 0:              umov            x8, vctr.d[0]
415                 rev             x8, x8
416                 add             x8, x8, #1
417                 rev             x8, x8
418                 ins             vctr.d[0], x8
419
420                 /*
421                  * Apply carry to counter blocks if needed.
422                  *
423                  * Since the carry flag was set, we know 0 <= IV_PART <
424                  * MAX_STRIDE.  Using the value of IV_PART we can determine how
425                  * many counter blocks need to be updated.
426                  */
427                 cbz             IV_PART, 2f
428                 adr             x16, 1f
429                 sub             x16, x16, IV_PART, lsl #3
430                 br              x16
431                 bti             c
432                 mov             v0.d[0], vctr.d[0]
433                 bti             c
434                 mov             v1.d[0], vctr.d[0]
435                 bti             c
436                 mov             v2.d[0], vctr.d[0]
437                 bti             c
438                 mov             v3.d[0], vctr.d[0]
439 ST5(            bti             c                               )
440 ST5(            mov             v4.d[0], vctr.d[0]              )
441 1:              b               2f
442                 .previous
443
444 2:              rev             x7, IV_PART
445                 ins             vctr.d[1], x7
446                 sub             x7, IV_PART, #MAX_STRIDE - 1
447                 sub             x8, IV_PART, #MAX_STRIDE - 2
448                 sub             x9, IV_PART, #MAX_STRIDE - 3
449                 rev             x7, x7
450                 rev             x8, x8
451                 mov             v1.d[1], x7
452                 rev             x9, x9
453 ST5(            sub             x10, IV_PART, #MAX_STRIDE - 4   )
454                 mov             v2.d[1], x8
455 ST5(            rev             x10, x10                        )
456                 mov             v3.d[1], x9
457 ST5(            mov             v4.d[1], x10                    )
458         .endif
459
460         /*
461          * If there are at least MAX_STRIDE blocks left, XOR the data with
462          * keystream and store.  Otherwise jump to tail handling.
463          */
464         tbnz            BYTES_W, #31, .Lctrtail\xctr
465         ld1             {v5.16b-v7.16b}, [IN], #48
466 ST4(    bl              aes_encrypt_block4x             )
467 ST5(    bl              aes_encrypt_block5x             )
468         eor             v0.16b, v5.16b, v0.16b
469 ST4(    ld1             {v5.16b}, [IN], #16             )
470         eor             v1.16b, v6.16b, v1.16b
471 ST5(    ld1             {v5.16b-v6.16b}, [IN], #32      )
472         eor             v2.16b, v7.16b, v2.16b
473         eor             v3.16b, v5.16b, v3.16b
474 ST5(    eor             v4.16b, v6.16b, v4.16b          )
475         st1             {v0.16b-v3.16b}, [OUT], #64
476 ST5(    st1             {v4.16b}, [OUT], #16            )
477         cbz             BYTES_W, .Lctrout\xctr
478         b               .LctrloopNx\xctr
479
480 .Lctrout\xctr:
481         .if !\xctr
482                 st1             {vctr.16b}, [IV] /* return next CTR value */
483         .endif
484         ldp             x29, x30, [sp], #16
485         ret
486
487 .Lctrtail\xctr:
488         /*
489          * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
490          *
491          * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
492          * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
493          * v4 should have the next two counter blocks.
494          *
495          * This allows us to store the ciphertext by writing to overlapping
496          * regions of memory.  Any invalid ciphertext blocks get overwritten by
497          * correctly computed blocks.  This approach greatly simplifies the
498          * logic for storing the ciphertext.
499          */
500         mov             x16, #16
501         ands            w7, BYTES_W, #0xf
502         csel            x13, x7, x16, ne
503
504 ST5(    cmp             BYTES_W, #64 - (MAX_STRIDE << 4))
505 ST5(    csel            x14, x16, xzr, gt               )
506         cmp             BYTES_W, #48 - (MAX_STRIDE << 4)
507         csel            x15, x16, xzr, gt
508         cmp             BYTES_W, #32 - (MAX_STRIDE << 4)
509         csel            x16, x16, xzr, gt
510         cmp             BYTES_W, #16 - (MAX_STRIDE << 4)
511
512         adr_l           x9, .Lcts_permute_table
513         add             x9, x9, x13
514         ble             .Lctrtail1x\xctr
515
516 ST5(    ld1             {v5.16b}, [IN], x14             )
517         ld1             {v6.16b}, [IN], x15
518         ld1             {v7.16b}, [IN], x16
519
520 ST4(    bl              aes_encrypt_block4x             )
521 ST5(    bl              aes_encrypt_block5x             )
522
523         ld1             {v8.16b}, [IN], x13
524         ld1             {v9.16b}, [IN]
525         ld1             {v10.16b}, [x9]
526
527 ST4(    eor             v6.16b, v6.16b, v0.16b          )
528 ST4(    eor             v7.16b, v7.16b, v1.16b          )
529 ST4(    tbl             v3.16b, {v3.16b}, v10.16b       )
530 ST4(    eor             v8.16b, v8.16b, v2.16b          )
531 ST4(    eor             v9.16b, v9.16b, v3.16b          )
532
533 ST5(    eor             v5.16b, v5.16b, v0.16b          )
534 ST5(    eor             v6.16b, v6.16b, v1.16b          )
535 ST5(    tbl             v4.16b, {v4.16b}, v10.16b       )
536 ST5(    eor             v7.16b, v7.16b, v2.16b          )
537 ST5(    eor             v8.16b, v8.16b, v3.16b          )
538 ST5(    eor             v9.16b, v9.16b, v4.16b          )
539
540 ST5(    st1             {v5.16b}, [OUT], x14            )
541         st1             {v6.16b}, [OUT], x15
542         st1             {v7.16b}, [OUT], x16
543         add             x13, x13, OUT
544         st1             {v9.16b}, [x13]         // overlapping stores
545         st1             {v8.16b}, [OUT]
546         b               .Lctrout\xctr
547
548 .Lctrtail1x\xctr:
549         /*
550          * Handle <= 16 bytes of plaintext
551          *
552          * This code always reads and writes 16 bytes.  To avoid out of bounds
553          * accesses, XCTR and CTR modes must use a temporary buffer when
554          * encrypting/decrypting less than 16 bytes.
555          *
556          * This code is unusual in that it loads the input and stores the output
557          * relative to the end of the buffers rather than relative to the start.
558          * This causes unusual behaviour when encrypting/decrypting less than 16
559          * bytes; the end of the data is expected to be at the end of the
560          * temporary buffer rather than the start of the data being at the start
561          * of the temporary buffer.
562          */
563         sub             x8, x7, #16
564         csel            x7, x7, x8, eq
565         add             IN, IN, x7
566         add             OUT, OUT, x7
567         ld1             {v5.16b}, [IN]
568         ld1             {v6.16b}, [OUT]
569 ST5(    mov             v3.16b, v4.16b                  )
570         encrypt_block   v3, ROUNDS_W, KEY, x8, w7
571         ld1             {v10.16b-v11.16b}, [x9]
572         tbl             v3.16b, {v3.16b}, v10.16b
573         sshr            v11.16b, v11.16b, #7
574         eor             v5.16b, v5.16b, v3.16b
575         bif             v5.16b, v6.16b, v11.16b
576         st1             {v5.16b}, [OUT]
577         b               .Lctrout\xctr
578
579         // Arguments
580         .unreq OUT
581         .unreq IN
582         .unreq KEY
583         .unreq ROUNDS_W
584         .unreq BYTES_W
585         .unreq IV
586         .unreq BYTE_CTR_W       // XCTR only
587         // Intermediate values
588         .unreq CTR_W            // XCTR only
589         .unreq CTR              // XCTR only
590         .unreq IV_PART
591         .unreq BLOCKS
592         .unreq BLOCKS_W
593 .endm
594
595         /*
596          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
597          *                 int bytes, u8 ctr[])
598          *
599          * The input and output buffers must always be at least 16 bytes even if
600          * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
601          * accesses will occur.  The data to be encrypted/decrypted is expected
602          * to be at the end of this 16-byte temporary buffer rather than the
603          * start.
604          */
605
606 AES_FUNC_START(aes_ctr_encrypt)
607         ctr_encrypt 0
608 AES_FUNC_END(aes_ctr_encrypt)
609
610         /*
611          * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
612          *                 int bytes, u8 const iv[], int byte_ctr)
613          *
614          * The input and output buffers must always be at least 16 bytes even if
615          * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
616          * accesses will occur.  The data to be encrypted/decrypted is expected
617          * to be at the end of this 16-byte temporary buffer rather than the
618          * start.
619          */
620
621 AES_FUNC_START(aes_xctr_encrypt)
622         ctr_encrypt 1
623 AES_FUNC_END(aes_xctr_encrypt)
624
625
626         /*
627          * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
628          *                 int bytes, u8 const rk2[], u8 iv[], int first)
629          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
630          *                 int bytes, u8 const rk2[], u8 iv[], int first)
631          */
632
633         .macro          next_tweak, out, in, tmp
634         sshr            \tmp\().2d,  \in\().2d,   #63
635         and             \tmp\().16b, \tmp\().16b, xtsmask.16b
636         add             \out\().2d,  \in\().2d,   \in\().2d
637         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
638         eor             \out\().16b, \out\().16b, \tmp\().16b
639         .endm
640
641         .macro          xts_load_mask, tmp
642         movi            xtsmask.2s, #0x1
643         movi            \tmp\().2s, #0x87
644         uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
645         .endm
646
647 AES_FUNC_START(aes_xts_encrypt)
648         stp             x29, x30, [sp, #-16]!
649         mov             x29, sp
650
651         ld1             {v4.16b}, [x6]
652         xts_load_mask   v8
653         cbz             w7, .Lxtsencnotfirst
654
655         enc_prepare     w3, x5, x8
656         xts_cts_skip_tw w7, .LxtsencNx
657         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
658         enc_switch_key  w3, x2, x8
659         b               .LxtsencNx
660
661 .Lxtsencnotfirst:
662         enc_prepare     w3, x2, x8
663 .LxtsencloopNx:
664         next_tweak      v4, v4, v8
665 .LxtsencNx:
666         subs            w4, w4, #64
667         bmi             .Lxtsenc1x
668         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
669         next_tweak      v5, v4, v8
670         eor             v0.16b, v0.16b, v4.16b
671         next_tweak      v6, v5, v8
672         eor             v1.16b, v1.16b, v5.16b
673         eor             v2.16b, v2.16b, v6.16b
674         next_tweak      v7, v6, v8
675         eor             v3.16b, v3.16b, v7.16b
676         bl              aes_encrypt_block4x
677         eor             v3.16b, v3.16b, v7.16b
678         eor             v0.16b, v0.16b, v4.16b
679         eor             v1.16b, v1.16b, v5.16b
680         eor             v2.16b, v2.16b, v6.16b
681         st1             {v0.16b-v3.16b}, [x0], #64
682         mov             v4.16b, v7.16b
683         cbz             w4, .Lxtsencret
684         xts_reload_mask v8
685         b               .LxtsencloopNx
686 .Lxtsenc1x:
687         adds            w4, w4, #64
688         beq             .Lxtsencout
689         subs            w4, w4, #16
690         bmi             .LxtsencctsNx
691 .Lxtsencloop:
692         ld1             {v0.16b}, [x1], #16
693 .Lxtsencctsout:
694         eor             v0.16b, v0.16b, v4.16b
695         encrypt_block   v0, w3, x2, x8, w7
696         eor             v0.16b, v0.16b, v4.16b
697         cbz             w4, .Lxtsencout
698         subs            w4, w4, #16
699         next_tweak      v4, v4, v8
700         bmi             .Lxtsenccts
701         st1             {v0.16b}, [x0], #16
702         b               .Lxtsencloop
703 .Lxtsencout:
704         st1             {v0.16b}, [x0]
705 .Lxtsencret:
706         st1             {v4.16b}, [x6]
707         ldp             x29, x30, [sp], #16
708         ret
709
710 .LxtsencctsNx:
711         mov             v0.16b, v3.16b
712         sub             x0, x0, #16
713 .Lxtsenccts:
714         adr_l           x8, .Lcts_permute_table
715
716         add             x1, x1, w4, sxtw        /* rewind input pointer */
717         add             w4, w4, #16             /* # bytes in final block */
718         add             x9, x8, #32
719         add             x8, x8, x4
720         sub             x9, x9, x4
721         add             x4, x0, x4              /* output address of final block */
722
723         ld1             {v1.16b}, [x1]          /* load final block */
724         ld1             {v2.16b}, [x8]
725         ld1             {v3.16b}, [x9]
726
727         tbl             v2.16b, {v0.16b}, v2.16b
728         tbx             v0.16b, {v1.16b}, v3.16b
729         st1             {v2.16b}, [x4]                  /* overlapping stores */
730         mov             w4, wzr
731         b               .Lxtsencctsout
732 AES_FUNC_END(aes_xts_encrypt)
733
734 AES_FUNC_START(aes_xts_decrypt)
735         stp             x29, x30, [sp, #-16]!
736         mov             x29, sp
737
738         /* subtract 16 bytes if we are doing CTS */
739         sub             w8, w4, #0x10
740         tst             w4, #0xf
741         csel            w4, w4, w8, eq
742
743         ld1             {v4.16b}, [x6]
744         xts_load_mask   v8
745         xts_cts_skip_tw w7, .Lxtsdecskiptw
746         cbz             w7, .Lxtsdecnotfirst
747
748         enc_prepare     w3, x5, x8
749         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
750 .Lxtsdecskiptw:
751         dec_prepare     w3, x2, x8
752         b               .LxtsdecNx
753
754 .Lxtsdecnotfirst:
755         dec_prepare     w3, x2, x8
756 .LxtsdecloopNx:
757         next_tweak      v4, v4, v8
758 .LxtsdecNx:
759         subs            w4, w4, #64
760         bmi             .Lxtsdec1x
761         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
762         next_tweak      v5, v4, v8
763         eor             v0.16b, v0.16b, v4.16b
764         next_tweak      v6, v5, v8
765         eor             v1.16b, v1.16b, v5.16b
766         eor             v2.16b, v2.16b, v6.16b
767         next_tweak      v7, v6, v8
768         eor             v3.16b, v3.16b, v7.16b
769         bl              aes_decrypt_block4x
770         eor             v3.16b, v3.16b, v7.16b
771         eor             v0.16b, v0.16b, v4.16b
772         eor             v1.16b, v1.16b, v5.16b
773         eor             v2.16b, v2.16b, v6.16b
774         st1             {v0.16b-v3.16b}, [x0], #64
775         mov             v4.16b, v7.16b
776         cbz             w4, .Lxtsdecout
777         xts_reload_mask v8
778         b               .LxtsdecloopNx
779 .Lxtsdec1x:
780         adds            w4, w4, #64
781         beq             .Lxtsdecout
782         subs            w4, w4, #16
783 .Lxtsdecloop:
784         ld1             {v0.16b}, [x1], #16
785         bmi             .Lxtsdeccts
786 .Lxtsdecctsout:
787         eor             v0.16b, v0.16b, v4.16b
788         decrypt_block   v0, w3, x2, x8, w7
789         eor             v0.16b, v0.16b, v4.16b
790         st1             {v0.16b}, [x0], #16
791         cbz             w4, .Lxtsdecout
792         subs            w4, w4, #16
793         next_tweak      v4, v4, v8
794         b               .Lxtsdecloop
795 .Lxtsdecout:
796         st1             {v4.16b}, [x6]
797         ldp             x29, x30, [sp], #16
798         ret
799
800 .Lxtsdeccts:
801         adr_l           x8, .Lcts_permute_table
802
803         add             x1, x1, w4, sxtw        /* rewind input pointer */
804         add             w4, w4, #16             /* # bytes in final block */
805         add             x9, x8, #32
806         add             x8, x8, x4
807         sub             x9, x9, x4
808         add             x4, x0, x4              /* output address of final block */
809
810         next_tweak      v5, v4, v8
811
812         ld1             {v1.16b}, [x1]          /* load final block */
813         ld1             {v2.16b}, [x8]
814         ld1             {v3.16b}, [x9]
815
816         eor             v0.16b, v0.16b, v5.16b
817         decrypt_block   v0, w3, x2, x8, w7
818         eor             v0.16b, v0.16b, v5.16b
819
820         tbl             v2.16b, {v0.16b}, v2.16b
821         tbx             v0.16b, {v1.16b}, v3.16b
822
823         st1             {v2.16b}, [x4]                  /* overlapping stores */
824         mov             w4, wzr
825         b               .Lxtsdecctsout
826 AES_FUNC_END(aes_xts_decrypt)
827
828         /*
829          * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
830          *                int blocks, u8 dg[], int enc_before, int enc_after)
831          */
832 AES_FUNC_START(aes_mac_update)
833         ld1             {v0.16b}, [x4]                  /* get dg */
834         enc_prepare     w2, x1, x7
835         cbz             w5, .Lmacloop4x
836
837         encrypt_block   v0, w2, x1, x7, w8
838
839 .Lmacloop4x:
840         subs            w3, w3, #4
841         bmi             .Lmac1x
842         ld1             {v1.16b-v4.16b}, [x0], #64      /* get next pt block */
843         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
844         encrypt_block   v0, w2, x1, x7, w8
845         eor             v0.16b, v0.16b, v2.16b
846         encrypt_block   v0, w2, x1, x7, w8
847         eor             v0.16b, v0.16b, v3.16b
848         encrypt_block   v0, w2, x1, x7, w8
849         eor             v0.16b, v0.16b, v4.16b
850         cmp             w3, wzr
851         csinv           x5, x6, xzr, eq
852         cbz             w5, .Lmacout
853         encrypt_block   v0, w2, x1, x7, w8
854         st1             {v0.16b}, [x4]                  /* return dg */
855         cond_yield      .Lmacout, x7, x8
856         b               .Lmacloop4x
857 .Lmac1x:
858         add             w3, w3, #4
859 .Lmacloop:
860         cbz             w3, .Lmacout
861         ld1             {v1.16b}, [x0], #16             /* get next pt block */
862         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
863
864         subs            w3, w3, #1
865         csinv           x5, x6, xzr, eq
866         cbz             w5, .Lmacout
867
868 .Lmacenc:
869         encrypt_block   v0, w2, x1, x7, w8
870         b               .Lmacloop
871
872 .Lmacout:
873         st1             {v0.16b}, [x4]                  /* return dg */
874         mov             w0, w3
875         ret
876 AES_FUNC_END(aes_mac_update)