1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
8 /* included by aes-ce.S and aes-neon.S */
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
28 SYM_FUNC_END(aes_encrypt_block4x)
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31 decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
33 SYM_FUNC_END(aes_decrypt_block4x)
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37 encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
39 SYM_FUNC_END(aes_encrypt_block5x)
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42 decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
44 SYM_FUNC_END(aes_decrypt_block5x)
48 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
50 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
54 AES_FUNC_START(aes_ecb_encrypt)
55 stp x29, x30, [sp, #-16]!
58 enc_prepare w3, x2, x5
61 subs w4, w4, #MAX_STRIDE
63 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
64 ST4( bl aes_encrypt_block4x )
65 ST5( ld1 {v4.16b}, [x1], #16 )
66 ST5( bl aes_encrypt_block5x )
67 st1 {v0.16b-v3.16b}, [x0], #64
68 ST5( st1 {v4.16b}, [x0], #16 )
71 adds w4, w4, #MAX_STRIDE
74 ld1 {v0.16b}, [x1], #16 /* get next pt block */
75 encrypt_block v0, w3, x2, x5, w6
76 st1 {v0.16b}, [x0], #16
80 ldp x29, x30, [sp], #16
82 AES_FUNC_END(aes_ecb_encrypt)
85 AES_FUNC_START(aes_ecb_decrypt)
86 stp x29, x30, [sp, #-16]!
89 dec_prepare w3, x2, x5
92 subs w4, w4, #MAX_STRIDE
94 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
95 ST4( bl aes_decrypt_block4x )
96 ST5( ld1 {v4.16b}, [x1], #16 )
97 ST5( bl aes_decrypt_block5x )
98 st1 {v0.16b-v3.16b}, [x0], #64
99 ST5( st1 {v4.16b}, [x0], #16 )
102 adds w4, w4, #MAX_STRIDE
105 ld1 {v0.16b}, [x1], #16 /* get next ct block */
106 decrypt_block v0, w3, x2, x5, w6
107 st1 {v0.16b}, [x0], #16
111 ldp x29, x30, [sp], #16
113 AES_FUNC_END(aes_ecb_decrypt)
117 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118 * int blocks, u8 iv[])
119 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120 * int blocks, u8 iv[])
121 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122 * int rounds, int blocks, u8 iv[],
124 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125 * int rounds, int blocks, u8 iv[],
129 AES_FUNC_START(aes_essiv_cbc_encrypt)
130 ld1 {v4.16b}, [x5] /* get iv */
132 mov w8, #14 /* AES-256: 14 rounds */
133 enc_prepare w8, x6, x7
134 encrypt_block v4, w8, x6, x7, w9
135 enc_switch_key w3, x2, x6
138 AES_FUNC_START(aes_cbc_encrypt)
139 ld1 {v4.16b}, [x5] /* get iv */
140 enc_prepare w3, x2, x6
145 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
146 eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
147 encrypt_block v0, w3, x2, x6, w7
148 eor v1.16b, v1.16b, v0.16b
149 encrypt_block v1, w3, x2, x6, w7
150 eor v2.16b, v2.16b, v1.16b
151 encrypt_block v2, w3, x2, x6, w7
152 eor v3.16b, v3.16b, v2.16b
153 encrypt_block v3, w3, x2, x6, w7
154 st1 {v0.16b-v3.16b}, [x0], #64
161 ld1 {v0.16b}, [x1], #16 /* get next pt block */
162 eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
163 encrypt_block v4, w3, x2, x6, w7
164 st1 {v4.16b}, [x0], #16
168 st1 {v4.16b}, [x5] /* return iv */
170 AES_FUNC_END(aes_cbc_encrypt)
171 AES_FUNC_END(aes_essiv_cbc_encrypt)
173 AES_FUNC_START(aes_essiv_cbc_decrypt)
174 stp x29, x30, [sp, #-16]!
177 ld1 {cbciv.16b}, [x5] /* get iv */
179 mov w8, #14 /* AES-256: 14 rounds */
180 enc_prepare w8, x6, x7
181 encrypt_block cbciv, w8, x6, x7, w9
184 AES_FUNC_START(aes_cbc_decrypt)
185 stp x29, x30, [sp, #-16]!
188 ld1 {cbciv.16b}, [x5] /* get iv */
190 dec_prepare w3, x2, x6
193 subs w4, w4, #MAX_STRIDE
195 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
197 ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
201 bl aes_decrypt_block5x
203 eor v0.16b, v0.16b, cbciv.16b
204 eor v1.16b, v1.16b, v5.16b
205 ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
206 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
207 eor v2.16b, v2.16b, v6.16b
208 eor v3.16b, v3.16b, v7.16b
209 eor v4.16b, v4.16b, v5.16b
214 bl aes_decrypt_block4x
216 eor v0.16b, v0.16b, cbciv.16b
217 eor v1.16b, v1.16b, v4.16b
218 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
219 eor v2.16b, v2.16b, v5.16b
220 eor v3.16b, v3.16b, v6.16b
222 st1 {v0.16b-v3.16b}, [x0], #64
223 ST5( st1 {v4.16b}, [x0], #16 )
226 adds w4, w4, #MAX_STRIDE
229 ld1 {v1.16b}, [x1], #16 /* get next ct block */
230 mov v0.16b, v1.16b /* ...and copy to v0 */
231 decrypt_block v0, w3, x2, x6, w7
232 eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
233 mov cbciv.16b, v1.16b /* ct is next iv */
234 st1 {v0.16b}, [x0], #16
238 st1 {cbciv.16b}, [x5] /* return iv */
239 ldp x29, x30, [sp], #16
241 AES_FUNC_END(aes_cbc_decrypt)
242 AES_FUNC_END(aes_essiv_cbc_decrypt)
246 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247 * int rounds, int bytes, u8 const iv[])
248 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249 * int rounds, int bytes, u8 const iv[])
252 AES_FUNC_START(aes_cbc_cts_encrypt)
253 adr_l x8, .Lcts_permute_table
261 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
264 ld1 {v5.16b}, [x5] /* get iv */
265 enc_prepare w3, x2, x6
267 eor v0.16b, v0.16b, v5.16b /* xor with iv */
268 tbl v1.16b, {v1.16b}, v4.16b
269 encrypt_block v0, w3, x2, x6, w7
271 eor v1.16b, v1.16b, v0.16b
272 tbl v0.16b, {v0.16b}, v3.16b
273 encrypt_block v1, w3, x2, x6, w7
276 st1 {v0.16b}, [x4] /* overlapping stores */
279 AES_FUNC_END(aes_cbc_cts_encrypt)
281 AES_FUNC_START(aes_cbc_cts_decrypt)
282 adr_l x8, .Lcts_permute_table
290 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
293 ld1 {v5.16b}, [x5] /* get iv */
294 dec_prepare w3, x2, x6
296 decrypt_block v0, w3, x2, x6, w7
297 tbl v2.16b, {v0.16b}, v3.16b
298 eor v2.16b, v2.16b, v1.16b
300 tbx v0.16b, {v1.16b}, v4.16b
301 decrypt_block v0, w3, x2, x6, w7
302 eor v0.16b, v0.16b, v5.16b /* xor with iv */
305 st1 {v2.16b}, [x4] /* overlapping stores */
308 AES_FUNC_END(aes_cbc_cts_decrypt)
310 .section ".rodata", "a"
313 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
316 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
317 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
323 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324 * int bytes, u8 ctr[], u8 finalbuf[])
327 AES_FUNC_START(aes_ctr_encrypt)
328 stp x29, x30, [sp, #-16]!
331 enc_prepare w3, x2, x12
334 umov x12, vctr.d[1] /* keep swabbed ctr in reg */
339 sub w4, w4, #MAX_STRIDE << 4
350 ST5( mov v4.16b, vctr.16b )
354 /* apply carry to outgoing counter */
355 0: umov x8, vctr.d[0]
361 /* apply carry to N counter blocks for N := x12 */
363 sub x16, x16, x12, lsl #3
366 mov v0.d[0], vctr.d[0]
368 mov v1.d[0], vctr.d[0]
370 mov v2.d[0], vctr.d[0]
372 mov v3.d[0], vctr.d[0]
374 ST5( mov v4.d[0], vctr.d[0] )
380 sub x7, x12, #MAX_STRIDE - 1
381 sub x8, x12, #MAX_STRIDE - 2
382 sub x9, x12, #MAX_STRIDE - 3
387 ST5( sub x10, x12, #MAX_STRIDE - 4 )
391 ST5( mov v4.d[1], x10 )
392 tbnz w4, #31, .Lctrtail
393 ld1 {v5.16b-v7.16b}, [x1], #48
394 ST4( bl aes_encrypt_block4x )
395 ST5( bl aes_encrypt_block5x )
396 eor v0.16b, v5.16b, v0.16b
397 ST4( ld1 {v5.16b}, [x1], #16 )
398 eor v1.16b, v6.16b, v1.16b
399 ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
400 eor v2.16b, v7.16b, v2.16b
401 eor v3.16b, v5.16b, v3.16b
402 ST5( eor v4.16b, v6.16b, v4.16b )
403 st1 {v0.16b-v3.16b}, [x0], #64
404 ST5( st1 {v4.16b}, [x0], #16 )
409 st1 {vctr.16b}, [x5] /* return next CTR value */
410 ldp x29, x30, [sp], #16
414 /* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
417 csel x13, x13, x16, ne
419 ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
420 ST5( csel x14, x16, xzr, gt )
421 cmp w4, #48 - (MAX_STRIDE << 4)
422 csel x15, x16, xzr, gt
423 cmp w4, #32 - (MAX_STRIDE << 4)
424 csel x16, x16, xzr, gt
425 cmp w4, #16 - (MAX_STRIDE << 4)
428 adr_l x12, .Lcts_permute_table
431 ST5( ld1 {v5.16b}, [x1], x14 )
432 ld1 {v6.16b}, [x1], x15
433 ld1 {v7.16b}, [x1], x16
435 ST4( bl aes_encrypt_block4x )
436 ST5( bl aes_encrypt_block5x )
438 ld1 {v8.16b}, [x1], x13
442 ST4( eor v6.16b, v6.16b, v0.16b )
443 ST4( eor v7.16b, v7.16b, v1.16b )
444 ST4( tbl v3.16b, {v3.16b}, v10.16b )
445 ST4( eor v8.16b, v8.16b, v2.16b )
446 ST4( eor v9.16b, v9.16b, v3.16b )
448 ST5( eor v5.16b, v5.16b, v0.16b )
449 ST5( eor v6.16b, v6.16b, v1.16b )
450 ST5( tbl v4.16b, {v4.16b}, v10.16b )
451 ST5( eor v7.16b, v7.16b, v2.16b )
452 ST5( eor v8.16b, v8.16b, v3.16b )
453 ST5( eor v9.16b, v9.16b, v4.16b )
455 ST5( st1 {v5.16b}, [x0], x14 )
456 st1 {v6.16b}, [x0], x15
457 st1 {v7.16b}, [x0], x16
459 st1 {v9.16b}, [x13] // overlapping stores
464 csel x0, x0, x6, eq // use finalbuf if less than a full block
466 ST5( mov v3.16b, v4.16b )
467 encrypt_block v3, w3, x2, x8, w7
468 eor v5.16b, v5.16b, v3.16b
471 AES_FUNC_END(aes_ctr_encrypt)
475 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
476 * int bytes, u8 const rk2[], u8 iv[], int first)
477 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
478 * int bytes, u8 const rk2[], u8 iv[], int first)
481 .macro next_tweak, out, in, tmp
482 sshr \tmp\().2d, \in\().2d, #63
483 and \tmp\().16b, \tmp\().16b, xtsmask.16b
484 add \out\().2d, \in\().2d, \in\().2d
485 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
486 eor \out\().16b, \out\().16b, \tmp\().16b
489 .macro xts_load_mask, tmp
490 movi xtsmask.2s, #0x1
491 movi \tmp\().2s, #0x87
492 uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
495 AES_FUNC_START(aes_xts_encrypt)
496 stp x29, x30, [sp, #-16]!
501 cbz w7, .Lxtsencnotfirst
503 enc_prepare w3, x5, x8
504 xts_cts_skip_tw w7, .LxtsencNx
505 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
506 enc_switch_key w3, x2, x8
510 enc_prepare w3, x2, x8
512 next_tweak v4, v4, v8
516 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
517 next_tweak v5, v4, v8
518 eor v0.16b, v0.16b, v4.16b
519 next_tweak v6, v5, v8
520 eor v1.16b, v1.16b, v5.16b
521 eor v2.16b, v2.16b, v6.16b
522 next_tweak v7, v6, v8
523 eor v3.16b, v3.16b, v7.16b
524 bl aes_encrypt_block4x
525 eor v3.16b, v3.16b, v7.16b
526 eor v0.16b, v0.16b, v4.16b
527 eor v1.16b, v1.16b, v5.16b
528 eor v2.16b, v2.16b, v6.16b
529 st1 {v0.16b-v3.16b}, [x0], #64
540 ld1 {v0.16b}, [x1], #16
542 eor v0.16b, v0.16b, v4.16b
543 encrypt_block v0, w3, x2, x8, w7
544 eor v0.16b, v0.16b, v4.16b
547 next_tweak v4, v4, v8
549 st1 {v0.16b}, [x0], #16
555 ldp x29, x30, [sp], #16
562 adr_l x8, .Lcts_permute_table
564 add x1, x1, w4, sxtw /* rewind input pointer */
565 add w4, w4, #16 /* # bytes in final block */
569 add x4, x0, x4 /* output address of final block */
571 ld1 {v1.16b}, [x1] /* load final block */
575 tbl v2.16b, {v0.16b}, v2.16b
576 tbx v0.16b, {v1.16b}, v3.16b
577 st1 {v2.16b}, [x4] /* overlapping stores */
580 AES_FUNC_END(aes_xts_encrypt)
582 AES_FUNC_START(aes_xts_decrypt)
583 stp x29, x30, [sp, #-16]!
586 /* subtract 16 bytes if we are doing CTS */
593 xts_cts_skip_tw w7, .Lxtsdecskiptw
594 cbz w7, .Lxtsdecnotfirst
596 enc_prepare w3, x5, x8
597 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
599 dec_prepare w3, x2, x8
603 dec_prepare w3, x2, x8
605 next_tweak v4, v4, v8
609 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
610 next_tweak v5, v4, v8
611 eor v0.16b, v0.16b, v4.16b
612 next_tweak v6, v5, v8
613 eor v1.16b, v1.16b, v5.16b
614 eor v2.16b, v2.16b, v6.16b
615 next_tweak v7, v6, v8
616 eor v3.16b, v3.16b, v7.16b
617 bl aes_decrypt_block4x
618 eor v3.16b, v3.16b, v7.16b
619 eor v0.16b, v0.16b, v4.16b
620 eor v1.16b, v1.16b, v5.16b
621 eor v2.16b, v2.16b, v6.16b
622 st1 {v0.16b-v3.16b}, [x0], #64
632 ld1 {v0.16b}, [x1], #16
635 eor v0.16b, v0.16b, v4.16b
636 decrypt_block v0, w3, x2, x8, w7
637 eor v0.16b, v0.16b, v4.16b
638 st1 {v0.16b}, [x0], #16
641 next_tweak v4, v4, v8
645 ldp x29, x30, [sp], #16
649 adr_l x8, .Lcts_permute_table
651 add x1, x1, w4, sxtw /* rewind input pointer */
652 add w4, w4, #16 /* # bytes in final block */
656 add x4, x0, x4 /* output address of final block */
658 next_tweak v5, v4, v8
660 ld1 {v1.16b}, [x1] /* load final block */
664 eor v0.16b, v0.16b, v5.16b
665 decrypt_block v0, w3, x2, x8, w7
666 eor v0.16b, v0.16b, v5.16b
668 tbl v2.16b, {v0.16b}, v2.16b
669 tbx v0.16b, {v1.16b}, v3.16b
671 st1 {v2.16b}, [x4] /* overlapping stores */
674 AES_FUNC_END(aes_xts_decrypt)
677 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
678 * int blocks, u8 dg[], int enc_before, int enc_after)
680 AES_FUNC_START(aes_mac_update)
681 ld1 {v0.16b}, [x4] /* get dg */
682 enc_prepare w2, x1, x7
685 encrypt_block v0, w2, x1, x7, w8
690 ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
691 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
692 encrypt_block v0, w2, x1, x7, w8
693 eor v0.16b, v0.16b, v2.16b
694 encrypt_block v0, w2, x1, x7, w8
695 eor v0.16b, v0.16b, v3.16b
696 encrypt_block v0, w2, x1, x7, w8
697 eor v0.16b, v0.16b, v4.16b
699 csinv x5, x6, xzr, eq
701 encrypt_block v0, w2, x1, x7, w8
702 st1 {v0.16b}, [x4] /* return dg */
703 cond_yield .Lmacout, x7
709 ld1 {v1.16b}, [x0], #16 /* get next pt block */
710 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
713 csinv x5, x6, xzr, eq
717 encrypt_block v0, w2, x1, x7, w8
721 st1 {v0.16b}, [x4] /* return dg */
724 AES_FUNC_END(aes_mac_update)