1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
8 /* included by aes-ce.S and aes-neon.S */
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
28 SYM_FUNC_END(aes_encrypt_block4x)
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31 decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
33 SYM_FUNC_END(aes_decrypt_block4x)
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37 encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
39 SYM_FUNC_END(aes_encrypt_block5x)
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42 decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
44 SYM_FUNC_END(aes_decrypt_block5x)
48 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
50 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
54 AES_FUNC_START(aes_ecb_encrypt)
55 stp x29, x30, [sp, #-16]!
58 enc_prepare w3, x2, x5
61 subs w4, w4, #MAX_STRIDE
63 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
64 ST4( bl aes_encrypt_block4x )
65 ST5( ld1 {v4.16b}, [x1], #16 )
66 ST5( bl aes_encrypt_block5x )
67 st1 {v0.16b-v3.16b}, [x0], #64
68 ST5( st1 {v4.16b}, [x0], #16 )
71 adds w4, w4, #MAX_STRIDE
74 ld1 {v0.16b}, [x1], #16 /* get next pt block */
75 encrypt_block v0, w3, x2, x5, w6
76 st1 {v0.16b}, [x0], #16
80 ldp x29, x30, [sp], #16
82 AES_FUNC_END(aes_ecb_encrypt)
85 AES_FUNC_START(aes_ecb_decrypt)
86 stp x29, x30, [sp, #-16]!
89 dec_prepare w3, x2, x5
92 subs w4, w4, #MAX_STRIDE
94 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
95 ST4( bl aes_decrypt_block4x )
96 ST5( ld1 {v4.16b}, [x1], #16 )
97 ST5( bl aes_decrypt_block5x )
98 st1 {v0.16b-v3.16b}, [x0], #64
99 ST5( st1 {v4.16b}, [x0], #16 )
102 adds w4, w4, #MAX_STRIDE
105 ld1 {v0.16b}, [x1], #16 /* get next ct block */
106 decrypt_block v0, w3, x2, x5, w6
107 st1 {v0.16b}, [x0], #16
111 ldp x29, x30, [sp], #16
113 AES_FUNC_END(aes_ecb_decrypt)
117 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118 * int blocks, u8 iv[])
119 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120 * int blocks, u8 iv[])
121 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122 * int rounds, int blocks, u8 iv[],
124 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125 * int rounds, int blocks, u8 iv[],
129 AES_FUNC_START(aes_essiv_cbc_encrypt)
130 ld1 {v4.16b}, [x5] /* get iv */
132 mov w8, #14 /* AES-256: 14 rounds */
133 enc_prepare w8, x6, x7
134 encrypt_block v4, w8, x6, x7, w9
135 enc_switch_key w3, x2, x6
138 AES_FUNC_START(aes_cbc_encrypt)
139 ld1 {v4.16b}, [x5] /* get iv */
140 enc_prepare w3, x2, x6
145 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
146 eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
147 encrypt_block v0, w3, x2, x6, w7
148 eor v1.16b, v1.16b, v0.16b
149 encrypt_block v1, w3, x2, x6, w7
150 eor v2.16b, v2.16b, v1.16b
151 encrypt_block v2, w3, x2, x6, w7
152 eor v3.16b, v3.16b, v2.16b
153 encrypt_block v3, w3, x2, x6, w7
154 st1 {v0.16b-v3.16b}, [x0], #64
161 ld1 {v0.16b}, [x1], #16 /* get next pt block */
162 eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
163 encrypt_block v4, w3, x2, x6, w7
164 st1 {v4.16b}, [x0], #16
168 st1 {v4.16b}, [x5] /* return iv */
170 AES_FUNC_END(aes_cbc_encrypt)
171 AES_FUNC_END(aes_essiv_cbc_encrypt)
173 AES_FUNC_START(aes_essiv_cbc_decrypt)
174 stp x29, x30, [sp, #-16]!
177 ld1 {cbciv.16b}, [x5] /* get iv */
179 mov w8, #14 /* AES-256: 14 rounds */
180 enc_prepare w8, x6, x7
181 encrypt_block cbciv, w8, x6, x7, w9
184 AES_FUNC_START(aes_cbc_decrypt)
185 stp x29, x30, [sp, #-16]!
188 ld1 {cbciv.16b}, [x5] /* get iv */
190 dec_prepare w3, x2, x6
193 subs w4, w4, #MAX_STRIDE
195 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
197 ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
201 bl aes_decrypt_block5x
203 eor v0.16b, v0.16b, cbciv.16b
204 eor v1.16b, v1.16b, v5.16b
205 ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
206 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
207 eor v2.16b, v2.16b, v6.16b
208 eor v3.16b, v3.16b, v7.16b
209 eor v4.16b, v4.16b, v5.16b
214 bl aes_decrypt_block4x
216 eor v0.16b, v0.16b, cbciv.16b
217 eor v1.16b, v1.16b, v4.16b
218 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
219 eor v2.16b, v2.16b, v5.16b
220 eor v3.16b, v3.16b, v6.16b
222 st1 {v0.16b-v3.16b}, [x0], #64
223 ST5( st1 {v4.16b}, [x0], #16 )
226 adds w4, w4, #MAX_STRIDE
229 ld1 {v1.16b}, [x1], #16 /* get next ct block */
230 mov v0.16b, v1.16b /* ...and copy to v0 */
231 decrypt_block v0, w3, x2, x6, w7
232 eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
233 mov cbciv.16b, v1.16b /* ct is next iv */
234 st1 {v0.16b}, [x0], #16
238 st1 {cbciv.16b}, [x5] /* return iv */
239 ldp x29, x30, [sp], #16
241 AES_FUNC_END(aes_cbc_decrypt)
242 AES_FUNC_END(aes_essiv_cbc_decrypt)
246 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247 * int rounds, int bytes, u8 const iv[])
248 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249 * int rounds, int bytes, u8 const iv[])
252 AES_FUNC_START(aes_cbc_cts_encrypt)
253 adr_l x8, .Lcts_permute_table
261 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
264 ld1 {v5.16b}, [x5] /* get iv */
265 enc_prepare w3, x2, x6
267 eor v0.16b, v0.16b, v5.16b /* xor with iv */
268 tbl v1.16b, {v1.16b}, v4.16b
269 encrypt_block v0, w3, x2, x6, w7
271 eor v1.16b, v1.16b, v0.16b
272 tbl v0.16b, {v0.16b}, v3.16b
273 encrypt_block v1, w3, x2, x6, w7
276 st1 {v0.16b}, [x4] /* overlapping stores */
279 AES_FUNC_END(aes_cbc_cts_encrypt)
281 AES_FUNC_START(aes_cbc_cts_decrypt)
282 adr_l x8, .Lcts_permute_table
290 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
293 ld1 {v5.16b}, [x5] /* get iv */
294 dec_prepare w3, x2, x6
296 decrypt_block v0, w3, x2, x6, w7
297 tbl v2.16b, {v0.16b}, v3.16b
298 eor v2.16b, v2.16b, v1.16b
300 tbx v0.16b, {v1.16b}, v4.16b
301 decrypt_block v0, w3, x2, x6, w7
302 eor v0.16b, v0.16b, v5.16b /* xor with iv */
305 st1 {v2.16b}, [x4] /* overlapping stores */
308 AES_FUNC_END(aes_cbc_cts_decrypt)
310 .section ".rodata", "a"
313 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
316 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
317 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
322 * This macro generates the code for CTR and XCTR mode.
324 .macro ctr_encrypt xctr
332 BYTE_CTR_W .req w6 // XCTR only
333 // Intermediate values
334 CTR_W .req w11 // XCTR only
335 CTR .req x11 // XCTR only
340 stp x29, x30, [sp, #-16]!
343 enc_prepare ROUNDS_W, KEY, IV_PART
347 * Keep 64 bits of the IV in a register. For CTR mode this lets us
348 * easily increment the IV. For XCTR mode this lets us efficiently XOR
349 * the 64-bit counter with the IV.
352 umov IV_PART, vctr.d[0]
353 lsr CTR_W, BYTE_CTR_W, #4
355 umov IV_PART, vctr.d[1]
360 add BLOCKS_W, BYTES_W, #15
361 sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
362 lsr BLOCKS_W, BLOCKS_W, #4
365 csel BLOCKS_W, BLOCKS_W, w8, lt
368 * Set up the counter values in v0-v{MAX_STRIDE-1}.
370 * If we are encrypting less than MAX_STRIDE blocks, the tail block
371 * handling code expects the last keystream block to be in
372 * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
373 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
378 adds IV_PART, IV_PART, BLOCKS
384 ST5( mov v4.16b, vctr.16b )
386 sub x6, CTR, #MAX_STRIDE - 1
387 sub x7, CTR, #MAX_STRIDE - 2
388 sub x8, CTR, #MAX_STRIDE - 3
389 sub x9, CTR, #MAX_STRIDE - 4
390 ST5( sub x10, CTR, #MAX_STRIDE - 5 )
395 ST5( eor x10, x10, IV_PART )
400 ST5( mov v4.d[0], x10 )
405 * This subsection handles carries.
407 * Conditional branching here is allowed with respect to time
408 * invariance since the branches are dependent on the IV instead
409 * of the plaintext or key. This code is rarely executed in
413 /* Apply carry to outgoing counter. */
414 0: umov x8, vctr.d[0]
421 * Apply carry to counter blocks if needed.
423 * Since the carry flag was set, we know 0 <= IV_PART <
424 * MAX_STRIDE. Using the value of IV_PART we can determine how
425 * many counter blocks need to be updated.
429 sub x16, x16, IV_PART, lsl #3
432 mov v0.d[0], vctr.d[0]
434 mov v1.d[0], vctr.d[0]
436 mov v2.d[0], vctr.d[0]
438 mov v3.d[0], vctr.d[0]
440 ST5( mov v4.d[0], vctr.d[0] )
446 sub x7, IV_PART, #MAX_STRIDE - 1
447 sub x8, IV_PART, #MAX_STRIDE - 2
448 sub x9, IV_PART, #MAX_STRIDE - 3
453 ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
457 ST5( mov v4.d[1], x10 )
461 * If there are at least MAX_STRIDE blocks left, XOR the data with
462 * keystream and store. Otherwise jump to tail handling.
464 tbnz BYTES_W, #31, .Lctrtail\xctr
465 ld1 {v5.16b-v7.16b}, [IN], #48
466 ST4( bl aes_encrypt_block4x )
467 ST5( bl aes_encrypt_block5x )
468 eor v0.16b, v5.16b, v0.16b
469 ST4( ld1 {v5.16b}, [IN], #16 )
470 eor v1.16b, v6.16b, v1.16b
471 ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
472 eor v2.16b, v7.16b, v2.16b
473 eor v3.16b, v5.16b, v3.16b
474 ST5( eor v4.16b, v6.16b, v4.16b )
475 st1 {v0.16b-v3.16b}, [OUT], #64
476 ST5( st1 {v4.16b}, [OUT], #16 )
477 cbz BYTES_W, .Lctrout\xctr
482 st1 {vctr.16b}, [IV] /* return next CTR value */
484 ldp x29, x30, [sp], #16
489 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
491 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
492 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
493 * v4 should have the next two counter blocks.
495 * This allows us to store the ciphertext by writing to overlapping
496 * regions of memory. Any invalid ciphertext blocks get overwritten by
497 * correctly computed blocks. This approach greatly simplifies the
498 * logic for storing the ciphertext.
501 ands w7, BYTES_W, #0xf
502 csel x13, x7, x16, ne
504 ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
505 ST5( csel x14, x16, xzr, gt )
506 cmp BYTES_W, #48 - (MAX_STRIDE << 4)
507 csel x15, x16, xzr, gt
508 cmp BYTES_W, #32 - (MAX_STRIDE << 4)
509 csel x16, x16, xzr, gt
510 cmp BYTES_W, #16 - (MAX_STRIDE << 4)
512 adr_l x9, .Lcts_permute_table
516 ST5( ld1 {v5.16b}, [IN], x14 )
517 ld1 {v6.16b}, [IN], x15
518 ld1 {v7.16b}, [IN], x16
520 ST4( bl aes_encrypt_block4x )
521 ST5( bl aes_encrypt_block5x )
523 ld1 {v8.16b}, [IN], x13
527 ST4( eor v6.16b, v6.16b, v0.16b )
528 ST4( eor v7.16b, v7.16b, v1.16b )
529 ST4( tbl v3.16b, {v3.16b}, v10.16b )
530 ST4( eor v8.16b, v8.16b, v2.16b )
531 ST4( eor v9.16b, v9.16b, v3.16b )
533 ST5( eor v5.16b, v5.16b, v0.16b )
534 ST5( eor v6.16b, v6.16b, v1.16b )
535 ST5( tbl v4.16b, {v4.16b}, v10.16b )
536 ST5( eor v7.16b, v7.16b, v2.16b )
537 ST5( eor v8.16b, v8.16b, v3.16b )
538 ST5( eor v9.16b, v9.16b, v4.16b )
540 ST5( st1 {v5.16b}, [OUT], x14 )
541 st1 {v6.16b}, [OUT], x15
542 st1 {v7.16b}, [OUT], x16
544 st1 {v9.16b}, [x13] // overlapping stores
550 * Handle <= 16 bytes of plaintext
552 * This code always reads and writes 16 bytes. To avoid out of bounds
553 * accesses, XCTR and CTR modes must use a temporary buffer when
554 * encrypting/decrypting less than 16 bytes.
556 * This code is unusual in that it loads the input and stores the output
557 * relative to the end of the buffers rather than relative to the start.
558 * This causes unusual behaviour when encrypting/decrypting less than 16
559 * bytes; the end of the data is expected to be at the end of the
560 * temporary buffer rather than the start of the data being at the start
561 * of the temporary buffer.
569 ST5( mov v3.16b, v4.16b )
570 encrypt_block v3, ROUNDS_W, KEY, x8, w7
571 ld1 {v10.16b-v11.16b}, [x9]
572 tbl v3.16b, {v3.16b}, v10.16b
573 sshr v11.16b, v11.16b, #7
574 eor v5.16b, v5.16b, v3.16b
575 bif v5.16b, v6.16b, v11.16b
586 .unreq BYTE_CTR_W // XCTR only
587 // Intermediate values
588 .unreq CTR_W // XCTR only
589 .unreq CTR // XCTR only
596 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
597 * int bytes, u8 ctr[])
599 * The input and output buffers must always be at least 16 bytes even if
600 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
601 * accesses will occur. The data to be encrypted/decrypted is expected
602 * to be at the end of this 16-byte temporary buffer rather than the
606 AES_FUNC_START(aes_ctr_encrypt)
608 AES_FUNC_END(aes_ctr_encrypt)
611 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
612 * int bytes, u8 const iv[], int byte_ctr)
614 * The input and output buffers must always be at least 16 bytes even if
615 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
616 * accesses will occur. The data to be encrypted/decrypted is expected
617 * to be at the end of this 16-byte temporary buffer rather than the
621 AES_FUNC_START(aes_xctr_encrypt)
623 AES_FUNC_END(aes_xctr_encrypt)
627 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
628 * int bytes, u8 const rk2[], u8 iv[], int first)
629 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
630 * int bytes, u8 const rk2[], u8 iv[], int first)
633 .macro next_tweak, out, in, tmp
634 sshr \tmp\().2d, \in\().2d, #63
635 and \tmp\().16b, \tmp\().16b, xtsmask.16b
636 add \out\().2d, \in\().2d, \in\().2d
637 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
638 eor \out\().16b, \out\().16b, \tmp\().16b
641 .macro xts_load_mask, tmp
642 movi xtsmask.2s, #0x1
643 movi \tmp\().2s, #0x87
644 uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
647 AES_FUNC_START(aes_xts_encrypt)
648 stp x29, x30, [sp, #-16]!
653 cbz w7, .Lxtsencnotfirst
655 enc_prepare w3, x5, x8
656 xts_cts_skip_tw w7, .LxtsencNx
657 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
658 enc_switch_key w3, x2, x8
662 enc_prepare w3, x2, x8
664 next_tweak v4, v4, v8
668 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
669 next_tweak v5, v4, v8
670 eor v0.16b, v0.16b, v4.16b
671 next_tweak v6, v5, v8
672 eor v1.16b, v1.16b, v5.16b
673 eor v2.16b, v2.16b, v6.16b
674 next_tweak v7, v6, v8
675 eor v3.16b, v3.16b, v7.16b
676 bl aes_encrypt_block4x
677 eor v3.16b, v3.16b, v7.16b
678 eor v0.16b, v0.16b, v4.16b
679 eor v1.16b, v1.16b, v5.16b
680 eor v2.16b, v2.16b, v6.16b
681 st1 {v0.16b-v3.16b}, [x0], #64
692 ld1 {v0.16b}, [x1], #16
694 eor v0.16b, v0.16b, v4.16b
695 encrypt_block v0, w3, x2, x8, w7
696 eor v0.16b, v0.16b, v4.16b
699 next_tweak v4, v4, v8
701 st1 {v0.16b}, [x0], #16
707 ldp x29, x30, [sp], #16
714 adr_l x8, .Lcts_permute_table
716 add x1, x1, w4, sxtw /* rewind input pointer */
717 add w4, w4, #16 /* # bytes in final block */
721 add x4, x0, x4 /* output address of final block */
723 ld1 {v1.16b}, [x1] /* load final block */
727 tbl v2.16b, {v0.16b}, v2.16b
728 tbx v0.16b, {v1.16b}, v3.16b
729 st1 {v2.16b}, [x4] /* overlapping stores */
732 AES_FUNC_END(aes_xts_encrypt)
734 AES_FUNC_START(aes_xts_decrypt)
735 stp x29, x30, [sp, #-16]!
738 /* subtract 16 bytes if we are doing CTS */
745 xts_cts_skip_tw w7, .Lxtsdecskiptw
746 cbz w7, .Lxtsdecnotfirst
748 enc_prepare w3, x5, x8
749 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
751 dec_prepare w3, x2, x8
755 dec_prepare w3, x2, x8
757 next_tweak v4, v4, v8
761 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
762 next_tweak v5, v4, v8
763 eor v0.16b, v0.16b, v4.16b
764 next_tweak v6, v5, v8
765 eor v1.16b, v1.16b, v5.16b
766 eor v2.16b, v2.16b, v6.16b
767 next_tweak v7, v6, v8
768 eor v3.16b, v3.16b, v7.16b
769 bl aes_decrypt_block4x
770 eor v3.16b, v3.16b, v7.16b
771 eor v0.16b, v0.16b, v4.16b
772 eor v1.16b, v1.16b, v5.16b
773 eor v2.16b, v2.16b, v6.16b
774 st1 {v0.16b-v3.16b}, [x0], #64
784 ld1 {v0.16b}, [x1], #16
787 eor v0.16b, v0.16b, v4.16b
788 decrypt_block v0, w3, x2, x8, w7
789 eor v0.16b, v0.16b, v4.16b
790 st1 {v0.16b}, [x0], #16
793 next_tweak v4, v4, v8
797 ldp x29, x30, [sp], #16
801 adr_l x8, .Lcts_permute_table
803 add x1, x1, w4, sxtw /* rewind input pointer */
804 add w4, w4, #16 /* # bytes in final block */
808 add x4, x0, x4 /* output address of final block */
810 next_tweak v5, v4, v8
812 ld1 {v1.16b}, [x1] /* load final block */
816 eor v0.16b, v0.16b, v5.16b
817 decrypt_block v0, w3, x2, x8, w7
818 eor v0.16b, v0.16b, v5.16b
820 tbl v2.16b, {v0.16b}, v2.16b
821 tbx v0.16b, {v1.16b}, v3.16b
823 st1 {v2.16b}, [x4] /* overlapping stores */
826 AES_FUNC_END(aes_xts_decrypt)
829 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
830 * int blocks, u8 dg[], int enc_before, int enc_after)
832 AES_FUNC_START(aes_mac_update)
833 ld1 {v0.16b}, [x4] /* get dg */
834 enc_prepare w2, x1, x7
837 encrypt_block v0, w2, x1, x7, w8
842 ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
843 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
844 encrypt_block v0, w2, x1, x7, w8
845 eor v0.16b, v0.16b, v2.16b
846 encrypt_block v0, w2, x1, x7, w8
847 eor v0.16b, v0.16b, v3.16b
848 encrypt_block v0, w2, x1, x7, w8
849 eor v0.16b, v0.16b, v4.16b
851 csinv x5, x6, xzr, eq
853 encrypt_block v0, w2, x1, x7, w8
854 st1 {v0.16b}, [x4] /* return dg */
855 cond_yield .Lmacout, x7, x8
861 ld1 {v1.16b}, [x0], #16 /* get next pt block */
862 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
865 csinv x5, x6, xzr, eq
869 encrypt_block v0, w2, x1, x7, w8
873 st1 {v0.16b}, [x4] /* return dg */
876 AES_FUNC_END(aes_mac_update)