1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
62 .macro __pmull_p64, rd, rn, rm
63 pmull \rd\().1q, \rn\().1d, \rm\().1d
66 .macro __pmull2_p64, rd, rn, rm
67 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
70 .macro __pmull_p8, rq, ad, bd
71 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
72 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
73 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
75 __pmull_p8_\bd \rq, \ad
78 .macro __pmull2_p8, rq, ad, bd
79 tbl t3.16b, {\ad\().16b}, perm1.16b // A1
80 tbl t5.16b, {\ad\().16b}, perm2.16b // A2
81 tbl t7.16b, {\ad\().16b}, perm3.16b // A3
83 __pmull2_p8_\bd \rq, \ad
86 .macro __pmull_p8_SHASH, rq, ad
87 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
90 .macro __pmull_p8_SHASH2, rq, ad
91 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
94 .macro __pmull2_p8_SHASH, rq, ad
95 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
98 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99 pmull\t t3.8h, t3.\nb, \bd // F = A1*B
100 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
101 pmull\t t5.8h, t5.\nb, \bd // H = A2*B
102 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
103 pmull\t t7.8h, t7.\nb, \bd // J = A3*B
104 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
105 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
106 pmull\t \rq\().8h, \ad, \bd // D = A*B
108 eor t3.16b, t3.16b, t4.16b // L = E + F
109 eor t5.16b, t5.16b, t6.16b // M = G + H
110 eor t7.16b, t7.16b, t8.16b // N = I + J
112 uzp1 t4.2d, t3.2d, t5.2d
113 uzp2 t3.2d, t3.2d, t5.2d
114 uzp1 t6.2d, t7.2d, t9.2d
115 uzp2 t7.2d, t7.2d, t9.2d
117 // t3 = (L) (P0 + P1) << 8
118 // t5 = (M) (P2 + P3) << 16
119 eor t4.16b, t4.16b, t3.16b
120 and t3.16b, t3.16b, k32_48.16b
122 // t7 = (N) (P4 + P5) << 24
123 // t9 = (K) (P6 + P7) << 32
124 eor t6.16b, t6.16b, t7.16b
125 and t7.16b, t7.16b, k00_16.16b
127 eor t4.16b, t4.16b, t3.16b
128 eor t6.16b, t6.16b, t7.16b
130 zip2 t5.2d, t4.2d, t3.2d
131 zip1 t3.2d, t4.2d, t3.2d
132 zip2 t9.2d, t6.2d, t7.2d
133 zip1 t7.2d, t6.2d, t7.2d
135 ext t3.16b, t3.16b, t3.16b, #15
136 ext t5.16b, t5.16b, t5.16b, #14
137 ext t7.16b, t7.16b, t7.16b, #13
138 ext t9.16b, t9.16b, t9.16b, #12
140 eor t3.16b, t3.16b, t5.16b
141 eor t7.16b, t7.16b, t9.16b
142 eor \rq\().16b, \rq\().16b, t3.16b
143 eor \rq\().16b, \rq\().16b, t7.16b
146 .macro __pmull_pre_p64
148 ld1 {HH.2d-HH4.2d}, [x8]
150 trn1 SHASH2.2d, SHASH.2d, HH.2d
151 trn2 T1.2d, SHASH.2d, HH.2d
152 eor SHASH2.16b, SHASH2.16b, T1.16b
154 trn1 HH34.2d, HH3.2d, HH4.2d
155 trn2 T1.2d, HH3.2d, HH4.2d
156 eor HH34.16b, HH34.16b, T1.16b
159 shl MASK.2d, MASK.2d, #57
162 .macro __pmull_pre_p8
163 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
164 eor SHASH2.16b, SHASH2.16b, SHASH.16b
166 // k00_16 := 0x0000000000000000_000000000000ffff
167 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
168 movi k32_48.2d, #0xffffffff
169 mov k32_48.h[2], k32_48.h[0]
170 ushr k00_16.2d, k32_48.2d, #32
172 // prepare the permutation vectors
173 mov_q x5, 0x080f0e0d0c0b0a09
176 eor perm1.16b, perm1.16b, T1.16b
177 ushr perm2.2d, perm1.2d, #8
178 ushr perm3.2d, perm1.2d, #16
179 ushr T1.2d, perm1.2d, #24
180 sli perm2.2d, perm1.2d, #56
181 sli perm3.2d, perm1.2d, #48
182 sli T1.2d, perm1.2d, #40
184 // precompute loop invariants
185 tbl sh1.16b, {SHASH.16b}, perm1.16b
186 tbl sh2.16b, {SHASH.16b}, perm2.16b
187 tbl sh3.16b, {SHASH.16b}, perm3.16b
188 tbl sh4.16b, {SHASH.16b}, T1.16b
189 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
190 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
191 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
192 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
196 // PMULL (64x64->128) based reduction for CPUs that can do
197 // it in a single instruction.
199 .macro __pmull_reduce_p64
200 pmull T2.1q, XL.1d, MASK.1d
201 eor XM.16b, XM.16b, T1.16b
206 eor XL.16b, XM.16b, T2.16b
207 ext T2.16b, XL.16b, XL.16b, #8
208 pmull XL.1q, XL.1d, MASK.1d
212 // Alternative reduction for CPUs that lack support for the
213 // 64x64->128 PMULL instruction
215 .macro __pmull_reduce_p8
216 eor XM.16b, XM.16b, T1.16b
221 shl T1.2d, XL.2d, #57
222 shl T2.2d, XL.2d, #62
223 eor T2.16b, T2.16b, T1.16b
224 shl T1.2d, XL.2d, #63
225 eor T2.16b, T2.16b, T1.16b
226 ext T1.16b, XL.16b, XH.16b, #8
227 eor T2.16b, T2.16b, T1.16b
232 ushr T2.2d, XL.2d, #1
233 eor XH.16b, XH.16b, XL.16b
234 eor XL.16b, XL.16b, T2.16b
235 ushr T2.2d, T2.2d, #6
236 ushr XL.2d, XL.2d, #1
239 .macro __pmull_ghash, pn
245 /* do the head block first, if supplied */
252 tbnz w0, #0, 2f // skip until #blocks is a
253 tbnz w0, #1, 2f // round multiple of 4
255 1: ld1 {XM3.16b-TT4.16b}, [x2], #64
259 rev64 T1.16b, XM3.16b
260 rev64 T2.16b, XH3.16b
261 rev64 TT4.16b, TT4.16b
262 rev64 TT3.16b, TT3.16b
264 ext IN1.16b, TT4.16b, TT4.16b, #8
265 ext XL3.16b, TT3.16b, TT3.16b, #8
267 eor TT4.16b, TT4.16b, IN1.16b
268 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
269 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
270 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
272 eor TT3.16b, TT3.16b, XL3.16b
273 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
274 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
275 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
277 ext IN1.16b, T2.16b, T2.16b, #8
278 eor XL2.16b, XL2.16b, XL3.16b
279 eor XH2.16b, XH2.16b, XH3.16b
280 eor XM2.16b, XM2.16b, XM3.16b
282 eor T2.16b, T2.16b, IN1.16b
283 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
284 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
285 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
287 eor XL2.16b, XL2.16b, XL3.16b
288 eor XH2.16b, XH2.16b, XH3.16b
289 eor XM2.16b, XM2.16b, XM3.16b
291 ext IN1.16b, T1.16b, T1.16b, #8
292 ext TT3.16b, XL.16b, XL.16b, #8
293 eor XL.16b, XL.16b, IN1.16b
294 eor T1.16b, T1.16b, TT3.16b
296 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
297 eor T1.16b, T1.16b, XL.16b
298 pmull XL.1q, HH4.1d, XL.1d // a0 * b0
299 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
301 eor XL.16b, XL.16b, XL2.16b
302 eor XH.16b, XH.16b, XH2.16b
303 eor XM.16b, XM.16b, XM2.16b
305 eor T2.16b, XL.16b, XH.16b
306 ext T1.16b, XL.16b, XH.16b, #8
307 eor XM.16b, XM.16b, T2.16b
311 eor T2.16b, T2.16b, XH.16b
312 eor XL.16b, XL.16b, T2.16b
318 2: ld1 {T1.2d}, [x2], #16
321 3: /* multiply XL by SHASH in GF(2^128) */
322 CPU_LE( rev64 T1.16b, T1.16b )
324 ext T2.16b, XL.16b, XL.16b, #8
325 ext IN1.16b, T1.16b, T1.16b, #8
326 eor T1.16b, T1.16b, T2.16b
327 eor XL.16b, XL.16b, IN1.16b
329 __pmull2_\pn XH, XL, SHASH // a1 * b1
330 eor T1.16b, T1.16b, XL.16b
331 __pmull_\pn XL, XL, SHASH // a0 * b0
332 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
334 4: eor T2.16b, XL.16b, XH.16b
335 ext T1.16b, XL.16b, XH.16b, #8
336 eor XM.16b, XM.16b, T2.16b
340 eor T2.16b, T2.16b, XH.16b
341 eor XL.16b, XL.16b, T2.16b
350 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351 * struct ghash_key const *k, const char *head)
353 SYM_FUNC_START(pmull_ghash_update_p64)
355 SYM_FUNC_END(pmull_ghash_update_p64)
357 SYM_FUNC_START(pmull_ghash_update_p8)
359 SYM_FUNC_END(pmull_ghash_update_p8)
385 .macro load_round_keys, rounds, rk, tmp
387 ld1 {K0.4s-K3.4s}, [\rk]
388 ld1 {K4.4s-K5.4s}, [\tmp]
389 add \tmp, \rk, \rounds, lsl #4
391 ld1 {KK.4s-KM.4s}, [\tmp]
394 .macro enc_round, state, key
395 aese \state\().16b, \key\().16b
396 aesmc \state\().16b, \state\().16b
399 .macro enc_qround, s0, s1, s2, s3, key
406 .macro enc_block, state, rounds, rk, tmp
408 ld1 {K6.4s-K7.4s}, [\tmp], #32
409 .irp key, K0, K1, K2, K3, K4 K5
410 enc_round \state, \key
413 tbnz \rounds, #2, .Lnot128_\@
420 aese \state\().16b, KL.16b
421 eor \state\().16b, \state\().16b, KM.16b
425 ld1 {K8.4s-K9.4s}, [\tmp], #32
428 ld1 {K6.4s-K7.4s}, [\tmp]
431 tbz \rounds, #1, .Lout192_\@
437 .macro pmull_gcm_do_crypt, enc
438 stp x29, x30, [sp, #-32]!
442 load_round_keys x7, x6, x8
444 ld1 {SHASH.2d}, [x3], #16
445 ld1 {HH.2d-HH4.2d}, [x3]
447 trn1 SHASH2.2d, SHASH.2d, HH.2d
448 trn2 T1.2d, SHASH.2d, HH.2d
449 eor SHASH2.16b, SHASH2.16b, T1.16b
451 trn1 HH34.2d, HH3.2d, HH4.2d
452 trn2 T1.2d, HH3.2d, HH4.2d
453 eor HH34.16b, HH34.16b, T1.16b
457 cbz x0, 3f // tag only?
459 ldr w8, [x5, #12] // load lower counter
462 0: mov w9, #4 // max blocks per round
464 lsr x10, x10, #4 // remaining blocks
471 ld1 {INP0.16b-INP3.16b}, [x2], #64
474 * Populate the four input registers right to left with up to 63 bytes
475 * of data, using overlapping loads to avoid branches.
477 * INP0 INP1 INP2 INP3
479 * 16 bytes | | | |xxxxxxxx|
480 * 17 bytes | | |xxxxxxxx|x |
481 * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
484 * Note that this code may read up to 15 bytes before the start of
485 * the input. It is up to the calling code to ensure this is safe if
486 * this happens in the first iteration of the loop (i.e., when the
487 * input size is < 16 bytes)
491 csel x19, x19, x15, ne
492 adr_l x17, .Lpermute_table + 16
502 csel x14, x15, xzr, gt
504 csel x15, x15, xzr, gt
506 csel x16, x19, xzr, gt
510 ld1 {INP0.16b}, [x2], x14
511 ld1 {INP1.16b}, [x2], x15
512 ld1 {INP2.16b}, [x2], x16
514 tbl INP3.16b, {INP3.16b}, T1.16b
519 bl pmull_gcm_ghash_4x
525 st1 {INP0.16b-INP3.16b}, [x1], #64
527 bl pmull_gcm_ghash_4x
531 3: ldp x19, x10, [sp, #24]
532 cbz x10, 5f // output tag?
534 ld1 {INP3.16b}, [x10] // load lengths[]
536 bl pmull_gcm_ghash_4x
538 mov w11, #(0x1 << 24) // BE '1U'
542 enc_block KS0, x7, x6, x12
544 ext XL.16b, XL.16b, XL.16b, #8
546 eor XL.16b, XL.16b, KS0.16b
549 st1 {XL.16b}, [x10] // store tag
551 ldp x11, x12, [sp, #40] // load tag pointer and authsize
552 adr_l x17, .Lpermute_table
553 ld1 {KS0.16b}, [x11] // load supplied tag
555 ld1 {KS1.16b}, [x17] // load permute vector
557 cmeq XL.16b, XL.16b, KS0.16b // compare tags
558 mvn XL.16b, XL.16b // -1 for fail, 0 for pass
559 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
560 sminv b0, XL.16b // signed minimum across XL
561 smov w0, v0.b[0] // return b0
564 4: ldp x29, x30, [sp], #32
569 str w8, [x5, #12] // store lower counter
573 6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
574 sub x17, x17, x19, lsl #1
579 7: ld1 {INP2.16b}, [x1]
580 tbx INP2.16b, {INP3.16b}, T1.16b
581 mov INP3.16b, INP2.16b
585 st1 {INP0.16b}, [x1], x14
586 st1 {INP1.16b}, [x1], x15
587 st1 {INP2.16b}, [x1], x16
588 tbl INP3.16b, {INP3.16b}, T1.16b
589 tbx INP3.16b, {INP2.16b}, T2.16b
590 8: st1 {INP3.16b}, [x1]
594 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
595 bl pmull_gcm_ghash_4x
601 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
602 * struct ghash_key const *k, u64 dg[], u8 ctr[],
603 * int rounds, u8 tag)
605 SYM_FUNC_START(pmull_gcm_encrypt)
607 SYM_FUNC_END(pmull_gcm_encrypt)
610 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
611 * struct ghash_key const *k, u64 dg[], u8 ctr[],
612 * int rounds, u8 tag)
614 SYM_FUNC_START(pmull_gcm_decrypt)
616 SYM_FUNC_END(pmull_gcm_decrypt)
618 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
620 shl MASK.2d, MASK.2d, #57
622 rev64 T1.16b, INP0.16b
623 rev64 T2.16b, INP1.16b
624 rev64 TT3.16b, INP2.16b
625 rev64 TT4.16b, INP3.16b
627 ext XL.16b, XL.16b, XL.16b, #8
629 tbz w9, #2, 0f // <4 blocks?
635 tbz w9, #0, 1f // 2 blocks?
636 tbz w9, #1, 2f // 1 block?
638 eor T2.16b, T2.16b, XL.16b
639 ext T1.16b, T2.16b, T2.16b, #8
642 1: eor TT3.16b, TT3.16b, XL.16b
643 ext T2.16b, TT3.16b, TT3.16b, #8
646 2: eor TT4.16b, TT4.16b, XL.16b
647 ext IN1.16b, TT4.16b, TT4.16b, #8
651 eor T1.16b, T1.16b, XL.16b
652 ext IN1.16b, T1.16b, T1.16b, #8
654 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
655 eor T1.16b, T1.16b, IN1.16b
656 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
657 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
659 ext T1.16b, T2.16b, T2.16b, #8
660 .Lgh3: eor T2.16b, T2.16b, T1.16b
661 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
662 pmull XL.1q, HH3.1d, T1.1d // a0 * b0
663 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
665 eor XH2.16b, XH2.16b, XH.16b
666 eor XL2.16b, XL2.16b, XL.16b
667 eor XM2.16b, XM2.16b, XM.16b
669 ext T2.16b, TT3.16b, TT3.16b, #8
670 .Lgh2: eor TT3.16b, TT3.16b, T2.16b
671 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
672 pmull XL.1q, HH.1d, T2.1d // a0 * b0
673 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
675 eor XH2.16b, XH2.16b, XH.16b
676 eor XL2.16b, XL2.16b, XL.16b
677 eor XM2.16b, XM2.16b, XM.16b
679 ext IN1.16b, TT4.16b, TT4.16b, #8
680 .Lgh1: eor TT4.16b, TT4.16b, IN1.16b
681 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
682 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
683 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
685 eor XH.16b, XH.16b, XH2.16b
686 eor XL.16b, XL.16b, XL2.16b
687 eor XM.16b, XM.16b, XM2.16b
689 eor T2.16b, XL.16b, XH.16b
690 ext T1.16b, XL.16b, XH.16b, #8
691 eor XM.16b, XM.16b, T2.16b
695 eor T2.16b, T2.16b, XH.16b
696 eor XL.16b, XL.16b, T2.16b
699 SYM_FUNC_END(pmull_gcm_ghash_4x)
701 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
702 ld1 {KS0.16b}, [x5] // load upper counter
714 ins KS0.s[3], w10 // set lower counter
719 add x10, x6, #96 // round key pointer
720 ld1 {K6.4s-K7.4s}, [x10], #32
721 .irp key, K0, K1, K2, K3, K4, K5
722 enc_qround KS0, KS1, KS2, KS3, \key
725 tbnz x7, #2, .Lnot128
728 ld1 {K8.4s-K9.4s}, [x10], #32
730 enc_qround KS0, KS1, KS2, KS3, \key
732 ld1 {K6.4s-K7.4s}, [x10]
734 enc_qround KS0, KS1, KS2, KS3, \key
742 enc_qround KS0, KS1, KS2, KS3, \key
746 enc_qround KS0, KS1, KS2, KS3, KK
753 eor KS0.16b, KS0.16b, KM.16b
754 eor KS1.16b, KS1.16b, KM.16b
755 eor KS2.16b, KS2.16b, KM.16b
756 eor KS3.16b, KS3.16b, KM.16b
758 eor INP0.16b, INP0.16b, KS0.16b
759 eor INP1.16b, INP1.16b, KS1.16b
760 eor INP2.16b, INP2.16b, KS2.16b
761 eor INP3.16b, INP3.16b, KS3.16b
764 SYM_FUNC_END(pmull_gcm_enc_4x)
766 .section ".rodata", "a"
769 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
771 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
772 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
773 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
775 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
776 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf