1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * SM4 Cipher Algorithm for ARMv8 NEON
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
33 #define SM4_PREPARE() \
34 adr_l x5, crypto_sm4_sbox; \
35 ld1 {v16.16b-v19.16b}, [x5], #64; \
36 ld1 {v20.16b-v23.16b}, [x5], #64; \
37 ld1 {v24.16b-v27.16b}, [x5], #64; \
38 ld1 {v28.16b-v31.16b}, [x5];
40 #define transpose_4x4(s0, s1, s2, s3) \
41 zip1 RTMP0.4s, s0.4s, s1.4s; \
42 zip1 RTMP1.4s, s2.4s, s3.4s; \
43 zip2 RTMP2.4s, s0.4s, s1.4s; \
44 zip2 RTMP3.4s, s2.4s, s3.4s; \
45 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
46 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
47 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
48 zip2 s3.2d, RTMP2.2d, RTMP3.2d;
50 #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
51 zip1 RTMP0.4s, s0.4s, s1.4s; \
52 zip1 RTMP1.4s, s2.4s, s3.4s; \
53 zip2 RTMP2.4s, s0.4s, s1.4s; \
54 zip2 RTMP3.4s, s2.4s, s3.4s; \
55 zip1 RTMP4.4s, s4.4s, s5.4s; \
56 zip1 RTMP5.4s, s6.4s, s7.4s; \
57 zip2 RTMP6.4s, s4.4s, s5.4s; \
58 zip2 RTMP7.4s, s6.4s, s7.4s; \
59 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
60 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
61 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
62 zip2 s3.2d, RTMP2.2d, RTMP3.2d; \
63 zip1 s4.2d, RTMP4.2d, RTMP5.2d; \
64 zip2 s5.2d, RTMP4.2d, RTMP5.2d; \
65 zip1 s6.2d, RTMP6.2d, RTMP7.2d; \
66 zip2 s7.2d, RTMP6.2d, RTMP7.2d;
68 #define rotate_clockwise_4x4(s0, s1, s2, s3) \
69 zip1 RTMP0.4s, s1.4s, s0.4s; \
70 zip2 RTMP1.4s, s1.4s, s0.4s; \
71 zip1 RTMP2.4s, s3.4s, s2.4s; \
72 zip2 RTMP3.4s, s3.4s, s2.4s; \
73 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
74 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
75 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
76 zip2 s3.2d, RTMP3.2d, RTMP1.2d;
78 #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
79 zip1 RTMP0.4s, s1.4s, s0.4s; \
80 zip1 RTMP2.4s, s3.4s, s2.4s; \
81 zip2 RTMP1.4s, s1.4s, s0.4s; \
82 zip2 RTMP3.4s, s3.4s, s2.4s; \
83 zip1 RTMP4.4s, s5.4s, s4.4s; \
84 zip1 RTMP6.4s, s7.4s, s6.4s; \
85 zip2 RTMP5.4s, s5.4s, s4.4s; \
86 zip2 RTMP7.4s, s7.4s, s6.4s; \
87 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
88 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
89 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
90 zip2 s3.2d, RTMP3.2d, RTMP1.2d; \
91 zip1 s4.2d, RTMP6.2d, RTMP4.2d; \
92 zip2 s5.2d, RTMP6.2d, RTMP4.2d; \
93 zip1 s6.2d, RTMP7.2d, RTMP5.2d; \
94 zip2 s7.2d, RTMP7.2d, RTMP5.2d;
96 #define ROUND4(round, s0, s1, s2, s3) \
97 dup RX0.4s, RKEY.s[round]; \
98 /* rk ^ s1 ^ s2 ^ s3 */ \
99 eor RTMP1.16b, s2.16b, s3.16b; \
100 eor RX0.16b, RX0.16b, s1.16b; \
101 eor RX0.16b, RX0.16b, RTMP1.16b; \
103 /* sbox, non-linear part */ \
104 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
105 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
106 sub RX0.16b, RX0.16b, RTMP3.16b; \
107 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
108 sub RX0.16b, RX0.16b, RTMP3.16b; \
109 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
110 sub RX0.16b, RX0.16b, RTMP3.16b; \
111 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
114 shl RTMP1.4s, RTMP0.4s, #8; \
115 shl RTMP2.4s, RTMP0.4s, #16; \
116 shl RTMP3.4s, RTMP0.4s, #24; \
117 sri RTMP1.4s, RTMP0.4s, #(32-8); \
118 sri RTMP2.4s, RTMP0.4s, #(32-16); \
119 sri RTMP3.4s, RTMP0.4s, #(32-24); \
120 /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
121 eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
122 eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
123 /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
124 eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
125 shl RTMP2.4s, RTMP1.4s, 2; \
126 sri RTMP2.4s, RTMP1.4s, #(32-2); \
127 eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
129 eor s0.16b, s0.16b, RTMP3.16b;
131 #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
134 ld1 {RKEY.4s}, [x0], #16; \
137 ROUND4(0, b0, b1, b2, b3); \
138 ROUND4(1, b1, b2, b3, b0); \
139 ROUND4(2, b2, b3, b0, b1); \
140 ROUND4(3, b3, b0, b1, b2); \
144 rev32 b0.16b, b0.16b; \
145 rev32 b1.16b, b1.16b; \
146 rev32 b2.16b, b2.16b; \
147 rev32 b3.16b, b3.16b; \
149 rotate_clockwise_4x4(b0, b1, b2, b3); \
151 /* repoint to rkey */ \
154 #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
155 rev32 b0.16b, b0.16b; \
156 rev32 b1.16b, b1.16b; \
157 rev32 b2.16b, b2.16b; \
158 rev32 b3.16b, b3.16b; \
159 SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
161 #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
162 /* rk ^ s1 ^ s2 ^ s3 */ \
163 dup RX0.4s, RKEY.s[round]; \
164 eor RTMP0.16b, s2.16b, s3.16b; \
165 mov RX1.16b, RX0.16b; \
166 eor RTMP1.16b, t2.16b, t3.16b; \
167 eor RX0.16b, RX0.16b, s1.16b; \
168 eor RX1.16b, RX1.16b, t1.16b; \
169 eor RX0.16b, RX0.16b, RTMP0.16b; \
170 eor RX1.16b, RX1.16b, RTMP1.16b; \
172 /* sbox, non-linear part */ \
173 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
174 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
175 tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
176 sub RX0.16b, RX0.16b, RTMP3.16b; \
177 sub RX1.16b, RX1.16b, RTMP3.16b; \
178 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
179 tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
180 sub RX0.16b, RX0.16b, RTMP3.16b; \
181 sub RX1.16b, RX1.16b, RTMP3.16b; \
182 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
183 tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
184 sub RX0.16b, RX0.16b, RTMP3.16b; \
185 sub RX1.16b, RX1.16b, RTMP3.16b; \
186 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
187 tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
190 shl RX0.4s, RTMP0.4s, #8; \
191 shl RX1.4s, RTMP1.4s, #8; \
192 shl RTMP2.4s, RTMP0.4s, #16; \
193 shl RTMP3.4s, RTMP1.4s, #16; \
194 sri RX0.4s, RTMP0.4s, #(32 - 8); \
195 sri RX1.4s, RTMP1.4s, #(32 - 8); \
196 sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
197 sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
198 /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
199 eor RX0.16b, RX0.16b, RTMP0.16b; \
200 eor RX1.16b, RX1.16b, RTMP1.16b; \
201 eor RX0.16b, RX0.16b, RTMP2.16b; \
202 eor RX1.16b, RX1.16b, RTMP3.16b; \
203 /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
204 shl RTMP2.4s, RTMP0.4s, #24; \
205 shl RTMP3.4s, RTMP1.4s, #24; \
206 sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
207 sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
208 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
209 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
210 shl RTMP2.4s, RX0.4s, #2; \
211 shl RTMP3.4s, RX1.4s, #2; \
212 sri RTMP2.4s, RX0.4s, #(32 - 2); \
213 sri RTMP3.4s, RX1.4s, #(32 - 2); \
214 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
215 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
216 /* s0/t0 ^= RTMP0/1 */ \
217 eor s0.16b, s0.16b, RTMP0.16b; \
218 eor t0.16b, t0.16b, RTMP1.16b;
220 #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221 rev32 b0.16b, b0.16b; \
222 rev32 b1.16b, b1.16b; \
223 rev32 b2.16b, b2.16b; \
224 rev32 b3.16b, b3.16b; \
225 rev32 b4.16b, b4.16b; \
226 rev32 b5.16b, b5.16b; \
227 rev32 b6.16b, b6.16b; \
228 rev32 b7.16b, b7.16b; \
232 ld1 {RKEY.4s}, [x0], #16; \
235 ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
236 ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
237 ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
238 ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
242 rev32 b0.16b, b0.16b; \
243 rev32 b1.16b, b1.16b; \
244 rev32 b2.16b, b2.16b; \
245 rev32 b3.16b, b3.16b; \
246 rev32 b4.16b, b4.16b; \
247 rev32 b5.16b, b5.16b; \
248 rev32 b6.16b, b6.16b; \
249 rev32 b7.16b, b7.16b; \
251 /* repoint to rkey */ \
254 #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
255 SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \
256 rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \
260 SYM_FUNC_START(sm4_neon_crypt)
262 * x0: round key array, CTX
271 tbnz w3, #31, .Lcrypt_4x
273 ld4 {v0.4s-v3.4s}, [x2], #64
274 ld4 {v4.4s-v7.4s}, [x2], #64
276 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
278 st1 {v0.16b-v3.16b}, [x1], #64
279 st1 {v4.16b-v7.16b}, [x1], #64
291 ld4 {v0.4s-v3.4s}, [x2], #64
293 SM4_CRYPT_BLK4(v0, v1, v2, v3)
295 st1 {v0.16b-v3.16b}, [x1], #64
301 ld1 {v0.16b}, [x2], #16
302 blt .Lcrypt_tail_load_done
303 ld1 {v1.16b}, [x2], #16
304 beq .Lcrypt_tail_load_done
305 ld1 {v2.16b}, [x2], #16
307 .Lcrypt_tail_load_done:
308 transpose_4x4(v0, v1, v2, v3)
310 SM4_CRYPT_BLK4(v0, v1, v2, v3)
313 st1 {v0.16b}, [x1], #16
315 st1 {v1.16b}, [x1], #16
317 st1 {v2.16b}, [x1], #16
321 SYM_FUNC_END(sm4_neon_crypt)
324 SYM_FUNC_START(sm4_neon_cbc_dec)
326 * x0: round key array, CTX
329 * x3: iv (big endian, 128 bit)
338 tbnz w4, #31, .Lcbc_dec_4x
340 ld4 {v0.4s-v3.4s}, [x2], #64
341 ld4 {v4.4s-v7.4s}, [x2]
343 SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
345 /* Avoid overwriting the RIV register */
346 rotate_clockwise_4x4(v0, v1, v2, v3)
347 rotate_clockwise_4x4(v4, v5, v6, v7)
351 eor v0.16b, v0.16b, RIV.16b
353 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
354 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
356 eor v1.16b, v1.16b, RTMP0.16b
357 eor v2.16b, v2.16b, RTMP1.16b
358 eor v3.16b, v3.16b, RTMP2.16b
359 eor v4.16b, v4.16b, RTMP3.16b
360 eor v5.16b, v5.16b, RTMP4.16b
361 eor v6.16b, v6.16b, RTMP5.16b
362 eor v7.16b, v7.16b, RTMP6.16b
364 mov RIV.16b, RTMP7.16b
366 st1 {v0.16b-v3.16b}, [x1], #64
367 st1 {v4.16b-v7.16b}, [x1], #64
369 cbz w4, .Lcbc_dec_end
379 ld1 {v0.16b-v3.16b}, [x2], #64
386 transpose_4x4(v4, v5, v6, v7)
388 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
390 eor v4.16b, v4.16b, RIV.16b
391 eor v5.16b, v5.16b, v0.16b
392 eor v6.16b, v6.16b, v1.16b
393 eor v7.16b, v7.16b, v2.16b
397 st1 {v4.16b-v7.16b}, [x1], #64
399 cbz w4, .Lcbc_dec_end
403 ld1 {v0.16b}, [x2], #16
404 blt .Lcbc_dec_tail_load_done
405 ld1 {v1.16b}, [x2], #16
406 beq .Lcbc_dec_tail_load_done
407 ld1 {v2.16b}, [x2], #16
409 .Lcbc_dec_tail_load_done:
414 transpose_4x4(v4, v5, v6, v7)
416 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
419 eor v4.16b, v4.16b, RIV.16b
421 st1 {v4.16b}, [x1], #16
424 eor v5.16b, v5.16b, v0.16b
426 st1 {v5.16b}, [x1], #16
429 eor v6.16b, v6.16b, v1.16b
431 st1 {v6.16b}, [x1], #16
438 SYM_FUNC_END(sm4_neon_cbc_dec)
441 SYM_FUNC_START(sm4_neon_cfb_dec)
443 * x0: round key array, CTX
446 * x3: iv (big endian, 128 bit)
455 tbnz w4, #31, .Lcfb_dec_4x
457 ld1 {v1.16b-v3.16b}, [x2], #48
458 ld4 {v4.4s-v7.4s}, [x2]
460 transpose_4x4(v0, v1, v2, v3)
462 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
465 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
466 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
468 eor v0.16b, v0.16b, RTMP0.16b
469 eor v1.16b, v1.16b, RTMP1.16b
470 eor v2.16b, v2.16b, RTMP2.16b
471 eor v3.16b, v3.16b, RTMP3.16b
472 eor v4.16b, v4.16b, RTMP4.16b
473 eor v5.16b, v5.16b, RTMP5.16b
474 eor v6.16b, v6.16b, RTMP6.16b
475 eor v7.16b, v7.16b, RTMP7.16b
477 st1 {v0.16b-v3.16b}, [x1], #64
478 st1 {v4.16b-v7.16b}, [x1], #64
480 mov v0.16b, RTMP7.16b
482 cbz w4, .Lcfb_dec_end
492 ld1 {v4.16b-v7.16b}, [x2], #64
494 rev32 v0.16b, v0.16b /* v0 is IV register */
499 transpose_4x4(v0, v1, v2, v3)
501 SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
503 eor v0.16b, v0.16b, v4.16b
504 eor v1.16b, v1.16b, v5.16b
505 eor v2.16b, v2.16b, v6.16b
506 eor v3.16b, v3.16b, v7.16b
508 st1 {v0.16b-v3.16b}, [x1], #64
512 cbz w4, .Lcfb_dec_end
516 ld1 {v4.16b}, [x2], #16
517 blt .Lcfb_dec_tail_load_done
518 ld1 {v5.16b}, [x2], #16
519 beq .Lcfb_dec_tail_load_done
520 ld1 {v6.16b}, [x2], #16
522 .Lcfb_dec_tail_load_done:
523 rev32 v0.16b, v0.16b /* v0 is IV register */
527 transpose_4x4(v0, v1, v2, v3)
529 SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
532 eor v0.16b, v0.16b, v4.16b
533 st1 {v0.16b}, [x1], #16
537 eor v1.16b, v1.16b, v5.16b
538 st1 {v1.16b}, [x1], #16
542 eor v2.16b, v2.16b, v6.16b
543 st1 {v2.16b}, [x1], #16
551 SYM_FUNC_END(sm4_neon_cfb_dec)
554 SYM_FUNC_START(sm4_neon_ctr_crypt)
556 * x0: round key array, CTX
559 * x3: ctr (big endian, 128 bit)
570 tbnz w4, #31, .Lctr_crypt_4x
572 #define inc_le128(vctr) \
576 rev64 vctr.16b, vctr.16b; \
580 inc_le128(v0) /* +0 */
581 inc_le128(v1) /* +1 */
582 inc_le128(v2) /* +2 */
583 inc_le128(v3) /* +3 */
584 inc_le128(v4) /* +4 */
585 inc_le128(v5) /* +5 */
586 inc_le128(v6) /* +6 */
587 inc_le128(v7) /* +7 */
589 transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
591 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
593 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
594 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
596 eor v0.16b, v0.16b, RTMP0.16b
597 eor v1.16b, v1.16b, RTMP1.16b
598 eor v2.16b, v2.16b, RTMP2.16b
599 eor v3.16b, v3.16b, RTMP3.16b
600 eor v4.16b, v4.16b, RTMP4.16b
601 eor v5.16b, v5.16b, RTMP5.16b
602 eor v6.16b, v6.16b, RTMP6.16b
603 eor v7.16b, v7.16b, RTMP7.16b
605 st1 {v0.16b-v3.16b}, [x1], #64
606 st1 {v4.16b-v7.16b}, [x1], #64
608 cbz w4, .Lctr_crypt_end
609 b .Lctr_crypt_loop_8x
619 inc_le128(v0) /* +0 */
620 inc_le128(v1) /* +1 */
621 inc_le128(v2) /* +2 */
622 inc_le128(v3) /* +3 */
624 ld1 {v4.16b-v7.16b}, [x2], #64
626 transpose_4x4(v0, v1, v2, v3)
628 SM4_CRYPT_BLK4(v0, v1, v2, v3)
630 eor v0.16b, v0.16b, v4.16b
631 eor v1.16b, v1.16b, v5.16b
632 eor v2.16b, v2.16b, v6.16b
633 eor v3.16b, v3.16b, v7.16b
635 st1 {v0.16b-v3.16b}, [x1], #64
637 cbz w4, .Lctr_crypt_end
640 /* inc_le128 will change the sign bit */
641 ld1 {v4.16b}, [x2], #16
644 blt .Lctr_crypt_tail_load_done
646 ld1 {v5.16b}, [x2], #16
649 beq .Lctr_crypt_tail_load_done
651 ld1 {v6.16b}, [x2], #16
654 .Lctr_crypt_tail_load_done:
655 transpose_4x4(v0, v1, v2, v3)
657 SM4_CRYPT_BLK4(v0, v1, v2, v3)
661 eor v0.16b, v0.16b, v4.16b
662 st1 {v0.16b}, [x1], #16
665 eor v1.16b, v1.16b, v5.16b
666 st1 {v1.16b}, [x1], #16
669 eor v2.16b, v2.16b, v6.16b
670 st1 {v2.16b}, [x1], #16
679 SYM_FUNC_END(sm4_neon_ctr_crypt)