Merge tag 'nfsd-5.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux
[linux-2.6-microblaze.git] / arch / arm64 / crypto / ghash-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4  *
5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10
11         SHASH           .req    v0
12         SHASH2          .req    v1
13         T1              .req    v2
14         T2              .req    v3
15         MASK            .req    v4
16         XM              .req    v5
17         XL              .req    v6
18         XH              .req    v7
19         IN1             .req    v7
20
21         k00_16          .req    v8
22         k32_48          .req    v9
23
24         t3              .req    v10
25         t4              .req    v11
26         t5              .req    v12
27         t6              .req    v13
28         t7              .req    v14
29         t8              .req    v15
30         t9              .req    v16
31
32         perm1           .req    v17
33         perm2           .req    v18
34         perm3           .req    v19
35
36         sh1             .req    v20
37         sh2             .req    v21
38         sh3             .req    v22
39         sh4             .req    v23
40
41         ss1             .req    v24
42         ss2             .req    v25
43         ss3             .req    v26
44         ss4             .req    v27
45
46         XL2             .req    v8
47         XM2             .req    v9
48         XH2             .req    v10
49         XL3             .req    v11
50         XM3             .req    v12
51         XH3             .req    v13
52         TT3             .req    v14
53         TT4             .req    v15
54         HH              .req    v16
55         HH3             .req    v17
56         HH4             .req    v18
57         HH34            .req    v19
58
59         .text
60         .arch           armv8-a+crypto
61
62         .macro          __pmull_p64, rd, rn, rm
63         pmull           \rd\().1q, \rn\().1d, \rm\().1d
64         .endm
65
66         .macro          __pmull2_p64, rd, rn, rm
67         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
68         .endm
69
70         .macro          __pmull_p8, rq, ad, bd
71         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
72         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
73         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
74
75         __pmull_p8_\bd  \rq, \ad
76         .endm
77
78         .macro          __pmull2_p8, rq, ad, bd
79         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
80         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
81         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
82
83         __pmull2_p8_\bd \rq, \ad
84         .endm
85
86         .macro          __pmull_p8_SHASH, rq, ad
87         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
88         .endm
89
90         .macro          __pmull_p8_SHASH2, rq, ad
91         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
92         .endm
93
94         .macro          __pmull2_p8_SHASH, rq, ad
95         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
96         .endm
97
98         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
100         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
101         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
102         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
103         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
104         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
105         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
106         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
107
108         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
109         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
110         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
111
112         uzp1            t4.2d, t3.2d, t5.2d
113         uzp2            t3.2d, t3.2d, t5.2d
114         uzp1            t6.2d, t7.2d, t9.2d
115         uzp2            t7.2d, t7.2d, t9.2d
116
117         // t3 = (L) (P0 + P1) << 8
118         // t5 = (M) (P2 + P3) << 16
119         eor             t4.16b, t4.16b, t3.16b
120         and             t3.16b, t3.16b, k32_48.16b
121
122         // t7 = (N) (P4 + P5) << 24
123         // t9 = (K) (P6 + P7) << 32
124         eor             t6.16b, t6.16b, t7.16b
125         and             t7.16b, t7.16b, k00_16.16b
126
127         eor             t4.16b, t4.16b, t3.16b
128         eor             t6.16b, t6.16b, t7.16b
129
130         zip2            t5.2d, t4.2d, t3.2d
131         zip1            t3.2d, t4.2d, t3.2d
132         zip2            t9.2d, t6.2d, t7.2d
133         zip1            t7.2d, t6.2d, t7.2d
134
135         ext             t3.16b, t3.16b, t3.16b, #15
136         ext             t5.16b, t5.16b, t5.16b, #14
137         ext             t7.16b, t7.16b, t7.16b, #13
138         ext             t9.16b, t9.16b, t9.16b, #12
139
140         eor             t3.16b, t3.16b, t5.16b
141         eor             t7.16b, t7.16b, t9.16b
142         eor             \rq\().16b, \rq\().16b, t3.16b
143         eor             \rq\().16b, \rq\().16b, t7.16b
144         .endm
145
146         .macro          __pmull_pre_p64
147         add             x8, x3, #16
148         ld1             {HH.2d-HH4.2d}, [x8]
149
150         trn1            SHASH2.2d, SHASH.2d, HH.2d
151         trn2            T1.2d, SHASH.2d, HH.2d
152         eor             SHASH2.16b, SHASH2.16b, T1.16b
153
154         trn1            HH34.2d, HH3.2d, HH4.2d
155         trn2            T1.2d, HH3.2d, HH4.2d
156         eor             HH34.16b, HH34.16b, T1.16b
157
158         movi            MASK.16b, #0xe1
159         shl             MASK.2d, MASK.2d, #57
160         .endm
161
162         .macro          __pmull_pre_p8
163         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
164         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
165
166         // k00_16 := 0x0000000000000000_000000000000ffff
167         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
168         movi            k32_48.2d, #0xffffffff
169         mov             k32_48.h[2], k32_48.h[0]
170         ushr            k00_16.2d, k32_48.2d, #32
171
172         // prepare the permutation vectors
173         mov_q           x5, 0x080f0e0d0c0b0a09
174         movi            T1.8b, #8
175         dup             perm1.2d, x5
176         eor             perm1.16b, perm1.16b, T1.16b
177         ushr            perm2.2d, perm1.2d, #8
178         ushr            perm3.2d, perm1.2d, #16
179         ushr            T1.2d, perm1.2d, #24
180         sli             perm2.2d, perm1.2d, #56
181         sli             perm3.2d, perm1.2d, #48
182         sli             T1.2d, perm1.2d, #40
183
184         // precompute loop invariants
185         tbl             sh1.16b, {SHASH.16b}, perm1.16b
186         tbl             sh2.16b, {SHASH.16b}, perm2.16b
187         tbl             sh3.16b, {SHASH.16b}, perm3.16b
188         tbl             sh4.16b, {SHASH.16b}, T1.16b
189         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
190         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
191         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
192         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
193         .endm
194
195         //
196         // PMULL (64x64->128) based reduction for CPUs that can do
197         // it in a single instruction.
198         //
199         .macro          __pmull_reduce_p64
200         pmull           T2.1q, XL.1d, MASK.1d
201         eor             XM.16b, XM.16b, T1.16b
202
203         mov             XH.d[0], XM.d[1]
204         mov             XM.d[1], XL.d[0]
205
206         eor             XL.16b, XM.16b, T2.16b
207         ext             T2.16b, XL.16b, XL.16b, #8
208         pmull           XL.1q, XL.1d, MASK.1d
209         .endm
210
211         //
212         // Alternative reduction for CPUs that lack support for the
213         // 64x64->128 PMULL instruction
214         //
215         .macro          __pmull_reduce_p8
216         eor             XM.16b, XM.16b, T1.16b
217
218         mov             XL.d[1], XM.d[0]
219         mov             XH.d[0], XM.d[1]
220
221         shl             T1.2d, XL.2d, #57
222         shl             T2.2d, XL.2d, #62
223         eor             T2.16b, T2.16b, T1.16b
224         shl             T1.2d, XL.2d, #63
225         eor             T2.16b, T2.16b, T1.16b
226         ext             T1.16b, XL.16b, XH.16b, #8
227         eor             T2.16b, T2.16b, T1.16b
228
229         mov             XL.d[1], T2.d[0]
230         mov             XH.d[0], T2.d[1]
231
232         ushr            T2.2d, XL.2d, #1
233         eor             XH.16b, XH.16b, XL.16b
234         eor             XL.16b, XL.16b, T2.16b
235         ushr            T2.2d, T2.2d, #6
236         ushr            XL.2d, XL.2d, #1
237         .endm
238
239         .macro          __pmull_ghash, pn
240         ld1             {SHASH.2d}, [x3]
241         ld1             {XL.2d}, [x1]
242
243         __pmull_pre_\pn
244
245         /* do the head block first, if supplied */
246         cbz             x4, 0f
247         ld1             {T1.2d}, [x4]
248         mov             x4, xzr
249         b               3f
250
251 0:      .ifc            \pn, p64
252         tbnz            w0, #0, 2f              // skip until #blocks is a
253         tbnz            w0, #1, 2f              // round multiple of 4
254
255 1:      ld1             {XM3.16b-TT4.16b}, [x2], #64
256
257         sub             w0, w0, #4
258
259         rev64           T1.16b, XM3.16b
260         rev64           T2.16b, XH3.16b
261         rev64           TT4.16b, TT4.16b
262         rev64           TT3.16b, TT3.16b
263
264         ext             IN1.16b, TT4.16b, TT4.16b, #8
265         ext             XL3.16b, TT3.16b, TT3.16b, #8
266
267         eor             TT4.16b, TT4.16b, IN1.16b
268         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
269         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
270         pmull           XM2.1q, SHASH2.1d, TT4.1d       // (a1 + a0)(b1 + b0)
271
272         eor             TT3.16b, TT3.16b, XL3.16b
273         pmull2          XH3.1q, HH.2d, XL3.2d           // a1 * b1
274         pmull           XL3.1q, HH.1d, XL3.1d           // a0 * b0
275         pmull2          XM3.1q, SHASH2.2d, TT3.2d       // (a1 + a0)(b1 + b0)
276
277         ext             IN1.16b, T2.16b, T2.16b, #8
278         eor             XL2.16b, XL2.16b, XL3.16b
279         eor             XH2.16b, XH2.16b, XH3.16b
280         eor             XM2.16b, XM2.16b, XM3.16b
281
282         eor             T2.16b, T2.16b, IN1.16b
283         pmull2          XH3.1q, HH3.2d, IN1.2d          // a1 * b1
284         pmull           XL3.1q, HH3.1d, IN1.1d          // a0 * b0
285         pmull           XM3.1q, HH34.1d, T2.1d          // (a1 + a0)(b1 + b0)
286
287         eor             XL2.16b, XL2.16b, XL3.16b
288         eor             XH2.16b, XH2.16b, XH3.16b
289         eor             XM2.16b, XM2.16b, XM3.16b
290
291         ext             IN1.16b, T1.16b, T1.16b, #8
292         ext             TT3.16b, XL.16b, XL.16b, #8
293         eor             XL.16b, XL.16b, IN1.16b
294         eor             T1.16b, T1.16b, TT3.16b
295
296         pmull2          XH.1q, HH4.2d, XL.2d            // a1 * b1
297         eor             T1.16b, T1.16b, XL.16b
298         pmull           XL.1q, HH4.1d, XL.1d            // a0 * b0
299         pmull2          XM.1q, HH34.2d, T1.2d           // (a1 + a0)(b1 + b0)
300
301         eor             XL.16b, XL.16b, XL2.16b
302         eor             XH.16b, XH.16b, XH2.16b
303         eor             XM.16b, XM.16b, XM2.16b
304
305         eor             T2.16b, XL.16b, XH.16b
306         ext             T1.16b, XL.16b, XH.16b, #8
307         eor             XM.16b, XM.16b, T2.16b
308
309         __pmull_reduce_p64
310
311         eor             T2.16b, T2.16b, XH.16b
312         eor             XL.16b, XL.16b, T2.16b
313
314         cbz             w0, 5f
315         b               1b
316         .endif
317
318 2:      ld1             {T1.2d}, [x2], #16
319         sub             w0, w0, #1
320
321 3:      /* multiply XL by SHASH in GF(2^128) */
322 CPU_LE( rev64           T1.16b, T1.16b  )
323
324         ext             T2.16b, XL.16b, XL.16b, #8
325         ext             IN1.16b, T1.16b, T1.16b, #8
326         eor             T1.16b, T1.16b, T2.16b
327         eor             XL.16b, XL.16b, IN1.16b
328
329         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
330         eor             T1.16b, T1.16b, XL.16b
331         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
332         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
333
334 4:      eor             T2.16b, XL.16b, XH.16b
335         ext             T1.16b, XL.16b, XH.16b, #8
336         eor             XM.16b, XM.16b, T2.16b
337
338         __pmull_reduce_\pn
339
340         eor             T2.16b, T2.16b, XH.16b
341         eor             XL.16b, XL.16b, T2.16b
342
343         cbnz            w0, 0b
344
345 5:      st1             {XL.2d}, [x1]
346         ret
347         .endm
348
349         /*
350          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351          *                         struct ghash_key const *k, const char *head)
352          */
353 SYM_FUNC_START(pmull_ghash_update_p64)
354         __pmull_ghash   p64
355 SYM_FUNC_END(pmull_ghash_update_p64)
356
357 SYM_FUNC_START(pmull_ghash_update_p8)
358         __pmull_ghash   p8
359 SYM_FUNC_END(pmull_ghash_update_p8)
360
361         KS0             .req    v8
362         KS1             .req    v9
363         KS2             .req    v10
364         KS3             .req    v11
365
366         INP0            .req    v21
367         INP1            .req    v22
368         INP2            .req    v23
369         INP3            .req    v24
370
371         K0              .req    v25
372         K1              .req    v26
373         K2              .req    v27
374         K3              .req    v28
375         K4              .req    v12
376         K5              .req    v13
377         K6              .req    v4
378         K7              .req    v5
379         K8              .req    v14
380         K9              .req    v15
381         KK              .req    v29
382         KL              .req    v30
383         KM              .req    v31
384
385         .macro          load_round_keys, rounds, rk, tmp
386         add             \tmp, \rk, #64
387         ld1             {K0.4s-K3.4s}, [\rk]
388         ld1             {K4.4s-K5.4s}, [\tmp]
389         add             \tmp, \rk, \rounds, lsl #4
390         sub             \tmp, \tmp, #32
391         ld1             {KK.4s-KM.4s}, [\tmp]
392         .endm
393
394         .macro          enc_round, state, key
395         aese            \state\().16b, \key\().16b
396         aesmc           \state\().16b, \state\().16b
397         .endm
398
399         .macro          enc_qround, s0, s1, s2, s3, key
400         enc_round       \s0, \key
401         enc_round       \s1, \key
402         enc_round       \s2, \key
403         enc_round       \s3, \key
404         .endm
405
406         .macro          enc_block, state, rounds, rk, tmp
407         add             \tmp, \rk, #96
408         ld1             {K6.4s-K7.4s}, [\tmp], #32
409         .irp            key, K0, K1, K2, K3, K4 K5
410         enc_round       \state, \key
411         .endr
412
413         tbnz            \rounds, #2, .Lnot128_\@
414 .Lout256_\@:
415         enc_round       \state, K6
416         enc_round       \state, K7
417
418 .Lout192_\@:
419         enc_round       \state, KK
420         aese            \state\().16b, KL.16b
421         eor             \state\().16b, \state\().16b, KM.16b
422
423         .subsection     1
424 .Lnot128_\@:
425         ld1             {K8.4s-K9.4s}, [\tmp], #32
426         enc_round       \state, K6
427         enc_round       \state, K7
428         ld1             {K6.4s-K7.4s}, [\tmp]
429         enc_round       \state, K8
430         enc_round       \state, K9
431         tbz             \rounds, #1, .Lout192_\@
432         b               .Lout256_\@
433         .previous
434         .endm
435
436         .align          6
437         .macro          pmull_gcm_do_crypt, enc
438         stp             x29, x30, [sp, #-32]!
439         mov             x29, sp
440         str             x19, [sp, #24]
441
442         load_round_keys x7, x6, x8
443
444         ld1             {SHASH.2d}, [x3], #16
445         ld1             {HH.2d-HH4.2d}, [x3]
446
447         trn1            SHASH2.2d, SHASH.2d, HH.2d
448         trn2            T1.2d, SHASH.2d, HH.2d
449         eor             SHASH2.16b, SHASH2.16b, T1.16b
450
451         trn1            HH34.2d, HH3.2d, HH4.2d
452         trn2            T1.2d, HH3.2d, HH4.2d
453         eor             HH34.16b, HH34.16b, T1.16b
454
455         ld1             {XL.2d}, [x4]
456
457         cbz             x0, 3f                          // tag only?
458
459         ldr             w8, [x5, #12]                   // load lower counter
460 CPU_LE( rev             w8, w8          )
461
462 0:      mov             w9, #4                          // max blocks per round
463         add             x10, x0, #0xf
464         lsr             x10, x10, #4                    // remaining blocks
465
466         subs            x0, x0, #64
467         csel            w9, w10, w9, mi
468         add             w8, w8, w9
469
470         bmi             1f
471         ld1             {INP0.16b-INP3.16b}, [x2], #64
472         .subsection     1
473         /*
474          * Populate the four input registers right to left with up to 63 bytes
475          * of data, using overlapping loads to avoid branches.
476          *
477          *                INP0     INP1     INP2     INP3
478          *  1 byte     |        |        |        |x       |
479          * 16 bytes    |        |        |        |xxxxxxxx|
480          * 17 bytes    |        |        |xxxxxxxx|x       |
481          * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
482          * etc etc
483          *
484          * Note that this code may read up to 15 bytes before the start of
485          * the input. It is up to the calling code to ensure this is safe if
486          * this happens in the first iteration of the loop (i.e., when the
487          * input size is < 16 bytes)
488          */
489 1:      mov             x15, #16
490         ands            x19, x0, #0xf
491         csel            x19, x19, x15, ne
492         adr_l           x17, .Lpermute_table + 16
493
494         sub             x11, x15, x19
495         add             x12, x17, x11
496         sub             x17, x17, x11
497         ld1             {T1.16b}, [x12]
498         sub             x10, x1, x11
499         sub             x11, x2, x11
500
501         cmp             x0, #-16
502         csel            x14, x15, xzr, gt
503         cmp             x0, #-32
504         csel            x15, x15, xzr, gt
505         cmp             x0, #-48
506         csel            x16, x19, xzr, gt
507         csel            x1, x1, x10, gt
508         csel            x2, x2, x11, gt
509
510         ld1             {INP0.16b}, [x2], x14
511         ld1             {INP1.16b}, [x2], x15
512         ld1             {INP2.16b}, [x2], x16
513         ld1             {INP3.16b}, [x2]
514         tbl             INP3.16b, {INP3.16b}, T1.16b
515         b               2f
516         .previous
517
518 2:      .if             \enc == 0
519         bl              pmull_gcm_ghash_4x
520         .endif
521
522         bl              pmull_gcm_enc_4x
523
524         tbnz            x0, #63, 6f
525         st1             {INP0.16b-INP3.16b}, [x1], #64
526         .if             \enc == 1
527         bl              pmull_gcm_ghash_4x
528         .endif
529         bne             0b
530
531 3:      ldp             x19, x10, [sp, #24]
532         cbz             x10, 5f                         // output tag?
533
534         ld1             {INP3.16b}, [x10]               // load lengths[]
535         mov             w9, #1
536         bl              pmull_gcm_ghash_4x
537
538         mov             w11, #(0x1 << 24)               // BE '1U'
539         ld1             {KS0.16b}, [x5]
540         mov             KS0.s[3], w11
541
542         enc_block       KS0, x7, x6, x12
543
544         ext             XL.16b, XL.16b, XL.16b, #8
545         rev64           XL.16b, XL.16b
546         eor             XL.16b, XL.16b, KS0.16b
547
548         .if             \enc == 1
549         st1             {XL.16b}, [x10]                 // store tag
550         .else
551         ldp             x11, x12, [sp, #40]             // load tag pointer and authsize
552         adr_l           x17, .Lpermute_table
553         ld1             {KS0.16b}, [x11]                // load supplied tag
554         add             x17, x17, x12
555         ld1             {KS1.16b}, [x17]                // load permute vector
556
557         cmeq            XL.16b, XL.16b, KS0.16b         // compare tags
558         mvn             XL.16b, XL.16b                  // -1 for fail, 0 for pass
559         tbl             XL.16b, {XL.16b}, KS1.16b       // keep authsize bytes only
560         sminv           b0, XL.16b                      // signed minimum across XL
561         smov            w0, v0.b[0]                     // return b0
562         .endif
563
564 4:      ldp             x29, x30, [sp], #32
565         ret
566
567 5:
568 CPU_LE( rev             w8, w8          )
569         str             w8, [x5, #12]                   // store lower counter
570         st1             {XL.2d}, [x4]
571         b               4b
572
573 6:      ld1             {T1.16b-T2.16b}, [x17], #32     // permute vectors
574         sub             x17, x17, x19, lsl #1
575
576         cmp             w9, #1
577         beq             7f
578         .subsection     1
579 7:      ld1             {INP2.16b}, [x1]
580         tbx             INP2.16b, {INP3.16b}, T1.16b
581         mov             INP3.16b, INP2.16b
582         b               8f
583         .previous
584
585         st1             {INP0.16b}, [x1], x14
586         st1             {INP1.16b}, [x1], x15
587         st1             {INP2.16b}, [x1], x16
588         tbl             INP3.16b, {INP3.16b}, T1.16b
589         tbx             INP3.16b, {INP2.16b}, T2.16b
590 8:      st1             {INP3.16b}, [x1]
591
592         .if             \enc == 1
593         ld1             {T1.16b}, [x17]
594         tbl             INP3.16b, {INP3.16b}, T1.16b    // clear non-data bits
595         bl              pmull_gcm_ghash_4x
596         .endif
597         b               3b
598         .endm
599
600         /*
601          * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
602          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
603          *                        int rounds, u8 tag)
604          */
605 SYM_FUNC_START(pmull_gcm_encrypt)
606         pmull_gcm_do_crypt      1
607 SYM_FUNC_END(pmull_gcm_encrypt)
608
609         /*
610          * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
611          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
612          *                        int rounds, u8 tag)
613          */
614 SYM_FUNC_START(pmull_gcm_decrypt)
615         pmull_gcm_do_crypt      0
616 SYM_FUNC_END(pmull_gcm_decrypt)
617
618 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
619         movi            MASK.16b, #0xe1
620         shl             MASK.2d, MASK.2d, #57
621
622         rev64           T1.16b, INP0.16b
623         rev64           T2.16b, INP1.16b
624         rev64           TT3.16b, INP2.16b
625         rev64           TT4.16b, INP3.16b
626
627         ext             XL.16b, XL.16b, XL.16b, #8
628
629         tbz             w9, #2, 0f                      // <4 blocks?
630         .subsection     1
631 0:      movi            XH2.16b, #0
632         movi            XM2.16b, #0
633         movi            XL2.16b, #0
634
635         tbz             w9, #0, 1f                      // 2 blocks?
636         tbz             w9, #1, 2f                      // 1 block?
637
638         eor             T2.16b, T2.16b, XL.16b
639         ext             T1.16b, T2.16b, T2.16b, #8
640         b               .Lgh3
641
642 1:      eor             TT3.16b, TT3.16b, XL.16b
643         ext             T2.16b, TT3.16b, TT3.16b, #8
644         b               .Lgh2
645
646 2:      eor             TT4.16b, TT4.16b, XL.16b
647         ext             IN1.16b, TT4.16b, TT4.16b, #8
648         b               .Lgh1
649         .previous
650
651         eor             T1.16b, T1.16b, XL.16b
652         ext             IN1.16b, T1.16b, T1.16b, #8
653
654         pmull2          XH2.1q, HH4.2d, IN1.2d          // a1 * b1
655         eor             T1.16b, T1.16b, IN1.16b
656         pmull           XL2.1q, HH4.1d, IN1.1d          // a0 * b0
657         pmull2          XM2.1q, HH34.2d, T1.2d          // (a1 + a0)(b1 + b0)
658
659         ext             T1.16b, T2.16b, T2.16b, #8
660 .Lgh3:  eor             T2.16b, T2.16b, T1.16b
661         pmull2          XH.1q, HH3.2d, T1.2d            // a1 * b1
662         pmull           XL.1q, HH3.1d, T1.1d            // a0 * b0
663         pmull           XM.1q, HH34.1d, T2.1d           // (a1 + a0)(b1 + b0)
664
665         eor             XH2.16b, XH2.16b, XH.16b
666         eor             XL2.16b, XL2.16b, XL.16b
667         eor             XM2.16b, XM2.16b, XM.16b
668
669         ext             T2.16b, TT3.16b, TT3.16b, #8
670 .Lgh2:  eor             TT3.16b, TT3.16b, T2.16b
671         pmull2          XH.1q, HH.2d, T2.2d             // a1 * b1
672         pmull           XL.1q, HH.1d, T2.1d             // a0 * b0
673         pmull2          XM.1q, SHASH2.2d, TT3.2d        // (a1 + a0)(b1 + b0)
674
675         eor             XH2.16b, XH2.16b, XH.16b
676         eor             XL2.16b, XL2.16b, XL.16b
677         eor             XM2.16b, XM2.16b, XM.16b
678
679         ext             IN1.16b, TT4.16b, TT4.16b, #8
680 .Lgh1:  eor             TT4.16b, TT4.16b, IN1.16b
681         pmull           XL.1q, SHASH.1d, IN1.1d         // a0 * b0
682         pmull2          XH.1q, SHASH.2d, IN1.2d         // a1 * b1
683         pmull           XM.1q, SHASH2.1d, TT4.1d        // (a1 + a0)(b1 + b0)
684
685         eor             XH.16b, XH.16b, XH2.16b
686         eor             XL.16b, XL.16b, XL2.16b
687         eor             XM.16b, XM.16b, XM2.16b
688
689         eor             T2.16b, XL.16b, XH.16b
690         ext             T1.16b, XL.16b, XH.16b, #8
691         eor             XM.16b, XM.16b, T2.16b
692
693         __pmull_reduce_p64
694
695         eor             T2.16b, T2.16b, XH.16b
696         eor             XL.16b, XL.16b, T2.16b
697
698         ret
699 SYM_FUNC_END(pmull_gcm_ghash_4x)
700
701 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
702         ld1             {KS0.16b}, [x5]                 // load upper counter
703         sub             w10, w8, #4
704         sub             w11, w8, #3
705         sub             w12, w8, #2
706         sub             w13, w8, #1
707         rev             w10, w10
708         rev             w11, w11
709         rev             w12, w12
710         rev             w13, w13
711         mov             KS1.16b, KS0.16b
712         mov             KS2.16b, KS0.16b
713         mov             KS3.16b, KS0.16b
714         ins             KS0.s[3], w10                   // set lower counter
715         ins             KS1.s[3], w11
716         ins             KS2.s[3], w12
717         ins             KS3.s[3], w13
718
719         add             x10, x6, #96                    // round key pointer
720         ld1             {K6.4s-K7.4s}, [x10], #32
721         .irp            key, K0, K1, K2, K3, K4, K5
722         enc_qround      KS0, KS1, KS2, KS3, \key
723         .endr
724
725         tbnz            x7, #2, .Lnot128
726         .subsection     1
727 .Lnot128:
728         ld1             {K8.4s-K9.4s}, [x10], #32
729         .irp            key, K6, K7
730         enc_qround      KS0, KS1, KS2, KS3, \key
731         .endr
732         ld1             {K6.4s-K7.4s}, [x10]
733         .irp            key, K8, K9
734         enc_qround      KS0, KS1, KS2, KS3, \key
735         .endr
736         tbz             x7, #1, .Lout192
737         b               .Lout256
738         .previous
739
740 .Lout256:
741         .irp            key, K6, K7
742         enc_qround      KS0, KS1, KS2, KS3, \key
743         .endr
744
745 .Lout192:
746         enc_qround      KS0, KS1, KS2, KS3, KK
747
748         aese            KS0.16b, KL.16b
749         aese            KS1.16b, KL.16b
750         aese            KS2.16b, KL.16b
751         aese            KS3.16b, KL.16b
752
753         eor             KS0.16b, KS0.16b, KM.16b
754         eor             KS1.16b, KS1.16b, KM.16b
755         eor             KS2.16b, KS2.16b, KM.16b
756         eor             KS3.16b, KS3.16b, KM.16b
757
758         eor             INP0.16b, INP0.16b, KS0.16b
759         eor             INP1.16b, INP1.16b, KS1.16b
760         eor             INP2.16b, INP2.16b, KS2.16b
761         eor             INP3.16b, INP3.16b, KS3.16b
762
763         ret
764 SYM_FUNC_END(pmull_gcm_enc_4x)
765
766         .section        ".rodata", "a"
767         .align          6
768 .Lpermute_table:
769         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
771         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
772         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
773         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
775         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
776         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
777         .previous