Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[linux-2.6-microblaze.git] / arch / arm / crypto / ghash-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4  *
5  * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10
11         .arch           armv8-a
12         .fpu            crypto-neon-fp-armv8
13
14         SHASH           .req    q0
15         T1              .req    q1
16         XL              .req    q2
17         XM              .req    q3
18         XH              .req    q4
19         IN1             .req    q4
20
21         SHASH_L         .req    d0
22         SHASH_H         .req    d1
23         T1_L            .req    d2
24         T1_H            .req    d3
25         XL_L            .req    d4
26         XL_H            .req    d5
27         XM_L            .req    d6
28         XM_H            .req    d7
29         XH_L            .req    d8
30
31         t0l             .req    d10
32         t0h             .req    d11
33         t1l             .req    d12
34         t1h             .req    d13
35         t2l             .req    d14
36         t2h             .req    d15
37         t3l             .req    d16
38         t3h             .req    d17
39         t4l             .req    d18
40         t4h             .req    d19
41
42         t0q             .req    q5
43         t1q             .req    q6
44         t2q             .req    q7
45         t3q             .req    q8
46         t4q             .req    q9
47         T2              .req    q9
48
49         s1l             .req    d20
50         s1h             .req    d21
51         s2l             .req    d22
52         s2h             .req    d23
53         s3l             .req    d24
54         s3h             .req    d25
55         s4l             .req    d26
56         s4h             .req    d27
57
58         MASK            .req    d28
59         SHASH2_p8       .req    d28
60
61         k16             .req    d29
62         k32             .req    d30
63         k48             .req    d31
64         SHASH2_p64      .req    d31
65
66         HH              .req    q10
67         HH3             .req    q11
68         HH4             .req    q12
69         HH34            .req    q13
70
71         HH_L            .req    d20
72         HH_H            .req    d21
73         HH3_L           .req    d22
74         HH3_H           .req    d23
75         HH4_L           .req    d24
76         HH4_H           .req    d25
77         HH34_L          .req    d26
78         HH34_H          .req    d27
79         SHASH2_H        .req    d29
80
81         XL2             .req    q5
82         XM2             .req    q6
83         XH2             .req    q7
84         T3              .req    q8
85
86         XL2_L           .req    d10
87         XL2_H           .req    d11
88         XM2_L           .req    d12
89         XM2_H           .req    d13
90         T3_L            .req    d16
91         T3_H            .req    d17
92
93         .text
94
95         .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
96         vmull.p64       \rd, \rn, \rm
97         .endm
98
99         /*
100          * This implementation of 64x64 -> 128 bit polynomial multiplication
101          * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
102          * "Fast Software Polynomial Multiplication on ARM Processors Using
103          * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
104          * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
105          *
106          * It has been slightly tweaked for in-order performance, and to allow
107          * 'rq' to overlap with 'ad' or 'bd'.
108          */
109         .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
110         vext.8          t0l, \ad, \ad, #1       @ A1
111         .ifc            \b1, t4l
112         vext.8          t4l, \bd, \bd, #1       @ B1
113         .endif
114         vmull.p8        t0q, t0l, \bd           @ F = A1*B
115         vext.8          t1l, \ad, \ad, #2       @ A2
116         vmull.p8        t4q, \ad, \b1           @ E = A*B1
117         .ifc            \b2, t3l
118         vext.8          t3l, \bd, \bd, #2       @ B2
119         .endif
120         vmull.p8        t1q, t1l, \bd           @ H = A2*B
121         vext.8          t2l, \ad, \ad, #3       @ A3
122         vmull.p8        t3q, \ad, \b2           @ G = A*B2
123         veor            t0q, t0q, t4q           @ L = E + F
124         .ifc            \b3, t4l
125         vext.8          t4l, \bd, \bd, #3       @ B3
126         .endif
127         vmull.p8        t2q, t2l, \bd           @ J = A3*B
128         veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
129         veor            t1q, t1q, t3q           @ M = G + H
130         .ifc            \b4, t3l
131         vext.8          t3l, \bd, \bd, #4       @ B4
132         .endif
133         vmull.p8        t4q, \ad, \b3           @ I = A*B3
134         veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
135         vmull.p8        t3q, \ad, \b4           @ K = A*B4
136         vand            t0h, t0h, k48
137         vand            t1h, t1h, k32
138         veor            t2q, t2q, t4q           @ N = I + J
139         veor            t0l, t0l, t0h
140         veor            t1l, t1l, t1h
141         veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
142         vand            t2h, t2h, k16
143         veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
144         vmov.i64        t3h, #0
145         vext.8          t0q, t0q, t0q, #15
146         veor            t2l, t2l, t2h
147         vext.8          t1q, t1q, t1q, #14
148         vmull.p8        \rq, \ad, \bd           @ D = A*B
149         vext.8          t2q, t2q, t2q, #13
150         vext.8          t3q, t3q, t3q, #12
151         veor            t0q, t0q, t1q
152         veor            t2q, t2q, t3q
153         veor            \rq, \rq, t0q
154         veor            \rq, \rq, t2q
155         .endm
156
157         //
158         // PMULL (64x64->128) based reduction for CPUs that can do
159         // it in a single instruction.
160         //
161         .macro          __pmull_reduce_p64
162         vmull.p64       T1, XL_L, MASK
163
164         veor            XH_L, XH_L, XM_H
165         vext.8          T1, T1, T1, #8
166         veor            XL_H, XL_H, XM_L
167         veor            T1, T1, XL
168
169         vmull.p64       XL, T1_H, MASK
170         .endm
171
172         //
173         // Alternative reduction for CPUs that lack support for the
174         // 64x64->128 PMULL instruction
175         //
176         .macro          __pmull_reduce_p8
177         veor            XL_H, XL_H, XM_L
178         veor            XH_L, XH_L, XM_H
179
180         vshl.i64        T1, XL, #57
181         vshl.i64        T2, XL, #62
182         veor            T1, T1, T2
183         vshl.i64        T2, XL, #63
184         veor            T1, T1, T2
185         veor            XL_H, XL_H, T1_L
186         veor            XH_L, XH_L, T1_H
187
188         vshr.u64        T1, XL, #1
189         veor            XH, XH, XL
190         veor            XL, XL, T1
191         vshr.u64        T1, T1, #6
192         vshr.u64        XL, XL, #1
193         .endm
194
195         .macro          ghash_update, pn
196         vld1.64         {XL}, [r1]
197
198         /* do the head block first, if supplied */
199         ldr             ip, [sp]
200         teq             ip, #0
201         beq             0f
202         vld1.64         {T1}, [ip]
203         teq             r0, #0
204         b               3f
205
206 0:      .ifc            \pn, p64
207         tst             r0, #3                  // skip until #blocks is a
208         bne             2f                      // round multiple of 4
209
210         vld1.8          {XL2-XM2}, [r2]!
211 1:      vld1.8          {T3-T2}, [r2]!
212         vrev64.8        XL2, XL2
213         vrev64.8        XM2, XM2
214
215         subs            r0, r0, #4
216
217         vext.8          T1, XL2, XL2, #8
218         veor            XL2_H, XL2_H, XL_L
219         veor            XL, XL, T1
220
221         vrev64.8        T3, T3
222         vrev64.8        T1, T2
223
224         vmull.p64       XH, HH4_H, XL_H                 // a1 * b1
225         veor            XL2_H, XL2_H, XL_H
226         vmull.p64       XL, HH4_L, XL_L                 // a0 * b0
227         vmull.p64       XM, HH34_H, XL2_H               // (a1 + a0)(b1 + b0)
228
229         vmull.p64       XH2, HH3_H, XM2_L               // a1 * b1
230         veor            XM2_L, XM2_L, XM2_H
231         vmull.p64       XL2, HH3_L, XM2_H               // a0 * b0
232         vmull.p64       XM2, HH34_L, XM2_L              // (a1 + a0)(b1 + b0)
233
234         veor            XH, XH, XH2
235         veor            XL, XL, XL2
236         veor            XM, XM, XM2
237
238         vmull.p64       XH2, HH_H, T3_L                 // a1 * b1
239         veor            T3_L, T3_L, T3_H
240         vmull.p64       XL2, HH_L, T3_H                 // a0 * b0
241         vmull.p64       XM2, SHASH2_H, T3_L             // (a1 + a0)(b1 + b0)
242
243         veor            XH, XH, XH2
244         veor            XL, XL, XL2
245         veor            XM, XM, XM2
246
247         vmull.p64       XH2, SHASH_H, T1_L              // a1 * b1
248         veor            T1_L, T1_L, T1_H
249         vmull.p64       XL2, SHASH_L, T1_H              // a0 * b0
250         vmull.p64       XM2, SHASH2_p64, T1_L           // (a1 + a0)(b1 + b0)
251
252         veor            XH, XH, XH2
253         veor            XL, XL, XL2
254         veor            XM, XM, XM2
255
256         beq             4f
257
258         vld1.8          {XL2-XM2}, [r2]!
259
260         veor            T1, XL, XH
261         veor            XM, XM, T1
262
263         __pmull_reduce_p64
264
265         veor            T1, T1, XH
266         veor            XL, XL, T1
267
268         b               1b
269         .endif
270
271 2:      vld1.64         {T1}, [r2]!
272         subs            r0, r0, #1
273
274 3:      /* multiply XL by SHASH in GF(2^128) */
275 #ifndef CONFIG_CPU_BIG_ENDIAN
276         vrev64.8        T1, T1
277 #endif
278         vext.8          IN1, T1, T1, #8
279         veor            T1_L, T1_L, XL_H
280         veor            XL, XL, IN1
281
282         __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
283         veor            T1, T1, XL
284         __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
285         __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)
286
287 4:      veor            T1, XL, XH
288         veor            XM, XM, T1
289
290         __pmull_reduce_\pn
291
292         veor            T1, T1, XH
293         veor            XL, XL, T1
294
295         bne             0b
296
297         vst1.64         {XL}, [r1]
298         bx              lr
299         .endm
300
301         /*
302          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
303          *                         struct ghash_key const *k, const char *head)
304          */
305 ENTRY(pmull_ghash_update_p64)
306         vld1.64         {SHASH}, [r3]!
307         vld1.64         {HH}, [r3]!
308         vld1.64         {HH3-HH4}, [r3]
309
310         veor            SHASH2_p64, SHASH_L, SHASH_H
311         veor            SHASH2_H, HH_L, HH_H
312         veor            HH34_L, HH3_L, HH3_H
313         veor            HH34_H, HH4_L, HH4_H
314
315         vmov.i8         MASK, #0xe1
316         vshl.u64        MASK, MASK, #57
317
318         ghash_update    p64
319 ENDPROC(pmull_ghash_update_p64)
320
321 ENTRY(pmull_ghash_update_p8)
322         vld1.64         {SHASH}, [r3]
323         veor            SHASH2_p8, SHASH_L, SHASH_H
324
325         vext.8          s1l, SHASH_L, SHASH_L, #1
326         vext.8          s2l, SHASH_L, SHASH_L, #2
327         vext.8          s3l, SHASH_L, SHASH_L, #3
328         vext.8          s4l, SHASH_L, SHASH_L, #4
329         vext.8          s1h, SHASH_H, SHASH_H, #1
330         vext.8          s2h, SHASH_H, SHASH_H, #2
331         vext.8          s3h, SHASH_H, SHASH_H, #3
332         vext.8          s4h, SHASH_H, SHASH_H, #4
333
334         vmov.i64        k16, #0xffff
335         vmov.i64        k32, #0xffffffff
336         vmov.i64        k48, #0xffffffffffff
337
338         ghash_update    p8
339 ENDPROC(pmull_ghash_update_p8)