arm64: fpsimd: run kernel mode NEON with softirqs disabled
[linux-2.6-microblaze.git] / arch / arm64 / crypto / sha3-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
4  *
5  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11
12 #include <linux/linkage.h>
13 #include <asm/assembler.h>
14
15         .irp    b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
16         .set    .Lv\b\().2d, \b
17         .set    .Lv\b\().16b, \b
18         .endr
19
20         /*
21          * ARMv8.2 Crypto Extensions instructions
22          */
23         .macro  eor3, rd, rn, rm, ra
24         .inst   0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
25         .endm
26
27         .macro  rax1, rd, rn, rm
28         .inst   0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
29         .endm
30
31         .macro  bcax, rd, rn, rm, ra
32         .inst   0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
33         .endm
34
35         .macro  xar, rd, rn, rm, imm6
36         .inst   0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
37         .endm
38
39         /*
40          * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
41          */
42         .text
43 SYM_FUNC_START(sha3_ce_transform)
44         /* load state */
45         add     x8, x0, #32
46         ld1     { v0.1d- v3.1d}, [x0]
47         ld1     { v4.1d- v7.1d}, [x8], #32
48         ld1     { v8.1d-v11.1d}, [x8], #32
49         ld1     {v12.1d-v15.1d}, [x8], #32
50         ld1     {v16.1d-v19.1d}, [x8], #32
51         ld1     {v20.1d-v23.1d}, [x8], #32
52         ld1     {v24.1d}, [x8]
53
54 0:      sub     w2, w2, #1
55         mov     w8, #24
56         adr_l   x9, .Lsha3_rcon
57
58         /* load input */
59         ld1     {v25.8b-v28.8b}, [x1], #32
60         ld1     {v29.8b-v31.8b}, [x1], #24
61         eor     v0.8b, v0.8b, v25.8b
62         eor     v1.8b, v1.8b, v26.8b
63         eor     v2.8b, v2.8b, v27.8b
64         eor     v3.8b, v3.8b, v28.8b
65         eor     v4.8b, v4.8b, v29.8b
66         eor     v5.8b, v5.8b, v30.8b
67         eor     v6.8b, v6.8b, v31.8b
68
69         tbnz    x3, #6, 2f              // SHA3-512
70
71         ld1     {v25.8b-v28.8b}, [x1], #32
72         ld1     {v29.8b-v30.8b}, [x1], #16
73         eor      v7.8b,  v7.8b, v25.8b
74         eor      v8.8b,  v8.8b, v26.8b
75         eor      v9.8b,  v9.8b, v27.8b
76         eor     v10.8b, v10.8b, v28.8b
77         eor     v11.8b, v11.8b, v29.8b
78         eor     v12.8b, v12.8b, v30.8b
79
80         tbnz    x3, #4, 1f              // SHA3-384 or SHA3-224
81
82         // SHA3-256
83         ld1     {v25.8b-v28.8b}, [x1], #32
84         eor     v13.8b, v13.8b, v25.8b
85         eor     v14.8b, v14.8b, v26.8b
86         eor     v15.8b, v15.8b, v27.8b
87         eor     v16.8b, v16.8b, v28.8b
88         b       3f
89
90 1:      tbz     x3, #2, 3f              // bit 2 cleared? SHA-384
91
92         // SHA3-224
93         ld1     {v25.8b-v28.8b}, [x1], #32
94         ld1     {v29.8b}, [x1], #8
95         eor     v13.8b, v13.8b, v25.8b
96         eor     v14.8b, v14.8b, v26.8b
97         eor     v15.8b, v15.8b, v27.8b
98         eor     v16.8b, v16.8b, v28.8b
99         eor     v17.8b, v17.8b, v29.8b
100         b       3f
101
102         // SHA3-512
103 2:      ld1     {v25.8b-v26.8b}, [x1], #16
104         eor      v7.8b,  v7.8b, v25.8b
105         eor      v8.8b,  v8.8b, v26.8b
106
107 3:      sub     w8, w8, #1
108
109         eor3    v29.16b,  v4.16b,  v9.16b, v14.16b
110         eor3    v26.16b,  v1.16b,  v6.16b, v11.16b
111         eor3    v28.16b,  v3.16b,  v8.16b, v13.16b
112         eor3    v25.16b,  v0.16b,  v5.16b, v10.16b
113         eor3    v27.16b,  v2.16b,  v7.16b, v12.16b
114         eor3    v29.16b, v29.16b, v19.16b, v24.16b
115         eor3    v26.16b, v26.16b, v16.16b, v21.16b
116         eor3    v28.16b, v28.16b, v18.16b, v23.16b
117         eor3    v25.16b, v25.16b, v15.16b, v20.16b
118         eor3    v27.16b, v27.16b, v17.16b, v22.16b
119
120         rax1    v30.2d, v29.2d, v26.2d  // bc[0]
121         rax1    v26.2d, v26.2d, v28.2d  // bc[2]
122         rax1    v28.2d, v28.2d, v25.2d  // bc[4]
123         rax1    v25.2d, v25.2d, v27.2d  // bc[1]
124         rax1    v27.2d, v27.2d, v29.2d  // bc[3]
125
126         eor      v0.16b,  v0.16b, v30.16b
127         xar      v29.2d,   v1.2d,  v25.2d, (64 - 1)
128         xar       v1.2d,   v6.2d,  v25.2d, (64 - 44)
129         xar       v6.2d,   v9.2d,  v28.2d, (64 - 20)
130         xar       v9.2d,  v22.2d,  v26.2d, (64 - 61)
131         xar      v22.2d,  v14.2d,  v28.2d, (64 - 39)
132         xar      v14.2d,  v20.2d,  v30.2d, (64 - 18)
133         xar      v31.2d,   v2.2d,  v26.2d, (64 - 62)
134         xar       v2.2d,  v12.2d,  v26.2d, (64 - 43)
135         xar      v12.2d,  v13.2d,  v27.2d, (64 - 25)
136         xar      v13.2d,  v19.2d,  v28.2d, (64 - 8)
137         xar      v19.2d,  v23.2d,  v27.2d, (64 - 56)
138         xar      v23.2d,  v15.2d,  v30.2d, (64 - 41)
139         xar      v15.2d,   v4.2d,  v28.2d, (64 - 27)
140         xar      v28.2d,  v24.2d,  v28.2d, (64 - 14)
141         xar      v24.2d,  v21.2d,  v25.2d, (64 - 2)
142         xar       v8.2d,   v8.2d,  v27.2d, (64 - 55)
143         xar       v4.2d,  v16.2d,  v25.2d, (64 - 45)
144         xar      v16.2d,   v5.2d,  v30.2d, (64 - 36)
145         xar       v5.2d,   v3.2d,  v27.2d, (64 - 28)
146         xar      v27.2d,  v18.2d,  v27.2d, (64 - 21)
147         xar       v3.2d,  v17.2d,  v26.2d, (64 - 15)
148         xar      v25.2d,  v11.2d,  v25.2d, (64 - 10)
149         xar      v26.2d,   v7.2d,  v26.2d, (64 - 6)
150         xar      v30.2d,  v10.2d,  v30.2d, (64 - 3)
151
152         bcax    v20.16b, v31.16b, v22.16b,  v8.16b
153         bcax    v21.16b,  v8.16b, v23.16b, v22.16b
154         bcax    v22.16b, v22.16b, v24.16b, v23.16b
155         bcax    v23.16b, v23.16b, v31.16b, v24.16b
156         bcax    v24.16b, v24.16b,  v8.16b, v31.16b
157
158         ld1r    {v31.2d}, [x9], #8
159
160         bcax    v17.16b, v25.16b, v19.16b,  v3.16b
161         bcax    v18.16b,  v3.16b, v15.16b, v19.16b
162         bcax    v19.16b, v19.16b, v16.16b, v15.16b
163         bcax    v15.16b, v15.16b, v25.16b, v16.16b
164         bcax    v16.16b, v16.16b,  v3.16b, v25.16b
165
166         bcax    v10.16b, v29.16b, v12.16b, v26.16b
167         bcax    v11.16b, v26.16b, v13.16b, v12.16b
168         bcax    v12.16b, v12.16b, v14.16b, v13.16b
169         bcax    v13.16b, v13.16b, v29.16b, v14.16b
170         bcax    v14.16b, v14.16b, v26.16b, v29.16b
171
172         bcax     v7.16b, v30.16b,  v9.16b,  v4.16b
173         bcax     v8.16b,  v4.16b,  v5.16b,  v9.16b
174         bcax     v9.16b,  v9.16b,  v6.16b,  v5.16b
175         bcax     v5.16b,  v5.16b, v30.16b,  v6.16b
176         bcax     v6.16b,  v6.16b,  v4.16b, v30.16b
177
178         bcax     v3.16b, v27.16b,  v0.16b, v28.16b
179         bcax     v4.16b, v28.16b,  v1.16b,  v0.16b
180         bcax     v0.16b,  v0.16b,  v2.16b,  v1.16b
181         bcax     v1.16b,  v1.16b, v27.16b,  v2.16b
182         bcax     v2.16b,  v2.16b, v28.16b, v27.16b
183
184         eor      v0.16b,  v0.16b, v31.16b
185
186         cbnz    w8, 3b
187         cond_yield 4f, x8, x9
188         cbnz    w2, 0b
189
190         /* save state */
191 4:      st1     { v0.1d- v3.1d}, [x0], #32
192         st1     { v4.1d- v7.1d}, [x0], #32
193         st1     { v8.1d-v11.1d}, [x0], #32
194         st1     {v12.1d-v15.1d}, [x0], #32
195         st1     {v16.1d-v19.1d}, [x0], #32
196         st1     {v20.1d-v23.1d}, [x0], #32
197         st1     {v24.1d}, [x0]
198         mov     w0, w2
199         ret
200 SYM_FUNC_END(sha3_ce_transform)
201
202         .section        ".rodata", "a"
203         .align          8
204 .Lsha3_rcon:
205         .quad   0x0000000000000001, 0x0000000000008082, 0x800000000000808a
206         .quad   0x8000000080008000, 0x000000000000808b, 0x0000000080000001
207         .quad   0x8000000080008081, 0x8000000000008009, 0x000000000000008a
208         .quad   0x0000000000000088, 0x0000000080008009, 0x000000008000000a
209         .quad   0x000000008000808b, 0x800000000000008b, 0x8000000000008089
210         .quad   0x8000000000008003, 0x8000000000008002, 0x8000000000000080
211         .quad   0x000000000000800a, 0x800000008000000a, 0x8000000080008081
212         .quad   0x8000000000008080, 0x0000000080000001, 0x8000000080008008