powerpc/64s: Make NMI record implicitly soft-masked code as irqs disabled
[linux-2.6-microblaze.git] / arch / x86 / crypto / blowfish-x86_64-asm_64.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Blowfish Cipher Algorithm (x86_64)
4  *
5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6  */
7
8 #include <linux/linkage.h>
9
10 .file "blowfish-x86_64-asm.S"
11 .text
12
13 /* structure of crypto context */
14 #define p       0
15 #define s0      ((16 + 2) * 4)
16 #define s1      ((16 + 2 + (1 * 256)) * 4)
17 #define s2      ((16 + 2 + (2 * 256)) * 4)
18 #define s3      ((16 + 2 + (3 * 256)) * 4)
19
20 /* register macros */
21 #define CTX %r12
22 #define RIO %rsi
23
24 #define RX0 %rax
25 #define RX1 %rbx
26 #define RX2 %rcx
27 #define RX3 %rdx
28
29 #define RX0d %eax
30 #define RX1d %ebx
31 #define RX2d %ecx
32 #define RX3d %edx
33
34 #define RX0bl %al
35 #define RX1bl %bl
36 #define RX2bl %cl
37 #define RX3bl %dl
38
39 #define RX0bh %ah
40 #define RX1bh %bh
41 #define RX2bh %ch
42 #define RX3bh %dh
43
44 #define RT0 %rdi
45 #define RT1 %rsi
46 #define RT2 %r8
47 #define RT3 %r9
48
49 #define RT0d %edi
50 #define RT1d %esi
51 #define RT2d %r8d
52 #define RT3d %r9d
53
54 #define RKEY %r10
55
56 /***********************************************************************
57  * 1-way blowfish
58  ***********************************************************************/
59 #define F() \
60         rorq $16,               RX0; \
61         movzbl RX0bh,           RT0d; \
62         movzbl RX0bl,           RT1d; \
63         rolq $16,               RX0; \
64         movl s0(CTX,RT0,4),     RT0d; \
65         addl s1(CTX,RT1,4),     RT0d; \
66         movzbl RX0bh,           RT1d; \
67         movzbl RX0bl,           RT2d; \
68         rolq $32,               RX0; \
69         xorl s2(CTX,RT1,4),     RT0d; \
70         addl s3(CTX,RT2,4),     RT0d; \
71         xorq RT0,               RX0;
72
73 #define add_roundkey_enc(n) \
74         xorq p+4*(n)(CTX),      RX0;
75
76 #define round_enc(n) \
77         add_roundkey_enc(n); \
78         \
79         F(); \
80         F();
81
82 #define add_roundkey_dec(n) \
83         movq p+4*(n-1)(CTX),    RT0; \
84         rorq $32,               RT0; \
85         xorq RT0,               RX0;
86
87 #define round_dec(n) \
88         add_roundkey_dec(n); \
89         \
90         F(); \
91         F(); \
92
93 #define read_block() \
94         movq (RIO),             RX0; \
95         rorq $32,               RX0; \
96         bswapq                  RX0;
97
98 #define write_block() \
99         bswapq                  RX0; \
100         movq RX0,               (RIO);
101
102 #define xor_block() \
103         bswapq                  RX0; \
104         xorq RX0,               (RIO);
105
106 SYM_FUNC_START(__blowfish_enc_blk)
107         /* input:
108          *      %rdi: ctx
109          *      %rsi: dst
110          *      %rdx: src
111          *      %rcx: bool, if true: xor output
112          */
113         movq %r12, %r11;
114
115         movq %rdi, CTX;
116         movq %rsi, %r10;
117         movq %rdx, RIO;
118
119         read_block();
120
121         round_enc(0);
122         round_enc(2);
123         round_enc(4);
124         round_enc(6);
125         round_enc(8);
126         round_enc(10);
127         round_enc(12);
128         round_enc(14);
129         add_roundkey_enc(16);
130
131         movq %r11, %r12;
132
133         movq %r10, RIO;
134         test %cl, %cl;
135         jnz .L__enc_xor;
136
137         write_block();
138         ret;
139 .L__enc_xor:
140         xor_block();
141         ret;
142 SYM_FUNC_END(__blowfish_enc_blk)
143
144 SYM_FUNC_START(blowfish_dec_blk)
145         /* input:
146          *      %rdi: ctx
147          *      %rsi: dst
148          *      %rdx: src
149          */
150         movq %r12, %r11;
151
152         movq %rdi, CTX;
153         movq %rsi, %r10;
154         movq %rdx, RIO;
155
156         read_block();
157
158         round_dec(17);
159         round_dec(15);
160         round_dec(13);
161         round_dec(11);
162         round_dec(9);
163         round_dec(7);
164         round_dec(5);
165         round_dec(3);
166         add_roundkey_dec(1);
167
168         movq %r10, RIO;
169         write_block();
170
171         movq %r11, %r12;
172
173         ret;
174 SYM_FUNC_END(blowfish_dec_blk)
175
176 /**********************************************************************
177   4-way blowfish, four blocks parallel
178  **********************************************************************/
179
180 /* F() for 4-way. Slower when used alone/1-way, but faster when used
181  * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
182  */
183 #define F4(x) \
184         movzbl x ## bh,         RT1d; \
185         movzbl x ## bl,         RT3d; \
186         rorq $16,               x; \
187         movzbl x ## bh,         RT0d; \
188         movzbl x ## bl,         RT2d; \
189         rorq $16,               x; \
190         movl s0(CTX,RT0,4),     RT0d; \
191         addl s1(CTX,RT2,4),     RT0d; \
192         xorl s2(CTX,RT1,4),     RT0d; \
193         addl s3(CTX,RT3,4),     RT0d; \
194         xorq RT0,               x;
195
196 #define add_preloaded_roundkey4() \
197         xorq RKEY,              RX0; \
198         xorq RKEY,              RX1; \
199         xorq RKEY,              RX2; \
200         xorq RKEY,              RX3;
201
202 #define preload_roundkey_enc(n) \
203         movq p+4*(n)(CTX),      RKEY;
204
205 #define add_roundkey_enc4(n) \
206         add_preloaded_roundkey4(); \
207         preload_roundkey_enc(n + 2);
208
209 #define round_enc4(n) \
210         add_roundkey_enc4(n); \
211         \
212         F4(RX0); \
213         F4(RX1); \
214         F4(RX2); \
215         F4(RX3); \
216         \
217         F4(RX0); \
218         F4(RX1); \
219         F4(RX2); \
220         F4(RX3);
221
222 #define preload_roundkey_dec(n) \
223         movq p+4*((n)-1)(CTX),  RKEY; \
224         rorq $32,               RKEY;
225
226 #define add_roundkey_dec4(n) \
227         add_preloaded_roundkey4(); \
228         preload_roundkey_dec(n - 2);
229
230 #define round_dec4(n) \
231         add_roundkey_dec4(n); \
232         \
233         F4(RX0); \
234         F4(RX1); \
235         F4(RX2); \
236         F4(RX3); \
237         \
238         F4(RX0); \
239         F4(RX1); \
240         F4(RX2); \
241         F4(RX3);
242
243 #define read_block4() \
244         movq (RIO),             RX0; \
245         rorq $32,               RX0; \
246         bswapq                  RX0; \
247         \
248         movq 8(RIO),            RX1; \
249         rorq $32,               RX1; \
250         bswapq                  RX1; \
251         \
252         movq 16(RIO),           RX2; \
253         rorq $32,               RX2; \
254         bswapq                  RX2; \
255         \
256         movq 24(RIO),           RX3; \
257         rorq $32,               RX3; \
258         bswapq                  RX3;
259
260 #define write_block4() \
261         bswapq                  RX0; \
262         movq RX0,               (RIO); \
263         \
264         bswapq                  RX1; \
265         movq RX1,               8(RIO); \
266         \
267         bswapq                  RX2; \
268         movq RX2,               16(RIO); \
269         \
270         bswapq                  RX3; \
271         movq RX3,               24(RIO);
272
273 #define xor_block4() \
274         bswapq                  RX0; \
275         xorq RX0,               (RIO); \
276         \
277         bswapq                  RX1; \
278         xorq RX1,               8(RIO); \
279         \
280         bswapq                  RX2; \
281         xorq RX2,               16(RIO); \
282         \
283         bswapq                  RX3; \
284         xorq RX3,               24(RIO);
285
286 SYM_FUNC_START(__blowfish_enc_blk_4way)
287         /* input:
288          *      %rdi: ctx
289          *      %rsi: dst
290          *      %rdx: src
291          *      %rcx: bool, if true: xor output
292          */
293         pushq %r12;
294         pushq %rbx;
295         pushq %rcx;
296
297         movq %rdi, CTX
298         movq %rsi, %r11;
299         movq %rdx, RIO;
300
301         preload_roundkey_enc(0);
302
303         read_block4();
304
305         round_enc4(0);
306         round_enc4(2);
307         round_enc4(4);
308         round_enc4(6);
309         round_enc4(8);
310         round_enc4(10);
311         round_enc4(12);
312         round_enc4(14);
313         add_preloaded_roundkey4();
314
315         popq %r12;
316         movq %r11, RIO;
317
318         test %r12b, %r12b;
319         jnz .L__enc_xor4;
320
321         write_block4();
322
323         popq %rbx;
324         popq %r12;
325         ret;
326
327 .L__enc_xor4:
328         xor_block4();
329
330         popq %rbx;
331         popq %r12;
332         ret;
333 SYM_FUNC_END(__blowfish_enc_blk_4way)
334
335 SYM_FUNC_START(blowfish_dec_blk_4way)
336         /* input:
337          *      %rdi: ctx
338          *      %rsi: dst
339          *      %rdx: src
340          */
341         pushq %r12;
342         pushq %rbx;
343
344         movq %rdi, CTX;
345         movq %rsi, %r11
346         movq %rdx, RIO;
347
348         preload_roundkey_dec(17);
349         read_block4();
350
351         round_dec4(17);
352         round_dec4(15);
353         round_dec4(13);
354         round_dec4(11);
355         round_dec4(9);
356         round_dec4(7);
357         round_dec4(5);
358         round_dec4(3);
359         add_preloaded_roundkey4();
360
361         movq %r11, RIO;
362         write_block4();
363
364         popq %rbx;
365         popq %r12;
366
367         ret;
368 SYM_FUNC_END(blowfish_dec_blk_4way)