Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / arch / x86 / include / asm / xor.h
1 #ifndef _ASM_X86_XOR_H
2 #define _ASM_X86_XOR_H
3
4 /*
5  * Optimized RAID-5 checksumming functions for SSE.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2, or (at your option)
10  * any later version.
11  *
12  * You should have received a copy of the GNU General Public License
13  * (for example /usr/src/linux/COPYING); if not, write to the Free
14  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15  */
16
17 /*
18  * Cache avoiding checksumming functions utilizing KNI instructions
19  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20  */
21
22 /*
23  * Based on
24  * High-speed RAID5 checksumming functions utilizing SSE instructions.
25  * Copyright (C) 1998 Ingo Molnar.
26  */
27
28 /*
29  * x86-64 changes / gcc fixes from Andi Kleen.
30  * Copyright 2002 Andi Kleen, SuSE Labs.
31  *
32  * This hasn't been optimized for the hammer yet, but there are likely
33  * no advantages to be gotten from x86-64 here anyways.
34  */
35
36 #include <asm/fpu/api.h>
37
38 #ifdef CONFIG_X86_32
39 /* reduce register pressure */
40 # define XOR_CONSTANT_CONSTRAINT "i"
41 #else
42 # define XOR_CONSTANT_CONSTRAINT "re"
43 #endif
44
45 #define OFFS(x)         "16*("#x")"
46 #define PF_OFFS(x)      "256+16*("#x")"
47 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
48 #define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
49 #define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
50 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
51 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
52 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
53 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
54 #define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
55 #define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
56 #define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
57 #define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
58 #define NOP(x)
59
60 #define BLK64(pf, op, i)                                \
61                 pf(i)                                   \
62                 op(i, 0)                                \
63                         op(i + 1, 1)                    \
64                                 op(i + 2, 2)            \
65                                         op(i + 3, 3)
66
67 static void
68 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
69 {
70         unsigned long lines = bytes >> 8;
71
72         kernel_fpu_begin();
73
74         asm volatile(
75 #undef BLOCK
76 #define BLOCK(i)                                        \
77                 LD(i, 0)                                \
78                         LD(i + 1, 1)                    \
79                 PF1(i)                                  \
80                                 PF1(i + 2)              \
81                                 LD(i + 2, 2)            \
82                                         LD(i + 3, 3)    \
83                 PF0(i + 4)                              \
84                                 PF0(i + 6)              \
85                 XO1(i, 0)                               \
86                         XO1(i + 1, 1)                   \
87                                 XO1(i + 2, 2)           \
88                                         XO1(i + 3, 3)   \
89                 ST(i, 0)                                \
90                         ST(i + 1, 1)                    \
91                                 ST(i + 2, 2)            \
92                                         ST(i + 3, 3)    \
93
94
95                 PF0(0)
96                                 PF0(2)
97
98         " .align 32                     ;\n"
99         " 1:                            ;\n"
100
101                 BLOCK(0)
102                 BLOCK(4)
103                 BLOCK(8)
104                 BLOCK(12)
105
106         "       add %[inc], %[p1]       ;\n"
107         "       add %[inc], %[p2]       ;\n"
108         "       dec %[cnt]              ;\n"
109         "       jnz 1b                  ;\n"
110         : [cnt] "+r" (lines),
111           [p1] "+r" (p1), [p2] "+r" (p2)
112         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
113         : "memory");
114
115         kernel_fpu_end();
116 }
117
118 static void
119 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
120 {
121         unsigned long lines = bytes >> 8;
122
123         kernel_fpu_begin();
124
125         asm volatile(
126 #undef BLOCK
127 #define BLOCK(i)                        \
128                 BLK64(PF0, LD, i)       \
129                 BLK64(PF1, XO1, i)      \
130                 BLK64(NOP, ST, i)       \
131
132         " .align 32                     ;\n"
133         " 1:                            ;\n"
134
135                 BLOCK(0)
136                 BLOCK(4)
137                 BLOCK(8)
138                 BLOCK(12)
139
140         "       add %[inc], %[p1]       ;\n"
141         "       add %[inc], %[p2]       ;\n"
142         "       dec %[cnt]              ;\n"
143         "       jnz 1b                  ;\n"
144         : [cnt] "+r" (lines),
145           [p1] "+r" (p1), [p2] "+r" (p2)
146         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
147         : "memory");
148
149         kernel_fpu_end();
150 }
151
152 static void
153 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
154           unsigned long *p3)
155 {
156         unsigned long lines = bytes >> 8;
157
158         kernel_fpu_begin();
159
160         asm volatile(
161 #undef BLOCK
162 #define BLOCK(i) \
163                 PF1(i)                                  \
164                                 PF1(i + 2)              \
165                 LD(i, 0)                                \
166                         LD(i + 1, 1)                    \
167                                 LD(i + 2, 2)            \
168                                         LD(i + 3, 3)    \
169                 PF2(i)                                  \
170                                 PF2(i + 2)              \
171                 PF0(i + 4)                              \
172                                 PF0(i + 6)              \
173                 XO1(i, 0)                               \
174                         XO1(i + 1, 1)                   \
175                                 XO1(i + 2, 2)           \
176                                         XO1(i + 3, 3)   \
177                 XO2(i, 0)                               \
178                         XO2(i + 1, 1)                   \
179                                 XO2(i + 2, 2)           \
180                                         XO2(i + 3, 3)   \
181                 ST(i, 0)                                \
182                         ST(i + 1, 1)                    \
183                                 ST(i + 2, 2)            \
184                                         ST(i + 3, 3)    \
185
186
187                 PF0(0)
188                                 PF0(2)
189
190         " .align 32                     ;\n"
191         " 1:                            ;\n"
192
193                 BLOCK(0)
194                 BLOCK(4)
195                 BLOCK(8)
196                 BLOCK(12)
197
198         "       add %[inc], %[p1]       ;\n"
199         "       add %[inc], %[p2]       ;\n"
200         "       add %[inc], %[p3]       ;\n"
201         "       dec %[cnt]              ;\n"
202         "       jnz 1b                  ;\n"
203         : [cnt] "+r" (lines),
204           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
205         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
206         : "memory");
207
208         kernel_fpu_end();
209 }
210
211 static void
212 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
213                unsigned long *p3)
214 {
215         unsigned long lines = bytes >> 8;
216
217         kernel_fpu_begin();
218
219         asm volatile(
220 #undef BLOCK
221 #define BLOCK(i)                        \
222                 BLK64(PF0, LD, i)       \
223                 BLK64(PF1, XO1, i)      \
224                 BLK64(PF2, XO2, i)      \
225                 BLK64(NOP, ST, i)       \
226
227         " .align 32                     ;\n"
228         " 1:                            ;\n"
229
230                 BLOCK(0)
231                 BLOCK(4)
232                 BLOCK(8)
233                 BLOCK(12)
234
235         "       add %[inc], %[p1]       ;\n"
236         "       add %[inc], %[p2]       ;\n"
237         "       add %[inc], %[p3]       ;\n"
238         "       dec %[cnt]              ;\n"
239         "       jnz 1b                  ;\n"
240         : [cnt] "+r" (lines),
241           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
242         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
243         : "memory");
244
245         kernel_fpu_end();
246 }
247
248 static void
249 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
250           unsigned long *p3, unsigned long *p4)
251 {
252         unsigned long lines = bytes >> 8;
253
254         kernel_fpu_begin();
255
256         asm volatile(
257 #undef BLOCK
258 #define BLOCK(i) \
259                 PF1(i)                                  \
260                                 PF1(i + 2)              \
261                 LD(i, 0)                                \
262                         LD(i + 1, 1)                    \
263                                 LD(i + 2, 2)            \
264                                         LD(i + 3, 3)    \
265                 PF2(i)                                  \
266                                 PF2(i + 2)              \
267                 XO1(i, 0)                               \
268                         XO1(i + 1, 1)                   \
269                                 XO1(i + 2, 2)           \
270                                         XO1(i + 3, 3)   \
271                 PF3(i)                                  \
272                                 PF3(i + 2)              \
273                 PF0(i + 4)                              \
274                                 PF0(i + 6)              \
275                 XO2(i, 0)                               \
276                         XO2(i + 1, 1)                   \
277                                 XO2(i + 2, 2)           \
278                                         XO2(i + 3, 3)   \
279                 XO3(i, 0)                               \
280                         XO3(i + 1, 1)                   \
281                                 XO3(i + 2, 2)           \
282                                         XO3(i + 3, 3)   \
283                 ST(i, 0)                                \
284                         ST(i + 1, 1)                    \
285                                 ST(i + 2, 2)            \
286                                         ST(i + 3, 3)    \
287
288
289                 PF0(0)
290                                 PF0(2)
291
292         " .align 32                     ;\n"
293         " 1:                            ;\n"
294
295                 BLOCK(0)
296                 BLOCK(4)
297                 BLOCK(8)
298                 BLOCK(12)
299
300         "       add %[inc], %[p1]       ;\n"
301         "       add %[inc], %[p2]       ;\n"
302         "       add %[inc], %[p3]       ;\n"
303         "       add %[inc], %[p4]       ;\n"
304         "       dec %[cnt]              ;\n"
305         "       jnz 1b                  ;\n"
306         : [cnt] "+r" (lines), [p1] "+r" (p1),
307           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
308         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
309         : "memory");
310
311         kernel_fpu_end();
312 }
313
314 static void
315 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
316                unsigned long *p3, unsigned long *p4)
317 {
318         unsigned long lines = bytes >> 8;
319
320         kernel_fpu_begin();
321
322         asm volatile(
323 #undef BLOCK
324 #define BLOCK(i)                        \
325                 BLK64(PF0, LD, i)       \
326                 BLK64(PF1, XO1, i)      \
327                 BLK64(PF2, XO2, i)      \
328                 BLK64(PF3, XO3, i)      \
329                 BLK64(NOP, ST, i)       \
330
331         " .align 32                     ;\n"
332         " 1:                            ;\n"
333
334                 BLOCK(0)
335                 BLOCK(4)
336                 BLOCK(8)
337                 BLOCK(12)
338
339         "       add %[inc], %[p1]       ;\n"
340         "       add %[inc], %[p2]       ;\n"
341         "       add %[inc], %[p3]       ;\n"
342         "       add %[inc], %[p4]       ;\n"
343         "       dec %[cnt]              ;\n"
344         "       jnz 1b                  ;\n"
345         : [cnt] "+r" (lines), [p1] "+r" (p1),
346           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348         : "memory");
349
350         kernel_fpu_end();
351 }
352
353 static void
354 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
355           unsigned long *p3, unsigned long *p4, unsigned long *p5)
356 {
357         unsigned long lines = bytes >> 8;
358
359         kernel_fpu_begin();
360
361         asm volatile(
362 #undef BLOCK
363 #define BLOCK(i) \
364                 PF1(i)                                  \
365                                 PF1(i + 2)              \
366                 LD(i, 0)                                \
367                         LD(i + 1, 1)                    \
368                                 LD(i + 2, 2)            \
369                                         LD(i + 3, 3)    \
370                 PF2(i)                                  \
371                                 PF2(i + 2)              \
372                 XO1(i, 0)                               \
373                         XO1(i + 1, 1)                   \
374                                 XO1(i + 2, 2)           \
375                                         XO1(i + 3, 3)   \
376                 PF3(i)                                  \
377                                 PF3(i + 2)              \
378                 XO2(i, 0)                               \
379                         XO2(i + 1, 1)                   \
380                                 XO2(i + 2, 2)           \
381                                         XO2(i + 3, 3)   \
382                 PF4(i)                                  \
383                                 PF4(i + 2)              \
384                 PF0(i + 4)                              \
385                                 PF0(i + 6)              \
386                 XO3(i, 0)                               \
387                         XO3(i + 1, 1)                   \
388                                 XO3(i + 2, 2)           \
389                                         XO3(i + 3, 3)   \
390                 XO4(i, 0)                               \
391                         XO4(i + 1, 1)                   \
392                                 XO4(i + 2, 2)           \
393                                         XO4(i + 3, 3)   \
394                 ST(i, 0)                                \
395                         ST(i + 1, 1)                    \
396                                 ST(i + 2, 2)            \
397                                         ST(i + 3, 3)    \
398
399
400                 PF0(0)
401                                 PF0(2)
402
403         " .align 32                     ;\n"
404         " 1:                            ;\n"
405
406                 BLOCK(0)
407                 BLOCK(4)
408                 BLOCK(8)
409                 BLOCK(12)
410
411         "       add %[inc], %[p1]       ;\n"
412         "       add %[inc], %[p2]       ;\n"
413         "       add %[inc], %[p3]       ;\n"
414         "       add %[inc], %[p4]       ;\n"
415         "       add %[inc], %[p5]       ;\n"
416         "       dec %[cnt]              ;\n"
417         "       jnz 1b                  ;\n"
418         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
419           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
420         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
421         : "memory");
422
423         kernel_fpu_end();
424 }
425
426 static void
427 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
428                unsigned long *p3, unsigned long *p4, unsigned long *p5)
429 {
430         unsigned long lines = bytes >> 8;
431
432         kernel_fpu_begin();
433
434         asm volatile(
435 #undef BLOCK
436 #define BLOCK(i)                        \
437                 BLK64(PF0, LD, i)       \
438                 BLK64(PF1, XO1, i)      \
439                 BLK64(PF2, XO2, i)      \
440                 BLK64(PF3, XO3, i)      \
441                 BLK64(PF4, XO4, i)      \
442                 BLK64(NOP, ST, i)       \
443
444         " .align 32                     ;\n"
445         " 1:                            ;\n"
446
447                 BLOCK(0)
448                 BLOCK(4)
449                 BLOCK(8)
450                 BLOCK(12)
451
452         "       add %[inc], %[p1]       ;\n"
453         "       add %[inc], %[p2]       ;\n"
454         "       add %[inc], %[p3]       ;\n"
455         "       add %[inc], %[p4]       ;\n"
456         "       add %[inc], %[p5]       ;\n"
457         "       dec %[cnt]              ;\n"
458         "       jnz 1b                  ;\n"
459         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
460           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
461         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
462         : "memory");
463
464         kernel_fpu_end();
465 }
466
467 static struct xor_block_template xor_block_sse_pf64 = {
468         .name = "prefetch64-sse",
469         .do_2 = xor_sse_2_pf64,
470         .do_3 = xor_sse_3_pf64,
471         .do_4 = xor_sse_4_pf64,
472         .do_5 = xor_sse_5_pf64,
473 };
474
475 #undef LD
476 #undef XO1
477 #undef XO2
478 #undef XO3
479 #undef XO4
480 #undef ST
481 #undef NOP
482 #undef BLK64
483 #undef BLOCK
484
485 #undef XOR_CONSTANT_CONSTRAINT
486
487 #ifdef CONFIG_X86_32
488 # include <asm/xor_32.h>
489 #else
490 # include <asm/xor_64.h>
491 #endif
492
493 #define XOR_SELECT_TEMPLATE(FASTEST) \
494         AVX_SELECT(FASTEST)
495
496 #endif /* _ASM_X86_XOR_H */