io_uring: don't assume mm is constant across submits
[linux-2.6-microblaze.git] / lib / raid6 / avx512.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- --------------------------------------------------------
3  *
4  *   Copyright (C) 2016 Intel Corporation
5  *
6  *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
7  *   Author: Megha Dey <megha.dey@linux.intel.com>
8  *
9  *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
10  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11  *
12  * -----------------------------------------------------------------------
13  */
14
15 /*
16  * AVX512 implementation of RAID-6 syndrome functions
17  *
18  */
19
20 #ifdef CONFIG_AS_AVX512
21
22 #include <linux/raid/pq.h>
23 #include "x86.h"
24
25 static const struct raid6_avx512_constants {
26         u64 x1d[8];
27 } raid6_avx512_constants __aligned(512/8) = {
28         { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
29           0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
30           0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
31           0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
32 };
33
34 static int raid6_have_avx512(void)
35 {
36         return boot_cpu_has(X86_FEATURE_AVX2) &&
37                 boot_cpu_has(X86_FEATURE_AVX) &&
38                 boot_cpu_has(X86_FEATURE_AVX512F) &&
39                 boot_cpu_has(X86_FEATURE_AVX512BW) &&
40                 boot_cpu_has(X86_FEATURE_AVX512VL) &&
41                 boot_cpu_has(X86_FEATURE_AVX512DQ);
42 }
43
44 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
45 {
46         u8 **dptr = (u8 **)ptrs;
47         u8 *p, *q;
48         int d, z, z0;
49
50         z0 = disks - 3;         /* Highest data disk */
51         p = dptr[z0+1];         /* XOR parity */
52         q = dptr[z0+2];         /* RS syndrome */
53
54         kernel_fpu_begin();
55
56         asm volatile("vmovdqa64 %0,%%zmm0\n\t"
57                      "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
58                      :
59                      : "m" (raid6_avx512_constants.x1d[0]));
60
61         for (d = 0; d < bytes; d += 64) {
62                 asm volatile("prefetchnta %0\n\t"
63                              "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
64                              "prefetchnta %1\n\t"
65                              "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
66                              "vmovdqa64 %1,%%zmm6"
67                              :
68                              : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
69                 for (z = z0-2; z >= 0; z--) {
70                         asm volatile("prefetchnta %0\n\t"
71                                      "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
72                                      "vpmovm2b %%k1,%%zmm5\n\t"
73                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
74                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
75                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
76                                      "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
77                                      "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
78                                      "vmovdqa64 %0,%%zmm6"
79                                      :
80                                      : "m" (dptr[z][d]));
81                 }
82                 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
83                              "vpmovm2b %%k1,%%zmm5\n\t"
84                              "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
85                              "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
86                              "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
87                              "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
88                              "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
89                              "vmovntdq %%zmm2,%0\n\t"
90                              "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
91                              "vmovntdq %%zmm4,%1\n\t"
92                              "vpxorq %%zmm4,%%zmm4,%%zmm4"
93                              :
94                              : "m" (p[d]), "m" (q[d]));
95         }
96
97         asm volatile("sfence" : : : "memory");
98         kernel_fpu_end();
99 }
100
101 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
102                                        size_t bytes, void **ptrs)
103 {
104         u8 **dptr = (u8 **)ptrs;
105         u8 *p, *q;
106         int d, z, z0;
107
108         z0 = stop;              /* P/Q right side optimization */
109         p = dptr[disks-2];      /* XOR parity */
110         q = dptr[disks-1];      /* RS syndrome */
111
112         kernel_fpu_begin();
113
114         asm volatile("vmovdqa64 %0,%%zmm0"
115                      : : "m" (raid6_avx512_constants.x1d[0]));
116
117         for (d = 0 ; d < bytes ; d += 64) {
118                 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
119                              "vmovdqa64 %1,%%zmm2\n\t"
120                              "vpxorq %%zmm4,%%zmm2,%%zmm2"
121                              :
122                              : "m" (dptr[z0][d]),  "m" (p[d]));
123                 /* P/Q data pages */
124                 for (z = z0-1 ; z >= start ; z--) {
125                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
126                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
127                                      "vpmovm2b %%k1,%%zmm5\n\t"
128                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
129                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
130                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
131                                      "vmovdqa64 %0,%%zmm5\n\t"
132                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
133                                      "vpxorq %%zmm5,%%zmm4,%%zmm4"
134                                      :
135                                      : "m" (dptr[z][d]));
136                 }
137                 /* P/Q left side optimization */
138                 for (z = start-1 ; z >= 0 ; z--) {
139                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
140                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
141                                      "vpmovm2b %%k1,%%zmm5\n\t"
142                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
143                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
144                                      "vpxorq %%zmm5,%%zmm4,%%zmm4"
145                                      :
146                                      : );
147                 }
148                 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
149                 /* Don't use movntdq for r/w memory area < cache line */
150                              "vmovdqa64 %%zmm4,%0\n\t"
151                              "vmovdqa64 %%zmm2,%1"
152                              :
153                              : "m" (q[d]), "m" (p[d]));
154         }
155
156         asm volatile("sfence" : : : "memory");
157         kernel_fpu_end();
158 }
159
160 const struct raid6_calls raid6_avx512x1 = {
161         raid6_avx5121_gen_syndrome,
162         raid6_avx5121_xor_syndrome,
163         raid6_have_avx512,
164         "avx512x1",
165         1                       /* Has cache hints */
166 };
167
168 /*
169  * Unrolled-by-2 AVX512 implementation
170  */
171 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
172 {
173         u8 **dptr = (u8 **)ptrs;
174         u8 *p, *q;
175         int d, z, z0;
176
177         z0 = disks - 3;         /* Highest data disk */
178         p = dptr[z0+1];         /* XOR parity */
179         q = dptr[z0+2];         /* RS syndrome */
180
181         kernel_fpu_begin();
182
183         asm volatile("vmovdqa64 %0,%%zmm0\n\t"
184                      "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
185                      :
186                      : "m" (raid6_avx512_constants.x1d[0]));
187
188         /* We uniformly assume a single prefetch covers at least 64 bytes */
189         for (d = 0; d < bytes; d += 128) {
190                 asm volatile("prefetchnta %0\n\t"
191                              "prefetchnta %1\n\t"
192                              "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
193                              "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
194                              "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
195                              "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
196                              :
197                              : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
198                 for (z = z0-1; z >= 0; z--) {
199                         asm volatile("prefetchnta %0\n\t"
200                                      "prefetchnta %1\n\t"
201                                      "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
202                                      "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
203                                      "vpmovm2b %%k1,%%zmm5\n\t"
204                                      "vpmovm2b %%k2,%%zmm7\n\t"
205                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
206                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
207                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
208                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
209                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
210                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
211                                      "vmovdqa64 %0,%%zmm5\n\t"
212                                      "vmovdqa64 %1,%%zmm7\n\t"
213                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
214                                      "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
215                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
216                                      "vpxorq %%zmm7,%%zmm6,%%zmm6"
217                                      :
218                                      : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
219                 }
220                 asm volatile("vmovntdq %%zmm2,%0\n\t"
221                              "vmovntdq %%zmm3,%1\n\t"
222                              "vmovntdq %%zmm4,%2\n\t"
223                              "vmovntdq %%zmm6,%3"
224                              :
225                              : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
226                                "m" (q[d+64]));
227         }
228
229         asm volatile("sfence" : : : "memory");
230         kernel_fpu_end();
231 }
232
233 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
234                                        size_t bytes, void **ptrs)
235 {
236         u8 **dptr = (u8 **)ptrs;
237         u8 *p, *q;
238         int d, z, z0;
239
240         z0 = stop;              /* P/Q right side optimization */
241         p = dptr[disks-2];      /* XOR parity */
242         q = dptr[disks-1];      /* RS syndrome */
243
244         kernel_fpu_begin();
245
246         asm volatile("vmovdqa64 %0,%%zmm0"
247                      : : "m" (raid6_avx512_constants.x1d[0]));
248
249         for (d = 0 ; d < bytes ; d += 128) {
250                 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
251                              "vmovdqa64 %1,%%zmm6\n\t"
252                              "vmovdqa64 %2,%%zmm2\n\t"
253                              "vmovdqa64 %3,%%zmm3\n\t"
254                              "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
255                              "vpxorq %%zmm6,%%zmm3,%%zmm3"
256                              :
257                              : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
258                                "m" (p[d]), "m" (p[d+64]));
259                 /* P/Q data pages */
260                 for (z = z0-1 ; z >= start ; z--) {
261                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
262                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
263                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
264                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
265                                      "vpmovm2b %%k1,%%zmm5\n\t"
266                                      "vpmovm2b %%k2,%%zmm7\n\t"
267                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
268                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
269                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
270                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
271                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
272                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
273                                      "vmovdqa64 %0,%%zmm5\n\t"
274                                      "vmovdqa64 %1,%%zmm7\n\t"
275                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
276                                      "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
277                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
278                                      "vpxorq %%zmm7,%%zmm6,%%zmm6"
279                                      :
280                                      : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
281                 }
282                 /* P/Q left side optimization */
283                 for (z = start-1 ; z >= 0 ; z--) {
284                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
285                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
286                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
287                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
288                                      "vpmovm2b %%k1,%%zmm5\n\t"
289                                      "vpmovm2b %%k2,%%zmm7\n\t"
290                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
291                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
292                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
293                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
294                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
295                                      "vpxorq %%zmm7,%%zmm6,%%zmm6"
296                                      :
297                                      : );
298                 }
299                 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
300                              "vpxorq %1,%%zmm6,%%zmm6\n\t"
301                              /* Don't use movntdq for r/w
302                               * memory area < cache line
303                               */
304                              "vmovdqa64 %%zmm4,%0\n\t"
305                              "vmovdqa64 %%zmm6,%1\n\t"
306                              "vmovdqa64 %%zmm2,%2\n\t"
307                              "vmovdqa64 %%zmm3,%3"
308                              :
309                              : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
310                                "m" (p[d+64]));
311         }
312
313         asm volatile("sfence" : : : "memory");
314         kernel_fpu_end();
315 }
316
317 const struct raid6_calls raid6_avx512x2 = {
318         raid6_avx5122_gen_syndrome,
319         raid6_avx5122_xor_syndrome,
320         raid6_have_avx512,
321         "avx512x2",
322         1                       /* Has cache hints */
323 };
324
325 #ifdef CONFIG_X86_64
326
327 /*
328  * Unrolled-by-4 AVX2 implementation
329  */
330 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
331 {
332         u8 **dptr = (u8 **)ptrs;
333         u8 *p, *q;
334         int d, z, z0;
335
336         z0 = disks - 3;         /* Highest data disk */
337         p = dptr[z0+1];         /* XOR parity */
338         q = dptr[z0+2];         /* RS syndrome */
339
340         kernel_fpu_begin();
341
342         asm volatile("vmovdqa64 %0,%%zmm0\n\t"
343                      "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
344                      "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
345                      "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
346                      "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
347                      "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
348                      "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
349                      "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
350                      "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
351                      "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
352                      :
353                      : "m" (raid6_avx512_constants.x1d[0]));
354
355         for (d = 0; d < bytes; d += 256) {
356                 for (z = z0; z >= 0; z--) {
357                 asm volatile("prefetchnta %0\n\t"
358                              "prefetchnta %1\n\t"
359                              "prefetchnta %2\n\t"
360                              "prefetchnta %3\n\t"
361                              "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
362                              "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
363                              "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
364                              "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
365                              "vpmovm2b %%k1,%%zmm5\n\t"
366                              "vpmovm2b %%k2,%%zmm7\n\t"
367                              "vpmovm2b %%k3,%%zmm13\n\t"
368                              "vpmovm2b %%k4,%%zmm15\n\t"
369                              "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
370                              "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
371                              "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
372                              "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
373                              "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
374                              "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
375                              "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
376                              "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
377                              "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
378                              "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
379                              "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
380                              "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
381                              "vmovdqa64 %0,%%zmm5\n\t"
382                              "vmovdqa64 %1,%%zmm7\n\t"
383                              "vmovdqa64 %2,%%zmm13\n\t"
384                              "vmovdqa64 %3,%%zmm15\n\t"
385                              "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
386                              "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
387                              "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
388                              "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
389                              "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
390                              "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
391                              "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
392                              "vpxorq %%zmm15,%%zmm14,%%zmm14"
393                              :
394                              : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
395                                "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
396                 }
397                 asm volatile("vmovntdq %%zmm2,%0\n\t"
398                              "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
399                              "vmovntdq %%zmm3,%1\n\t"
400                              "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
401                              "vmovntdq %%zmm10,%2\n\t"
402                              "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
403                              "vmovntdq %%zmm11,%3\n\t"
404                              "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
405                              "vmovntdq %%zmm4,%4\n\t"
406                              "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
407                              "vmovntdq %%zmm6,%5\n\t"
408                              "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
409                              "vmovntdq %%zmm12,%6\n\t"
410                              "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
411                              "vmovntdq %%zmm14,%7\n\t"
412                              "vpxorq %%zmm14,%%zmm14,%%zmm14"
413                              :
414                              : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
415                                "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
416                                "m" (q[d+128]), "m" (q[d+192]));
417         }
418
419         asm volatile("sfence" : : : "memory");
420         kernel_fpu_end();
421 }
422
423 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
424                                        size_t bytes, void **ptrs)
425 {
426         u8 **dptr = (u8 **)ptrs;
427         u8 *p, *q;
428         int d, z, z0;
429
430         z0 = stop;              /* P/Q right side optimization */
431         p = dptr[disks-2];      /* XOR parity */
432         q = dptr[disks-1];      /* RS syndrome */
433
434         kernel_fpu_begin();
435
436         asm volatile("vmovdqa64 %0,%%zmm0"
437                      :: "m" (raid6_avx512_constants.x1d[0]));
438
439         for (d = 0 ; d < bytes ; d += 256) {
440                 asm volatile("vmovdqa64 %0,%%zmm4\n\t"
441                              "vmovdqa64 %1,%%zmm6\n\t"
442                              "vmovdqa64 %2,%%zmm12\n\t"
443                              "vmovdqa64 %3,%%zmm14\n\t"
444                              "vmovdqa64 %4,%%zmm2\n\t"
445                              "vmovdqa64 %5,%%zmm3\n\t"
446                              "vmovdqa64 %6,%%zmm10\n\t"
447                              "vmovdqa64 %7,%%zmm11\n\t"
448                              "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
449                              "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
450                              "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
451                              "vpxorq %%zmm14,%%zmm11,%%zmm11"
452                              :
453                              : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
454                                "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
455                                "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
456                                "m" (p[d+192]));
457                 /* P/Q data pages */
458                 for (z = z0-1 ; z >= start ; z--) {
459                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
460                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
461                                      "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
462                                      "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
463                                      "prefetchnta %0\n\t"
464                                      "prefetchnta %2\n\t"
465                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
466                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
467                                      "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
468                                      "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
469                                      "vpmovm2b %%k1,%%zmm5\n\t"
470                                      "vpmovm2b %%k2,%%zmm7\n\t"
471                                      "vpmovm2b %%k3,%%zmm13\n\t"
472                                      "vpmovm2b %%k4,%%zmm15\n\t"
473                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
474                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
475                                      "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
476                                      "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
477                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
478                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
479                                      "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
480                                      "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
481                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
482                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
483                                      "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
484                                      "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
485                                      "vmovdqa64 %0,%%zmm5\n\t"
486                                      "vmovdqa64 %1,%%zmm7\n\t"
487                                      "vmovdqa64 %2,%%zmm13\n\t"
488                                      "vmovdqa64 %3,%%zmm15\n\t"
489                                      "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
490                                      "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
491                                      "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
492                                      "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
493                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
494                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
495                                      "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
496                                      "vpxorq %%zmm15,%%zmm14,%%zmm14"
497                                      :
498                                      : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
499                                        "m" (dptr[z][d+128]),
500                                        "m" (dptr[z][d+192]));
501                 }
502                 asm volatile("prefetchnta %0\n\t"
503                              "prefetchnta %1\n\t"
504                              :
505                              : "m" (q[d]), "m" (q[d+128]));
506                 /* P/Q left side optimization */
507                 for (z = start-1 ; z >= 0 ; z--) {
508                         asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
509                                      "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
510                                      "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
511                                      "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
512                                      "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
513                                      "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
514                                      "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
515                                      "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
516                                      "vpmovm2b %%k1,%%zmm5\n\t"
517                                      "vpmovm2b %%k2,%%zmm7\n\t"
518                                      "vpmovm2b %%k3,%%zmm13\n\t"
519                                      "vpmovm2b %%k4,%%zmm15\n\t"
520                                      "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
521                                      "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
522                                      "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
523                                      "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
524                                      "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
525                                      "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
526                                      "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
527                                      "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
528                                      "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
529                                      "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
530                                      "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
531                                      "vpxorq %%zmm15,%%zmm14,%%zmm14"
532                                      :
533                                      : );
534                 }
535                 asm volatile("vmovntdq %%zmm2,%0\n\t"
536                              "vmovntdq %%zmm3,%1\n\t"
537                              "vmovntdq %%zmm10,%2\n\t"
538                              "vmovntdq %%zmm11,%3\n\t"
539                              "vpxorq %4,%%zmm4,%%zmm4\n\t"
540                              "vpxorq %5,%%zmm6,%%zmm6\n\t"
541                              "vpxorq %6,%%zmm12,%%zmm12\n\t"
542                              "vpxorq %7,%%zmm14,%%zmm14\n\t"
543                              "vmovntdq %%zmm4,%4\n\t"
544                              "vmovntdq %%zmm6,%5\n\t"
545                              "vmovntdq %%zmm12,%6\n\t"
546                              "vmovntdq %%zmm14,%7"
547                              :
548                              : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
549                                "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
550                                "m" (q[d+128]), "m" (q[d+192]));
551         }
552         asm volatile("sfence" : : : "memory");
553         kernel_fpu_end();
554 }
555 const struct raid6_calls raid6_avx512x4 = {
556         raid6_avx5124_gen_syndrome,
557         raid6_avx5124_xor_syndrome,
558         raid6_have_avx512,
559         "avx512x4",
560         1                       /* Has cache hints */
561 };
562 #endif
563
564 #endif /* CONFIG_AS_AVX512 */