arch/x86/include/asm/xor_avx.h

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 #ifndef _ASM_X86_XOR_AVX_H
   3 #define _ASM_X86_XOR_AVX_H
   4
   5 /*
   6  * Optimized RAID-5 checksumming functions for AVX
   7  *
   8  * Copyright (C) 2012 Intel Corporation
   9  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
  10  *
  11  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  12  */
  13
  14 #include <linux/compiler.h>
  15 #include <asm/fpu/api.h>
  16
  17 #define BLOCK4(i) \
  18                 BLOCK(32 * i, 0) \
  19                 BLOCK(32 * (i + 1), 1) \
  20                 BLOCK(32 * (i + 2), 2) \
  21                 BLOCK(32 * (i + 3), 3)
  22
  23 #define BLOCK16() \
  24                 BLOCK4(0) \
  25                 BLOCK4(4) \
  26                 BLOCK4(8) \
  27                 BLOCK4(12)
  28
  29 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
  30 {
  31         unsigned long lines = bytes >> 9;
  32
  33         kernel_fpu_begin();
  34
  35         while (lines--) {
  36 #undef BLOCK
  37 #define BLOCK(i, reg) \
  38 do { \
  39         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
  40         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
  41                 "m" (p0[i / sizeof(*p0)])); \
  42         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  43                 "=m" (p0[i / sizeof(*p0)])); \
  44 } while (0);
  45
  46                 BLOCK16()
  47
  48                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  49                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  50         }
  51
  52         kernel_fpu_end();
  53 }
  54
  55 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  56         unsigned long *p2)
  57 {
  58         unsigned long lines = bytes >> 9;
  59
  60         kernel_fpu_begin();
  61
  62         while (lines--) {
  63 #undef BLOCK
  64 #define BLOCK(i, reg) \
  65 do { \
  66         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
  67         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  68                 "m" (p1[i / sizeof(*p1)])); \
  69         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  70                 "m" (p0[i / sizeof(*p0)])); \
  71         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  72                 "=m" (p0[i / sizeof(*p0)])); \
  73 } while (0);
  74
  75                 BLOCK16()
  76
  77                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  78                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  79                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
  80         }
  81
  82         kernel_fpu_end();
  83 }
  84
  85 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  86         unsigned long *p2, unsigned long *p3)
  87 {
  88         unsigned long lines = bytes >> 9;
  89
  90         kernel_fpu_begin();
  91
  92         while (lines--) {
  93 #undef BLOCK
  94 #define BLOCK(i, reg) \
  95 do { \
  96         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
  97         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  98                 "m" (p2[i / sizeof(*p2)])); \
  99         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 100                 "m" (p1[i / sizeof(*p1)])); \
 101         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 102                 "m" (p0[i / sizeof(*p0)])); \
 103         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 104                 "=m" (p0[i / sizeof(*p0)])); \
 105 } while (0);
 106
 107                 BLOCK16();
 108
 109                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 110                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 111                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 112                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 113         }
 114
 115         kernel_fpu_end();
 116 }
 117
 118 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 119         unsigned long *p2, unsigned long *p3, unsigned long *p4)
 120 {
 121         unsigned long lines = bytes >> 9;
 122
 123         kernel_fpu_begin();
 124
 125         while (lines--) {
 126 #undef BLOCK
 127 #define BLOCK(i, reg) \
 128 do { \
 129         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
 130         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 131                 "m" (p3[i / sizeof(*p3)])); \
 132         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 133                 "m" (p2[i / sizeof(*p2)])); \
 134         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 135                 "m" (p1[i / sizeof(*p1)])); \
 136         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 137                 "m" (p0[i / sizeof(*p0)])); \
 138         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 139                 "=m" (p0[i / sizeof(*p0)])); \
 140 } while (0);
 141
 142                 BLOCK16()
 143
 144                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 145                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 146                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 147                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 148                 p4 = (unsigned long *)((uintptr_t)p4 + 512);
 149         }
 150
 151         kernel_fpu_end();
 152 }
 153
 154 static struct xor_block_template xor_block_avx = {
 155         .name = "avx",
 156         .do_2 = xor_avx_2,
 157         .do_3 = xor_avx_3,
 158         .do_4 = xor_avx_4,
 159         .do_5 = xor_avx_5,
 160 };
 161
 162 #define AVX_XOR_SPEED \
 163 do { \
 164         if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
 165                 xor_speed(&xor_block_avx); \
 166 } while (0)
 167
 168 #define AVX_SELECT(FASTEST) \
 169         (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
 170
 171 #endif