2 * arch/arm64/lib/xor-neon.c
4 * Authors: Jackie Liu <liuyun01@kylinos.cn>
5 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
12 #include <linux/raid/xor.h>
13 #include <linux/module.h>
14 #include <asm/neon-intrinsics.h>
16 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
19 uint64_t *dp1 = (uint64_t *)p1;
20 uint64_t *dp2 = (uint64_t *)p2;
22 register uint64x2_t v0, v1, v2, v3;
23 long lines = bytes / (sizeof(uint64x2_t) * 4);
27 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
28 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
29 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
30 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
33 vst1q_u64(dp1 + 0, v0);
34 vst1q_u64(dp1 + 2, v1);
35 vst1q_u64(dp1 + 4, v2);
36 vst1q_u64(dp1 + 6, v3);
40 } while (--lines > 0);
43 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
44 unsigned long *p2, unsigned long *p3)
46 uint64_t *dp1 = (uint64_t *)p1;
47 uint64_t *dp2 = (uint64_t *)p2;
48 uint64_t *dp3 = (uint64_t *)p3;
50 register uint64x2_t v0, v1, v2, v3;
51 long lines = bytes / (sizeof(uint64x2_t) * 4);
55 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
56 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
57 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
58 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
61 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
62 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
63 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
64 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
67 vst1q_u64(dp1 + 0, v0);
68 vst1q_u64(dp1 + 2, v1);
69 vst1q_u64(dp1 + 4, v2);
70 vst1q_u64(dp1 + 6, v3);
75 } while (--lines > 0);
78 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
79 unsigned long *p2, unsigned long *p3, unsigned long *p4)
81 uint64_t *dp1 = (uint64_t *)p1;
82 uint64_t *dp2 = (uint64_t *)p2;
83 uint64_t *dp3 = (uint64_t *)p3;
84 uint64_t *dp4 = (uint64_t *)p4;
86 register uint64x2_t v0, v1, v2, v3;
87 long lines = bytes / (sizeof(uint64x2_t) * 4);
91 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
92 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
93 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
94 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
97 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
98 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
99 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
100 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
103 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
104 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
105 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
106 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
109 vst1q_u64(dp1 + 0, v0);
110 vst1q_u64(dp1 + 2, v1);
111 vst1q_u64(dp1 + 4, v2);
112 vst1q_u64(dp1 + 6, v3);
118 } while (--lines > 0);
121 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
122 unsigned long *p2, unsigned long *p3,
123 unsigned long *p4, unsigned long *p5)
125 uint64_t *dp1 = (uint64_t *)p1;
126 uint64_t *dp2 = (uint64_t *)p2;
127 uint64_t *dp3 = (uint64_t *)p3;
128 uint64_t *dp4 = (uint64_t *)p4;
129 uint64_t *dp5 = (uint64_t *)p5;
131 register uint64x2_t v0, v1, v2, v3;
132 long lines = bytes / (sizeof(uint64x2_t) * 4);
136 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
137 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
138 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
139 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
142 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
143 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
144 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
145 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
148 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
149 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
150 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
151 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
154 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
155 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
156 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
157 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
160 vst1q_u64(dp1 + 0, v0);
161 vst1q_u64(dp1 + 2, v1);
162 vst1q_u64(dp1 + 4, v2);
163 vst1q_u64(dp1 + 6, v3);
170 } while (--lines > 0);
173 struct xor_block_template const xor_block_inner_neon = {
174 .name = "__inner_neon__",
175 .do_2 = xor_arm64_neon_2,
176 .do_3 = xor_arm64_neon_3,
177 .do_4 = xor_arm64_neon_4,
178 .do_5 = xor_arm64_neon_5,
180 EXPORT_SYMBOL(xor_block_inner_neon);
182 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
183 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
184 MODULE_LICENSE("GPL");