1 /* U3memcpy.S: UltraSparc-III optimized memcpy.
3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
7 #include <asm/visasm.h>
9 #define GLOBAL_SPARE %g7
11 #define ASI_BLK_P 0xf0
14 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
15 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
16 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
18 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
19 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
21 #define GLOBAL_SPARE %g5
39 #define EX_RETVAL(x) x
43 #define LOAD(type,addr,dest) type [addr], dest
47 #define STORE(type,src,addr) type src, [addr]
51 #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
55 #define FUNC_NAME U3memcpy
66 .register %g2,#scratch
67 .register %g3,#scratch
69 /* Special/non-trivial issues of this code:
71 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
72 * 2) Only low 32 FPU registers are used so that only the
73 * lower half of the FPU register set is dirtied by this
74 * code. This is especially important in the kernel.
75 * 3) This code never prefetches cachelines past the end
76 * of the source buffer.
82 /* The cheetah's flexible spine, oversized liver, enlarged heart,
83 * slender muscular body, and claws make it the swiftest hunter
84 * in Africa and the fastest animal on land. Can reach speeds
85 * of up to 2.4GB per second.
89 .type FUNC_NAME,#function
90 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
107 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
108 * o5 from here until we hit VISExitHalf.
112 /* Is 'dst' already aligned on an 64-byte boundary? */
116 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
117 * of bytes to copy to make 'dst' 64-byte aligned. We pre-
118 * subtract this from 'len'.
120 sub %o0, %o1, GLOBAL_SPARE
128 1: subcc %g1, 0x1, %g1
129 EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3))
130 EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
134 add %o1, GLOBAL_SPARE, %o0
139 alignaddr %o1, %g0, %o1
141 EX_LD_FP(LOAD(ldd, %o1, %f4))
142 1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6))
145 faligndata %f4, %f6, %f0
146 EX_ST_FP(STORE(std, %f0, %o0))
150 EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4))
153 faligndata %f6, %f4, %f2
154 EX_ST_FP(STORE(std, %f2, %o0))
158 3: LOAD(prefetch, %o1 + 0x000, #one_read)
159 LOAD(prefetch, %o1 + 0x040, #one_read)
160 andn %o2, (0x40 - 1), GLOBAL_SPARE
161 LOAD(prefetch, %o1 + 0x080, #one_read)
162 LOAD(prefetch, %o1 + 0x0c0, #one_read)
163 LOAD(prefetch, %o1 + 0x100, #one_read)
164 EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0))
165 LOAD(prefetch, %o1 + 0x140, #one_read)
166 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
167 LOAD(prefetch, %o1 + 0x180, #one_read)
168 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
169 LOAD(prefetch, %o1 + 0x1c0, #one_read)
170 faligndata %f0, %f2, %f16
171 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
172 faligndata %f2, %f4, %f18
173 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
174 faligndata %f4, %f6, %f20
175 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
176 faligndata %f6, %f8, %f22
178 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
179 faligndata %f8, %f10, %f24
180 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
181 faligndata %f10, %f12, %f26
182 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
184 subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE
187 srl GLOBAL_SPARE, 6, %o3
193 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
194 faligndata %f12, %f14, %f28
195 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
196 faligndata %f14, %f0, %f30
197 EX_ST_FP(STORE_BLK(%f16, %o0))
198 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
199 faligndata %f0, %f2, %f16
202 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
203 faligndata %f2, %f4, %f18
204 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
205 faligndata %f4, %f6, %f20
206 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
208 faligndata %f6, %f8, %f22
209 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
211 faligndata %f8, %f10, %f24
212 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
213 LOAD(prefetch, %o1 + 0x1c0, #one_read)
214 faligndata %f10, %f12, %f26
218 /* Finally we copy the last full 64-byte block. */
220 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
221 faligndata %f12, %f14, %f28
222 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
223 faligndata %f14, %f0, %f30
224 EX_ST_FP(STORE_BLK(%f16, %o0))
225 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
226 faligndata %f0, %f2, %f16
227 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
228 faligndata %f2, %f4, %f18
229 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
230 faligndata %f4, %f6, %f20
231 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
232 faligndata %f6, %f8, %f22
233 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
234 faligndata %f8, %f10, %f24
238 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
239 1: faligndata %f10, %f12, %f26
240 faligndata %f12, %f14, %f28
241 faligndata %f14, %f0, %f30
242 EX_ST_FP(STORE_BLK(%f16, %o0))
247 /* Now we copy the (len modulo 64) bytes at the end.
248 * Note how we borrow the %f0 loaded above.
250 * Also notice how this code is careful not to perform a
251 * load past the end of the src buffer.
262 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0))
264 1: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2))
267 faligndata %f0, %f2, %f8
268 EX_ST_FP(STORE(std, %f8, %o0))
271 EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0))
274 faligndata %f2, %f0, %f8
275 EX_ST_FP(STORE(std, %f8, %o0))
279 /* If anything is left, we copy it one byte at a time.
280 * Note that %g1 is (src & 0x3) saved above before the
281 * alignaddr was performed.
295 EX_LD(LOAD(ldx, %o1, %o5))
296 EX_ST(STORE(stx, %o5, %o1 + %o3))
299 1: andcc %o2, 0x4, %g0
302 EX_LD(LOAD(lduw, %o1, %o5))
303 EX_ST(STORE(stw, %o5, %o1 + %o3))
306 1: andcc %o2, 0x2, %g0
309 EX_LD(LOAD(lduh, %o1, %o5))
310 EX_ST(STORE(sth, %o5, %o1 + %o3))
313 1: andcc %o2, 0x1, %g0
316 EX_LD(LOAD(ldub, %o1, %o5))
318 EX_ST(STORE(stb, %o5, %o1 + %o3))
321 70: /* 16 < len <= 64 */
326 andn %o2, 0xf, GLOBAL_SPARE
328 1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE
329 EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
330 EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
331 EX_ST(STORE(stx, %o5, %o1 + %o3))
333 EX_ST(STORE(stx, %g1, %o1 + %o3))
336 73: andcc %o2, 0x8, %g0
340 EX_LD(LOAD(ldx, %o1, %o5))
341 EX_ST(STORE(stx, %o5, %o1 + %o3))
343 1: andcc %o2, 0x4, %g0
347 EX_LD(LOAD(lduw, %o1, %o5))
348 EX_ST(STORE(stw, %o5, %o1 + %o3))
364 EX_LD(LOAD(ldub, %o1, %o5))
365 EX_ST(STORE(stb, %o5, %o1 + %o3))
381 EX_LD(LOAD(ldx, %o1, %g2))
383 andn %o2, 0x7, GLOBAL_SPARE
385 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
386 subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE
390 EX_ST(STORE(stx, %o5, %o0))
403 80: /* 0 < len <= 16 */
410 EX_LD(LOAD(lduw, %o1, %g1))
411 EX_ST(STORE(stw, %g1, %o1 + %o3))
416 mov EX_RETVAL(%o4), %o0
421 EX_LD(LOAD(ldub, %o1, %g1))
422 EX_ST(STORE(stb, %g1, %o1 + %o3))
426 mov EX_RETVAL(%o4), %o0
428 .size FUNC_NAME, .-FUNC_NAME