26ea02b7311fd0c1560825d4584027af63948c95
[linux-2.6-microblaze.git] / arch / powerpc / lib / memcpy_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11 #include <asm/export.h>
12 #include <asm/asm-compat.h>
13
14         .align  7
15 _GLOBAL_TOC(memcpy)
16 BEGIN_FTR_SECTION
17 #ifdef __LITTLE_ENDIAN__
18         cmpdi   cr7,r5,0
19 #else
20         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
21 #endif
22 FTR_SECTION_ELSE
23 #ifdef CONFIG_PPC_BOOK3S_64
24 #ifndef SELFTEST
25         b       memcpy_power7
26 #endif
27 #endif
28 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
29 #ifdef __LITTLE_ENDIAN__
30         /* dumb little-endian memcpy that will get replaced at runtime */
31         addi r9,r3,-1
32         addi r4,r4,-1
33         beqlr cr7
34         mtctr r5
35 1:      lbzu r10,1(r4)
36         stbu r10,1(r9)
37         bdnz 1b
38         blr
39 #else
40         PPC_MTOCRF(0x01,r5)
41         cmpldi  cr1,r5,16
42         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
43         andi.   r6,r6,7
44         dcbt    0,r4
45         blt     cr1,.Lshort_copy
46 /* Below we want to nop out the bne if we're on a CPU that has the
47    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
48    cleared.
49    At the time of writing the only CPU that has this combination of bits
50    set is Power6. */
51 BEGIN_FTR_SECTION
52         nop
53 FTR_SECTION_ELSE
54         bne     .Ldst_unaligned
55 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
56                     CPU_FTR_UNALIGNED_LD_STD)
57 .Ldst_aligned:
58         addi    r3,r3,-16
59 BEGIN_FTR_SECTION
60         andi.   r0,r4,7
61         bne     .Lsrc_unaligned
62 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
63         srdi    r7,r5,4
64         ld      r9,0(r4)
65         addi    r4,r4,-8
66         mtctr   r7
67         andi.   r5,r5,7
68         bf      cr7*4+0,2f
69         addi    r3,r3,8
70         addi    r4,r4,8
71         mr      r8,r9
72         blt     cr1,3f
73 1:      ld      r9,8(r4)
74         std     r8,8(r3)
75 2:      ldu     r8,16(r4)
76         stdu    r9,16(r3)
77         bdnz    1b
78 3:      std     r8,8(r3)
79         beq     3f
80         addi    r3,r3,16
81 .Ldo_tail:
82         bf      cr7*4+1,1f
83         lwz     r9,8(r4)
84         addi    r4,r4,4
85         stw     r9,0(r3)
86         addi    r3,r3,4
87 1:      bf      cr7*4+2,2f
88         lhz     r9,8(r4)
89         addi    r4,r4,2
90         sth     r9,0(r3)
91         addi    r3,r3,2
92 2:      bf      cr7*4+3,3f
93         lbz     r9,8(r4)
94         stb     r9,0(r3)
95 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
96         blr
97
98 .Lsrc_unaligned:
99         srdi    r6,r5,3
100         addi    r5,r5,-16
101         subf    r4,r0,r4
102         srdi    r7,r5,4
103         sldi    r10,r0,3
104         cmpdi   cr6,r6,3
105         andi.   r5,r5,7
106         mtctr   r7
107         subfic  r11,r10,64
108         add     r5,r5,r0
109
110         bt      cr7*4+0,0f
111
112         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
113         ld      r0,8(r4)
114         sld     r6,r9,r10
115         ldu     r9,16(r4)
116         srd     r7,r0,r11
117         sld     r8,r0,r10
118         or      r7,r7,r6
119         blt     cr6,4f
120         ld      r0,8(r4)
121         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
122         b       2f
123
124 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
125         ldu     r9,8(r4)
126         sld     r8,r0,r10
127         addi    r3,r3,-8
128         blt     cr6,5f
129         ld      r0,8(r4)
130         srd     r12,r9,r11
131         sld     r6,r9,r10
132         ldu     r9,16(r4)
133         or      r12,r8,r12
134         srd     r7,r0,r11
135         sld     r8,r0,r10
136         addi    r3,r3,16
137         beq     cr6,3f
138
139         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
140 1:      or      r7,r7,r6
141         ld      r0,8(r4)
142         std     r12,8(r3)
143 2:      srd     r12,r9,r11
144         sld     r6,r9,r10
145         ldu     r9,16(r4)
146         or      r12,r8,r12
147         stdu    r7,16(r3)
148         srd     r7,r0,r11
149         sld     r8,r0,r10
150         bdnz    1b
151
152 3:      std     r12,8(r3)
153         or      r7,r7,r6
154 4:      std     r7,16(r3)
155 5:      srd     r12,r9,r11
156         or      r12,r8,r12
157         std     r12,24(r3)
158         beq     4f
159         cmpwi   cr1,r5,8
160         addi    r3,r3,32
161         sld     r9,r9,r10
162         ble     cr1,6f
163         ld      r0,8(r4)
164         srd     r7,r0,r11
165         or      r9,r7,r9
166 6:
167         bf      cr7*4+1,1f
168         rotldi  r9,r9,32
169         stw     r9,0(r3)
170         addi    r3,r3,4
171 1:      bf      cr7*4+2,2f
172         rotldi  r9,r9,16
173         sth     r9,0(r3)
174         addi    r3,r3,2
175 2:      bf      cr7*4+3,3f
176         rotldi  r9,r9,8
177         stb     r9,0(r3)
178 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
179         blr
180
181 .Ldst_unaligned:
182         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
183         subf    r5,r6,r5
184         li      r7,0
185         cmpldi  cr1,r5,16
186         bf      cr7*4+3,1f
187         lbz     r0,0(r4)
188         stb     r0,0(r3)
189         addi    r7,r7,1
190 1:      bf      cr7*4+2,2f
191         lhzx    r0,r7,r4
192         sthx    r0,r7,r3
193         addi    r7,r7,2
194 2:      bf      cr7*4+1,3f
195         lwzx    r0,r7,r4
196         stwx    r0,r7,r3
197 3:      PPC_MTOCRF(0x01,r5)
198         add     r4,r6,r4
199         add     r3,r6,r3
200         b       .Ldst_aligned
201
202 .Lshort_copy:
203         bf      cr7*4+0,1f
204         lwz     r0,0(r4)
205         lwz     r9,4(r4)
206         addi    r4,r4,8
207         stw     r0,0(r3)
208         stw     r9,4(r3)
209         addi    r3,r3,8
210 1:      bf      cr7*4+1,2f
211         lwz     r0,0(r4)
212         addi    r4,r4,4
213         stw     r0,0(r3)
214         addi    r3,r3,4
215 2:      bf      cr7*4+2,3f
216         lhz     r0,0(r4)
217         addi    r4,r4,2
218         sth     r0,0(r3)
219         addi    r3,r3,2
220 3:      bf      cr7*4+3,4f
221         lbz     r0,0(r4)
222         stb     r0,0(r3)
223 4:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
224         blr
225 #endif
226 EXPORT_SYMBOL(memcpy)