GNU Linux-libre 4.19.286-gnu1
[releases.git] / arch / powerpc / lib / memcpy_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11 #include <asm/export.h>
12 #include <asm/asm-compat.h>
13 #include <asm/feature-fixups.h>
14
15 #ifndef SELFTEST_CASE
16 /* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
17 #define SELFTEST_CASE   0
18 #endif
19
20         .align  7
21 _GLOBAL_TOC(memcpy)
22 BEGIN_FTR_SECTION
23 #ifdef __LITTLE_ENDIAN__
24         cmpdi   cr7,r5,0
25 #else
26         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
27 #endif
28 FTR_SECTION_ELSE
29 #ifdef CONFIG_PPC_BOOK3S_64
30         b       memcpy_power7
31 #endif
32 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
33 #ifdef __LITTLE_ENDIAN__
34         /* dumb little-endian memcpy that will get replaced at runtime */
35         addi r9,r3,-1
36         addi r4,r4,-1
37         beqlr cr7
38         mtctr r5
39 1:      lbzu r10,1(r4)
40         stbu r10,1(r9)
41         bdnz 1b
42         blr
43 #else
44         PPC_MTOCRF(0x01,r5)
45         cmpldi  cr1,r5,16
46         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
47         andi.   r6,r6,7
48         dcbt    0,r4
49         blt     cr1,.Lshort_copy
50 /* Below we want to nop out the bne if we're on a CPU that has the
51    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
52    cleared.
53    At the time of writing the only CPU that has this combination of bits
54    set is Power6. */
55 test_feature = (SELFTEST_CASE == 1)
56 BEGIN_FTR_SECTION
57         nop
58 FTR_SECTION_ELSE
59         bne     .Ldst_unaligned
60 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
61                     CPU_FTR_UNALIGNED_LD_STD)
62 .Ldst_aligned:
63         addi    r3,r3,-16
64 test_feature = (SELFTEST_CASE == 0)
65 BEGIN_FTR_SECTION
66         andi.   r0,r4,7
67         bne     .Lsrc_unaligned
68 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
69         srdi    r7,r5,4
70         ld      r9,0(r4)
71         addi    r4,r4,-8
72         mtctr   r7
73         andi.   r5,r5,7
74         bf      cr7*4+0,2f
75         addi    r3,r3,8
76         addi    r4,r4,8
77         mr      r8,r9
78         blt     cr1,3f
79 1:      ld      r9,8(r4)
80         std     r8,8(r3)
81 2:      ldu     r8,16(r4)
82         stdu    r9,16(r3)
83         bdnz    1b
84 3:      std     r8,8(r3)
85         beq     3f
86         addi    r3,r3,16
87 .Ldo_tail:
88         bf      cr7*4+1,1f
89         lwz     r9,8(r4)
90         addi    r4,r4,4
91         stw     r9,0(r3)
92         addi    r3,r3,4
93 1:      bf      cr7*4+2,2f
94         lhz     r9,8(r4)
95         addi    r4,r4,2
96         sth     r9,0(r3)
97         addi    r3,r3,2
98 2:      bf      cr7*4+3,3f
99         lbz     r9,8(r4)
100         stb     r9,0(r3)
101 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
102         blr
103
104 .Lsrc_unaligned:
105         srdi    r6,r5,3
106         addi    r5,r5,-16
107         subf    r4,r0,r4
108         srdi    r7,r5,4
109         sldi    r10,r0,3
110         cmpdi   cr6,r6,3
111         andi.   r5,r5,7
112         mtctr   r7
113         subfic  r11,r10,64
114         add     r5,r5,r0
115
116         bt      cr7*4+0,0f
117
118         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
119         ld      r0,8(r4)
120         sld     r6,r9,r10
121         ldu     r9,16(r4)
122         srd     r7,r0,r11
123         sld     r8,r0,r10
124         or      r7,r7,r6
125         blt     cr6,4f
126         ld      r0,8(r4)
127         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
128         b       2f
129
130 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
131         ldu     r9,8(r4)
132         sld     r8,r0,r10
133         addi    r3,r3,-8
134         blt     cr6,5f
135         ld      r0,8(r4)
136         srd     r12,r9,r11
137         sld     r6,r9,r10
138         ldu     r9,16(r4)
139         or      r12,r8,r12
140         srd     r7,r0,r11
141         sld     r8,r0,r10
142         addi    r3,r3,16
143         beq     cr6,3f
144
145         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
146 1:      or      r7,r7,r6
147         ld      r0,8(r4)
148         std     r12,8(r3)
149 2:      srd     r12,r9,r11
150         sld     r6,r9,r10
151         ldu     r9,16(r4)
152         or      r12,r8,r12
153         stdu    r7,16(r3)
154         srd     r7,r0,r11
155         sld     r8,r0,r10
156         bdnz    1b
157
158 3:      std     r12,8(r3)
159         or      r7,r7,r6
160 4:      std     r7,16(r3)
161 5:      srd     r12,r9,r11
162         or      r12,r8,r12
163         std     r12,24(r3)
164         beq     4f
165         cmpwi   cr1,r5,8
166         addi    r3,r3,32
167         sld     r9,r9,r10
168         ble     cr1,6f
169         ld      r0,8(r4)
170         srd     r7,r0,r11
171         or      r9,r7,r9
172 6:
173         bf      cr7*4+1,1f
174         rotldi  r9,r9,32
175         stw     r9,0(r3)
176         addi    r3,r3,4
177 1:      bf      cr7*4+2,2f
178         rotldi  r9,r9,16
179         sth     r9,0(r3)
180         addi    r3,r3,2
181 2:      bf      cr7*4+3,3f
182         rotldi  r9,r9,8
183         stb     r9,0(r3)
184 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
185         blr
186
187 .Ldst_unaligned:
188         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
189         subf    r5,r6,r5
190         li      r7,0
191         cmpldi  cr1,r5,16
192         bf      cr7*4+3,1f
193         lbz     r0,0(r4)
194         stb     r0,0(r3)
195         addi    r7,r7,1
196 1:      bf      cr7*4+2,2f
197         lhzx    r0,r7,r4
198         sthx    r0,r7,r3
199         addi    r7,r7,2
200 2:      bf      cr7*4+1,3f
201         lwzx    r0,r7,r4
202         stwx    r0,r7,r3
203 3:      PPC_MTOCRF(0x01,r5)
204         add     r4,r6,r4
205         add     r3,r6,r3
206         b       .Ldst_aligned
207
208 .Lshort_copy:
209         bf      cr7*4+0,1f
210         lwz     r0,0(r4)
211         lwz     r9,4(r4)
212         addi    r4,r4,8
213         stw     r0,0(r3)
214         stw     r9,4(r3)
215         addi    r3,r3,8
216 1:      bf      cr7*4+1,2f
217         lwz     r0,0(r4)
218         addi    r4,r4,4
219         stw     r0,0(r3)
220         addi    r3,r3,4
221 2:      bf      cr7*4+2,3f
222         lhz     r0,0(r4)
223         addi    r4,r4,2
224         sth     r0,0(r3)
225         addi    r3,r3,2
226 3:      bf      cr7*4+3,4f
227         lbz     r0,0(r4)
228         stb     r0,0(r3)
229 4:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
230         blr
231 #endif
232 EXPORT_SYMBOL(memcpy)