2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/linkage.h>
26 #include <asm/assembler.h>
27 #include <asm/cache.h>
30 * Move a buffer from src to test (alignment handled by the hardware).
31 * If dest <= src, call memcpy, otherwise copy in reverse order.
66 b.hs __memcpy /* No overlap. */
71 b.lo .Ltail15 /*probably non-alignment accesses.*/
73 ands tmp2, src, #15 /* Bytes to reach alignment. */
75 sub count, count, tmp2
77 * process the aligned offset length to make the src aligned firstly.
78 * those extra instructions' cost is acceptable. It also make the
79 * coming accesses are based on aligned address.
82 ldrb tmp1w, [src, #-1]!
83 strb tmp1w, [dst, #-1]!
86 ldrh tmp1w, [src, #-2]!
87 strh tmp1w, [dst, #-2]!
90 ldr tmp1w, [src, #-4]!
91 str tmp1w, [dst, #-4]!
93 tbz tmp2, #3, .LSrcAligned
102 * Deal with small copies quickly by dropping straight into the
107 * Copy up to 48 bytes of data. At this point we only need the
108 * bottom 6 bits of count to be accurate.
110 ands tmp1, count, #0x30
115 ldp A_l, A_h, [src, #-16]!
116 stp A_l, A_h, [dst, #-16]!
118 ldp A_l, A_h, [src, #-16]!
119 stp A_l, A_h, [dst, #-16]!
121 ldp A_l, A_h, [src, #-16]!
122 stp A_l, A_h, [dst, #-16]!
126 ldr tmp1, [src, #-8]!
127 str tmp1, [dst, #-8]!
130 ldr tmp1w, [src, #-4]!
131 str tmp1w, [dst, #-4]!
134 ldrh tmp1w, [src, #-2]!
135 strh tmp1w, [dst, #-2]!
137 tbz count, #0, .Lexitfunc
138 ldrb tmp1w, [src, #-1]
139 strb tmp1w, [dst, #-1]
145 subs count, count, #128
146 b.ge .Lcpy_body_large
148 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
151 ldp A_l, A_h, [src, #-16]
152 stp A_l, A_h, [dst, #-16]
153 ldp B_l, B_h, [src, #-32]
154 ldp C_l, C_h, [src, #-48]
155 stp B_l, B_h, [dst, #-32]
156 stp C_l, C_h, [dst, #-48]
157 ldp D_l, D_h, [src, #-64]!
158 stp D_l, D_h, [dst, #-64]!
165 * Critical loop. Start at a new cache line boundary. Assuming
166 * 64 bytes per line this ensures the entire loop is in one line.
168 .p2align L1_CACHE_SHIFT
170 /* pre-load 64 bytes data. */
171 ldp A_l, A_h, [src, #-16]
172 ldp B_l, B_h, [src, #-32]
173 ldp C_l, C_h, [src, #-48]
174 ldp D_l, D_h, [src, #-64]!
177 * interlace the load of next 64 bytes data block with store of the last
178 * loaded 64 bytes data.
180 stp A_l, A_h, [dst, #-16]
181 ldp A_l, A_h, [src, #-16]
182 stp B_l, B_h, [dst, #-32]
183 ldp B_l, B_h, [src, #-32]
184 stp C_l, C_h, [dst, #-48]
185 ldp C_l, C_h, [src, #-48]
186 stp D_l, D_h, [dst, #-64]!
187 ldp D_l, D_h, [src, #-64]!
188 subs count, count, #64
190 stp A_l, A_h, [dst, #-16]
191 stp B_l, B_h, [dst, #-32]
192 stp C_l, C_h, [dst, #-48]
193 stp D_l, D_h, [dst, #-64]!