GNU Linux-libre 4.19.264-gnu1
[releases.git] / arch / powerpc / lib / copyuser_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11 #include <asm/export.h>
12 #include <asm/asm-compat.h>
13 #include <asm/feature-fixups.h>
14
15 #ifndef SELFTEST_CASE
16 /* 0 == most CPUs, 1 == POWER6, 2 == Cell */
17 #define SELFTEST_CASE   0
18 #endif
19
20 #ifdef __BIG_ENDIAN__
21 #define sLd sld         /* Shift towards low-numbered address. */
22 #define sHd srd         /* Shift towards high-numbered address. */
23 #else
24 #define sLd srd         /* Shift towards low-numbered address. */
25 #define sHd sld         /* Shift towards high-numbered address. */
26 #endif
27
28 /*
29  * These macros are used to generate exception table entries.
30  * The exception handlers below use the original arguments
31  * (stored on the stack) and the point where we're up to in
32  * the destination buffer, i.e. the address of the first
33  * unmodified byte.  Generally r3 points into the destination
34  * buffer, but the first unmodified byte is at a variable
35  * offset from r3.  In the code below, the symbol r3_offset
36  * is set to indicate the current offset at each point in
37  * the code.  This offset is then used as a negative offset
38  * from the exception handler code, and those instructions
39  * before the exception handlers are addi instructions that
40  * adjust r3 to point to the correct place.
41  */
42         .macro  lex             /* exception handler for load */
43 100:    EX_TABLE(100b, .Lld_exc - r3_offset)
44         .endm
45
46         .macro  stex            /* exception handler for store */
47 100:    EX_TABLE(100b, .Lst_exc - r3_offset)
48         .endm
49
50         .align  7
51 _GLOBAL_TOC(__copy_tofrom_user)
52 #ifdef CONFIG_PPC_BOOK3S_64
53 BEGIN_FTR_SECTION
54         nop
55 FTR_SECTION_ELSE
56         b       __copy_tofrom_user_power7
57 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
58 #endif
59 _GLOBAL(__copy_tofrom_user_base)
60         /* first check for a 4kB copy on a 4kB boundary */
61         cmpldi  cr1,r5,16
62         cmpdi   cr6,r5,4096
63         or      r0,r3,r4
64         neg     r6,r3           /* LS 3 bits = # bytes to 8-byte dest bdry */
65         andi.   r0,r0,4095
66         std     r3,-24(r1)
67         crand   cr0*4+2,cr0*4+2,cr6*4+2
68         std     r4,-16(r1)
69         std     r5,-8(r1)
70         dcbt    0,r4
71         beq     .Lcopy_page_4K
72         andi.   r6,r6,7
73         PPC_MTOCRF(0x01,r5)
74         blt     cr1,.Lshort_copy
75 /* Below we want to nop out the bne if we're on a CPU that has the
76  * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
77  * cleared.
78  * At the time of writing the only CPU that has this combination of bits
79  * set is Power6.
80  */
81 test_feature = (SELFTEST_CASE == 1)
82 BEGIN_FTR_SECTION
83         nop
84 FTR_SECTION_ELSE
85         bne     .Ldst_unaligned
86 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
87                     CPU_FTR_UNALIGNED_LD_STD)
88 .Ldst_aligned:
89         addi    r3,r3,-16
90 r3_offset = 16
91 test_feature = (SELFTEST_CASE == 0)
92 BEGIN_FTR_SECTION
93         andi.   r0,r4,7
94         bne     .Lsrc_unaligned
95 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
96         blt     cr1,.Ldo_tail           /* if < 16 bytes to copy */
97         srdi    r0,r5,5
98         cmpdi   cr1,r0,0
99 lex;    ld      r7,0(r4)
100 lex;    ld      r6,8(r4)
101         addi    r4,r4,16
102         mtctr   r0
103         andi.   r0,r5,0x10
104         beq     22f
105         addi    r3,r3,16
106 r3_offset = 0
107         addi    r4,r4,-16
108         mr      r9,r7
109         mr      r8,r6
110         beq     cr1,72f
111 21:
112 lex;    ld      r7,16(r4)
113 lex;    ld      r6,24(r4)
114         addi    r4,r4,32
115 stex;   std     r9,0(r3)
116 r3_offset = 8
117 stex;   std     r8,8(r3)
118 r3_offset = 16
119 22:
120 lex;    ld      r9,0(r4)
121 lex;    ld      r8,8(r4)
122 stex;   std     r7,16(r3)
123 r3_offset = 24
124 stex;   std     r6,24(r3)
125         addi    r3,r3,32
126 r3_offset = 0
127         bdnz    21b
128 72:
129 stex;   std     r9,0(r3)
130 r3_offset = 8
131 stex;   std     r8,8(r3)
132 r3_offset = 16
133         andi.   r5,r5,0xf
134         beq+    3f
135         addi    r4,r4,16
136 .Ldo_tail:
137         addi    r3,r3,16
138 r3_offset = 0
139         bf      cr7*4+0,246f
140 lex;    ld      r9,0(r4)
141         addi    r4,r4,8
142 stex;   std     r9,0(r3)
143         addi    r3,r3,8
144 246:    bf      cr7*4+1,1f
145 lex;    lwz     r9,0(r4)
146         addi    r4,r4,4
147 stex;   stw     r9,0(r3)
148         addi    r3,r3,4
149 1:      bf      cr7*4+2,2f
150 lex;    lhz     r9,0(r4)
151         addi    r4,r4,2
152 stex;   sth     r9,0(r3)
153         addi    r3,r3,2
154 2:      bf      cr7*4+3,3f
155 lex;    lbz     r9,0(r4)
156 stex;   stb     r9,0(r3)
157 3:      li      r3,0
158         blr
159
160 .Lsrc_unaligned:
161 r3_offset = 16
162         srdi    r6,r5,3
163         addi    r5,r5,-16
164         subf    r4,r0,r4
165         srdi    r7,r5,4
166         sldi    r10,r0,3
167         cmpldi  cr6,r6,3
168         andi.   r5,r5,7
169         mtctr   r7
170         subfic  r11,r10,64
171         add     r5,r5,r0
172         bt      cr7*4+0,28f
173
174 lex;    ld      r9,0(r4)        /* 3+2n loads, 2+2n stores */
175 lex;    ld      r0,8(r4)
176         sLd     r6,r9,r10
177 lex;    ldu     r9,16(r4)
178         sHd     r7,r0,r11
179         sLd     r8,r0,r10
180         or      r7,r7,r6
181         blt     cr6,79f
182 lex;    ld      r0,8(r4)
183         b       2f
184
185 28:
186 lex;    ld      r0,0(r4)        /* 4+2n loads, 3+2n stores */
187 lex;    ldu     r9,8(r4)
188         sLd     r8,r0,r10
189         addi    r3,r3,-8
190 r3_offset = 24
191         blt     cr6,5f
192 lex;    ld      r0,8(r4)
193         sHd     r12,r9,r11
194         sLd     r6,r9,r10
195 lex;    ldu     r9,16(r4)
196         or      r12,r8,r12
197         sHd     r7,r0,r11
198         sLd     r8,r0,r10
199         addi    r3,r3,16
200 r3_offset = 8
201         beq     cr6,78f
202
203 1:      or      r7,r7,r6
204 lex;    ld      r0,8(r4)
205 stex;   std     r12,8(r3)
206 r3_offset = 16
207 2:      sHd     r12,r9,r11
208         sLd     r6,r9,r10
209 lex;    ldu     r9,16(r4)
210         or      r12,r8,r12
211 stex;   stdu    r7,16(r3)
212 r3_offset = 8
213         sHd     r7,r0,r11
214         sLd     r8,r0,r10
215         bdnz    1b
216
217 78:
218 stex;   std     r12,8(r3)
219 r3_offset = 16
220         or      r7,r7,r6
221 79:
222 stex;   std     r7,16(r3)
223 r3_offset = 24
224 5:      sHd     r12,r9,r11
225         or      r12,r8,r12
226 stex;   std     r12,24(r3)
227 r3_offset = 32
228         bne     6f
229         li      r3,0
230         blr
231 6:      cmpwi   cr1,r5,8
232         addi    r3,r3,32
233 r3_offset = 0
234         sLd     r9,r9,r10
235         ble     cr1,7f
236 lex;    ld      r0,8(r4)
237         sHd     r7,r0,r11
238         or      r9,r7,r9
239 7:
240         bf      cr7*4+1,1f
241 #ifdef __BIG_ENDIAN__
242         rotldi  r9,r9,32
243 #endif
244 stex;   stw     r9,0(r3)
245 #ifdef __LITTLE_ENDIAN__
246         rotrdi  r9,r9,32
247 #endif
248         addi    r3,r3,4
249 1:      bf      cr7*4+2,2f
250 #ifdef __BIG_ENDIAN__
251         rotldi  r9,r9,16
252 #endif
253 stex;   sth     r9,0(r3)
254 #ifdef __LITTLE_ENDIAN__
255         rotrdi  r9,r9,16
256 #endif
257         addi    r3,r3,2
258 2:      bf      cr7*4+3,3f
259 #ifdef __BIG_ENDIAN__
260         rotldi  r9,r9,8
261 #endif
262 stex;   stb     r9,0(r3)
263 #ifdef __LITTLE_ENDIAN__
264         rotrdi  r9,r9,8
265 #endif
266 3:      li      r3,0
267         blr
268
269 .Ldst_unaligned:
270 r3_offset = 0
271         PPC_MTOCRF(0x01,r6)             /* put #bytes to 8B bdry into cr7 */
272         subf    r5,r6,r5
273         li      r7,0
274         cmpldi  cr1,r5,16
275         bf      cr7*4+3,1f
276 100:    EX_TABLE(100b, .Lld_exc_r7)
277         lbz     r0,0(r4)
278 100:    EX_TABLE(100b, .Lst_exc_r7)
279         stb     r0,0(r3)
280         addi    r7,r7,1
281 1:      bf      cr7*4+2,2f
282 100:    EX_TABLE(100b, .Lld_exc_r7)
283         lhzx    r0,r7,r4
284 100:    EX_TABLE(100b, .Lst_exc_r7)
285         sthx    r0,r7,r3
286         addi    r7,r7,2
287 2:      bf      cr7*4+1,3f
288 100:    EX_TABLE(100b, .Lld_exc_r7)
289         lwzx    r0,r7,r4
290 100:    EX_TABLE(100b, .Lst_exc_r7)
291         stwx    r0,r7,r3
292 3:      PPC_MTOCRF(0x01,r5)
293         add     r4,r6,r4
294         add     r3,r6,r3
295         b       .Ldst_aligned
296
297 .Lshort_copy:
298 r3_offset = 0
299         bf      cr7*4+0,1f
300 lex;    lwz     r0,0(r4)
301 lex;    lwz     r9,4(r4)
302         addi    r4,r4,8
303 stex;   stw     r0,0(r3)
304 stex;   stw     r9,4(r3)
305         addi    r3,r3,8
306 1:      bf      cr7*4+1,2f
307 lex;    lwz     r0,0(r4)
308         addi    r4,r4,4
309 stex;   stw     r0,0(r3)
310         addi    r3,r3,4
311 2:      bf      cr7*4+2,3f
312 lex;    lhz     r0,0(r4)
313         addi    r4,r4,2
314 stex;   sth     r0,0(r3)
315         addi    r3,r3,2
316 3:      bf      cr7*4+3,4f
317 lex;    lbz     r0,0(r4)
318 stex;   stb     r0,0(r3)
319 4:      li      r3,0
320         blr
321
322 /*
323  * exception handlers follow
324  * we have to return the number of bytes not copied
325  * for an exception on a load, we set the rest of the destination to 0
326  * Note that the number of bytes of instructions for adjusting r3 needs
327  * to equal the amount of the adjustment, due to the trick of using
328  * .Lld_exc - r3_offset as the handler address.
329  */
330
331 .Lld_exc_r7:
332         add     r3,r3,r7
333         b       .Lld_exc
334
335         /* adjust by 24 */
336         addi    r3,r3,8
337         nop
338         /* adjust by 16 */
339         addi    r3,r3,8
340         nop
341         /* adjust by 8 */
342         addi    r3,r3,8
343         nop
344
345 /*
346  * Here we have had a fault on a load and r3 points to the first
347  * unmodified byte of the destination.  We use the original arguments
348  * and r3 to work out how much wasn't copied.  Since we load some
349  * distance ahead of the stores, we continue copying byte-by-byte until
350  * we hit the load fault again in order to copy as much as possible.
351  */
352 .Lld_exc:
353         ld      r6,-24(r1)
354         ld      r4,-16(r1)
355         ld      r5,-8(r1)
356         subf    r6,r6,r3
357         add     r4,r4,r6
358         subf    r5,r6,r5        /* #bytes left to go */
359
360 /*
361  * first see if we can copy any more bytes before hitting another exception
362  */
363         mtctr   r5
364 r3_offset = 0
365 100:    EX_TABLE(100b, .Ldone)
366 43:     lbz     r0,0(r4)
367         addi    r4,r4,1
368 stex;   stb     r0,0(r3)
369         addi    r3,r3,1
370         bdnz    43b
371         li      r3,0            /* huh? all copied successfully this time? */
372         blr
373
374 /*
375  * here we have trapped again, amount remaining is in ctr.
376  */
377 .Ldone:
378         mfctr   r3
379         blr
380
381 /*
382  * exception handlers for stores: we need to work out how many bytes
383  * weren't copied, and we may need to copy some more.
384  * Note that the number of bytes of instructions for adjusting r3 needs
385  * to equal the amount of the adjustment, due to the trick of using
386  * .Lst_exc - r3_offset as the handler address.
387  */
388 .Lst_exc_r7:
389         add     r3,r3,r7
390         b       .Lst_exc
391
392         /* adjust by 24 */
393         addi    r3,r3,8
394         nop
395         /* adjust by 16 */
396         addi    r3,r3,8
397         nop
398         /* adjust by 8 */
399         addi    r3,r3,4
400         /* adjust by 4 */
401         addi    r3,r3,4
402 .Lst_exc:
403         ld      r6,-24(r1)      /* original destination pointer */
404         ld      r4,-16(r1)      /* original source pointer */
405         ld      r5,-8(r1)       /* original number of bytes */
406         add     r7,r6,r5
407         /*
408          * If the destination pointer isn't 8-byte aligned,
409          * we may have got the exception as a result of a
410          * store that overlapped a page boundary, so we may be
411          * able to copy a few more bytes.
412          */
413 17:     andi.   r0,r3,7
414         beq     19f
415         subf    r8,r6,r3        /* #bytes copied */
416 100:    EX_TABLE(100b,19f)
417         lbzx    r0,r8,r4
418 100:    EX_TABLE(100b,19f)
419         stb     r0,0(r3)
420         addi    r3,r3,1
421         cmpld   r3,r7
422         blt     17b
423 19:     subf    r3,r3,r7        /* #bytes not copied in r3 */
424         blr
425
426 /*
427  * Routine to copy a whole page of data, optimized for POWER4.
428  * On POWER4 it is more than 50% faster than the simple loop
429  * above (following the .Ldst_aligned label).
430  */
431         .macro  exc
432 100:    EX_TABLE(100b, .Labort)
433         .endm
434 .Lcopy_page_4K:
435         std     r31,-32(1)
436         std     r30,-40(1)
437         std     r29,-48(1)
438         std     r28,-56(1)
439         std     r27,-64(1)
440         std     r26,-72(1)
441         std     r25,-80(1)
442         std     r24,-88(1)
443         std     r23,-96(1)
444         std     r22,-104(1)
445         std     r21,-112(1)
446         std     r20,-120(1)
447         li      r5,4096/32 - 1
448         addi    r3,r3,-8
449         li      r0,5
450 0:      addi    r5,r5,-24
451         mtctr   r0
452 exc;    ld      r22,640(4)
453 exc;    ld      r21,512(4)
454 exc;    ld      r20,384(4)
455 exc;    ld      r11,256(4)
456 exc;    ld      r9,128(4)
457 exc;    ld      r7,0(4)
458 exc;    ld      r25,648(4)
459 exc;    ld      r24,520(4)
460 exc;    ld      r23,392(4)
461 exc;    ld      r10,264(4)
462 exc;    ld      r8,136(4)
463 exc;    ldu     r6,8(4)
464         cmpwi   r5,24
465 1:
466 exc;    std     r22,648(3)
467 exc;    std     r21,520(3)
468 exc;    std     r20,392(3)
469 exc;    std     r11,264(3)
470 exc;    std     r9,136(3)
471 exc;    std     r7,8(3)
472 exc;    ld      r28,648(4)
473 exc;    ld      r27,520(4)
474 exc;    ld      r26,392(4)
475 exc;    ld      r31,264(4)
476 exc;    ld      r30,136(4)
477 exc;    ld      r29,8(4)
478 exc;    std     r25,656(3)
479 exc;    std     r24,528(3)
480 exc;    std     r23,400(3)
481 exc;    std     r10,272(3)
482 exc;    std     r8,144(3)
483 exc;    std     r6,16(3)
484 exc;    ld      r22,656(4)
485 exc;    ld      r21,528(4)
486 exc;    ld      r20,400(4)
487 exc;    ld      r11,272(4)
488 exc;    ld      r9,144(4)
489 exc;    ld      r7,16(4)
490 exc;    std     r28,664(3)
491 exc;    std     r27,536(3)
492 exc;    std     r26,408(3)
493 exc;    std     r31,280(3)
494 exc;    std     r30,152(3)
495 exc;    stdu    r29,24(3)
496 exc;    ld      r25,664(4)
497 exc;    ld      r24,536(4)
498 exc;    ld      r23,408(4)
499 exc;    ld      r10,280(4)
500 exc;    ld      r8,152(4)
501 exc;    ldu     r6,24(4)
502         bdnz    1b
503 exc;    std     r22,648(3)
504 exc;    std     r21,520(3)
505 exc;    std     r20,392(3)
506 exc;    std     r11,264(3)
507 exc;    std     r9,136(3)
508 exc;    std     r7,8(3)
509         addi    r4,r4,640
510         addi    r3,r3,648
511         bge     0b
512         mtctr   r5
513 exc;    ld      r7,0(4)
514 exc;    ld      r8,8(4)
515 exc;    ldu     r9,16(4)
516 3:
517 exc;    ld      r10,8(4)
518 exc;    std     r7,8(3)
519 exc;    ld      r7,16(4)
520 exc;    std     r8,16(3)
521 exc;    ld      r8,24(4)
522 exc;    std     r9,24(3)
523 exc;    ldu     r9,32(4)
524 exc;    stdu    r10,32(3)
525         bdnz    3b
526 4:
527 exc;    ld      r10,8(4)
528 exc;    std     r7,8(3)
529 exc;    std     r8,16(3)
530 exc;    std     r9,24(3)
531 exc;    std     r10,32(3)
532 9:      ld      r20,-120(1)
533         ld      r21,-112(1)
534         ld      r22,-104(1)
535         ld      r23,-96(1)
536         ld      r24,-88(1)
537         ld      r25,-80(1)
538         ld      r26,-72(1)
539         ld      r27,-64(1)
540         ld      r28,-56(1)
541         ld      r29,-48(1)
542         ld      r30,-40(1)
543         ld      r31,-32(1)
544         li      r3,0
545         blr
546
547 /*
548  * on an exception, reset to the beginning and jump back into the
549  * standard __copy_tofrom_user
550  */
551 .Labort:
552         ld      r20,-120(1)
553         ld      r21,-112(1)
554         ld      r22,-104(1)
555         ld      r23,-96(1)
556         ld      r24,-88(1)
557         ld      r25,-80(1)
558         ld      r26,-72(1)
559         ld      r27,-64(1)
560         ld      r28,-56(1)
561         ld      r29,-48(1)
562         ld      r30,-40(1)
563         ld      r31,-32(1)
564         ld      r3,-24(r1)
565         ld      r4,-16(r1)
566         li      r5,4096
567         b       .Ldst_aligned
568 EXPORT_SYMBOL(__copy_tofrom_user)