arch/powerpc/lib/checksum_64.S

   1 /*
   2  * This file contains assembly-language implementations
   3  * of IP-style 1's complement checksum routines.
   4  *
   5  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6  *
   7  *  This program is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU General Public License
   9  *  as published by the Free Software Foundation; either version
  10  *  2 of the License, or (at your option) any later version.
  11  *
  12  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13  */
  14
  15 #include <linux/sys.h>
  16 #include <asm/processor.h>
  17 #include <asm/errno.h>
  18 #include <asm/ppc_asm.h>
  19 #include <asm/export.h>
  20
  21 /*
  22  * Computes the checksum of a memory block at buff, length len,
  23  * and adds in "sum" (32-bit).
  24  *
  25  * __csum_partial(r3=buff, r4=len, r5=sum)
  26  */
  27 _GLOBAL(__csum_partial)
  28         addic   r0,r5,0                 /* clear carry */
  29
  30         srdi.   r6,r4,3                 /* less than 8 bytes? */
  31         beq     .Lcsum_tail_word
  32
  33         /*
  34          * If only halfword aligned, align to a double word. Since odd
  35          * aligned addresses should be rare and they would require more
  36          * work to calculate the correct checksum, we ignore that case
  37          * and take the potential slowdown of unaligned loads.
  38          */
  39         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
  40         beq     .Lcsum_aligned
  41
  42         li      r7,4
  43         sub     r6,r7,r6
  44         mtctr   r6
  45
  46 1:
  47         lhz     r6,0(r3)                /* align to doubleword */
  48         subi    r4,r4,2
  49         addi    r3,r3,2
  50         adde    r0,r0,r6
  51         bdnz    1b
  52
  53 .Lcsum_aligned:
  54         /*
  55          * We unroll the loop such that each iteration is 64 bytes with an
  56          * entry and exit limb of 64 bytes, meaning a minimum size of
  57          * 128 bytes.
  58          */
  59         srdi.   r6,r4,7
  60         beq     .Lcsum_tail_doublewords         /* len < 128 */
  61
  62         srdi    r6,r4,6
  63         subi    r6,r6,1
  64         mtctr   r6
  65
  66         stdu    r1,-STACKFRAMESIZE(r1)
  67         std     r14,STK_REG(R14)(r1)
  68         std     r15,STK_REG(R15)(r1)
  69         std     r16,STK_REG(R16)(r1)
  70
  71         ld      r6,0(r3)
  72         ld      r9,8(r3)
  73
  74         ld      r10,16(r3)
  75         ld      r11,24(r3)
  76
  77         /*
  78          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  79          * because of the XER dependency. This means the fastest this loop can
  80          * go is 16 cycles per iteration. The scheduling of the loop below has
  81          * been shown to hit this on both POWER6 and POWER7.
  82          */
  83         .align 5
  84 2:
  85         adde    r0,r0,r6
  86         ld      r12,32(r3)
  87         ld      r14,40(r3)
  88
  89         adde    r0,r0,r9
  90         ld      r15,48(r3)
  91         ld      r16,56(r3)
  92         addi    r3,r3,64
  93
  94         adde    r0,r0,r10
  95
  96         adde    r0,r0,r11
  97
  98         adde    r0,r0,r12
  99
 100         adde    r0,r0,r14
 101
 102         adde    r0,r0,r15
 103         ld      r6,0(r3)
 104         ld      r9,8(r3)
 105
 106         adde    r0,r0,r16
 107         ld      r10,16(r3)
 108         ld      r11,24(r3)
 109         bdnz    2b
 110
 111
 112         adde    r0,r0,r6
 113         ld      r12,32(r3)
 114         ld      r14,40(r3)
 115
 116         adde    r0,r0,r9
 117         ld      r15,48(r3)
 118         ld      r16,56(r3)
 119         addi    r3,r3,64
 120
 121         adde    r0,r0,r10
 122         adde    r0,r0,r11
 123         adde    r0,r0,r12
 124         adde    r0,r0,r14
 125         adde    r0,r0,r15
 126         adde    r0,r0,r16
 127
 128         ld      r14,STK_REG(R14)(r1)
 129         ld      r15,STK_REG(R15)(r1)
 130         ld      r16,STK_REG(R16)(r1)
 131         addi    r1,r1,STACKFRAMESIZE
 132
 133         andi.   r4,r4,63
 134
 135 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 136         srdi.   r6,r4,3
 137         beq     .Lcsum_tail_word
 138
 139         mtctr   r6
 140 3:
 141         ld      r6,0(r3)
 142         addi    r3,r3,8
 143         adde    r0,r0,r6
 144         bdnz    3b
 145
 146         andi.   r4,r4,7
 147
 148 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
 149         srdi.   r6,r4,2
 150         beq     .Lcsum_tail_halfword
 151
 152         lwz     r6,0(r3)
 153         addi    r3,r3,4
 154         adde    r0,r0,r6
 155         subi    r4,r4,4
 156
 157 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 158         srdi.   r6,r4,1
 159         beq     .Lcsum_tail_byte
 160
 161         lhz     r6,0(r3)
 162         addi    r3,r3,2
 163         adde    r0,r0,r6
 164         subi    r4,r4,2
 165
 166 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
 167         andi.   r6,r4,1
 168         beq     .Lcsum_finish
 169
 170         lbz     r6,0(r3)
 171 #ifdef __BIG_ENDIAN__
 172         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 173         adde    r0,r0,r9
 174 #else
 175         adde    r0,r0,r6
 176 #endif
 177
 178 .Lcsum_finish:
 179         addze   r0,r0                   /* add in final carry */
 180         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 181         add     r3,r4,r0
 182         srdi    r3,r3,32
 183         blr
 184 EXPORT_SYMBOL(__csum_partial)
 185
 186
 187         .macro srcnr
 188 100:
 189         EX_TABLE(100b,.Lsrc_error_nr)
 190         .endm
 191
 192         .macro source
 193 150:
 194         EX_TABLE(150b,.Lsrc_error)
 195         .endm
 196
 197         .macro dstnr
 198 200:
 199         EX_TABLE(200b,.Ldest_error_nr)
 200         .endm
 201
 202         .macro dest
 203 250:
 204         EX_TABLE(250b,.Ldest_error)
 205         .endm
 206
 207 /*
 208  * Computes the checksum of a memory block at src, length len,
 209  * and adds in "sum" (32-bit), while copying the block to dst.
 210  * If an access exception occurs on src or dst, it stores -EFAULT
 211  * to *src_err or *dst_err respectively. The caller must take any action
 212  * required in this case (zeroing memory, recalculating partial checksum etc).
 213  *
 214  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 215  */
 216 _GLOBAL(csum_partial_copy_generic)
 217         addic   r0,r6,0                 /* clear carry */
 218
 219         srdi.   r6,r5,3                 /* less than 8 bytes? */
 220         beq     .Lcopy_tail_word
 221
 222         /*
 223          * If only halfword aligned, align to a double word. Since odd
 224          * aligned addresses should be rare and they would require more
 225          * work to calculate the correct checksum, we ignore that case
 226          * and take the potential slowdown of unaligned loads.
 227          *
 228          * If the source and destination are relatively unaligned we only
 229          * align the source. This keeps things simple.
 230          */
 231         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
 232         beq     .Lcopy_aligned
 233
 234         li      r9,4
 235         sub     r6,r9,r6
 236         mtctr   r6
 237
 238 1:
 239 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 240         subi    r5,r5,2
 241         addi    r3,r3,2
 242         adde    r0,r0,r6
 243 dstnr;  sth     r6,0(r4)
 244         addi    r4,r4,2
 245         bdnz    1b
 246
 247 .Lcopy_aligned:
 248         /*
 249          * We unroll the loop such that each iteration is 64 bytes with an
 250          * entry and exit limb of 64 bytes, meaning a minimum size of
 251          * 128 bytes.
 252          */
 253         srdi.   r6,r5,7
 254         beq     .Lcopy_tail_doublewords         /* len < 128 */
 255
 256         srdi    r6,r5,6
 257         subi    r6,r6,1
 258         mtctr   r6
 259
 260         stdu    r1,-STACKFRAMESIZE(r1)
 261         std     r14,STK_REG(R14)(r1)
 262         std     r15,STK_REG(R15)(r1)
 263         std     r16,STK_REG(R16)(r1)
 264
 265 source; ld      r6,0(r3)
 266 source; ld      r9,8(r3)
 267
 268 source; ld      r10,16(r3)
 269 source; ld      r11,24(r3)
 270
 271         /*
 272          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
 273          * because of the XER dependency. This means the fastest this loop can
 274          * go is 16 cycles per iteration. The scheduling of the loop below has
 275          * been shown to hit this on both POWER6 and POWER7.
 276          */
 277         .align 5
 278 2:
 279         adde    r0,r0,r6
 280 source; ld      r12,32(r3)
 281 source; ld      r14,40(r3)
 282
 283         adde    r0,r0,r9
 284 source; ld      r15,48(r3)
 285 source; ld      r16,56(r3)
 286         addi    r3,r3,64
 287
 288         adde    r0,r0,r10
 289 dest;   std     r6,0(r4)
 290 dest;   std     r9,8(r4)
 291
 292         adde    r0,r0,r11
 293 dest;   std     r10,16(r4)
 294 dest;   std     r11,24(r4)
 295
 296         adde    r0,r0,r12
 297 dest;   std     r12,32(r4)
 298 dest;   std     r14,40(r4)
 299
 300         adde    r0,r0,r14
 301 dest;   std     r15,48(r4)
 302 dest;   std     r16,56(r4)
 303         addi    r4,r4,64
 304
 305         adde    r0,r0,r15
 306 source; ld      r6,0(r3)
 307 source; ld      r9,8(r3)
 308
 309         adde    r0,r0,r16
 310 source; ld      r10,16(r3)
 311 source; ld      r11,24(r3)
 312         bdnz    2b
 313
 314
 315         adde    r0,r0,r6
 316 source; ld      r12,32(r3)
 317 source; ld      r14,40(r3)
 318
 319         adde    r0,r0,r9
 320 source; ld      r15,48(r3)
 321 source; ld      r16,56(r3)
 322         addi    r3,r3,64
 323
 324         adde    r0,r0,r10
 325 dest;   std     r6,0(r4)
 326 dest;   std     r9,8(r4)
 327
 328         adde    r0,r0,r11
 329 dest;   std     r10,16(r4)
 330 dest;   std     r11,24(r4)
 331
 332         adde    r0,r0,r12
 333 dest;   std     r12,32(r4)
 334 dest;   std     r14,40(r4)
 335
 336         adde    r0,r0,r14
 337 dest;   std     r15,48(r4)
 338 dest;   std     r16,56(r4)
 339         addi    r4,r4,64
 340
 341         adde    r0,r0,r15
 342         adde    r0,r0,r16
 343
 344         ld      r14,STK_REG(R14)(r1)
 345         ld      r15,STK_REG(R15)(r1)
 346         ld      r16,STK_REG(R16)(r1)
 347         addi    r1,r1,STACKFRAMESIZE
 348
 349         andi.   r5,r5,63
 350
 351 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 352         srdi.   r6,r5,3
 353         beq     .Lcopy_tail_word
 354
 355         mtctr   r6
 356 3:
 357 srcnr;  ld      r6,0(r3)
 358         addi    r3,r3,8
 359         adde    r0,r0,r6
 360 dstnr;  std     r6,0(r4)
 361         addi    r4,r4,8
 362         bdnz    3b
 363
 364         andi.   r5,r5,7
 365
 366 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
 367         srdi.   r6,r5,2
 368         beq     .Lcopy_tail_halfword
 369
 370 srcnr;  lwz     r6,0(r3)
 371         addi    r3,r3,4
 372         adde    r0,r0,r6
 373 dstnr;  stw     r6,0(r4)
 374         addi    r4,r4,4
 375         subi    r5,r5,4
 376
 377 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 378         srdi.   r6,r5,1
 379         beq     .Lcopy_tail_byte
 380
 381 srcnr;  lhz     r6,0(r3)
 382         addi    r3,r3,2
 383         adde    r0,r0,r6
 384 dstnr;  sth     r6,0(r4)
 385         addi    r4,r4,2
 386         subi    r5,r5,2
 387
 388 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
 389         andi.   r6,r5,1
 390         beq     .Lcopy_finish
 391
 392 srcnr;  lbz     r6,0(r3)
 393 #ifdef __BIG_ENDIAN__
 394         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 395         adde    r0,r0,r9
 396 #else
 397         adde    r0,r0,r6
 398 #endif
 399 dstnr;  stb     r6,0(r4)
 400
 401 .Lcopy_finish:
 402         addze   r0,r0                   /* add in final carry */
 403         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 404         add     r3,r4,r0
 405         srdi    r3,r3,32
 406         blr
 407
 408 .Lsrc_error:
 409         ld      r14,STK_REG(R14)(r1)
 410         ld      r15,STK_REG(R15)(r1)
 411         ld      r16,STK_REG(R16)(r1)
 412         addi    r1,r1,STACKFRAMESIZE
 413 .Lsrc_error_nr:
 414         cmpdi   0,r7,0
 415         beqlr
 416         li      r6,-EFAULT
 417         stw     r6,0(r7)
 418         blr
 419
 420 .Ldest_error:
 421         ld      r14,STK_REG(R14)(r1)
 422         ld      r15,STK_REG(R15)(r1)
 423         ld      r16,STK_REG(R16)(r1)
 424         addi    r1,r1,STACKFRAMESIZE
 425 .Ldest_error_nr:
 426         cmpdi   0,r8,0
 427         beqlr
 428         li      r6,-EFAULT
 429         stw     r6,0(r8)
 430         blr
 431 EXPORT_SYMBOL(csum_partial_copy_generic)