arch/sparc/lib/checksum_64.S

   1 /* checksum.S: Sparc V9 optimized checksum code.
   2  *
   3  *  Copyright(C) 1995 Linus Torvalds
   4  *  Copyright(C) 1995 Miguel de Icaza
   5  *  Copyright(C) 1996, 2000 David S. Miller
   6  *  Copyright(C) 1997 Jakub Jelinek
   7  *
   8  * derived from:
   9  *      Linux/Alpha checksum c-code
  10  *      Linux/ix86 inline checksum assembly
  11  *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  12  *      David Mosberger-Tang for optimized reference c-code
  13  *      BSD4.4 portable checksum routine
  14  */
  15
  16 #include <asm/export.h>
  17         .text
  18
  19 csum_partial_fix_alignment:
  20         /* We checked for zero length already, so there must be
  21          * at least one byte.
  22          */
  23         be,pt           %icc, 1f
  24          nop
  25         ldub            [%o0 + 0x00], %o4
  26         add             %o0, 1, %o0
  27         sub             %o1, 1, %o1
  28 1:      andcc           %o0, 0x2, %g0
  29         be,pn           %icc, csum_partial_post_align
  30          cmp            %o1, 2
  31         blu,pn          %icc, csum_partial_end_cruft
  32          nop
  33         lduh            [%o0 + 0x00], %o5
  34         add             %o0, 2, %o0
  35         sub             %o1, 2, %o1
  36         ba,pt           %xcc, csum_partial_post_align
  37          add            %o5, %o4, %o4
  38
  39         .align          32
  40         .globl          csum_partial
  41         EXPORT_SYMBOL(csum_partial)
  42 csum_partial:           /* %o0=buff, %o1=len, %o2=sum */
  43         prefetch        [%o0 + 0x000], #n_reads
  44         clr             %o4
  45         prefetch        [%o0 + 0x040], #n_reads
  46         brz,pn          %o1, csum_partial_finish
  47          andcc          %o0, 0x3, %g0
  48
  49         /* We "remember" whether the lowest bit in the address
  50          * was set in %g7.  Because if it is, we have to swap
  51          * upper and lower 8 bit fields of the sum we calculate.
  52         */
  53         bne,pn          %icc, csum_partial_fix_alignment
  54          andcc          %o0, 0x1, %g7
  55
  56 csum_partial_post_align:
  57         prefetch        [%o0 + 0x080], #n_reads
  58         andncc          %o1, 0x3f, %o3
  59
  60         prefetch        [%o0 + 0x0c0], #n_reads
  61         sub             %o1, %o3, %o1
  62         brz,pn          %o3, 2f
  63          prefetch       [%o0 + 0x100], #n_reads
  64
  65         /* So that we don't need to use the non-pairing
  66          * add-with-carry instructions we accumulate 32-bit
  67          * values into a 64-bit register.  At the end of the
  68          * loop we fold it down to 32-bits and so on.
  69          */
  70         prefetch        [%o0 + 0x140], #n_reads
  71 1:      lduw            [%o0 + 0x00], %o5
  72         lduw            [%o0 + 0x04], %g1
  73         lduw            [%o0 + 0x08], %g2
  74         add             %o4, %o5, %o4
  75         lduw            [%o0 + 0x0c], %g3
  76         add             %o4, %g1, %o4
  77         lduw            [%o0 + 0x10], %o5
  78         add             %o4, %g2, %o4
  79         lduw            [%o0 + 0x14], %g1
  80         add             %o4, %g3, %o4
  81         lduw            [%o0 + 0x18], %g2
  82         add             %o4, %o5, %o4
  83         lduw            [%o0 + 0x1c], %g3
  84         add             %o4, %g1, %o4
  85         lduw            [%o0 + 0x20], %o5
  86         add             %o4, %g2, %o4
  87         lduw            [%o0 + 0x24], %g1
  88         add             %o4, %g3, %o4
  89         lduw            [%o0 + 0x28], %g2
  90         add             %o4, %o5, %o4
  91         lduw            [%o0 + 0x2c], %g3
  92         add             %o4, %g1, %o4
  93         lduw            [%o0 + 0x30], %o5
  94         add             %o4, %g2, %o4
  95         lduw            [%o0 + 0x34], %g1
  96         add             %o4, %g3, %o4
  97         lduw            [%o0 + 0x38], %g2
  98         add             %o4, %o5, %o4
  99         lduw            [%o0 + 0x3c], %g3
 100         add             %o4, %g1, %o4
 101         prefetch        [%o0 + 0x180], #n_reads
 102         add             %o4, %g2, %o4
 103         subcc           %o3, 0x40, %o3
 104         add             %o0, 0x40, %o0
 105         bne,pt          %icc, 1b
 106          add            %o4, %g3, %o4
 107
 108 2:      and             %o1, 0x3c, %o3
 109         brz,pn          %o3, 2f
 110          sub            %o1, %o3, %o1
 111 1:      lduw            [%o0 + 0x00], %o5
 112         subcc           %o3, 0x4, %o3
 113         add             %o0, 0x4, %o0
 114         bne,pt          %icc, 1b
 115          add            %o4, %o5, %o4
 116
 117 2:
 118         /* fold 64-->32 */
 119         srlx            %o4, 32, %o5
 120         srl             %o4, 0, %o4
 121         add             %o4, %o5, %o4
 122         srlx            %o4, 32, %o5
 123         srl             %o4, 0, %o4
 124         add             %o4, %o5, %o4
 125
 126         /* fold 32-->16 */
 127         sethi           %hi(0xffff0000), %g1
 128         srl             %o4, 16, %o5
 129         andn            %o4, %g1, %g2
 130         add             %o5, %g2, %o4
 131         srl             %o4, 16, %o5
 132         andn            %o4, %g1, %g2
 133         add             %o5, %g2, %o4
 134
 135 csum_partial_end_cruft:
 136         /* %o4 has the 16-bit sum we have calculated so-far.  */
 137         cmp             %o1, 2
 138         blu,pt          %icc, 1f
 139          nop
 140         lduh            [%o0 + 0x00], %o5
 141         sub             %o1, 2, %o1
 142         add             %o0, 2, %o0
 143         add             %o4, %o5, %o4
 144 1:      brz,pt          %o1, 1f
 145          nop
 146         ldub            [%o0 + 0x00], %o5
 147         sub             %o1, 1, %o1
 148         add             %o0, 1, %o0
 149         sllx            %o5, 8, %o5
 150         add             %o4, %o5, %o4
 151 1:
 152         /* fold 32-->16 */
 153         sethi           %hi(0xffff0000), %g1
 154         srl             %o4, 16, %o5
 155         andn            %o4, %g1, %g2
 156         add             %o5, %g2, %o4
 157         srl             %o4, 16, %o5
 158         andn            %o4, %g1, %g2
 159         add             %o5, %g2, %o4
 160
 161 1:      brz,pt          %g7, 1f
 162          nop
 163
 164         /* We started with an odd byte, byte-swap the result.  */
 165         srl             %o4, 8, %o5
 166         and             %o4, 0xff, %g1
 167         sll             %g1, 8, %g1
 168         or              %o5, %g1, %o4
 169
 170 1:      addcc           %o2, %o4, %o2
 171         addc            %g0, %o2, %o2
 172
 173 csum_partial_finish:
 174         retl
 175          srl            %o2, 0, %o0