GNU Linux-libre 4.9-gnu1
[releases.git] / drivers / crypto / vmx / aesp8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for AES instructions as per PowerISA
18 # specification version 2.07, first implemented by POWER8 processor.
19 # The module is endian-agnostic in sense that it supports both big-
20 # and little-endian cases. Data alignment in parallelizable modes is
21 # handled with VSX loads and stores, which implies MSR.VSX flag being
22 # set. It should also be noted that ISA specification doesn't prohibit
23 # alignment exceptions for these instructions on page boundaries.
24 # Initially alignment was handled in pure AltiVec/VMX way [when data
25 # is aligned programmatically, which in turn guarantees exception-
26 # free execution], but it turned to hamper performance when vcipher
27 # instructions are interleaved. It's reckoned that eventual
28 # misalignment penalties at page boundaries are in average lower
29 # than additional overhead in pure AltiVec approach.
30 #
31 # May 2016
32 #
33 # Add XTS subroutine, 9x on little- and 12x improvement on big-endian
34 # systems were measured.
35 #
36 ######################################################################
37 # Current large-block performance in cycles per byte processed with
38 # 128-bit key (less is better).
39 #
40 #               CBC en-/decrypt CTR     XTS
41 # POWER8[le]    3.96/0.72       0.74    1.1
42 # POWER8[be]    3.75/0.65       0.66    1.0
43
44 $flavour = shift;
45
46 if ($flavour =~ /64/) {
47         $SIZE_T =8;
48         $LRSAVE =2*$SIZE_T;
49         $STU    ="stdu";
50         $POP    ="ld";
51         $PUSH   ="std";
52         $UCMP   ="cmpld";
53         $SHL    ="sldi";
54 } elsif ($flavour =~ /32/) {
55         $SIZE_T =4;
56         $LRSAVE =$SIZE_T;
57         $STU    ="stwu";
58         $POP    ="lwz";
59         $PUSH   ="stw";
60         $UCMP   ="cmplw";
61         $SHL    ="slwi";
62 } else { die "nonsense $flavour"; }
63
64 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
65
66 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
68 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
69 die "can't locate ppc-xlate.pl";
70
71 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
72
73 $FRAME=8*$SIZE_T;
74 $prefix="aes_p8";
75
76 $sp="r1";
77 $vrsave="r12";
78
79 #########################################################################
80 {{{     # Key setup procedures                                          #
81 my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
82 my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
83 my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
84
85 $code.=<<___;
86 .machine        "any"
87
88 .text
89
90 .align  7
91 rcon:
92 .long   0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
93 .long   0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
94 .long   0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
95 .long   0,0,0,0                                         ?asis
96 Lconsts:
97         mflr    r0
98         bcl     20,31,\$+4
99         mflr    $ptr     #vvvvv "distance between . and rcon
100         addi    $ptr,$ptr,-0x48
101         mtlr    r0
102         blr
103         .long   0
104         .byte   0,12,0x14,0,0,0,0,0
105 .asciz  "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
106
107 .globl  .${prefix}_set_encrypt_key
108 Lset_encrypt_key:
109         mflr            r11
110         $PUSH           r11,$LRSAVE($sp)
111
112         li              $ptr,-1
113         ${UCMP}i        $inp,0
114         beq-            Lenc_key_abort          # if ($inp==0) return -1;
115         ${UCMP}i        $out,0
116         beq-            Lenc_key_abort          # if ($out==0) return -1;
117         li              $ptr,-2
118         cmpwi           $bits,128
119         blt-            Lenc_key_abort
120         cmpwi           $bits,256
121         bgt-            Lenc_key_abort
122         andi.           r0,$bits,0x3f
123         bne-            Lenc_key_abort
124
125         lis             r0,0xfff0
126         mfspr           $vrsave,256
127         mtspr           256,r0
128
129         bl              Lconsts
130         mtlr            r11
131
132         neg             r9,$inp
133         lvx             $in0,0,$inp
134         addi            $inp,$inp,15            # 15 is not typo
135         lvsr            $key,0,r9               # borrow $key
136         li              r8,0x20
137         cmpwi           $bits,192
138         lvx             $in1,0,$inp
139         le?vspltisb     $mask,0x0f              # borrow $mask
140         lvx             $rcon,0,$ptr
141         le?vxor         $key,$key,$mask         # adjust for byte swap
142         lvx             $mask,r8,$ptr
143         addi            $ptr,$ptr,0x10
144         vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
145         li              $cnt,8
146         vxor            $zero,$zero,$zero
147         mtctr           $cnt
148
149         ?lvsr           $outperm,0,$out
150         vspltisb        $outmask,-1
151         lvx             $outhead,0,$out
152         ?vperm          $outmask,$zero,$outmask,$outperm
153
154         blt             Loop128
155         addi            $inp,$inp,8
156         beq             L192
157         addi            $inp,$inp,8
158         b               L256
159
160 .align  4
161 Loop128:
162         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
163         vsldoi          $tmp,$zero,$in0,12      # >>32
164          vperm          $outtail,$in0,$in0,$outperm     # rotate
165          vsel           $stage,$outhead,$outtail,$outmask
166          vmr            $outhead,$outtail
167         vcipherlast     $key,$key,$rcon
168          stvx           $stage,0,$out
169          addi           $out,$out,16
170
171         vxor            $in0,$in0,$tmp
172         vsldoi          $tmp,$zero,$tmp,12      # >>32
173         vxor            $in0,$in0,$tmp
174         vsldoi          $tmp,$zero,$tmp,12      # >>32
175         vxor            $in0,$in0,$tmp
176          vadduwm        $rcon,$rcon,$rcon
177         vxor            $in0,$in0,$key
178         bdnz            Loop128
179
180         lvx             $rcon,0,$ptr            # last two round keys
181
182         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
183         vsldoi          $tmp,$zero,$in0,12      # >>32
184          vperm          $outtail,$in0,$in0,$outperm     # rotate
185          vsel           $stage,$outhead,$outtail,$outmask
186          vmr            $outhead,$outtail
187         vcipherlast     $key,$key,$rcon
188          stvx           $stage,0,$out
189          addi           $out,$out,16
190
191         vxor            $in0,$in0,$tmp
192         vsldoi          $tmp,$zero,$tmp,12      # >>32
193         vxor            $in0,$in0,$tmp
194         vsldoi          $tmp,$zero,$tmp,12      # >>32
195         vxor            $in0,$in0,$tmp
196          vadduwm        $rcon,$rcon,$rcon
197         vxor            $in0,$in0,$key
198
199         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
200         vsldoi          $tmp,$zero,$in0,12      # >>32
201          vperm          $outtail,$in0,$in0,$outperm     # rotate
202          vsel           $stage,$outhead,$outtail,$outmask
203          vmr            $outhead,$outtail
204         vcipherlast     $key,$key,$rcon
205          stvx           $stage,0,$out
206          addi           $out,$out,16
207
208         vxor            $in0,$in0,$tmp
209         vsldoi          $tmp,$zero,$tmp,12      # >>32
210         vxor            $in0,$in0,$tmp
211         vsldoi          $tmp,$zero,$tmp,12      # >>32
212         vxor            $in0,$in0,$tmp
213         vxor            $in0,$in0,$key
214          vperm          $outtail,$in0,$in0,$outperm     # rotate
215          vsel           $stage,$outhead,$outtail,$outmask
216          vmr            $outhead,$outtail
217          stvx           $stage,0,$out
218
219         addi            $inp,$out,15            # 15 is not typo
220         addi            $out,$out,0x50
221
222         li              $rounds,10
223         b               Ldone
224
225 .align  4
226 L192:
227         lvx             $tmp,0,$inp
228         li              $cnt,4
229          vperm          $outtail,$in0,$in0,$outperm     # rotate
230          vsel           $stage,$outhead,$outtail,$outmask
231          vmr            $outhead,$outtail
232          stvx           $stage,0,$out
233          addi           $out,$out,16
234         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
235         vspltisb        $key,8                  # borrow $key
236         mtctr           $cnt
237         vsububm         $mask,$mask,$key        # adjust the mask
238
239 Loop192:
240         vperm           $key,$in1,$in1,$mask    # roate-n-splat
241         vsldoi          $tmp,$zero,$in0,12      # >>32
242         vcipherlast     $key,$key,$rcon
243
244         vxor            $in0,$in0,$tmp
245         vsldoi          $tmp,$zero,$tmp,12      # >>32
246         vxor            $in0,$in0,$tmp
247         vsldoi          $tmp,$zero,$tmp,12      # >>32
248         vxor            $in0,$in0,$tmp
249
250          vsldoi         $stage,$zero,$in1,8
251         vspltw          $tmp,$in0,3
252         vxor            $tmp,$tmp,$in1
253         vsldoi          $in1,$zero,$in1,12      # >>32
254          vadduwm        $rcon,$rcon,$rcon
255         vxor            $in1,$in1,$tmp
256         vxor            $in0,$in0,$key
257         vxor            $in1,$in1,$key
258          vsldoi         $stage,$stage,$in0,8
259
260         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
261         vsldoi          $tmp,$zero,$in0,12      # >>32
262          vperm          $outtail,$stage,$stage,$outperm # rotate
263          vsel           $stage,$outhead,$outtail,$outmask
264          vmr            $outhead,$outtail
265         vcipherlast     $key,$key,$rcon
266          stvx           $stage,0,$out
267          addi           $out,$out,16
268
269          vsldoi         $stage,$in0,$in1,8
270         vxor            $in0,$in0,$tmp
271         vsldoi          $tmp,$zero,$tmp,12      # >>32
272          vperm          $outtail,$stage,$stage,$outperm # rotate
273          vsel           $stage,$outhead,$outtail,$outmask
274          vmr            $outhead,$outtail
275         vxor            $in0,$in0,$tmp
276         vsldoi          $tmp,$zero,$tmp,12      # >>32
277         vxor            $in0,$in0,$tmp
278          stvx           $stage,0,$out
279          addi           $out,$out,16
280
281         vspltw          $tmp,$in0,3
282         vxor            $tmp,$tmp,$in1
283         vsldoi          $in1,$zero,$in1,12      # >>32
284          vadduwm        $rcon,$rcon,$rcon
285         vxor            $in1,$in1,$tmp
286         vxor            $in0,$in0,$key
287         vxor            $in1,$in1,$key
288          vperm          $outtail,$in0,$in0,$outperm     # rotate
289          vsel           $stage,$outhead,$outtail,$outmask
290          vmr            $outhead,$outtail
291          stvx           $stage,0,$out
292          addi           $inp,$out,15            # 15 is not typo
293          addi           $out,$out,16
294         bdnz            Loop192
295
296         li              $rounds,12
297         addi            $out,$out,0x20
298         b               Ldone
299
300 .align  4
301 L256:
302         lvx             $tmp,0,$inp
303         li              $cnt,7
304         li              $rounds,14
305          vperm          $outtail,$in0,$in0,$outperm     # rotate
306          vsel           $stage,$outhead,$outtail,$outmask
307          vmr            $outhead,$outtail
308          stvx           $stage,0,$out
309          addi           $out,$out,16
310         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
311         mtctr           $cnt
312
313 Loop256:
314         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
315         vsldoi          $tmp,$zero,$in0,12      # >>32
316          vperm          $outtail,$in1,$in1,$outperm     # rotate
317          vsel           $stage,$outhead,$outtail,$outmask
318          vmr            $outhead,$outtail
319         vcipherlast     $key,$key,$rcon
320          stvx           $stage,0,$out
321          addi           $out,$out,16
322
323         vxor            $in0,$in0,$tmp
324         vsldoi          $tmp,$zero,$tmp,12      # >>32
325         vxor            $in0,$in0,$tmp
326         vsldoi          $tmp,$zero,$tmp,12      # >>32
327         vxor            $in0,$in0,$tmp
328          vadduwm        $rcon,$rcon,$rcon
329         vxor            $in0,$in0,$key
330          vperm          $outtail,$in0,$in0,$outperm     # rotate
331          vsel           $stage,$outhead,$outtail,$outmask
332          vmr            $outhead,$outtail
333          stvx           $stage,0,$out
334          addi           $inp,$out,15            # 15 is not typo
335          addi           $out,$out,16
336         bdz             Ldone
337
338         vspltw          $key,$in0,3             # just splat
339         vsldoi          $tmp,$zero,$in1,12      # >>32
340         vsbox           $key,$key
341
342         vxor            $in1,$in1,$tmp
343         vsldoi          $tmp,$zero,$tmp,12      # >>32
344         vxor            $in1,$in1,$tmp
345         vsldoi          $tmp,$zero,$tmp,12      # >>32
346         vxor            $in1,$in1,$tmp
347
348         vxor            $in1,$in1,$key
349         b               Loop256
350
351 .align  4
352 Ldone:
353         lvx             $in1,0,$inp             # redundant in aligned case
354         vsel            $in1,$outhead,$in1,$outmask
355         stvx            $in1,0,$inp
356         li              $ptr,0
357         mtspr           256,$vrsave
358         stw             $rounds,0($out)
359
360 Lenc_key_abort:
361         mr              r3,$ptr
362         blr
363         .long           0
364         .byte           0,12,0x14,1,0,0,3,0
365         .long           0
366 .size   .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
367
368 .globl  .${prefix}_set_decrypt_key
369         $STU            $sp,-$FRAME($sp)
370         mflr            r10
371         $PUSH           r10,$FRAME+$LRSAVE($sp)
372         bl              Lset_encrypt_key
373         mtlr            r10
374
375         cmpwi           r3,0
376         bne-            Ldec_key_abort
377
378         slwi            $cnt,$rounds,4
379         subi            $inp,$out,240           # first round key
380         srwi            $rounds,$rounds,1
381         add             $out,$inp,$cnt          # last round key
382         mtctr           $rounds
383
384 Ldeckey:
385         lwz             r0, 0($inp)
386         lwz             r6, 4($inp)
387         lwz             r7, 8($inp)
388         lwz             r8, 12($inp)
389         addi            $inp,$inp,16
390         lwz             r9, 0($out)
391         lwz             r10,4($out)
392         lwz             r11,8($out)
393         lwz             r12,12($out)
394         stw             r0, 0($out)
395         stw             r6, 4($out)
396         stw             r7, 8($out)
397         stw             r8, 12($out)
398         subi            $out,$out,16
399         stw             r9, -16($inp)
400         stw             r10,-12($inp)
401         stw             r11,-8($inp)
402         stw             r12,-4($inp)
403         bdnz            Ldeckey
404
405         xor             r3,r3,r3                # return value
406 Ldec_key_abort:
407         addi            $sp,$sp,$FRAME
408         blr
409         .long           0
410         .byte           0,12,4,1,0x80,0,3,0
411         .long           0
412 .size   .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
413 ___
414 }}}
415 #########################################################################
416 {{{     # Single block en- and decrypt procedures                       #
417 sub gen_block () {
418 my $dir = shift;
419 my $n   = $dir eq "de" ? "n" : "";
420 my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
421
422 $code.=<<___;
423 .globl  .${prefix}_${dir}crypt
424         lwz             $rounds,240($key)
425         lis             r0,0xfc00
426         mfspr           $vrsave,256
427         li              $idx,15                 # 15 is not typo
428         mtspr           256,r0
429
430         lvx             v0,0,$inp
431         neg             r11,$out
432         lvx             v1,$idx,$inp
433         lvsl            v2,0,$inp               # inpperm
434         le?vspltisb     v4,0x0f
435         ?lvsl           v3,0,r11                # outperm
436         le?vxor         v2,v2,v4
437         li              $idx,16
438         vperm           v0,v0,v1,v2             # align [and byte swap in LE]
439         lvx             v1,0,$key
440         ?lvsl           v5,0,$key               # keyperm
441         srwi            $rounds,$rounds,1
442         lvx             v2,$idx,$key
443         addi            $idx,$idx,16
444         subi            $rounds,$rounds,1
445         ?vperm          v1,v1,v2,v5             # align round key
446
447         vxor            v0,v0,v1
448         lvx             v1,$idx,$key
449         addi            $idx,$idx,16
450         mtctr           $rounds
451
452 Loop_${dir}c:
453         ?vperm          v2,v2,v1,v5
454         v${n}cipher     v0,v0,v2
455         lvx             v2,$idx,$key
456         addi            $idx,$idx,16
457         ?vperm          v1,v1,v2,v5
458         v${n}cipher     v0,v0,v1
459         lvx             v1,$idx,$key
460         addi            $idx,$idx,16
461         bdnz            Loop_${dir}c
462
463         ?vperm          v2,v2,v1,v5
464         v${n}cipher     v0,v0,v2
465         lvx             v2,$idx,$key
466         ?vperm          v1,v1,v2,v5
467         v${n}cipherlast v0,v0,v1
468
469         vspltisb        v2,-1
470         vxor            v1,v1,v1
471         li              $idx,15                 # 15 is not typo
472         ?vperm          v2,v1,v2,v3             # outmask
473         le?vxor         v3,v3,v4
474         lvx             v1,0,$out               # outhead
475         vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
476         vsel            v1,v1,v0,v2
477         lvx             v4,$idx,$out
478         stvx            v1,0,$out
479         vsel            v0,v0,v4,v2
480         stvx            v0,$idx,$out
481
482         mtspr           256,$vrsave
483         blr
484         .long           0
485         .byte           0,12,0x14,0,0,0,3,0
486         .long           0
487 .size   .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
488 ___
489 }
490 &gen_block("en");
491 &gen_block("de");
492 }}}
493 #########################################################################
494 {{{     # CBC en- and decrypt procedures                                #
495 my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
496 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
497 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
498                                                 map("v$_",(4..10));
499 $code.=<<___;
500 .globl  .${prefix}_cbc_encrypt
501         ${UCMP}i        $len,16
502         bltlr-
503
504         cmpwi           $enc,0                  # test direction
505         lis             r0,0xffe0
506         mfspr           $vrsave,256
507         mtspr           256,r0
508
509         li              $idx,15
510         vxor            $rndkey0,$rndkey0,$rndkey0
511         le?vspltisb     $tmp,0x0f
512
513         lvx             $ivec,0,$ivp            # load [unaligned] iv
514         lvsl            $inpperm,0,$ivp
515         lvx             $inptail,$idx,$ivp
516         le?vxor         $inpperm,$inpperm,$tmp
517         vperm           $ivec,$ivec,$inptail,$inpperm
518
519         neg             r11,$inp
520         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
521         lwz             $rounds,240($key)
522
523         lvsr            $inpperm,0,r11          # prepare for unaligned load
524         lvx             $inptail,0,$inp
525         addi            $inp,$inp,15            # 15 is not typo
526         le?vxor         $inpperm,$inpperm,$tmp
527
528         ?lvsr           $outperm,0,$out         # prepare for unaligned store
529         vspltisb        $outmask,-1
530         lvx             $outhead,0,$out
531         ?vperm          $outmask,$rndkey0,$outmask,$outperm
532         le?vxor         $outperm,$outperm,$tmp
533
534         srwi            $rounds,$rounds,1
535         li              $idx,16
536         subi            $rounds,$rounds,1
537         beq             Lcbc_dec
538
539 Lcbc_enc:
540         vmr             $inout,$inptail
541         lvx             $inptail,0,$inp
542         addi            $inp,$inp,16
543         mtctr           $rounds
544         subi            $len,$len,16            # len-=16
545
546         lvx             $rndkey0,0,$key
547          vperm          $inout,$inout,$inptail,$inpperm
548         lvx             $rndkey1,$idx,$key
549         addi            $idx,$idx,16
550         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
551         vxor            $inout,$inout,$rndkey0
552         lvx             $rndkey0,$idx,$key
553         addi            $idx,$idx,16
554         vxor            $inout,$inout,$ivec
555
556 Loop_cbc_enc:
557         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
558         vcipher         $inout,$inout,$rndkey1
559         lvx             $rndkey1,$idx,$key
560         addi            $idx,$idx,16
561         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
562         vcipher         $inout,$inout,$rndkey0
563         lvx             $rndkey0,$idx,$key
564         addi            $idx,$idx,16
565         bdnz            Loop_cbc_enc
566
567         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
568         vcipher         $inout,$inout,$rndkey1
569         lvx             $rndkey1,$idx,$key
570         li              $idx,16
571         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
572         vcipherlast     $ivec,$inout,$rndkey0
573         ${UCMP}i        $len,16
574
575         vperm           $tmp,$ivec,$ivec,$outperm
576         vsel            $inout,$outhead,$tmp,$outmask
577         vmr             $outhead,$tmp
578         stvx            $inout,0,$out
579         addi            $out,$out,16
580         bge             Lcbc_enc
581
582         b               Lcbc_done
583
584 .align  4
585 Lcbc_dec:
586         ${UCMP}i        $len,128
587         bge             _aesp8_cbc_decrypt8x
588         vmr             $tmp,$inptail
589         lvx             $inptail,0,$inp
590         addi            $inp,$inp,16
591         mtctr           $rounds
592         subi            $len,$len,16            # len-=16
593
594         lvx             $rndkey0,0,$key
595          vperm          $tmp,$tmp,$inptail,$inpperm
596         lvx             $rndkey1,$idx,$key
597         addi            $idx,$idx,16
598         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
599         vxor            $inout,$tmp,$rndkey0
600         lvx             $rndkey0,$idx,$key
601         addi            $idx,$idx,16
602
603 Loop_cbc_dec:
604         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
605         vncipher        $inout,$inout,$rndkey1
606         lvx             $rndkey1,$idx,$key
607         addi            $idx,$idx,16
608         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
609         vncipher        $inout,$inout,$rndkey0
610         lvx             $rndkey0,$idx,$key
611         addi            $idx,$idx,16
612         bdnz            Loop_cbc_dec
613
614         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
615         vncipher        $inout,$inout,$rndkey1
616         lvx             $rndkey1,$idx,$key
617         li              $idx,16
618         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
619         vncipherlast    $inout,$inout,$rndkey0
620         ${UCMP}i        $len,16
621
622         vxor            $inout,$inout,$ivec
623         vmr             $ivec,$tmp
624         vperm           $tmp,$inout,$inout,$outperm
625         vsel            $inout,$outhead,$tmp,$outmask
626         vmr             $outhead,$tmp
627         stvx            $inout,0,$out
628         addi            $out,$out,16
629         bge             Lcbc_dec
630
631 Lcbc_done:
632         addi            $out,$out,-1
633         lvx             $inout,0,$out           # redundant in aligned case
634         vsel            $inout,$outhead,$inout,$outmask
635         stvx            $inout,0,$out
636
637         neg             $enc,$ivp               # write [unaligned] iv
638         li              $idx,15                 # 15 is not typo
639         vxor            $rndkey0,$rndkey0,$rndkey0
640         vspltisb        $outmask,-1
641         le?vspltisb     $tmp,0x0f
642         ?lvsl           $outperm,0,$enc
643         ?vperm          $outmask,$rndkey0,$outmask,$outperm
644         le?vxor         $outperm,$outperm,$tmp
645         lvx             $outhead,0,$ivp
646         vperm           $ivec,$ivec,$ivec,$outperm
647         vsel            $inout,$outhead,$ivec,$outmask
648         lvx             $inptail,$idx,$ivp
649         stvx            $inout,0,$ivp
650         vsel            $inout,$ivec,$inptail,$outmask
651         stvx            $inout,$idx,$ivp
652
653         mtspr           256,$vrsave
654         blr
655         .long           0
656         .byte           0,12,0x14,0,0,0,6,0
657         .long           0
658 ___
659 #########################################################################
660 {{      # Optimized CBC decrypt procedure                               #
661 my $key_="r11";
662 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
663 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
664 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
665 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
666                         # v26-v31 last 6 round keys
667 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
668
669 $code.=<<___;
670 .align  5
671 _aesp8_cbc_decrypt8x:
672         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
673         li              r10,`$FRAME+8*16+15`
674         li              r11,`$FRAME+8*16+31`
675         stvx            v20,r10,$sp             # ABI says so
676         addi            r10,r10,32
677         stvx            v21,r11,$sp
678         addi            r11,r11,32
679         stvx            v22,r10,$sp
680         addi            r10,r10,32
681         stvx            v23,r11,$sp
682         addi            r11,r11,32
683         stvx            v24,r10,$sp
684         addi            r10,r10,32
685         stvx            v25,r11,$sp
686         addi            r11,r11,32
687         stvx            v26,r10,$sp
688         addi            r10,r10,32
689         stvx            v27,r11,$sp
690         addi            r11,r11,32
691         stvx            v28,r10,$sp
692         addi            r10,r10,32
693         stvx            v29,r11,$sp
694         addi            r11,r11,32
695         stvx            v30,r10,$sp
696         stvx            v31,r11,$sp
697         li              r0,-1
698         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
699         li              $x10,0x10
700         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
701         li              $x20,0x20
702         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
703         li              $x30,0x30
704         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
705         li              $x40,0x40
706         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
707         li              $x50,0x50
708         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
709         li              $x60,0x60
710         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
711         li              $x70,0x70
712         mtspr           256,r0
713
714         subi            $rounds,$rounds,3       # -4 in total
715         subi            $len,$len,128           # bias
716
717         lvx             $rndkey0,$x00,$key      # load key schedule
718         lvx             v30,$x10,$key
719         addi            $key,$key,0x20
720         lvx             v31,$x00,$key
721         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
722         addi            $key_,$sp,$FRAME+15
723         mtctr           $rounds
724
725 Load_cbc_dec_key:
726         ?vperm          v24,v30,v31,$keyperm
727         lvx             v30,$x10,$key
728         addi            $key,$key,0x20
729         stvx            v24,$x00,$key_          # off-load round[1]
730         ?vperm          v25,v31,v30,$keyperm
731         lvx             v31,$x00,$key
732         stvx            v25,$x10,$key_          # off-load round[2]
733         addi            $key_,$key_,0x20
734         bdnz            Load_cbc_dec_key
735
736         lvx             v26,$x10,$key
737         ?vperm          v24,v30,v31,$keyperm
738         lvx             v27,$x20,$key
739         stvx            v24,$x00,$key_          # off-load round[3]
740         ?vperm          v25,v31,v26,$keyperm
741         lvx             v28,$x30,$key
742         stvx            v25,$x10,$key_          # off-load round[4]
743         addi            $key_,$sp,$FRAME+15     # rewind $key_
744         ?vperm          v26,v26,v27,$keyperm
745         lvx             v29,$x40,$key
746         ?vperm          v27,v27,v28,$keyperm
747         lvx             v30,$x50,$key
748         ?vperm          v28,v28,v29,$keyperm
749         lvx             v31,$x60,$key
750         ?vperm          v29,v29,v30,$keyperm
751         lvx             $out0,$x70,$key         # borrow $out0
752         ?vperm          v30,v30,v31,$keyperm
753         lvx             v24,$x00,$key_          # pre-load round[1]
754         ?vperm          v31,v31,$out0,$keyperm
755         lvx             v25,$x10,$key_          # pre-load round[2]
756
757         #lvx            $inptail,0,$inp         # "caller" already did this
758         #addi           $inp,$inp,15            # 15 is not typo
759         subi            $inp,$inp,15            # undo "caller"
760
761          le?li          $idx,8
762         lvx_u           $in0,$x00,$inp          # load first 8 "words"
763          le?lvsl        $inpperm,0,$idx
764          le?vspltisb    $tmp,0x0f
765         lvx_u           $in1,$x10,$inp
766          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
767         lvx_u           $in2,$x20,$inp
768          le?vperm       $in0,$in0,$in0,$inpperm
769         lvx_u           $in3,$x30,$inp
770          le?vperm       $in1,$in1,$in1,$inpperm
771         lvx_u           $in4,$x40,$inp
772          le?vperm       $in2,$in2,$in2,$inpperm
773         vxor            $out0,$in0,$rndkey0
774         lvx_u           $in5,$x50,$inp
775          le?vperm       $in3,$in3,$in3,$inpperm
776         vxor            $out1,$in1,$rndkey0
777         lvx_u           $in6,$x60,$inp
778          le?vperm       $in4,$in4,$in4,$inpperm
779         vxor            $out2,$in2,$rndkey0
780         lvx_u           $in7,$x70,$inp
781         addi            $inp,$inp,0x80
782          le?vperm       $in5,$in5,$in5,$inpperm
783         vxor            $out3,$in3,$rndkey0
784          le?vperm       $in6,$in6,$in6,$inpperm
785         vxor            $out4,$in4,$rndkey0
786          le?vperm       $in7,$in7,$in7,$inpperm
787         vxor            $out5,$in5,$rndkey0
788         vxor            $out6,$in6,$rndkey0
789         vxor            $out7,$in7,$rndkey0
790
791         mtctr           $rounds
792         b               Loop_cbc_dec8x
793 .align  5
794 Loop_cbc_dec8x:
795         vncipher        $out0,$out0,v24
796         vncipher        $out1,$out1,v24
797         vncipher        $out2,$out2,v24
798         vncipher        $out3,$out3,v24
799         vncipher        $out4,$out4,v24
800         vncipher        $out5,$out5,v24
801         vncipher        $out6,$out6,v24
802         vncipher        $out7,$out7,v24
803         lvx             v24,$x20,$key_          # round[3]
804         addi            $key_,$key_,0x20
805
806         vncipher        $out0,$out0,v25
807         vncipher        $out1,$out1,v25
808         vncipher        $out2,$out2,v25
809         vncipher        $out3,$out3,v25
810         vncipher        $out4,$out4,v25
811         vncipher        $out5,$out5,v25
812         vncipher        $out6,$out6,v25
813         vncipher        $out7,$out7,v25
814         lvx             v25,$x10,$key_          # round[4]
815         bdnz            Loop_cbc_dec8x
816
817         subic           $len,$len,128           # $len-=128
818         vncipher        $out0,$out0,v24
819         vncipher        $out1,$out1,v24
820         vncipher        $out2,$out2,v24
821         vncipher        $out3,$out3,v24
822         vncipher        $out4,$out4,v24
823         vncipher        $out5,$out5,v24
824         vncipher        $out6,$out6,v24
825         vncipher        $out7,$out7,v24
826
827         subfe.          r0,r0,r0                # borrow?-1:0
828         vncipher        $out0,$out0,v25
829         vncipher        $out1,$out1,v25
830         vncipher        $out2,$out2,v25
831         vncipher        $out3,$out3,v25
832         vncipher        $out4,$out4,v25
833         vncipher        $out5,$out5,v25
834         vncipher        $out6,$out6,v25
835         vncipher        $out7,$out7,v25
836
837         and             r0,r0,$len
838         vncipher        $out0,$out0,v26
839         vncipher        $out1,$out1,v26
840         vncipher        $out2,$out2,v26
841         vncipher        $out3,$out3,v26
842         vncipher        $out4,$out4,v26
843         vncipher        $out5,$out5,v26
844         vncipher        $out6,$out6,v26
845         vncipher        $out7,$out7,v26
846
847         add             $inp,$inp,r0            # $inp is adjusted in such
848                                                 # way that at exit from the
849                                                 # loop inX-in7 are loaded
850                                                 # with last "words"
851         vncipher        $out0,$out0,v27
852         vncipher        $out1,$out1,v27
853         vncipher        $out2,$out2,v27
854         vncipher        $out3,$out3,v27
855         vncipher        $out4,$out4,v27
856         vncipher        $out5,$out5,v27
857         vncipher        $out6,$out6,v27
858         vncipher        $out7,$out7,v27
859
860         addi            $key_,$sp,$FRAME+15     # rewind $key_
861         vncipher        $out0,$out0,v28
862         vncipher        $out1,$out1,v28
863         vncipher        $out2,$out2,v28
864         vncipher        $out3,$out3,v28
865         vncipher        $out4,$out4,v28
866         vncipher        $out5,$out5,v28
867         vncipher        $out6,$out6,v28
868         vncipher        $out7,$out7,v28
869         lvx             v24,$x00,$key_          # re-pre-load round[1]
870
871         vncipher        $out0,$out0,v29
872         vncipher        $out1,$out1,v29
873         vncipher        $out2,$out2,v29
874         vncipher        $out3,$out3,v29
875         vncipher        $out4,$out4,v29
876         vncipher        $out5,$out5,v29
877         vncipher        $out6,$out6,v29
878         vncipher        $out7,$out7,v29
879         lvx             v25,$x10,$key_          # re-pre-load round[2]
880
881         vncipher        $out0,$out0,v30
882          vxor           $ivec,$ivec,v31         # xor with last round key
883         vncipher        $out1,$out1,v30
884          vxor           $in0,$in0,v31
885         vncipher        $out2,$out2,v30
886          vxor           $in1,$in1,v31
887         vncipher        $out3,$out3,v30
888          vxor           $in2,$in2,v31
889         vncipher        $out4,$out4,v30
890          vxor           $in3,$in3,v31
891         vncipher        $out5,$out5,v30
892          vxor           $in4,$in4,v31
893         vncipher        $out6,$out6,v30
894          vxor           $in5,$in5,v31
895         vncipher        $out7,$out7,v30
896          vxor           $in6,$in6,v31
897
898         vncipherlast    $out0,$out0,$ivec
899         vncipherlast    $out1,$out1,$in0
900          lvx_u          $in0,$x00,$inp          # load next input block
901         vncipherlast    $out2,$out2,$in1
902          lvx_u          $in1,$x10,$inp
903         vncipherlast    $out3,$out3,$in2
904          le?vperm       $in0,$in0,$in0,$inpperm
905          lvx_u          $in2,$x20,$inp
906         vncipherlast    $out4,$out4,$in3
907          le?vperm       $in1,$in1,$in1,$inpperm
908          lvx_u          $in3,$x30,$inp
909         vncipherlast    $out5,$out5,$in4
910          le?vperm       $in2,$in2,$in2,$inpperm
911          lvx_u          $in4,$x40,$inp
912         vncipherlast    $out6,$out6,$in5
913          le?vperm       $in3,$in3,$in3,$inpperm
914          lvx_u          $in5,$x50,$inp
915         vncipherlast    $out7,$out7,$in6
916          le?vperm       $in4,$in4,$in4,$inpperm
917          lvx_u          $in6,$x60,$inp
918         vmr             $ivec,$in7
919          le?vperm       $in5,$in5,$in5,$inpperm
920          lvx_u          $in7,$x70,$inp
921          addi           $inp,$inp,0x80
922
923         le?vperm        $out0,$out0,$out0,$inpperm
924         le?vperm        $out1,$out1,$out1,$inpperm
925         stvx_u          $out0,$x00,$out
926          le?vperm       $in6,$in6,$in6,$inpperm
927          vxor           $out0,$in0,$rndkey0
928         le?vperm        $out2,$out2,$out2,$inpperm
929         stvx_u          $out1,$x10,$out
930          le?vperm       $in7,$in7,$in7,$inpperm
931          vxor           $out1,$in1,$rndkey0
932         le?vperm        $out3,$out3,$out3,$inpperm
933         stvx_u          $out2,$x20,$out
934          vxor           $out2,$in2,$rndkey0
935         le?vperm        $out4,$out4,$out4,$inpperm
936         stvx_u          $out3,$x30,$out
937          vxor           $out3,$in3,$rndkey0
938         le?vperm        $out5,$out5,$out5,$inpperm
939         stvx_u          $out4,$x40,$out
940          vxor           $out4,$in4,$rndkey0
941         le?vperm        $out6,$out6,$out6,$inpperm
942         stvx_u          $out5,$x50,$out
943          vxor           $out5,$in5,$rndkey0
944         le?vperm        $out7,$out7,$out7,$inpperm
945         stvx_u          $out6,$x60,$out
946          vxor           $out6,$in6,$rndkey0
947         stvx_u          $out7,$x70,$out
948         addi            $out,$out,0x80
949          vxor           $out7,$in7,$rndkey0
950
951         mtctr           $rounds
952         beq             Loop_cbc_dec8x          # did $len-=128 borrow?
953
954         addic.          $len,$len,128
955         beq             Lcbc_dec8x_done
956         nop
957         nop
958
959 Loop_cbc_dec8x_tail:                            # up to 7 "words" tail...
960         vncipher        $out1,$out1,v24
961         vncipher        $out2,$out2,v24
962         vncipher        $out3,$out3,v24
963         vncipher        $out4,$out4,v24
964         vncipher        $out5,$out5,v24
965         vncipher        $out6,$out6,v24
966         vncipher        $out7,$out7,v24
967         lvx             v24,$x20,$key_          # round[3]
968         addi            $key_,$key_,0x20
969
970         vncipher        $out1,$out1,v25
971         vncipher        $out2,$out2,v25
972         vncipher        $out3,$out3,v25
973         vncipher        $out4,$out4,v25
974         vncipher        $out5,$out5,v25
975         vncipher        $out6,$out6,v25
976         vncipher        $out7,$out7,v25
977         lvx             v25,$x10,$key_          # round[4]
978         bdnz            Loop_cbc_dec8x_tail
979
980         vncipher        $out1,$out1,v24
981         vncipher        $out2,$out2,v24
982         vncipher        $out3,$out3,v24
983         vncipher        $out4,$out4,v24
984         vncipher        $out5,$out5,v24
985         vncipher        $out6,$out6,v24
986         vncipher        $out7,$out7,v24
987
988         vncipher        $out1,$out1,v25
989         vncipher        $out2,$out2,v25
990         vncipher        $out3,$out3,v25
991         vncipher        $out4,$out4,v25
992         vncipher        $out5,$out5,v25
993         vncipher        $out6,$out6,v25
994         vncipher        $out7,$out7,v25
995
996         vncipher        $out1,$out1,v26
997         vncipher        $out2,$out2,v26
998         vncipher        $out3,$out3,v26
999         vncipher        $out4,$out4,v26
1000         vncipher        $out5,$out5,v26
1001         vncipher        $out6,$out6,v26
1002         vncipher        $out7,$out7,v26
1003
1004         vncipher        $out1,$out1,v27
1005         vncipher        $out2,$out2,v27
1006         vncipher        $out3,$out3,v27
1007         vncipher        $out4,$out4,v27
1008         vncipher        $out5,$out5,v27
1009         vncipher        $out6,$out6,v27
1010         vncipher        $out7,$out7,v27
1011
1012         vncipher        $out1,$out1,v28
1013         vncipher        $out2,$out2,v28
1014         vncipher        $out3,$out3,v28
1015         vncipher        $out4,$out4,v28
1016         vncipher        $out5,$out5,v28
1017         vncipher        $out6,$out6,v28
1018         vncipher        $out7,$out7,v28
1019
1020         vncipher        $out1,$out1,v29
1021         vncipher        $out2,$out2,v29
1022         vncipher        $out3,$out3,v29
1023         vncipher        $out4,$out4,v29
1024         vncipher        $out5,$out5,v29
1025         vncipher        $out6,$out6,v29
1026         vncipher        $out7,$out7,v29
1027
1028         vncipher        $out1,$out1,v30
1029          vxor           $ivec,$ivec,v31         # last round key
1030         vncipher        $out2,$out2,v30
1031          vxor           $in1,$in1,v31
1032         vncipher        $out3,$out3,v30
1033          vxor           $in2,$in2,v31
1034         vncipher        $out4,$out4,v30
1035          vxor           $in3,$in3,v31
1036         vncipher        $out5,$out5,v30
1037          vxor           $in4,$in4,v31
1038         vncipher        $out6,$out6,v30
1039          vxor           $in5,$in5,v31
1040         vncipher        $out7,$out7,v30
1041          vxor           $in6,$in6,v31
1042
1043         cmplwi          $len,32                 # switch($len)
1044         blt             Lcbc_dec8x_one
1045         nop
1046         beq             Lcbc_dec8x_two
1047         cmplwi          $len,64
1048         blt             Lcbc_dec8x_three
1049         nop
1050         beq             Lcbc_dec8x_four
1051         cmplwi          $len,96
1052         blt             Lcbc_dec8x_five
1053         nop
1054         beq             Lcbc_dec8x_six
1055
1056 Lcbc_dec8x_seven:
1057         vncipherlast    $out1,$out1,$ivec
1058         vncipherlast    $out2,$out2,$in1
1059         vncipherlast    $out3,$out3,$in2
1060         vncipherlast    $out4,$out4,$in3
1061         vncipherlast    $out5,$out5,$in4
1062         vncipherlast    $out6,$out6,$in5
1063         vncipherlast    $out7,$out7,$in6
1064         vmr             $ivec,$in7
1065
1066         le?vperm        $out1,$out1,$out1,$inpperm
1067         le?vperm        $out2,$out2,$out2,$inpperm
1068         stvx_u          $out1,$x00,$out
1069         le?vperm        $out3,$out3,$out3,$inpperm
1070         stvx_u          $out2,$x10,$out
1071         le?vperm        $out4,$out4,$out4,$inpperm
1072         stvx_u          $out3,$x20,$out
1073         le?vperm        $out5,$out5,$out5,$inpperm
1074         stvx_u          $out4,$x30,$out
1075         le?vperm        $out6,$out6,$out6,$inpperm
1076         stvx_u          $out5,$x40,$out
1077         le?vperm        $out7,$out7,$out7,$inpperm
1078         stvx_u          $out6,$x50,$out
1079         stvx_u          $out7,$x60,$out
1080         addi            $out,$out,0x70
1081         b               Lcbc_dec8x_done
1082
1083 .align  5
1084 Lcbc_dec8x_six:
1085         vncipherlast    $out2,$out2,$ivec
1086         vncipherlast    $out3,$out3,$in2
1087         vncipherlast    $out4,$out4,$in3
1088         vncipherlast    $out5,$out5,$in4
1089         vncipherlast    $out6,$out6,$in5
1090         vncipherlast    $out7,$out7,$in6
1091         vmr             $ivec,$in7
1092
1093         le?vperm        $out2,$out2,$out2,$inpperm
1094         le?vperm        $out3,$out3,$out3,$inpperm
1095         stvx_u          $out2,$x00,$out
1096         le?vperm        $out4,$out4,$out4,$inpperm
1097         stvx_u          $out3,$x10,$out
1098         le?vperm        $out5,$out5,$out5,$inpperm
1099         stvx_u          $out4,$x20,$out
1100         le?vperm        $out6,$out6,$out6,$inpperm
1101         stvx_u          $out5,$x30,$out
1102         le?vperm        $out7,$out7,$out7,$inpperm
1103         stvx_u          $out6,$x40,$out
1104         stvx_u          $out7,$x50,$out
1105         addi            $out,$out,0x60
1106         b               Lcbc_dec8x_done
1107
1108 .align  5
1109 Lcbc_dec8x_five:
1110         vncipherlast    $out3,$out3,$ivec
1111         vncipherlast    $out4,$out4,$in3
1112         vncipherlast    $out5,$out5,$in4
1113         vncipherlast    $out6,$out6,$in5
1114         vncipherlast    $out7,$out7,$in6
1115         vmr             $ivec,$in7
1116
1117         le?vperm        $out3,$out3,$out3,$inpperm
1118         le?vperm        $out4,$out4,$out4,$inpperm
1119         stvx_u          $out3,$x00,$out
1120         le?vperm        $out5,$out5,$out5,$inpperm
1121         stvx_u          $out4,$x10,$out
1122         le?vperm        $out6,$out6,$out6,$inpperm
1123         stvx_u          $out5,$x20,$out
1124         le?vperm        $out7,$out7,$out7,$inpperm
1125         stvx_u          $out6,$x30,$out
1126         stvx_u          $out7,$x40,$out
1127         addi            $out,$out,0x50
1128         b               Lcbc_dec8x_done
1129
1130 .align  5
1131 Lcbc_dec8x_four:
1132         vncipherlast    $out4,$out4,$ivec
1133         vncipherlast    $out5,$out5,$in4
1134         vncipherlast    $out6,$out6,$in5
1135         vncipherlast    $out7,$out7,$in6
1136         vmr             $ivec,$in7
1137
1138         le?vperm        $out4,$out4,$out4,$inpperm
1139         le?vperm        $out5,$out5,$out5,$inpperm
1140         stvx_u          $out4,$x00,$out
1141         le?vperm        $out6,$out6,$out6,$inpperm
1142         stvx_u          $out5,$x10,$out
1143         le?vperm        $out7,$out7,$out7,$inpperm
1144         stvx_u          $out6,$x20,$out
1145         stvx_u          $out7,$x30,$out
1146         addi            $out,$out,0x40
1147         b               Lcbc_dec8x_done
1148
1149 .align  5
1150 Lcbc_dec8x_three:
1151         vncipherlast    $out5,$out5,$ivec
1152         vncipherlast    $out6,$out6,$in5
1153         vncipherlast    $out7,$out7,$in6
1154         vmr             $ivec,$in7
1155
1156         le?vperm        $out5,$out5,$out5,$inpperm
1157         le?vperm        $out6,$out6,$out6,$inpperm
1158         stvx_u          $out5,$x00,$out
1159         le?vperm        $out7,$out7,$out7,$inpperm
1160         stvx_u          $out6,$x10,$out
1161         stvx_u          $out7,$x20,$out
1162         addi            $out,$out,0x30
1163         b               Lcbc_dec8x_done
1164
1165 .align  5
1166 Lcbc_dec8x_two:
1167         vncipherlast    $out6,$out6,$ivec
1168         vncipherlast    $out7,$out7,$in6
1169         vmr             $ivec,$in7
1170
1171         le?vperm        $out6,$out6,$out6,$inpperm
1172         le?vperm        $out7,$out7,$out7,$inpperm
1173         stvx_u          $out6,$x00,$out
1174         stvx_u          $out7,$x10,$out
1175         addi            $out,$out,0x20
1176         b               Lcbc_dec8x_done
1177
1178 .align  5
1179 Lcbc_dec8x_one:
1180         vncipherlast    $out7,$out7,$ivec
1181         vmr             $ivec,$in7
1182
1183         le?vperm        $out7,$out7,$out7,$inpperm
1184         stvx_u          $out7,0,$out
1185         addi            $out,$out,0x10
1186
1187 Lcbc_dec8x_done:
1188         le?vperm        $ivec,$ivec,$ivec,$inpperm
1189         stvx_u          $ivec,0,$ivp            # write [unaligned] iv
1190
1191         li              r10,`$FRAME+15`
1192         li              r11,`$FRAME+31`
1193         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1194         addi            r10,r10,32
1195         stvx            $inpperm,r11,$sp
1196         addi            r11,r11,32
1197         stvx            $inpperm,r10,$sp
1198         addi            r10,r10,32
1199         stvx            $inpperm,r11,$sp
1200         addi            r11,r11,32
1201         stvx            $inpperm,r10,$sp
1202         addi            r10,r10,32
1203         stvx            $inpperm,r11,$sp
1204         addi            r11,r11,32
1205         stvx            $inpperm,r10,$sp
1206         addi            r10,r10,32
1207         stvx            $inpperm,r11,$sp
1208         addi            r11,r11,32
1209
1210         mtspr           256,$vrsave
1211         lvx             v20,r10,$sp             # ABI says so
1212         addi            r10,r10,32
1213         lvx             v21,r11,$sp
1214         addi            r11,r11,32
1215         lvx             v22,r10,$sp
1216         addi            r10,r10,32
1217         lvx             v23,r11,$sp
1218         addi            r11,r11,32
1219         lvx             v24,r10,$sp
1220         addi            r10,r10,32
1221         lvx             v25,r11,$sp
1222         addi            r11,r11,32
1223         lvx             v26,r10,$sp
1224         addi            r10,r10,32
1225         lvx             v27,r11,$sp
1226         addi            r11,r11,32
1227         lvx             v28,r10,$sp
1228         addi            r10,r10,32
1229         lvx             v29,r11,$sp
1230         addi            r11,r11,32
1231         lvx             v30,r10,$sp
1232         lvx             v31,r11,$sp
1233         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1234         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1235         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1236         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1237         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1238         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1239         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1240         blr
1241         .long           0
1242         .byte           0,12,0x14,0,0x80,6,6,0
1243         .long           0
1244 .size   .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1245 ___
1246 }}      }}}
1247
1248 #########################################################################
1249 {{{     # CTR procedure[s]                                              #
1250 my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1251 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
1252 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1253                                                 map("v$_",(4..11));
1254 my $dat=$tmp;
1255
1256 $code.=<<___;
1257 .globl  .${prefix}_ctr32_encrypt_blocks
1258         ${UCMP}i        $len,1
1259         bltlr-
1260
1261         lis             r0,0xfff0
1262         mfspr           $vrsave,256
1263         mtspr           256,r0
1264
1265         li              $idx,15
1266         vxor            $rndkey0,$rndkey0,$rndkey0
1267         le?vspltisb     $tmp,0x0f
1268
1269         lvx             $ivec,0,$ivp            # load [unaligned] iv
1270         lvsl            $inpperm,0,$ivp
1271         lvx             $inptail,$idx,$ivp
1272          vspltisb       $one,1
1273         le?vxor         $inpperm,$inpperm,$tmp
1274         vperm           $ivec,$ivec,$inptail,$inpperm
1275          vsldoi         $one,$rndkey0,$one,1
1276
1277         neg             r11,$inp
1278         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
1279         lwz             $rounds,240($key)
1280
1281         lvsr            $inpperm,0,r11          # prepare for unaligned load
1282         lvx             $inptail,0,$inp
1283         addi            $inp,$inp,15            # 15 is not typo
1284         le?vxor         $inpperm,$inpperm,$tmp
1285
1286         srwi            $rounds,$rounds,1
1287         li              $idx,16
1288         subi            $rounds,$rounds,1
1289
1290         ${UCMP}i        $len,8
1291         bge             _aesp8_ctr32_encrypt8x
1292
1293         ?lvsr           $outperm,0,$out         # prepare for unaligned store
1294         vspltisb        $outmask,-1
1295         lvx             $outhead,0,$out
1296         ?vperm          $outmask,$rndkey0,$outmask,$outperm
1297         le?vxor         $outperm,$outperm,$tmp
1298
1299         lvx             $rndkey0,0,$key
1300         mtctr           $rounds
1301         lvx             $rndkey1,$idx,$key
1302         addi            $idx,$idx,16
1303         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1304         vxor            $inout,$ivec,$rndkey0
1305         lvx             $rndkey0,$idx,$key
1306         addi            $idx,$idx,16
1307         b               Loop_ctr32_enc
1308
1309 .align  5
1310 Loop_ctr32_enc:
1311         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1312         vcipher         $inout,$inout,$rndkey1
1313         lvx             $rndkey1,$idx,$key
1314         addi            $idx,$idx,16
1315         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1316         vcipher         $inout,$inout,$rndkey0
1317         lvx             $rndkey0,$idx,$key
1318         addi            $idx,$idx,16
1319         bdnz            Loop_ctr32_enc
1320
1321         vadduwm         $ivec,$ivec,$one
1322          vmr            $dat,$inptail
1323          lvx            $inptail,0,$inp
1324          addi           $inp,$inp,16
1325          subic.         $len,$len,1             # blocks--
1326
1327         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1328         vcipher         $inout,$inout,$rndkey1
1329         lvx             $rndkey1,$idx,$key
1330          vperm          $dat,$dat,$inptail,$inpperm
1331          li             $idx,16
1332         ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
1333          lvx            $rndkey0,0,$key
1334         vxor            $dat,$dat,$rndkey1      # last round key
1335         vcipherlast     $inout,$inout,$dat
1336
1337          lvx            $rndkey1,$idx,$key
1338          addi           $idx,$idx,16
1339         vperm           $inout,$inout,$inout,$outperm
1340         vsel            $dat,$outhead,$inout,$outmask
1341          mtctr          $rounds
1342          ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
1343         vmr             $outhead,$inout
1344          vxor           $inout,$ivec,$rndkey0
1345          lvx            $rndkey0,$idx,$key
1346          addi           $idx,$idx,16
1347         stvx            $dat,0,$out
1348         addi            $out,$out,16
1349         bne             Loop_ctr32_enc
1350
1351         addi            $out,$out,-1
1352         lvx             $inout,0,$out           # redundant in aligned case
1353         vsel            $inout,$outhead,$inout,$outmask
1354         stvx            $inout,0,$out
1355
1356         mtspr           256,$vrsave
1357         blr
1358         .long           0
1359         .byte           0,12,0x14,0,0,0,6,0
1360         .long           0
1361 ___
1362 #########################################################################
1363 {{      # Optimized CTR procedure                                       #
1364 my $key_="r11";
1365 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1366 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1367 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1368 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
1369                         # v26-v31 last 6 round keys
1370 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
1371 my ($two,$three,$four)=($outhead,$outperm,$outmask);
1372
1373 $code.=<<___;
1374 .align  5
1375 _aesp8_ctr32_encrypt8x:
1376         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1377         li              r10,`$FRAME+8*16+15`
1378         li              r11,`$FRAME+8*16+31`
1379         stvx            v20,r10,$sp             # ABI says so
1380         addi            r10,r10,32
1381         stvx            v21,r11,$sp
1382         addi            r11,r11,32
1383         stvx            v22,r10,$sp
1384         addi            r10,r10,32
1385         stvx            v23,r11,$sp
1386         addi            r11,r11,32
1387         stvx            v24,r10,$sp
1388         addi            r10,r10,32
1389         stvx            v25,r11,$sp
1390         addi            r11,r11,32
1391         stvx            v26,r10,$sp
1392         addi            r10,r10,32
1393         stvx            v27,r11,$sp
1394         addi            r11,r11,32
1395         stvx            v28,r10,$sp
1396         addi            r10,r10,32
1397         stvx            v29,r11,$sp
1398         addi            r11,r11,32
1399         stvx            v30,r10,$sp
1400         stvx            v31,r11,$sp
1401         li              r0,-1
1402         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
1403         li              $x10,0x10
1404         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1405         li              $x20,0x20
1406         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1407         li              $x30,0x30
1408         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1409         li              $x40,0x40
1410         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1411         li              $x50,0x50
1412         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1413         li              $x60,0x60
1414         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1415         li              $x70,0x70
1416         mtspr           256,r0
1417
1418         subi            $rounds,$rounds,3       # -4 in total
1419
1420         lvx             $rndkey0,$x00,$key      # load key schedule
1421         lvx             v30,$x10,$key
1422         addi            $key,$key,0x20
1423         lvx             v31,$x00,$key
1424         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
1425         addi            $key_,$sp,$FRAME+15
1426         mtctr           $rounds
1427
1428 Load_ctr32_enc_key:
1429         ?vperm          v24,v30,v31,$keyperm
1430         lvx             v30,$x10,$key
1431         addi            $key,$key,0x20
1432         stvx            v24,$x00,$key_          # off-load round[1]
1433         ?vperm          v25,v31,v30,$keyperm
1434         lvx             v31,$x00,$key
1435         stvx            v25,$x10,$key_          # off-load round[2]
1436         addi            $key_,$key_,0x20
1437         bdnz            Load_ctr32_enc_key
1438
1439         lvx             v26,$x10,$key
1440         ?vperm          v24,v30,v31,$keyperm
1441         lvx             v27,$x20,$key
1442         stvx            v24,$x00,$key_          # off-load round[3]
1443         ?vperm          v25,v31,v26,$keyperm
1444         lvx             v28,$x30,$key
1445         stvx            v25,$x10,$key_          # off-load round[4]
1446         addi            $key_,$sp,$FRAME+15     # rewind $key_
1447         ?vperm          v26,v26,v27,$keyperm
1448         lvx             v29,$x40,$key
1449         ?vperm          v27,v27,v28,$keyperm
1450         lvx             v30,$x50,$key
1451         ?vperm          v28,v28,v29,$keyperm
1452         lvx             v31,$x60,$key
1453         ?vperm          v29,v29,v30,$keyperm
1454         lvx             $out0,$x70,$key         # borrow $out0
1455         ?vperm          v30,v30,v31,$keyperm
1456         lvx             v24,$x00,$key_          # pre-load round[1]
1457         ?vperm          v31,v31,$out0,$keyperm
1458         lvx             v25,$x10,$key_          # pre-load round[2]
1459
1460         vadduqm         $two,$one,$one
1461         subi            $inp,$inp,15            # undo "caller"
1462         $SHL            $len,$len,4
1463
1464         vadduqm         $out1,$ivec,$one        # counter values ...
1465         vadduqm         $out2,$ivec,$two
1466         vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1467          le?li          $idx,8
1468         vadduqm         $out3,$out1,$two
1469         vxor            $out1,$out1,$rndkey0
1470          le?lvsl        $inpperm,0,$idx
1471         vadduqm         $out4,$out2,$two
1472         vxor            $out2,$out2,$rndkey0
1473          le?vspltisb    $tmp,0x0f
1474         vadduqm         $out5,$out3,$two
1475         vxor            $out3,$out3,$rndkey0
1476          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
1477         vadduqm         $out6,$out4,$two
1478         vxor            $out4,$out4,$rndkey0
1479         vadduqm         $out7,$out5,$two
1480         vxor            $out5,$out5,$rndkey0
1481         vadduqm         $ivec,$out6,$two        # next counter value
1482         vxor            $out6,$out6,$rndkey0
1483         vxor            $out7,$out7,$rndkey0
1484
1485         mtctr           $rounds
1486         b               Loop_ctr32_enc8x
1487 .align  5
1488 Loop_ctr32_enc8x:
1489         vcipher         $out0,$out0,v24
1490         vcipher         $out1,$out1,v24
1491         vcipher         $out2,$out2,v24
1492         vcipher         $out3,$out3,v24
1493         vcipher         $out4,$out4,v24
1494         vcipher         $out5,$out5,v24
1495         vcipher         $out6,$out6,v24
1496         vcipher         $out7,$out7,v24
1497 Loop_ctr32_enc8x_middle:
1498         lvx             v24,$x20,$key_          # round[3]
1499         addi            $key_,$key_,0x20
1500
1501         vcipher         $out0,$out0,v25
1502         vcipher         $out1,$out1,v25
1503         vcipher         $out2,$out2,v25
1504         vcipher         $out3,$out3,v25
1505         vcipher         $out4,$out4,v25
1506         vcipher         $out5,$out5,v25
1507         vcipher         $out6,$out6,v25
1508         vcipher         $out7,$out7,v25
1509         lvx             v25,$x10,$key_          # round[4]
1510         bdnz            Loop_ctr32_enc8x
1511
1512         subic           r11,$len,256            # $len-256, borrow $key_
1513         vcipher         $out0,$out0,v24
1514         vcipher         $out1,$out1,v24
1515         vcipher         $out2,$out2,v24
1516         vcipher         $out3,$out3,v24
1517         vcipher         $out4,$out4,v24
1518         vcipher         $out5,$out5,v24
1519         vcipher         $out6,$out6,v24
1520         vcipher         $out7,$out7,v24
1521
1522         subfe           r0,r0,r0                # borrow?-1:0
1523         vcipher         $out0,$out0,v25
1524         vcipher         $out1,$out1,v25
1525         vcipher         $out2,$out2,v25
1526         vcipher         $out3,$out3,v25
1527         vcipher         $out4,$out4,v25
1528         vcipher         $out5,$out5,v25
1529         vcipher         $out6,$out6,v25
1530         vcipher         $out7,$out7,v25
1531
1532         and             r0,r0,r11
1533         addi            $key_,$sp,$FRAME+15     # rewind $key_
1534         vcipher         $out0,$out0,v26
1535         vcipher         $out1,$out1,v26
1536         vcipher         $out2,$out2,v26
1537         vcipher         $out3,$out3,v26
1538         vcipher         $out4,$out4,v26
1539         vcipher         $out5,$out5,v26
1540         vcipher         $out6,$out6,v26
1541         vcipher         $out7,$out7,v26
1542         lvx             v24,$x00,$key_          # re-pre-load round[1]
1543
1544         subic           $len,$len,129           # $len-=129
1545         vcipher         $out0,$out0,v27
1546         addi            $len,$len,1             # $len-=128 really
1547         vcipher         $out1,$out1,v27
1548         vcipher         $out2,$out2,v27
1549         vcipher         $out3,$out3,v27
1550         vcipher         $out4,$out4,v27
1551         vcipher         $out5,$out5,v27
1552         vcipher         $out6,$out6,v27
1553         vcipher         $out7,$out7,v27
1554         lvx             v25,$x10,$key_          # re-pre-load round[2]
1555
1556         vcipher         $out0,$out0,v28
1557          lvx_u          $in0,$x00,$inp          # load input
1558         vcipher         $out1,$out1,v28
1559          lvx_u          $in1,$x10,$inp
1560         vcipher         $out2,$out2,v28
1561          lvx_u          $in2,$x20,$inp
1562         vcipher         $out3,$out3,v28
1563          lvx_u          $in3,$x30,$inp
1564         vcipher         $out4,$out4,v28
1565          lvx_u          $in4,$x40,$inp
1566         vcipher         $out5,$out5,v28
1567          lvx_u          $in5,$x50,$inp
1568         vcipher         $out6,$out6,v28
1569          lvx_u          $in6,$x60,$inp
1570         vcipher         $out7,$out7,v28
1571          lvx_u          $in7,$x70,$inp
1572          addi           $inp,$inp,0x80
1573
1574         vcipher         $out0,$out0,v29
1575          le?vperm       $in0,$in0,$in0,$inpperm
1576         vcipher         $out1,$out1,v29
1577          le?vperm       $in1,$in1,$in1,$inpperm
1578         vcipher         $out2,$out2,v29
1579          le?vperm       $in2,$in2,$in2,$inpperm
1580         vcipher         $out3,$out3,v29
1581          le?vperm       $in3,$in3,$in3,$inpperm
1582         vcipher         $out4,$out4,v29
1583          le?vperm       $in4,$in4,$in4,$inpperm
1584         vcipher         $out5,$out5,v29
1585          le?vperm       $in5,$in5,$in5,$inpperm
1586         vcipher         $out6,$out6,v29
1587          le?vperm       $in6,$in6,$in6,$inpperm
1588         vcipher         $out7,$out7,v29
1589          le?vperm       $in7,$in7,$in7,$inpperm
1590
1591         add             $inp,$inp,r0            # $inp is adjusted in such
1592                                                 # way that at exit from the
1593                                                 # loop inX-in7 are loaded
1594                                                 # with last "words"
1595         subfe.          r0,r0,r0                # borrow?-1:0
1596         vcipher         $out0,$out0,v30
1597          vxor           $in0,$in0,v31           # xor with last round key
1598         vcipher         $out1,$out1,v30
1599          vxor           $in1,$in1,v31
1600         vcipher         $out2,$out2,v30
1601          vxor           $in2,$in2,v31
1602         vcipher         $out3,$out3,v30
1603          vxor           $in3,$in3,v31
1604         vcipher         $out4,$out4,v30
1605          vxor           $in4,$in4,v31
1606         vcipher         $out5,$out5,v30
1607          vxor           $in5,$in5,v31
1608         vcipher         $out6,$out6,v30
1609          vxor           $in6,$in6,v31
1610         vcipher         $out7,$out7,v30
1611          vxor           $in7,$in7,v31
1612
1613         bne             Lctr32_enc8x_break      # did $len-129 borrow?
1614
1615         vcipherlast     $in0,$out0,$in0
1616         vcipherlast     $in1,$out1,$in1
1617          vadduqm        $out1,$ivec,$one        # counter values ...
1618         vcipherlast     $in2,$out2,$in2
1619          vadduqm        $out2,$ivec,$two
1620          vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1621         vcipherlast     $in3,$out3,$in3
1622          vadduqm        $out3,$out1,$two
1623          vxor           $out1,$out1,$rndkey0
1624         vcipherlast     $in4,$out4,$in4
1625          vadduqm        $out4,$out2,$two
1626          vxor           $out2,$out2,$rndkey0
1627         vcipherlast     $in5,$out5,$in5
1628          vadduqm        $out5,$out3,$two
1629          vxor           $out3,$out3,$rndkey0
1630         vcipherlast     $in6,$out6,$in6
1631          vadduqm        $out6,$out4,$two
1632          vxor           $out4,$out4,$rndkey0
1633         vcipherlast     $in7,$out7,$in7
1634          vadduqm        $out7,$out5,$two
1635          vxor           $out5,$out5,$rndkey0
1636         le?vperm        $in0,$in0,$in0,$inpperm
1637          vadduqm        $ivec,$out6,$two        # next counter value
1638          vxor           $out6,$out6,$rndkey0
1639         le?vperm        $in1,$in1,$in1,$inpperm
1640          vxor           $out7,$out7,$rndkey0
1641         mtctr           $rounds
1642
1643          vcipher        $out0,$out0,v24
1644         stvx_u          $in0,$x00,$out
1645         le?vperm        $in2,$in2,$in2,$inpperm
1646          vcipher        $out1,$out1,v24
1647         stvx_u          $in1,$x10,$out
1648         le?vperm        $in3,$in3,$in3,$inpperm
1649          vcipher        $out2,$out2,v24
1650         stvx_u          $in2,$x20,$out
1651         le?vperm        $in4,$in4,$in4,$inpperm
1652          vcipher        $out3,$out3,v24
1653         stvx_u          $in3,$x30,$out
1654         le?vperm        $in5,$in5,$in5,$inpperm
1655          vcipher        $out4,$out4,v24
1656         stvx_u          $in4,$x40,$out
1657         le?vperm        $in6,$in6,$in6,$inpperm
1658          vcipher        $out5,$out5,v24
1659         stvx_u          $in5,$x50,$out
1660         le?vperm        $in7,$in7,$in7,$inpperm
1661          vcipher        $out6,$out6,v24
1662         stvx_u          $in6,$x60,$out
1663          vcipher        $out7,$out7,v24
1664         stvx_u          $in7,$x70,$out
1665         addi            $out,$out,0x80
1666
1667         b               Loop_ctr32_enc8x_middle
1668
1669 .align  5
1670 Lctr32_enc8x_break:
1671         cmpwi           $len,-0x60
1672         blt             Lctr32_enc8x_one
1673         nop
1674         beq             Lctr32_enc8x_two
1675         cmpwi           $len,-0x40
1676         blt             Lctr32_enc8x_three
1677         nop
1678         beq             Lctr32_enc8x_four
1679         cmpwi           $len,-0x20
1680         blt             Lctr32_enc8x_five
1681         nop
1682         beq             Lctr32_enc8x_six
1683         cmpwi           $len,0x00
1684         blt             Lctr32_enc8x_seven
1685
1686 Lctr32_enc8x_eight:
1687         vcipherlast     $out0,$out0,$in0
1688         vcipherlast     $out1,$out1,$in1
1689         vcipherlast     $out2,$out2,$in2
1690         vcipherlast     $out3,$out3,$in3
1691         vcipherlast     $out4,$out4,$in4
1692         vcipherlast     $out5,$out5,$in5
1693         vcipherlast     $out6,$out6,$in6
1694         vcipherlast     $out7,$out7,$in7
1695
1696         le?vperm        $out0,$out0,$out0,$inpperm
1697         le?vperm        $out1,$out1,$out1,$inpperm
1698         stvx_u          $out0,$x00,$out
1699         le?vperm        $out2,$out2,$out2,$inpperm
1700         stvx_u          $out1,$x10,$out
1701         le?vperm        $out3,$out3,$out3,$inpperm
1702         stvx_u          $out2,$x20,$out
1703         le?vperm        $out4,$out4,$out4,$inpperm
1704         stvx_u          $out3,$x30,$out
1705         le?vperm        $out5,$out5,$out5,$inpperm
1706         stvx_u          $out4,$x40,$out
1707         le?vperm        $out6,$out6,$out6,$inpperm
1708         stvx_u          $out5,$x50,$out
1709         le?vperm        $out7,$out7,$out7,$inpperm
1710         stvx_u          $out6,$x60,$out
1711         stvx_u          $out7,$x70,$out
1712         addi            $out,$out,0x80
1713         b               Lctr32_enc8x_done
1714
1715 .align  5
1716 Lctr32_enc8x_seven:
1717         vcipherlast     $out0,$out0,$in1
1718         vcipherlast     $out1,$out1,$in2
1719         vcipherlast     $out2,$out2,$in3
1720         vcipherlast     $out3,$out3,$in4
1721         vcipherlast     $out4,$out4,$in5
1722         vcipherlast     $out5,$out5,$in6
1723         vcipherlast     $out6,$out6,$in7
1724
1725         le?vperm        $out0,$out0,$out0,$inpperm
1726         le?vperm        $out1,$out1,$out1,$inpperm
1727         stvx_u          $out0,$x00,$out
1728         le?vperm        $out2,$out2,$out2,$inpperm
1729         stvx_u          $out1,$x10,$out
1730         le?vperm        $out3,$out3,$out3,$inpperm
1731         stvx_u          $out2,$x20,$out
1732         le?vperm        $out4,$out4,$out4,$inpperm
1733         stvx_u          $out3,$x30,$out
1734         le?vperm        $out5,$out5,$out5,$inpperm
1735         stvx_u          $out4,$x40,$out
1736         le?vperm        $out6,$out6,$out6,$inpperm
1737         stvx_u          $out5,$x50,$out
1738         stvx_u          $out6,$x60,$out
1739         addi            $out,$out,0x70
1740         b               Lctr32_enc8x_done
1741
1742 .align  5
1743 Lctr32_enc8x_six:
1744         vcipherlast     $out0,$out0,$in2
1745         vcipherlast     $out1,$out1,$in3
1746         vcipherlast     $out2,$out2,$in4
1747         vcipherlast     $out3,$out3,$in5
1748         vcipherlast     $out4,$out4,$in6
1749         vcipherlast     $out5,$out5,$in7
1750
1751         le?vperm        $out0,$out0,$out0,$inpperm
1752         le?vperm        $out1,$out1,$out1,$inpperm
1753         stvx_u          $out0,$x00,$out
1754         le?vperm        $out2,$out2,$out2,$inpperm
1755         stvx_u          $out1,$x10,$out
1756         le?vperm        $out3,$out3,$out3,$inpperm
1757         stvx_u          $out2,$x20,$out
1758         le?vperm        $out4,$out4,$out4,$inpperm
1759         stvx_u          $out3,$x30,$out
1760         le?vperm        $out5,$out5,$out5,$inpperm
1761         stvx_u          $out4,$x40,$out
1762         stvx_u          $out5,$x50,$out
1763         addi            $out,$out,0x60
1764         b               Lctr32_enc8x_done
1765
1766 .align  5
1767 Lctr32_enc8x_five:
1768         vcipherlast     $out0,$out0,$in3
1769         vcipherlast     $out1,$out1,$in4
1770         vcipherlast     $out2,$out2,$in5
1771         vcipherlast     $out3,$out3,$in6
1772         vcipherlast     $out4,$out4,$in7
1773
1774         le?vperm        $out0,$out0,$out0,$inpperm
1775         le?vperm        $out1,$out1,$out1,$inpperm
1776         stvx_u          $out0,$x00,$out
1777         le?vperm        $out2,$out2,$out2,$inpperm
1778         stvx_u          $out1,$x10,$out
1779         le?vperm        $out3,$out3,$out3,$inpperm
1780         stvx_u          $out2,$x20,$out
1781         le?vperm        $out4,$out4,$out4,$inpperm
1782         stvx_u          $out3,$x30,$out
1783         stvx_u          $out4,$x40,$out
1784         addi            $out,$out,0x50
1785         b               Lctr32_enc8x_done
1786
1787 .align  5
1788 Lctr32_enc8x_four:
1789         vcipherlast     $out0,$out0,$in4
1790         vcipherlast     $out1,$out1,$in5
1791         vcipherlast     $out2,$out2,$in6
1792         vcipherlast     $out3,$out3,$in7
1793
1794         le?vperm        $out0,$out0,$out0,$inpperm
1795         le?vperm        $out1,$out1,$out1,$inpperm
1796         stvx_u          $out0,$x00,$out
1797         le?vperm        $out2,$out2,$out2,$inpperm
1798         stvx_u          $out1,$x10,$out
1799         le?vperm        $out3,$out3,$out3,$inpperm
1800         stvx_u          $out2,$x20,$out
1801         stvx_u          $out3,$x30,$out
1802         addi            $out,$out,0x40
1803         b               Lctr32_enc8x_done
1804
1805 .align  5
1806 Lctr32_enc8x_three:
1807         vcipherlast     $out0,$out0,$in5
1808         vcipherlast     $out1,$out1,$in6
1809         vcipherlast     $out2,$out2,$in7
1810
1811         le?vperm        $out0,$out0,$out0,$inpperm
1812         le?vperm        $out1,$out1,$out1,$inpperm
1813         stvx_u          $out0,$x00,$out
1814         le?vperm        $out2,$out2,$out2,$inpperm
1815         stvx_u          $out1,$x10,$out
1816         stvx_u          $out2,$x20,$out
1817         addi            $out,$out,0x30
1818         b               Lcbc_dec8x_done
1819
1820 .align  5
1821 Lctr32_enc8x_two:
1822         vcipherlast     $out0,$out0,$in6
1823         vcipherlast     $out1,$out1,$in7
1824
1825         le?vperm        $out0,$out0,$out0,$inpperm
1826         le?vperm        $out1,$out1,$out1,$inpperm
1827         stvx_u          $out0,$x00,$out
1828         stvx_u          $out1,$x10,$out
1829         addi            $out,$out,0x20
1830         b               Lcbc_dec8x_done
1831
1832 .align  5
1833 Lctr32_enc8x_one:
1834         vcipherlast     $out0,$out0,$in7
1835
1836         le?vperm        $out0,$out0,$out0,$inpperm
1837         stvx_u          $out0,0,$out
1838         addi            $out,$out,0x10
1839
1840 Lctr32_enc8x_done:
1841         li              r10,`$FRAME+15`
1842         li              r11,`$FRAME+31`
1843         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1844         addi            r10,r10,32
1845         stvx            $inpperm,r11,$sp
1846         addi            r11,r11,32
1847         stvx            $inpperm,r10,$sp
1848         addi            r10,r10,32
1849         stvx            $inpperm,r11,$sp
1850         addi            r11,r11,32
1851         stvx            $inpperm,r10,$sp
1852         addi            r10,r10,32
1853         stvx            $inpperm,r11,$sp
1854         addi            r11,r11,32
1855         stvx            $inpperm,r10,$sp
1856         addi            r10,r10,32
1857         stvx            $inpperm,r11,$sp
1858         addi            r11,r11,32
1859
1860         mtspr           256,$vrsave
1861         lvx             v20,r10,$sp             # ABI says so
1862         addi            r10,r10,32
1863         lvx             v21,r11,$sp
1864         addi            r11,r11,32
1865         lvx             v22,r10,$sp
1866         addi            r10,r10,32
1867         lvx             v23,r11,$sp
1868         addi            r11,r11,32
1869         lvx             v24,r10,$sp
1870         addi            r10,r10,32
1871         lvx             v25,r11,$sp
1872         addi            r11,r11,32
1873         lvx             v26,r10,$sp
1874         addi            r10,r10,32
1875         lvx             v27,r11,$sp
1876         addi            r11,r11,32
1877         lvx             v28,r10,$sp
1878         addi            r10,r10,32
1879         lvx             v29,r11,$sp
1880         addi            r11,r11,32
1881         lvx             v30,r10,$sp
1882         lvx             v31,r11,$sp
1883         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1884         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1885         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1886         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1887         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1888         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1889         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1890         blr
1891         .long           0
1892         .byte           0,12,0x14,0,0x80,6,6,0
1893         .long           0
1894 .size   .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1895 ___
1896 }}      }}}
1897
1898 #########################################################################
1899 {{{     # XTS procedures                                                #
1900 # int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,   #
1901 #                             const AES_KEY *key1, const AES_KEY *key2, #
1902 #                             [const] unsigned char iv[16]);            #
1903 # If $key2 is NULL, then a "tweak chaining" mode is engaged, in which   #
1904 # input tweak value is assumed to be encrypted already, and last tweak  #
1905 # value, one suitable for consecutive call on same chunk of data, is    #
1906 # written back to original buffer. In addition, in "tweak chaining"     #
1907 # mode only complete input blocks are processed.                        #
1908
1909 my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =     map("r$_",(3..10));
1910 my ($rndkey0,$rndkey1,$inout) =                         map("v$_",(0..2));
1911 my ($output,$inptail,$inpperm,$leperm,$keyperm) =       map("v$_",(3..7));
1912 my ($tweak,$seven,$eighty7,$tmp,$tweak1) =              map("v$_",(8..12));
1913 my $taillen = $key2;
1914
1915    ($inp,$idx) = ($idx,$inp);                           # reassign
1916
1917 $code.=<<___;
1918 .globl  .${prefix}_xts_encrypt
1919         mr              $inp,r3                         # reassign
1920         li              r3,-1
1921         ${UCMP}i        $len,16
1922         bltlr-
1923
1924         lis             r0,0xfff0
1925         mfspr           r12,256                         # save vrsave
1926         li              r11,0
1927         mtspr           256,r0
1928
1929         vspltisb        $seven,0x07                     # 0x070707..07
1930         le?lvsl         $leperm,r11,r11
1931         le?vspltisb     $tmp,0x0f
1932         le?vxor         $leperm,$leperm,$seven
1933
1934         li              $idx,15
1935         lvx             $tweak,0,$ivp                   # load [unaligned] iv
1936         lvsl            $inpperm,0,$ivp
1937         lvx             $inptail,$idx,$ivp
1938         le?vxor         $inpperm,$inpperm,$tmp
1939         vperm           $tweak,$tweak,$inptail,$inpperm
1940
1941         neg             r11,$inp
1942         lvsr            $inpperm,0,r11                  # prepare for unaligned load
1943         lvx             $inout,0,$inp
1944         addi            $inp,$inp,15                    # 15 is not typo
1945         le?vxor         $inpperm,$inpperm,$tmp
1946
1947         ${UCMP}i        $key2,0                         # key2==NULL?
1948         beq             Lxts_enc_no_key2
1949
1950         ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
1951         lwz             $rounds,240($key2)
1952         srwi            $rounds,$rounds,1
1953         subi            $rounds,$rounds,1
1954         li              $idx,16
1955
1956         lvx             $rndkey0,0,$key2
1957         lvx             $rndkey1,$idx,$key2
1958         addi            $idx,$idx,16
1959         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1960         vxor            $tweak,$tweak,$rndkey0
1961         lvx             $rndkey0,$idx,$key2
1962         addi            $idx,$idx,16
1963         mtctr           $rounds
1964
1965 Ltweak_xts_enc:
1966         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1967         vcipher         $tweak,$tweak,$rndkey1
1968         lvx             $rndkey1,$idx,$key2
1969         addi            $idx,$idx,16
1970         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1971         vcipher         $tweak,$tweak,$rndkey0
1972         lvx             $rndkey0,$idx,$key2
1973         addi            $idx,$idx,16
1974         bdnz            Ltweak_xts_enc
1975
1976         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1977         vcipher         $tweak,$tweak,$rndkey1
1978         lvx             $rndkey1,$idx,$key2
1979         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1980         vcipherlast     $tweak,$tweak,$rndkey0
1981
1982         li              $ivp,0                          # don't chain the tweak
1983         b               Lxts_enc
1984
1985 Lxts_enc_no_key2:
1986         li              $idx,-16
1987         and             $len,$len,$idx                  # in "tweak chaining"
1988                                                         # mode only complete
1989                                                         # blocks are processed
1990 Lxts_enc:
1991         lvx             $inptail,0,$inp
1992         addi            $inp,$inp,16
1993
1994         ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
1995         lwz             $rounds,240($key1)
1996         srwi            $rounds,$rounds,1
1997         subi            $rounds,$rounds,1
1998         li              $idx,16
1999
2000         vslb            $eighty7,$seven,$seven          # 0x808080..80
2001         vor             $eighty7,$eighty7,$seven        # 0x878787..87
2002         vspltisb        $tmp,1                          # 0x010101..01
2003         vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
2004
2005         ${UCMP}i        $len,96
2006         bge             _aesp8_xts_encrypt6x
2007
2008         andi.           $taillen,$len,15
2009         subic           r0,$len,32
2010         subi            $taillen,$taillen,16
2011         subfe           r0,r0,r0
2012         and             r0,r0,$taillen
2013         add             $inp,$inp,r0
2014
2015         lvx             $rndkey0,0,$key1
2016         lvx             $rndkey1,$idx,$key1
2017         addi            $idx,$idx,16
2018         vperm           $inout,$inout,$inptail,$inpperm
2019         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2020         vxor            $inout,$inout,$tweak
2021         vxor            $inout,$inout,$rndkey0
2022         lvx             $rndkey0,$idx,$key1
2023         addi            $idx,$idx,16
2024         mtctr           $rounds
2025         b               Loop_xts_enc
2026
2027 .align  5
2028 Loop_xts_enc:
2029         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2030         vcipher         $inout,$inout,$rndkey1
2031         lvx             $rndkey1,$idx,$key1
2032         addi            $idx,$idx,16
2033         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2034         vcipher         $inout,$inout,$rndkey0
2035         lvx             $rndkey0,$idx,$key1
2036         addi            $idx,$idx,16
2037         bdnz            Loop_xts_enc
2038
2039         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2040         vcipher         $inout,$inout,$rndkey1
2041         lvx             $rndkey1,$idx,$key1
2042         li              $idx,16
2043         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2044         vxor            $rndkey0,$rndkey0,$tweak
2045         vcipherlast     $output,$inout,$rndkey0
2046
2047         le?vperm        $tmp,$output,$output,$leperm
2048         be?nop
2049         le?stvx_u       $tmp,0,$out
2050         be?stvx_u       $output,0,$out
2051         addi            $out,$out,16
2052
2053         subic.          $len,$len,16
2054         beq             Lxts_enc_done
2055
2056         vmr             $inout,$inptail
2057         lvx             $inptail,0,$inp
2058         addi            $inp,$inp,16
2059         lvx             $rndkey0,0,$key1
2060         lvx             $rndkey1,$idx,$key1
2061         addi            $idx,$idx,16
2062
2063         subic           r0,$len,32
2064         subfe           r0,r0,r0
2065         and             r0,r0,$taillen
2066         add             $inp,$inp,r0
2067
2068         vsrab           $tmp,$tweak,$seven              # next tweak value
2069         vaddubm         $tweak,$tweak,$tweak
2070         vsldoi          $tmp,$tmp,$tmp,15
2071         vand            $tmp,$tmp,$eighty7
2072         vxor            $tweak,$tweak,$tmp
2073
2074         vperm           $inout,$inout,$inptail,$inpperm
2075         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2076         vxor            $inout,$inout,$tweak
2077         vxor            $output,$output,$rndkey0        # just in case $len<16
2078         vxor            $inout,$inout,$rndkey0
2079         lvx             $rndkey0,$idx,$key1
2080         addi            $idx,$idx,16
2081
2082         mtctr           $rounds
2083         ${UCMP}i        $len,16
2084         bge             Loop_xts_enc
2085
2086         vxor            $output,$output,$tweak
2087         lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
2088         vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
2089         vspltisb        $tmp,-1
2090         vperm           $inptail,$inptail,$tmp,$inpperm
2091         vsel            $inout,$inout,$output,$inptail
2092
2093         subi            r11,$out,17
2094         subi            $out,$out,16
2095         mtctr           $len
2096         li              $len,16
2097 Loop_xts_enc_steal:
2098         lbzu            r0,1(r11)
2099         stb             r0,16(r11)
2100         bdnz            Loop_xts_enc_steal
2101
2102         mtctr           $rounds
2103         b               Loop_xts_enc                    # one more time...
2104
2105 Lxts_enc_done:
2106         ${UCMP}i        $ivp,0
2107         beq             Lxts_enc_ret
2108
2109         vsrab           $tmp,$tweak,$seven              # next tweak value
2110         vaddubm         $tweak,$tweak,$tweak
2111         vsldoi          $tmp,$tmp,$tmp,15
2112         vand            $tmp,$tmp,$eighty7
2113         vxor            $tweak,$tweak,$tmp
2114
2115         le?vperm        $tweak,$tweak,$tweak,$leperm
2116         stvx_u          $tweak,0,$ivp
2117
2118 Lxts_enc_ret:
2119         mtspr           256,r12                         # restore vrsave
2120         li              r3,0
2121         blr
2122         .long           0
2123         .byte           0,12,0x04,0,0x80,6,6,0
2124         .long           0
2125 .size   .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2126
2127 .globl  .${prefix}_xts_decrypt
2128         mr              $inp,r3                         # reassign
2129         li              r3,-1
2130         ${UCMP}i        $len,16
2131         bltlr-
2132
2133         lis             r0,0xfff8
2134         mfspr           r12,256                         # save vrsave
2135         li              r11,0
2136         mtspr           256,r0
2137
2138         andi.           r0,$len,15
2139         neg             r0,r0
2140         andi.           r0,r0,16
2141         sub             $len,$len,r0
2142
2143         vspltisb        $seven,0x07                     # 0x070707..07
2144         le?lvsl         $leperm,r11,r11
2145         le?vspltisb     $tmp,0x0f
2146         le?vxor         $leperm,$leperm,$seven
2147
2148         li              $idx,15
2149         lvx             $tweak,0,$ivp                   # load [unaligned] iv
2150         lvsl            $inpperm,0,$ivp
2151         lvx             $inptail,$idx,$ivp
2152         le?vxor         $inpperm,$inpperm,$tmp
2153         vperm           $tweak,$tweak,$inptail,$inpperm
2154
2155         neg             r11,$inp
2156         lvsr            $inpperm,0,r11                  # prepare for unaligned load
2157         lvx             $inout,0,$inp
2158         addi            $inp,$inp,15                    # 15 is not typo
2159         le?vxor         $inpperm,$inpperm,$tmp
2160
2161         ${UCMP}i        $key2,0                         # key2==NULL?
2162         beq             Lxts_dec_no_key2
2163
2164         ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
2165         lwz             $rounds,240($key2)
2166         srwi            $rounds,$rounds,1
2167         subi            $rounds,$rounds,1
2168         li              $idx,16
2169
2170         lvx             $rndkey0,0,$key2
2171         lvx             $rndkey1,$idx,$key2
2172         addi            $idx,$idx,16
2173         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2174         vxor            $tweak,$tweak,$rndkey0
2175         lvx             $rndkey0,$idx,$key2
2176         addi            $idx,$idx,16
2177         mtctr           $rounds
2178
2179 Ltweak_xts_dec:
2180         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2181         vcipher         $tweak,$tweak,$rndkey1
2182         lvx             $rndkey1,$idx,$key2
2183         addi            $idx,$idx,16
2184         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2185         vcipher         $tweak,$tweak,$rndkey0
2186         lvx             $rndkey0,$idx,$key2
2187         addi            $idx,$idx,16
2188         bdnz            Ltweak_xts_dec
2189
2190         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2191         vcipher         $tweak,$tweak,$rndkey1
2192         lvx             $rndkey1,$idx,$key2
2193         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2194         vcipherlast     $tweak,$tweak,$rndkey0
2195
2196         li              $ivp,0                          # don't chain the tweak
2197         b               Lxts_dec
2198
2199 Lxts_dec_no_key2:
2200         neg             $idx,$len
2201         andi.           $idx,$idx,15
2202         add             $len,$len,$idx                  # in "tweak chaining"
2203                                                         # mode only complete
2204                                                         # blocks are processed
2205 Lxts_dec:
2206         lvx             $inptail,0,$inp
2207         addi            $inp,$inp,16
2208
2209         ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
2210         lwz             $rounds,240($key1)
2211         srwi            $rounds,$rounds,1
2212         subi            $rounds,$rounds,1
2213         li              $idx,16
2214
2215         vslb            $eighty7,$seven,$seven          # 0x808080..80
2216         vor             $eighty7,$eighty7,$seven        # 0x878787..87
2217         vspltisb        $tmp,1                          # 0x010101..01
2218         vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
2219
2220         ${UCMP}i        $len,96
2221         bge             _aesp8_xts_decrypt6x
2222
2223         lvx             $rndkey0,0,$key1
2224         lvx             $rndkey1,$idx,$key1
2225         addi            $idx,$idx,16
2226         vperm           $inout,$inout,$inptail,$inpperm
2227         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2228         vxor            $inout,$inout,$tweak
2229         vxor            $inout,$inout,$rndkey0
2230         lvx             $rndkey0,$idx,$key1
2231         addi            $idx,$idx,16
2232         mtctr           $rounds
2233
2234         ${UCMP}i        $len,16
2235         blt             Ltail_xts_dec
2236         be?b            Loop_xts_dec
2237
2238 .align  5
2239 Loop_xts_dec:
2240         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2241         vncipher        $inout,$inout,$rndkey1
2242         lvx             $rndkey1,$idx,$key1
2243         addi            $idx,$idx,16
2244         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2245         vncipher        $inout,$inout,$rndkey0
2246         lvx             $rndkey0,$idx,$key1
2247         addi            $idx,$idx,16
2248         bdnz            Loop_xts_dec
2249
2250         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2251         vncipher        $inout,$inout,$rndkey1
2252         lvx             $rndkey1,$idx,$key1
2253         li              $idx,16
2254         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2255         vxor            $rndkey0,$rndkey0,$tweak
2256         vncipherlast    $output,$inout,$rndkey0
2257
2258         le?vperm        $tmp,$output,$output,$leperm
2259         be?nop
2260         le?stvx_u       $tmp,0,$out
2261         be?stvx_u       $output,0,$out
2262         addi            $out,$out,16
2263
2264         subic.          $len,$len,16
2265         beq             Lxts_dec_done
2266
2267         vmr             $inout,$inptail
2268         lvx             $inptail,0,$inp
2269         addi            $inp,$inp,16
2270         lvx             $rndkey0,0,$key1
2271         lvx             $rndkey1,$idx,$key1
2272         addi            $idx,$idx,16
2273
2274         vsrab           $tmp,$tweak,$seven              # next tweak value
2275         vaddubm         $tweak,$tweak,$tweak
2276         vsldoi          $tmp,$tmp,$tmp,15
2277         vand            $tmp,$tmp,$eighty7
2278         vxor            $tweak,$tweak,$tmp
2279
2280         vperm           $inout,$inout,$inptail,$inpperm
2281         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2282         vxor            $inout,$inout,$tweak
2283         vxor            $inout,$inout,$rndkey0
2284         lvx             $rndkey0,$idx,$key1
2285         addi            $idx,$idx,16
2286
2287         mtctr           $rounds
2288         ${UCMP}i        $len,16
2289         bge             Loop_xts_dec
2290
2291 Ltail_xts_dec:
2292         vsrab           $tmp,$tweak,$seven              # next tweak value
2293         vaddubm         $tweak1,$tweak,$tweak
2294         vsldoi          $tmp,$tmp,$tmp,15
2295         vand            $tmp,$tmp,$eighty7
2296         vxor            $tweak1,$tweak1,$tmp
2297
2298         subi            $inp,$inp,16
2299         add             $inp,$inp,$len
2300
2301         vxor            $inout,$inout,$tweak            # :-(
2302         vxor            $inout,$inout,$tweak1           # :-)
2303
2304 Loop_xts_dec_short:
2305         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2306         vncipher        $inout,$inout,$rndkey1
2307         lvx             $rndkey1,$idx,$key1
2308         addi            $idx,$idx,16
2309         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2310         vncipher        $inout,$inout,$rndkey0
2311         lvx             $rndkey0,$idx,$key1
2312         addi            $idx,$idx,16
2313         bdnz            Loop_xts_dec_short
2314
2315         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2316         vncipher        $inout,$inout,$rndkey1
2317         lvx             $rndkey1,$idx,$key1
2318         li              $idx,16
2319         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2320         vxor            $rndkey0,$rndkey0,$tweak1
2321         vncipherlast    $output,$inout,$rndkey0
2322
2323         le?vperm        $tmp,$output,$output,$leperm
2324         be?nop
2325         le?stvx_u       $tmp,0,$out
2326         be?stvx_u       $output,0,$out
2327
2328         vmr             $inout,$inptail
2329         lvx             $inptail,0,$inp
2330         #addi           $inp,$inp,16
2331         lvx             $rndkey0,0,$key1
2332         lvx             $rndkey1,$idx,$key1
2333         addi            $idx,$idx,16
2334         vperm           $inout,$inout,$inptail,$inpperm
2335         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2336
2337         lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
2338         vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
2339         vspltisb        $tmp,-1
2340         vperm           $inptail,$inptail,$tmp,$inpperm
2341         vsel            $inout,$inout,$output,$inptail
2342
2343         vxor            $rndkey0,$rndkey0,$tweak
2344         vxor            $inout,$inout,$rndkey0
2345         lvx             $rndkey0,$idx,$key1
2346         addi            $idx,$idx,16
2347
2348         subi            r11,$out,1
2349         mtctr           $len
2350         li              $len,16
2351 Loop_xts_dec_steal:
2352         lbzu            r0,1(r11)
2353         stb             r0,16(r11)
2354         bdnz            Loop_xts_dec_steal
2355
2356         mtctr           $rounds
2357         b               Loop_xts_dec                    # one more time...
2358
2359 Lxts_dec_done:
2360         ${UCMP}i        $ivp,0
2361         beq             Lxts_dec_ret
2362
2363         vsrab           $tmp,$tweak,$seven              # next tweak value
2364         vaddubm         $tweak,$tweak,$tweak
2365         vsldoi          $tmp,$tmp,$tmp,15
2366         vand            $tmp,$tmp,$eighty7
2367         vxor            $tweak,$tweak,$tmp
2368
2369         le?vperm        $tweak,$tweak,$tweak,$leperm
2370         stvx_u          $tweak,0,$ivp
2371
2372 Lxts_dec_ret:
2373         mtspr           256,r12                         # restore vrsave
2374         li              r3,0
2375         blr
2376         .long           0
2377         .byte           0,12,0x04,0,0x80,6,6,0
2378         .long           0
2379 .size   .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2380 ___
2381 #########################################################################
2382 {{      # Optimized XTS procedures                                      #
2383 my $key_=$key2;
2384 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2385     $x00=0 if ($flavour =~ /osx/);
2386 my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2387 my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2388 my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2389 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
2390                         # v26-v31 last 6 round keys
2391 my ($keyperm)=($out0);  # aliases with "caller", redundant assignment
2392 my $taillen=$x70;
2393
2394 $code.=<<___;
2395 .align  5
2396 _aesp8_xts_encrypt6x:
2397         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2398         mflr            r11
2399         li              r7,`$FRAME+8*16+15`
2400         li              r3,`$FRAME+8*16+31`
2401         $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2402         stvx            v20,r7,$sp              # ABI says so
2403         addi            r7,r7,32
2404         stvx            v21,r3,$sp
2405         addi            r3,r3,32
2406         stvx            v22,r7,$sp
2407         addi            r7,r7,32
2408         stvx            v23,r3,$sp
2409         addi            r3,r3,32
2410         stvx            v24,r7,$sp
2411         addi            r7,r7,32
2412         stvx            v25,r3,$sp
2413         addi            r3,r3,32
2414         stvx            v26,r7,$sp
2415         addi            r7,r7,32
2416         stvx            v27,r3,$sp
2417         addi            r3,r3,32
2418         stvx            v28,r7,$sp
2419         addi            r7,r7,32
2420         stvx            v29,r3,$sp
2421         addi            r3,r3,32
2422         stvx            v30,r7,$sp
2423         stvx            v31,r3,$sp
2424         li              r0,-1
2425         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
2426         li              $x10,0x10
2427         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2428         li              $x20,0x20
2429         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2430         li              $x30,0x30
2431         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2432         li              $x40,0x40
2433         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2434         li              $x50,0x50
2435         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2436         li              $x60,0x60
2437         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2438         li              $x70,0x70
2439         mtspr           256,r0
2440
2441         subi            $rounds,$rounds,3       # -4 in total
2442
2443         lvx             $rndkey0,$x00,$key1     # load key schedule
2444         lvx             v30,$x10,$key1
2445         addi            $key1,$key1,0x20
2446         lvx             v31,$x00,$key1
2447         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
2448         addi            $key_,$sp,$FRAME+15
2449         mtctr           $rounds
2450
2451 Load_xts_enc_key:
2452         ?vperm          v24,v30,v31,$keyperm
2453         lvx             v30,$x10,$key1
2454         addi            $key1,$key1,0x20
2455         stvx            v24,$x00,$key_          # off-load round[1]
2456         ?vperm          v25,v31,v30,$keyperm
2457         lvx             v31,$x00,$key1
2458         stvx            v25,$x10,$key_          # off-load round[2]
2459         addi            $key_,$key_,0x20
2460         bdnz            Load_xts_enc_key
2461
2462         lvx             v26,$x10,$key1
2463         ?vperm          v24,v30,v31,$keyperm
2464         lvx             v27,$x20,$key1
2465         stvx            v24,$x00,$key_          # off-load round[3]
2466         ?vperm          v25,v31,v26,$keyperm
2467         lvx             v28,$x30,$key1
2468         stvx            v25,$x10,$key_          # off-load round[4]
2469         addi            $key_,$sp,$FRAME+15     # rewind $key_
2470         ?vperm          v26,v26,v27,$keyperm
2471         lvx             v29,$x40,$key1
2472         ?vperm          v27,v27,v28,$keyperm
2473         lvx             v30,$x50,$key1
2474         ?vperm          v28,v28,v29,$keyperm
2475         lvx             v31,$x60,$key1
2476         ?vperm          v29,v29,v30,$keyperm
2477         lvx             $twk5,$x70,$key1        # borrow $twk5
2478         ?vperm          v30,v30,v31,$keyperm
2479         lvx             v24,$x00,$key_          # pre-load round[1]
2480         ?vperm          v31,v31,$twk5,$keyperm
2481         lvx             v25,$x10,$key_          # pre-load round[2]
2482
2483          vperm          $in0,$inout,$inptail,$inpperm
2484          subi           $inp,$inp,31            # undo "caller"
2485         vxor            $twk0,$tweak,$rndkey0
2486         vsrab           $tmp,$tweak,$seven      # next tweak value
2487         vaddubm         $tweak,$tweak,$tweak
2488         vsldoi          $tmp,$tmp,$tmp,15
2489         vand            $tmp,$tmp,$eighty7
2490          vxor           $out0,$in0,$twk0
2491         vxor            $tweak,$tweak,$tmp
2492
2493          lvx_u          $in1,$x10,$inp
2494         vxor            $twk1,$tweak,$rndkey0
2495         vsrab           $tmp,$tweak,$seven      # next tweak value
2496         vaddubm         $tweak,$tweak,$tweak
2497         vsldoi          $tmp,$tmp,$tmp,15
2498          le?vperm       $in1,$in1,$in1,$leperm
2499         vand            $tmp,$tmp,$eighty7
2500          vxor           $out1,$in1,$twk1
2501         vxor            $tweak,$tweak,$tmp
2502
2503          lvx_u          $in2,$x20,$inp
2504          andi.          $taillen,$len,15
2505         vxor            $twk2,$tweak,$rndkey0
2506         vsrab           $tmp,$tweak,$seven      # next tweak value
2507         vaddubm         $tweak,$tweak,$tweak
2508         vsldoi          $tmp,$tmp,$tmp,15
2509          le?vperm       $in2,$in2,$in2,$leperm
2510         vand            $tmp,$tmp,$eighty7
2511          vxor           $out2,$in2,$twk2
2512         vxor            $tweak,$tweak,$tmp
2513
2514          lvx_u          $in3,$x30,$inp
2515          sub            $len,$len,$taillen
2516         vxor            $twk3,$tweak,$rndkey0
2517         vsrab           $tmp,$tweak,$seven      # next tweak value
2518         vaddubm         $tweak,$tweak,$tweak
2519         vsldoi          $tmp,$tmp,$tmp,15
2520          le?vperm       $in3,$in3,$in3,$leperm
2521         vand            $tmp,$tmp,$eighty7
2522          vxor           $out3,$in3,$twk3
2523         vxor            $tweak,$tweak,$tmp
2524
2525          lvx_u          $in4,$x40,$inp
2526          subi           $len,$len,0x60
2527         vxor            $twk4,$tweak,$rndkey0
2528         vsrab           $tmp,$tweak,$seven      # next tweak value
2529         vaddubm         $tweak,$tweak,$tweak
2530         vsldoi          $tmp,$tmp,$tmp,15
2531          le?vperm       $in4,$in4,$in4,$leperm
2532         vand            $tmp,$tmp,$eighty7
2533          vxor           $out4,$in4,$twk4
2534         vxor            $tweak,$tweak,$tmp
2535
2536          lvx_u          $in5,$x50,$inp
2537          addi           $inp,$inp,0x60
2538         vxor            $twk5,$tweak,$rndkey0
2539         vsrab           $tmp,$tweak,$seven      # next tweak value
2540         vaddubm         $tweak,$tweak,$tweak
2541         vsldoi          $tmp,$tmp,$tmp,15
2542          le?vperm       $in5,$in5,$in5,$leperm
2543         vand            $tmp,$tmp,$eighty7
2544          vxor           $out5,$in5,$twk5
2545         vxor            $tweak,$tweak,$tmp
2546
2547         vxor            v31,v31,$rndkey0
2548         mtctr           $rounds
2549         b               Loop_xts_enc6x
2550
2551 .align  5
2552 Loop_xts_enc6x:
2553         vcipher         $out0,$out0,v24
2554         vcipher         $out1,$out1,v24
2555         vcipher         $out2,$out2,v24
2556         vcipher         $out3,$out3,v24
2557         vcipher         $out4,$out4,v24
2558         vcipher         $out5,$out5,v24
2559         lvx             v24,$x20,$key_          # round[3]
2560         addi            $key_,$key_,0x20
2561
2562         vcipher         $out0,$out0,v25
2563         vcipher         $out1,$out1,v25
2564         vcipher         $out2,$out2,v25
2565         vcipher         $out3,$out3,v25
2566         vcipher         $out4,$out4,v25
2567         vcipher         $out5,$out5,v25
2568         lvx             v25,$x10,$key_          # round[4]
2569         bdnz            Loop_xts_enc6x
2570
2571         subic           $len,$len,96            # $len-=96
2572          vxor           $in0,$twk0,v31          # xor with last round key
2573         vcipher         $out0,$out0,v24
2574         vcipher         $out1,$out1,v24
2575          vsrab          $tmp,$tweak,$seven      # next tweak value
2576          vxor           $twk0,$tweak,$rndkey0
2577          vaddubm        $tweak,$tweak,$tweak
2578         vcipher         $out2,$out2,v24
2579         vcipher         $out3,$out3,v24
2580          vsldoi         $tmp,$tmp,$tmp,15
2581         vcipher         $out4,$out4,v24
2582         vcipher         $out5,$out5,v24
2583
2584         subfe.          r0,r0,r0                # borrow?-1:0
2585          vand           $tmp,$tmp,$eighty7
2586         vcipher         $out0,$out0,v25
2587         vcipher         $out1,$out1,v25
2588          vxor           $tweak,$tweak,$tmp
2589         vcipher         $out2,$out2,v25
2590         vcipher         $out3,$out3,v25
2591          vxor           $in1,$twk1,v31
2592          vsrab          $tmp,$tweak,$seven      # next tweak value
2593          vxor           $twk1,$tweak,$rndkey0
2594         vcipher         $out4,$out4,v25
2595         vcipher         $out5,$out5,v25
2596
2597         and             r0,r0,$len
2598          vaddubm        $tweak,$tweak,$tweak
2599          vsldoi         $tmp,$tmp,$tmp,15
2600         vcipher         $out0,$out0,v26
2601         vcipher         $out1,$out1,v26
2602          vand           $tmp,$tmp,$eighty7
2603         vcipher         $out2,$out2,v26
2604         vcipher         $out3,$out3,v26
2605          vxor           $tweak,$tweak,$tmp
2606         vcipher         $out4,$out4,v26
2607         vcipher         $out5,$out5,v26
2608
2609         add             $inp,$inp,r0            # $inp is adjusted in such
2610                                                 # way that at exit from the
2611                                                 # loop inX-in5 are loaded
2612                                                 # with last "words"
2613          vxor           $in2,$twk2,v31
2614          vsrab          $tmp,$tweak,$seven      # next tweak value
2615          vxor           $twk2,$tweak,$rndkey0
2616          vaddubm        $tweak,$tweak,$tweak
2617         vcipher         $out0,$out0,v27
2618         vcipher         $out1,$out1,v27
2619          vsldoi         $tmp,$tmp,$tmp,15
2620         vcipher         $out2,$out2,v27
2621         vcipher         $out3,$out3,v27
2622          vand           $tmp,$tmp,$eighty7
2623         vcipher         $out4,$out4,v27
2624         vcipher         $out5,$out5,v27
2625
2626         addi            $key_,$sp,$FRAME+15     # rewind $key_
2627          vxor           $tweak,$tweak,$tmp
2628         vcipher         $out0,$out0,v28
2629         vcipher         $out1,$out1,v28
2630          vxor           $in3,$twk3,v31
2631          vsrab          $tmp,$tweak,$seven      # next tweak value
2632          vxor           $twk3,$tweak,$rndkey0
2633         vcipher         $out2,$out2,v28
2634         vcipher         $out3,$out3,v28
2635          vaddubm        $tweak,$tweak,$tweak
2636          vsldoi         $tmp,$tmp,$tmp,15
2637         vcipher         $out4,$out4,v28
2638         vcipher         $out5,$out5,v28
2639         lvx             v24,$x00,$key_          # re-pre-load round[1]
2640          vand           $tmp,$tmp,$eighty7
2641
2642         vcipher         $out0,$out0,v29
2643         vcipher         $out1,$out1,v29
2644          vxor           $tweak,$tweak,$tmp
2645         vcipher         $out2,$out2,v29
2646         vcipher         $out3,$out3,v29
2647          vxor           $in4,$twk4,v31
2648          vsrab          $tmp,$tweak,$seven      # next tweak value
2649          vxor           $twk4,$tweak,$rndkey0
2650         vcipher         $out4,$out4,v29
2651         vcipher         $out5,$out5,v29
2652         lvx             v25,$x10,$key_          # re-pre-load round[2]
2653          vaddubm        $tweak,$tweak,$tweak
2654          vsldoi         $tmp,$tmp,$tmp,15
2655
2656         vcipher         $out0,$out0,v30
2657         vcipher         $out1,$out1,v30
2658          vand           $tmp,$tmp,$eighty7
2659         vcipher         $out2,$out2,v30
2660         vcipher         $out3,$out3,v30
2661          vxor           $tweak,$tweak,$tmp
2662         vcipher         $out4,$out4,v30
2663         vcipher         $out5,$out5,v30
2664          vxor           $in5,$twk5,v31
2665          vsrab          $tmp,$tweak,$seven      # next tweak value
2666          vxor           $twk5,$tweak,$rndkey0
2667
2668         vcipherlast     $out0,$out0,$in0
2669          lvx_u          $in0,$x00,$inp          # load next input block
2670          vaddubm        $tweak,$tweak,$tweak
2671          vsldoi         $tmp,$tmp,$tmp,15
2672         vcipherlast     $out1,$out1,$in1
2673          lvx_u          $in1,$x10,$inp
2674         vcipherlast     $out2,$out2,$in2
2675          le?vperm       $in0,$in0,$in0,$leperm
2676          lvx_u          $in2,$x20,$inp
2677          vand           $tmp,$tmp,$eighty7
2678         vcipherlast     $out3,$out3,$in3
2679          le?vperm       $in1,$in1,$in1,$leperm
2680          lvx_u          $in3,$x30,$inp
2681         vcipherlast     $out4,$out4,$in4
2682          le?vperm       $in2,$in2,$in2,$leperm
2683          lvx_u          $in4,$x40,$inp
2684          vxor           $tweak,$tweak,$tmp
2685         vcipherlast     $tmp,$out5,$in5         # last block might be needed
2686                                                 # in stealing mode
2687          le?vperm       $in3,$in3,$in3,$leperm
2688          lvx_u          $in5,$x50,$inp
2689          addi           $inp,$inp,0x60
2690          le?vperm       $in4,$in4,$in4,$leperm
2691          le?vperm       $in5,$in5,$in5,$leperm
2692
2693         le?vperm        $out0,$out0,$out0,$leperm
2694         le?vperm        $out1,$out1,$out1,$leperm
2695         stvx_u          $out0,$x00,$out         # store output
2696          vxor           $out0,$in0,$twk0
2697         le?vperm        $out2,$out2,$out2,$leperm
2698         stvx_u          $out1,$x10,$out
2699          vxor           $out1,$in1,$twk1
2700         le?vperm        $out3,$out3,$out3,$leperm
2701         stvx_u          $out2,$x20,$out
2702          vxor           $out2,$in2,$twk2
2703         le?vperm        $out4,$out4,$out4,$leperm
2704         stvx_u          $out3,$x30,$out
2705          vxor           $out3,$in3,$twk3
2706         le?vperm        $out5,$tmp,$tmp,$leperm
2707         stvx_u          $out4,$x40,$out
2708          vxor           $out4,$in4,$twk4
2709         le?stvx_u       $out5,$x50,$out
2710         be?stvx_u       $tmp, $x50,$out
2711          vxor           $out5,$in5,$twk5
2712         addi            $out,$out,0x60
2713
2714         mtctr           $rounds
2715         beq             Loop_xts_enc6x          # did $len-=96 borrow?
2716
2717         addic.          $len,$len,0x60
2718         beq             Lxts_enc6x_zero
2719         cmpwi           $len,0x20
2720         blt             Lxts_enc6x_one
2721         nop
2722         beq             Lxts_enc6x_two
2723         cmpwi           $len,0x40
2724         blt             Lxts_enc6x_three
2725         nop
2726         beq             Lxts_enc6x_four
2727
2728 Lxts_enc6x_five:
2729         vxor            $out0,$in1,$twk0
2730         vxor            $out1,$in2,$twk1
2731         vxor            $out2,$in3,$twk2
2732         vxor            $out3,$in4,$twk3
2733         vxor            $out4,$in5,$twk4
2734
2735         bl              _aesp8_xts_enc5x
2736
2737         le?vperm        $out0,$out0,$out0,$leperm
2738         vmr             $twk0,$twk5             # unused tweak
2739         le?vperm        $out1,$out1,$out1,$leperm
2740         stvx_u          $out0,$x00,$out         # store output
2741         le?vperm        $out2,$out2,$out2,$leperm
2742         stvx_u          $out1,$x10,$out
2743         le?vperm        $out3,$out3,$out3,$leperm
2744         stvx_u          $out2,$x20,$out
2745         vxor            $tmp,$out4,$twk5        # last block prep for stealing
2746         le?vperm        $out4,$out4,$out4,$leperm
2747         stvx_u          $out3,$x30,$out
2748         stvx_u          $out4,$x40,$out
2749         addi            $out,$out,0x50
2750         bne             Lxts_enc6x_steal
2751         b               Lxts_enc6x_done
2752
2753 .align  4
2754 Lxts_enc6x_four:
2755         vxor            $out0,$in2,$twk0
2756         vxor            $out1,$in3,$twk1
2757         vxor            $out2,$in4,$twk2
2758         vxor            $out3,$in5,$twk3
2759         vxor            $out4,$out4,$out4
2760
2761         bl              _aesp8_xts_enc5x
2762
2763         le?vperm        $out0,$out0,$out0,$leperm
2764         vmr             $twk0,$twk4             # unused tweak
2765         le?vperm        $out1,$out1,$out1,$leperm
2766         stvx_u          $out0,$x00,$out         # store output
2767         le?vperm        $out2,$out2,$out2,$leperm
2768         stvx_u          $out1,$x10,$out
2769         vxor            $tmp,$out3,$twk4        # last block prep for stealing
2770         le?vperm        $out3,$out3,$out3,$leperm
2771         stvx_u          $out2,$x20,$out
2772         stvx_u          $out3,$x30,$out
2773         addi            $out,$out,0x40
2774         bne             Lxts_enc6x_steal
2775         b               Lxts_enc6x_done
2776
2777 .align  4
2778 Lxts_enc6x_three:
2779         vxor            $out0,$in3,$twk0
2780         vxor            $out1,$in4,$twk1
2781         vxor            $out2,$in5,$twk2
2782         vxor            $out3,$out3,$out3
2783         vxor            $out4,$out4,$out4
2784
2785         bl              _aesp8_xts_enc5x
2786
2787         le?vperm        $out0,$out0,$out0,$leperm
2788         vmr             $twk0,$twk3             # unused tweak
2789         le?vperm        $out1,$out1,$out1,$leperm
2790         stvx_u          $out0,$x00,$out         # store output
2791         vxor            $tmp,$out2,$twk3        # last block prep for stealing
2792         le?vperm        $out2,$out2,$out2,$leperm
2793         stvx_u          $out1,$x10,$out
2794         stvx_u          $out2,$x20,$out
2795         addi            $out,$out,0x30
2796         bne             Lxts_enc6x_steal
2797         b               Lxts_enc6x_done
2798
2799 .align  4
2800 Lxts_enc6x_two:
2801         vxor            $out0,$in4,$twk0
2802         vxor            $out1,$in5,$twk1
2803         vxor            $out2,$out2,$out2
2804         vxor            $out3,$out3,$out3
2805         vxor            $out4,$out4,$out4
2806
2807         bl              _aesp8_xts_enc5x
2808
2809         le?vperm        $out0,$out0,$out0,$leperm
2810         vmr             $twk0,$twk2             # unused tweak
2811         vxor            $tmp,$out1,$twk2        # last block prep for stealing
2812         le?vperm        $out1,$out1,$out1,$leperm
2813         stvx_u          $out0,$x00,$out         # store output
2814         stvx_u          $out1,$x10,$out
2815         addi            $out,$out,0x20
2816         bne             Lxts_enc6x_steal
2817         b               Lxts_enc6x_done
2818
2819 .align  4
2820 Lxts_enc6x_one:
2821         vxor            $out0,$in5,$twk0
2822         nop
2823 Loop_xts_enc1x:
2824         vcipher         $out0,$out0,v24
2825         lvx             v24,$x20,$key_          # round[3]
2826         addi            $key_,$key_,0x20
2827
2828         vcipher         $out0,$out0,v25
2829         lvx             v25,$x10,$key_          # round[4]
2830         bdnz            Loop_xts_enc1x
2831
2832         add             $inp,$inp,$taillen
2833         cmpwi           $taillen,0
2834         vcipher         $out0,$out0,v24
2835
2836         subi            $inp,$inp,16
2837         vcipher         $out0,$out0,v25
2838
2839         lvsr            $inpperm,0,$taillen
2840         vcipher         $out0,$out0,v26
2841
2842         lvx_u           $in0,0,$inp
2843         vcipher         $out0,$out0,v27
2844
2845         addi            $key_,$sp,$FRAME+15     # rewind $key_
2846         vcipher         $out0,$out0,v28
2847         lvx             v24,$x00,$key_          # re-pre-load round[1]
2848
2849         vcipher         $out0,$out0,v29
2850         lvx             v25,$x10,$key_          # re-pre-load round[2]
2851          vxor           $twk0,$twk0,v31
2852
2853         le?vperm        $in0,$in0,$in0,$leperm
2854         vcipher         $out0,$out0,v30
2855
2856         vperm           $in0,$in0,$in0,$inpperm
2857         vcipherlast     $out0,$out0,$twk0
2858
2859         vmr             $twk0,$twk1             # unused tweak
2860         vxor            $tmp,$out0,$twk1        # last block prep for stealing
2861         le?vperm        $out0,$out0,$out0,$leperm
2862         stvx_u          $out0,$x00,$out         # store output
2863         addi            $out,$out,0x10
2864         bne             Lxts_enc6x_steal
2865         b               Lxts_enc6x_done
2866
2867 .align  4
2868 Lxts_enc6x_zero:
2869         cmpwi           $taillen,0
2870         beq             Lxts_enc6x_done
2871
2872         add             $inp,$inp,$taillen
2873         subi            $inp,$inp,16
2874         lvx_u           $in0,0,$inp
2875         lvsr            $inpperm,0,$taillen     # $in5 is no more
2876         le?vperm        $in0,$in0,$in0,$leperm
2877         vperm           $in0,$in0,$in0,$inpperm
2878         vxor            $tmp,$tmp,$twk0
2879 Lxts_enc6x_steal:
2880         vxor            $in0,$in0,$twk0
2881         vxor            $out0,$out0,$out0
2882         vspltisb        $out1,-1
2883         vperm           $out0,$out0,$out1,$inpperm
2884         vsel            $out0,$in0,$tmp,$out0   # $tmp is last block, remember?
2885
2886         subi            r30,$out,17
2887         subi            $out,$out,16
2888         mtctr           $taillen
2889 Loop_xts_enc6x_steal:
2890         lbzu            r0,1(r30)
2891         stb             r0,16(r30)
2892         bdnz            Loop_xts_enc6x_steal
2893
2894         li              $taillen,0
2895         mtctr           $rounds
2896         b               Loop_xts_enc1x          # one more time...
2897
2898 .align  4
2899 Lxts_enc6x_done:
2900         ${UCMP}i        $ivp,0
2901         beq             Lxts_enc6x_ret
2902
2903         vxor            $tweak,$twk0,$rndkey0
2904         le?vperm        $tweak,$tweak,$tweak,$leperm
2905         stvx_u          $tweak,0,$ivp
2906
2907 Lxts_enc6x_ret:
2908         mtlr            r11
2909         li              r10,`$FRAME+15`
2910         li              r11,`$FRAME+31`
2911         stvx            $seven,r10,$sp          # wipe copies of round keys
2912         addi            r10,r10,32
2913         stvx            $seven,r11,$sp
2914         addi            r11,r11,32
2915         stvx            $seven,r10,$sp
2916         addi            r10,r10,32
2917         stvx            $seven,r11,$sp
2918         addi            r11,r11,32
2919         stvx            $seven,r10,$sp
2920         addi            r10,r10,32
2921         stvx            $seven,r11,$sp
2922         addi            r11,r11,32
2923         stvx            $seven,r10,$sp
2924         addi            r10,r10,32
2925         stvx            $seven,r11,$sp
2926         addi            r11,r11,32
2927
2928         mtspr           256,$vrsave
2929         lvx             v20,r10,$sp             # ABI says so
2930         addi            r10,r10,32
2931         lvx             v21,r11,$sp
2932         addi            r11,r11,32
2933         lvx             v22,r10,$sp
2934         addi            r10,r10,32
2935         lvx             v23,r11,$sp
2936         addi            r11,r11,32
2937         lvx             v24,r10,$sp
2938         addi            r10,r10,32
2939         lvx             v25,r11,$sp
2940         addi            r11,r11,32
2941         lvx             v26,r10,$sp
2942         addi            r10,r10,32
2943         lvx             v27,r11,$sp
2944         addi            r11,r11,32
2945         lvx             v28,r10,$sp
2946         addi            r10,r10,32
2947         lvx             v29,r11,$sp
2948         addi            r11,r11,32
2949         lvx             v30,r10,$sp
2950         lvx             v31,r11,$sp
2951         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2952         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2953         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2954         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2955         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2956         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2957         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
2958         blr
2959         .long           0
2960         .byte           0,12,0x04,1,0x80,6,6,0
2961         .long           0
2962
2963 .align  5
2964 _aesp8_xts_enc5x:
2965         vcipher         $out0,$out0,v24
2966         vcipher         $out1,$out1,v24
2967         vcipher         $out2,$out2,v24
2968         vcipher         $out3,$out3,v24
2969         vcipher         $out4,$out4,v24
2970         lvx             v24,$x20,$key_          # round[3]
2971         addi            $key_,$key_,0x20
2972
2973         vcipher         $out0,$out0,v25
2974         vcipher         $out1,$out1,v25
2975         vcipher         $out2,$out2,v25
2976         vcipher         $out3,$out3,v25
2977         vcipher         $out4,$out4,v25
2978         lvx             v25,$x10,$key_          # round[4]
2979         bdnz            _aesp8_xts_enc5x
2980
2981         add             $inp,$inp,$taillen
2982         cmpwi           $taillen,0
2983         vcipher         $out0,$out0,v24
2984         vcipher         $out1,$out1,v24
2985         vcipher         $out2,$out2,v24
2986         vcipher         $out3,$out3,v24
2987         vcipher         $out4,$out4,v24
2988
2989         subi            $inp,$inp,16
2990         vcipher         $out0,$out0,v25
2991         vcipher         $out1,$out1,v25
2992         vcipher         $out2,$out2,v25
2993         vcipher         $out3,$out3,v25
2994         vcipher         $out4,$out4,v25
2995          vxor           $twk0,$twk0,v31
2996
2997         vcipher         $out0,$out0,v26
2998         lvsr            $inpperm,r0,$taillen    # $in5 is no more
2999         vcipher         $out1,$out1,v26
3000         vcipher         $out2,$out2,v26
3001         vcipher         $out3,$out3,v26
3002         vcipher         $out4,$out4,v26
3003          vxor           $in1,$twk1,v31
3004
3005         vcipher         $out0,$out0,v27
3006         lvx_u           $in0,0,$inp
3007         vcipher         $out1,$out1,v27
3008         vcipher         $out2,$out2,v27
3009         vcipher         $out3,$out3,v27
3010         vcipher         $out4,$out4,v27
3011          vxor           $in2,$twk2,v31
3012
3013         addi            $key_,$sp,$FRAME+15     # rewind $key_
3014         vcipher         $out0,$out0,v28
3015         vcipher         $out1,$out1,v28
3016         vcipher         $out2,$out2,v28
3017         vcipher         $out3,$out3,v28
3018         vcipher         $out4,$out4,v28
3019         lvx             v24,$x00,$key_          # re-pre-load round[1]
3020          vxor           $in3,$twk3,v31
3021
3022         vcipher         $out0,$out0,v29
3023         le?vperm        $in0,$in0,$in0,$leperm
3024         vcipher         $out1,$out1,v29
3025         vcipher         $out2,$out2,v29
3026         vcipher         $out3,$out3,v29
3027         vcipher         $out4,$out4,v29
3028         lvx             v25,$x10,$key_          # re-pre-load round[2]
3029          vxor           $in4,$twk4,v31
3030
3031         vcipher         $out0,$out0,v30
3032         vperm           $in0,$in0,$in0,$inpperm
3033         vcipher         $out1,$out1,v30
3034         vcipher         $out2,$out2,v30
3035         vcipher         $out3,$out3,v30
3036         vcipher         $out4,$out4,v30
3037
3038         vcipherlast     $out0,$out0,$twk0
3039         vcipherlast     $out1,$out1,$in1
3040         vcipherlast     $out2,$out2,$in2
3041         vcipherlast     $out3,$out3,$in3
3042         vcipherlast     $out4,$out4,$in4
3043         blr
3044         .long           0
3045         .byte           0,12,0x14,0,0,0,0,0
3046
3047 .align  5
3048 _aesp8_xts_decrypt6x:
3049         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3050         mflr            r11
3051         li              r7,`$FRAME+8*16+15`
3052         li              r3,`$FRAME+8*16+31`
3053         $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3054         stvx            v20,r7,$sp              # ABI says so
3055         addi            r7,r7,32
3056         stvx            v21,r3,$sp
3057         addi            r3,r3,32
3058         stvx            v22,r7,$sp
3059         addi            r7,r7,32
3060         stvx            v23,r3,$sp
3061         addi            r3,r3,32
3062         stvx            v24,r7,$sp
3063         addi            r7,r7,32
3064         stvx            v25,r3,$sp
3065         addi            r3,r3,32
3066         stvx            v26,r7,$sp
3067         addi            r7,r7,32
3068         stvx            v27,r3,$sp
3069         addi            r3,r3,32
3070         stvx            v28,r7,$sp
3071         addi            r7,r7,32
3072         stvx            v29,r3,$sp
3073         addi            r3,r3,32
3074         stvx            v30,r7,$sp
3075         stvx            v31,r3,$sp
3076         li              r0,-1
3077         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
3078         li              $x10,0x10
3079         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3080         li              $x20,0x20
3081         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3082         li              $x30,0x30
3083         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3084         li              $x40,0x40
3085         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3086         li              $x50,0x50
3087         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3088         li              $x60,0x60
3089         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3090         li              $x70,0x70
3091         mtspr           256,r0
3092
3093         subi            $rounds,$rounds,3       # -4 in total
3094
3095         lvx             $rndkey0,$x00,$key1     # load key schedule
3096         lvx             v30,$x10,$key1
3097         addi            $key1,$key1,0x20
3098         lvx             v31,$x00,$key1
3099         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
3100         addi            $key_,$sp,$FRAME+15
3101         mtctr           $rounds
3102
3103 Load_xts_dec_key:
3104         ?vperm          v24,v30,v31,$keyperm
3105         lvx             v30,$x10,$key1
3106         addi            $key1,$key1,0x20
3107         stvx            v24,$x00,$key_          # off-load round[1]
3108         ?vperm          v25,v31,v30,$keyperm
3109         lvx             v31,$x00,$key1
3110         stvx            v25,$x10,$key_          # off-load round[2]
3111         addi            $key_,$key_,0x20
3112         bdnz            Load_xts_dec_key
3113
3114         lvx             v26,$x10,$key1
3115         ?vperm          v24,v30,v31,$keyperm
3116         lvx             v27,$x20,$key1
3117         stvx            v24,$x00,$key_          # off-load round[3]
3118         ?vperm          v25,v31,v26,$keyperm
3119         lvx             v28,$x30,$key1
3120         stvx            v25,$x10,$key_          # off-load round[4]
3121         addi            $key_,$sp,$FRAME+15     # rewind $key_
3122         ?vperm          v26,v26,v27,$keyperm
3123         lvx             v29,$x40,$key1
3124         ?vperm          v27,v27,v28,$keyperm
3125         lvx             v30,$x50,$key1
3126         ?vperm          v28,v28,v29,$keyperm
3127         lvx             v31,$x60,$key1
3128         ?vperm          v29,v29,v30,$keyperm
3129         lvx             $twk5,$x70,$key1        # borrow $twk5
3130         ?vperm          v30,v30,v31,$keyperm
3131         lvx             v24,$x00,$key_          # pre-load round[1]
3132         ?vperm          v31,v31,$twk5,$keyperm
3133         lvx             v25,$x10,$key_          # pre-load round[2]
3134
3135          vperm          $in0,$inout,$inptail,$inpperm
3136          subi           $inp,$inp,31            # undo "caller"
3137         vxor            $twk0,$tweak,$rndkey0
3138         vsrab           $tmp,$tweak,$seven      # next tweak value
3139         vaddubm         $tweak,$tweak,$tweak
3140         vsldoi          $tmp,$tmp,$tmp,15
3141         vand            $tmp,$tmp,$eighty7
3142          vxor           $out0,$in0,$twk0
3143         vxor            $tweak,$tweak,$tmp
3144
3145          lvx_u          $in1,$x10,$inp
3146         vxor            $twk1,$tweak,$rndkey0
3147         vsrab           $tmp,$tweak,$seven      # next tweak value
3148         vaddubm         $tweak,$tweak,$tweak
3149         vsldoi          $tmp,$tmp,$tmp,15
3150          le?vperm       $in1,$in1,$in1,$leperm
3151         vand            $tmp,$tmp,$eighty7
3152          vxor           $out1,$in1,$twk1
3153         vxor            $tweak,$tweak,$tmp
3154
3155          lvx_u          $in2,$x20,$inp
3156          andi.          $taillen,$len,15
3157         vxor            $twk2,$tweak,$rndkey0
3158         vsrab           $tmp,$tweak,$seven      # next tweak value
3159         vaddubm         $tweak,$tweak,$tweak
3160         vsldoi          $tmp,$tmp,$tmp,15
3161          le?vperm       $in2,$in2,$in2,$leperm
3162         vand            $tmp,$tmp,$eighty7
3163          vxor           $out2,$in2,$twk2
3164         vxor            $tweak,$tweak,$tmp
3165
3166          lvx_u          $in3,$x30,$inp
3167          sub            $len,$len,$taillen
3168         vxor            $twk3,$tweak,$rndkey0
3169         vsrab           $tmp,$tweak,$seven      # next tweak value
3170         vaddubm         $tweak,$tweak,$tweak
3171         vsldoi          $tmp,$tmp,$tmp,15
3172          le?vperm       $in3,$in3,$in3,$leperm
3173         vand            $tmp,$tmp,$eighty7
3174          vxor           $out3,$in3,$twk3
3175         vxor            $tweak,$tweak,$tmp
3176
3177          lvx_u          $in4,$x40,$inp
3178          subi           $len,$len,0x60
3179         vxor            $twk4,$tweak,$rndkey0
3180         vsrab           $tmp,$tweak,$seven      # next tweak value
3181         vaddubm         $tweak,$tweak,$tweak
3182         vsldoi          $tmp,$tmp,$tmp,15
3183          le?vperm       $in4,$in4,$in4,$leperm
3184         vand            $tmp,$tmp,$eighty7
3185          vxor           $out4,$in4,$twk4
3186         vxor            $tweak,$tweak,$tmp
3187
3188          lvx_u          $in5,$x50,$inp
3189          addi           $inp,$inp,0x60
3190         vxor            $twk5,$tweak,$rndkey0
3191         vsrab           $tmp,$tweak,$seven      # next tweak value
3192         vaddubm         $tweak,$tweak,$tweak
3193         vsldoi          $tmp,$tmp,$tmp,15
3194          le?vperm       $in5,$in5,$in5,$leperm
3195         vand            $tmp,$tmp,$eighty7
3196          vxor           $out5,$in5,$twk5
3197         vxor            $tweak,$tweak,$tmp
3198
3199         vxor            v31,v31,$rndkey0
3200         mtctr           $rounds
3201         b               Loop_xts_dec6x
3202
3203 .align  5
3204 Loop_xts_dec6x:
3205         vncipher        $out0,$out0,v24
3206         vncipher        $out1,$out1,v24
3207         vncipher        $out2,$out2,v24
3208         vncipher        $out3,$out3,v24
3209         vncipher        $out4,$out4,v24
3210         vncipher        $out5,$out5,v24
3211         lvx             v24,$x20,$key_          # round[3]
3212         addi            $key_,$key_,0x20
3213
3214         vncipher        $out0,$out0,v25
3215         vncipher        $out1,$out1,v25
3216         vncipher        $out2,$out2,v25
3217         vncipher        $out3,$out3,v25
3218         vncipher        $out4,$out4,v25
3219         vncipher        $out5,$out5,v25
3220         lvx             v25,$x10,$key_          # round[4]
3221         bdnz            Loop_xts_dec6x
3222
3223         subic           $len,$len,96            # $len-=96
3224          vxor           $in0,$twk0,v31          # xor with last round key
3225         vncipher        $out0,$out0,v24
3226         vncipher        $out1,$out1,v24
3227          vsrab          $tmp,$tweak,$seven      # next tweak value
3228          vxor           $twk0,$tweak,$rndkey0
3229          vaddubm        $tweak,$tweak,$tweak
3230         vncipher        $out2,$out2,v24
3231         vncipher        $out3,$out3,v24
3232          vsldoi         $tmp,$tmp,$tmp,15
3233         vncipher        $out4,$out4,v24
3234         vncipher        $out5,$out5,v24
3235
3236         subfe.          r0,r0,r0                # borrow?-1:0
3237          vand           $tmp,$tmp,$eighty7
3238         vncipher        $out0,$out0,v25
3239         vncipher        $out1,$out1,v25
3240          vxor           $tweak,$tweak,$tmp
3241         vncipher        $out2,$out2,v25
3242         vncipher        $out3,$out3,v25
3243          vxor           $in1,$twk1,v31
3244          vsrab          $tmp,$tweak,$seven      # next tweak value
3245          vxor           $twk1,$tweak,$rndkey0
3246         vncipher        $out4,$out4,v25
3247         vncipher        $out5,$out5,v25
3248
3249         and             r0,r0,$len
3250          vaddubm        $tweak,$tweak,$tweak
3251          vsldoi         $tmp,$tmp,$tmp,15
3252         vncipher        $out0,$out0,v26
3253         vncipher        $out1,$out1,v26
3254          vand           $tmp,$tmp,$eighty7
3255         vncipher        $out2,$out2,v26
3256         vncipher        $out3,$out3,v26
3257          vxor           $tweak,$tweak,$tmp
3258         vncipher        $out4,$out4,v26
3259         vncipher        $out5,$out5,v26
3260
3261         add             $inp,$inp,r0            # $inp is adjusted in such
3262                                                 # way that at exit from the
3263                                                 # loop inX-in5 are loaded
3264                                                 # with last "words"
3265          vxor           $in2,$twk2,v31
3266          vsrab          $tmp,$tweak,$seven      # next tweak value
3267          vxor           $twk2,$tweak,$rndkey0
3268          vaddubm        $tweak,$tweak,$tweak
3269         vncipher        $out0,$out0,v27
3270         vncipher        $out1,$out1,v27
3271          vsldoi         $tmp,$tmp,$tmp,15
3272         vncipher        $out2,$out2,v27
3273         vncipher        $out3,$out3,v27
3274          vand           $tmp,$tmp,$eighty7
3275         vncipher        $out4,$out4,v27
3276         vncipher        $out5,$out5,v27
3277
3278         addi            $key_,$sp,$FRAME+15     # rewind $key_
3279          vxor           $tweak,$tweak,$tmp
3280         vncipher        $out0,$out0,v28
3281         vncipher        $out1,$out1,v28
3282          vxor           $in3,$twk3,v31
3283          vsrab          $tmp,$tweak,$seven      # next tweak value
3284          vxor           $twk3,$tweak,$rndkey0
3285         vncipher        $out2,$out2,v28
3286         vncipher        $out3,$out3,v28
3287          vaddubm        $tweak,$tweak,$tweak
3288          vsldoi         $tmp,$tmp,$tmp,15
3289         vncipher        $out4,$out4,v28
3290         vncipher        $out5,$out5,v28
3291         lvx             v24,$x00,$key_          # re-pre-load round[1]
3292          vand           $tmp,$tmp,$eighty7
3293
3294         vncipher        $out0,$out0,v29
3295         vncipher        $out1,$out1,v29
3296          vxor           $tweak,$tweak,$tmp
3297         vncipher        $out2,$out2,v29
3298         vncipher        $out3,$out3,v29
3299          vxor           $in4,$twk4,v31
3300          vsrab          $tmp,$tweak,$seven      # next tweak value
3301          vxor           $twk4,$tweak,$rndkey0
3302         vncipher        $out4,$out4,v29
3303         vncipher        $out5,$out5,v29
3304         lvx             v25,$x10,$key_          # re-pre-load round[2]
3305          vaddubm        $tweak,$tweak,$tweak
3306          vsldoi         $tmp,$tmp,$tmp,15
3307
3308         vncipher        $out0,$out0,v30
3309         vncipher        $out1,$out1,v30
3310          vand           $tmp,$tmp,$eighty7
3311         vncipher        $out2,$out2,v30
3312         vncipher        $out3,$out3,v30
3313          vxor           $tweak,$tweak,$tmp
3314         vncipher        $out4,$out4,v30
3315         vncipher        $out5,$out5,v30
3316          vxor           $in5,$twk5,v31
3317          vsrab          $tmp,$tweak,$seven      # next tweak value
3318          vxor           $twk5,$tweak,$rndkey0
3319
3320         vncipherlast    $out0,$out0,$in0
3321          lvx_u          $in0,$x00,$inp          # load next input block
3322          vaddubm        $tweak,$tweak,$tweak
3323          vsldoi         $tmp,$tmp,$tmp,15
3324         vncipherlast    $out1,$out1,$in1
3325          lvx_u          $in1,$x10,$inp
3326         vncipherlast    $out2,$out2,$in2
3327          le?vperm       $in0,$in0,$in0,$leperm
3328          lvx_u          $in2,$x20,$inp
3329          vand           $tmp,$tmp,$eighty7
3330         vncipherlast    $out3,$out3,$in3
3331          le?vperm       $in1,$in1,$in1,$leperm
3332          lvx_u          $in3,$x30,$inp
3333         vncipherlast    $out4,$out4,$in4
3334          le?vperm       $in2,$in2,$in2,$leperm
3335          lvx_u          $in4,$x40,$inp
3336          vxor           $tweak,$tweak,$tmp
3337         vncipherlast    $out5,$out5,$in5
3338          le?vperm       $in3,$in3,$in3,$leperm
3339          lvx_u          $in5,$x50,$inp
3340          addi           $inp,$inp,0x60
3341          le?vperm       $in4,$in4,$in4,$leperm
3342          le?vperm       $in5,$in5,$in5,$leperm
3343
3344         le?vperm        $out0,$out0,$out0,$leperm
3345         le?vperm        $out1,$out1,$out1,$leperm
3346         stvx_u          $out0,$x00,$out         # store output
3347          vxor           $out0,$in0,$twk0
3348         le?vperm        $out2,$out2,$out2,$leperm
3349         stvx_u          $out1,$x10,$out
3350          vxor           $out1,$in1,$twk1
3351         le?vperm        $out3,$out3,$out3,$leperm
3352         stvx_u          $out2,$x20,$out
3353          vxor           $out2,$in2,$twk2
3354         le?vperm        $out4,$out4,$out4,$leperm
3355         stvx_u          $out3,$x30,$out
3356          vxor           $out3,$in3,$twk3
3357         le?vperm        $out5,$out5,$out5,$leperm
3358         stvx_u          $out4,$x40,$out
3359          vxor           $out4,$in4,$twk4
3360         stvx_u          $out5,$x50,$out
3361          vxor           $out5,$in5,$twk5
3362         addi            $out,$out,0x60
3363
3364         mtctr           $rounds
3365         beq             Loop_xts_dec6x          # did $len-=96 borrow?
3366
3367         addic.          $len,$len,0x60
3368         beq             Lxts_dec6x_zero
3369         cmpwi           $len,0x20
3370         blt             Lxts_dec6x_one
3371         nop
3372         beq             Lxts_dec6x_two
3373         cmpwi           $len,0x40
3374         blt             Lxts_dec6x_three
3375         nop
3376         beq             Lxts_dec6x_four
3377
3378 Lxts_dec6x_five:
3379         vxor            $out0,$in1,$twk0
3380         vxor            $out1,$in2,$twk1
3381         vxor            $out2,$in3,$twk2
3382         vxor            $out3,$in4,$twk3
3383         vxor            $out4,$in5,$twk4
3384
3385         bl              _aesp8_xts_dec5x
3386
3387         le?vperm        $out0,$out0,$out0,$leperm
3388         vmr             $twk0,$twk5             # unused tweak
3389         vxor            $twk1,$tweak,$rndkey0
3390         le?vperm        $out1,$out1,$out1,$leperm
3391         stvx_u          $out0,$x00,$out         # store output
3392         vxor            $out0,$in0,$twk1
3393         le?vperm        $out2,$out2,$out2,$leperm
3394         stvx_u          $out1,$x10,$out
3395         le?vperm        $out3,$out3,$out3,$leperm
3396         stvx_u          $out2,$x20,$out
3397         le?vperm        $out4,$out4,$out4,$leperm
3398         stvx_u          $out3,$x30,$out
3399         stvx_u          $out4,$x40,$out
3400         addi            $out,$out,0x50
3401         bne             Lxts_dec6x_steal
3402         b               Lxts_dec6x_done
3403
3404 .align  4
3405 Lxts_dec6x_four:
3406         vxor            $out0,$in2,$twk0
3407         vxor            $out1,$in3,$twk1
3408         vxor            $out2,$in4,$twk2
3409         vxor            $out3,$in5,$twk3
3410         vxor            $out4,$out4,$out4
3411
3412         bl              _aesp8_xts_dec5x
3413
3414         le?vperm        $out0,$out0,$out0,$leperm
3415         vmr             $twk0,$twk4             # unused tweak
3416         vmr             $twk1,$twk5
3417         le?vperm        $out1,$out1,$out1,$leperm
3418         stvx_u          $out0,$x00,$out         # store output
3419         vxor            $out0,$in0,$twk5
3420         le?vperm        $out2,$out2,$out2,$leperm
3421         stvx_u          $out1,$x10,$out
3422         le?vperm        $out3,$out3,$out3,$leperm
3423         stvx_u          $out2,$x20,$out
3424         stvx_u          $out3,$x30,$out
3425         addi            $out,$out,0x40
3426         bne             Lxts_dec6x_steal
3427         b               Lxts_dec6x_done
3428
3429 .align  4
3430 Lxts_dec6x_three:
3431         vxor            $out0,$in3,$twk0
3432         vxor            $out1,$in4,$twk1
3433         vxor            $out2,$in5,$twk2
3434         vxor            $out3,$out3,$out3
3435         vxor            $out4,$out4,$out4
3436
3437         bl              _aesp8_xts_dec5x
3438
3439         le?vperm        $out0,$out0,$out0,$leperm
3440         vmr             $twk0,$twk3             # unused tweak
3441         vmr             $twk1,$twk4
3442         le?vperm        $out1,$out1,$out1,$leperm
3443         stvx_u          $out0,$x00,$out         # store output
3444         vxor            $out0,$in0,$twk4
3445         le?vperm        $out2,$out2,$out2,$leperm
3446         stvx_u          $out1,$x10,$out
3447         stvx_u          $out2,$x20,$out
3448         addi            $out,$out,0x30
3449         bne             Lxts_dec6x_steal
3450         b               Lxts_dec6x_done
3451
3452 .align  4
3453 Lxts_dec6x_two:
3454         vxor            $out0,$in4,$twk0
3455         vxor            $out1,$in5,$twk1
3456         vxor            $out2,$out2,$out2
3457         vxor            $out3,$out3,$out3
3458         vxor            $out4,$out4,$out4
3459
3460         bl              _aesp8_xts_dec5x
3461
3462         le?vperm        $out0,$out0,$out0,$leperm
3463         vmr             $twk0,$twk2             # unused tweak
3464         vmr             $twk1,$twk3
3465         le?vperm        $out1,$out1,$out1,$leperm
3466         stvx_u          $out0,$x00,$out         # store output
3467         vxor            $out0,$in0,$twk3
3468         stvx_u          $out1,$x10,$out
3469         addi            $out,$out,0x20
3470         bne             Lxts_dec6x_steal
3471         b               Lxts_dec6x_done
3472
3473 .align  4
3474 Lxts_dec6x_one:
3475         vxor            $out0,$in5,$twk0
3476         nop
3477 Loop_xts_dec1x:
3478         vncipher        $out0,$out0,v24
3479         lvx             v24,$x20,$key_          # round[3]
3480         addi            $key_,$key_,0x20
3481
3482         vncipher        $out0,$out0,v25
3483         lvx             v25,$x10,$key_          # round[4]
3484         bdnz            Loop_xts_dec1x
3485
3486         subi            r0,$taillen,1
3487         vncipher        $out0,$out0,v24
3488
3489         andi.           r0,r0,16
3490         cmpwi           $taillen,0
3491         vncipher        $out0,$out0,v25
3492
3493         sub             $inp,$inp,r0
3494         vncipher        $out0,$out0,v26
3495
3496         lvx_u           $in0,0,$inp
3497         vncipher        $out0,$out0,v27
3498
3499         addi            $key_,$sp,$FRAME+15     # rewind $key_
3500         vncipher        $out0,$out0,v28
3501         lvx             v24,$x00,$key_          # re-pre-load round[1]
3502
3503         vncipher        $out0,$out0,v29
3504         lvx             v25,$x10,$key_          # re-pre-load round[2]
3505          vxor           $twk0,$twk0,v31
3506
3507         le?vperm        $in0,$in0,$in0,$leperm
3508         vncipher        $out0,$out0,v30
3509
3510         mtctr           $rounds
3511         vncipherlast    $out0,$out0,$twk0
3512
3513         vmr             $twk0,$twk1             # unused tweak
3514         vmr             $twk1,$twk2
3515         le?vperm        $out0,$out0,$out0,$leperm
3516         stvx_u          $out0,$x00,$out         # store output
3517         addi            $out,$out,0x10
3518         vxor            $out0,$in0,$twk2
3519         bne             Lxts_dec6x_steal
3520         b               Lxts_dec6x_done
3521
3522 .align  4
3523 Lxts_dec6x_zero:
3524         cmpwi           $taillen,0
3525         beq             Lxts_dec6x_done
3526
3527         lvx_u           $in0,0,$inp
3528         le?vperm        $in0,$in0,$in0,$leperm
3529         vxor            $out0,$in0,$twk1
3530 Lxts_dec6x_steal:
3531         vncipher        $out0,$out0,v24
3532         lvx             v24,$x20,$key_          # round[3]
3533         addi            $key_,$key_,0x20
3534
3535         vncipher        $out0,$out0,v25
3536         lvx             v25,$x10,$key_          # round[4]
3537         bdnz            Lxts_dec6x_steal
3538
3539         add             $inp,$inp,$taillen
3540         vncipher        $out0,$out0,v24
3541
3542         cmpwi           $taillen,0
3543         vncipher        $out0,$out0,v25
3544
3545         lvx_u           $in0,0,$inp
3546         vncipher        $out0,$out0,v26
3547
3548         lvsr            $inpperm,0,$taillen     # $in5 is no more
3549         vncipher        $out0,$out0,v27
3550
3551         addi            $key_,$sp,$FRAME+15     # rewind $key_
3552         vncipher        $out0,$out0,v28
3553         lvx             v24,$x00,$key_          # re-pre-load round[1]
3554
3555         vncipher        $out0,$out0,v29
3556         lvx             v25,$x10,$key_          # re-pre-load round[2]
3557          vxor           $twk1,$twk1,v31
3558
3559         le?vperm        $in0,$in0,$in0,$leperm
3560         vncipher        $out0,$out0,v30
3561
3562         vperm           $in0,$in0,$in0,$inpperm
3563         vncipherlast    $tmp,$out0,$twk1
3564
3565         le?vperm        $out0,$tmp,$tmp,$leperm
3566         le?stvx_u       $out0,0,$out
3567         be?stvx_u       $tmp,0,$out
3568
3569         vxor            $out0,$out0,$out0
3570         vspltisb        $out1,-1
3571         vperm           $out0,$out0,$out1,$inpperm
3572         vsel            $out0,$in0,$tmp,$out0
3573         vxor            $out0,$out0,$twk0
3574
3575         subi            r30,$out,1
3576         mtctr           $taillen
3577 Loop_xts_dec6x_steal:
3578         lbzu            r0,1(r30)
3579         stb             r0,16(r30)
3580         bdnz            Loop_xts_dec6x_steal
3581
3582         li              $taillen,0
3583         mtctr           $rounds
3584         b               Loop_xts_dec1x          # one more time...
3585
3586 .align  4
3587 Lxts_dec6x_done:
3588         ${UCMP}i        $ivp,0
3589         beq             Lxts_dec6x_ret
3590
3591         vxor            $tweak,$twk0,$rndkey0
3592         le?vperm        $tweak,$tweak,$tweak,$leperm
3593         stvx_u          $tweak,0,$ivp
3594
3595 Lxts_dec6x_ret:
3596         mtlr            r11
3597         li              r10,`$FRAME+15`
3598         li              r11,`$FRAME+31`
3599         stvx            $seven,r10,$sp          # wipe copies of round keys
3600         addi            r10,r10,32
3601         stvx            $seven,r11,$sp
3602         addi            r11,r11,32
3603         stvx            $seven,r10,$sp
3604         addi            r10,r10,32
3605         stvx            $seven,r11,$sp
3606         addi            r11,r11,32
3607         stvx            $seven,r10,$sp
3608         addi            r10,r10,32
3609         stvx            $seven,r11,$sp
3610         addi            r11,r11,32
3611         stvx            $seven,r10,$sp
3612         addi            r10,r10,32
3613         stvx            $seven,r11,$sp
3614         addi            r11,r11,32
3615
3616         mtspr           256,$vrsave
3617         lvx             v20,r10,$sp             # ABI says so
3618         addi            r10,r10,32
3619         lvx             v21,r11,$sp
3620         addi            r11,r11,32
3621         lvx             v22,r10,$sp
3622         addi            r10,r10,32
3623         lvx             v23,r11,$sp
3624         addi            r11,r11,32
3625         lvx             v24,r10,$sp
3626         addi            r10,r10,32
3627         lvx             v25,r11,$sp
3628         addi            r11,r11,32
3629         lvx             v26,r10,$sp
3630         addi            r10,r10,32
3631         lvx             v27,r11,$sp
3632         addi            r11,r11,32
3633         lvx             v28,r10,$sp
3634         addi            r10,r10,32
3635         lvx             v29,r11,$sp
3636         addi            r11,r11,32
3637         lvx             v30,r10,$sp
3638         lvx             v31,r11,$sp
3639         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3640         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3641         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3642         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3643         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3644         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3645         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3646         blr
3647         .long           0
3648         .byte           0,12,0x04,1,0x80,6,6,0
3649         .long           0
3650
3651 .align  5
3652 _aesp8_xts_dec5x:
3653         vncipher        $out0,$out0,v24
3654         vncipher        $out1,$out1,v24
3655         vncipher        $out2,$out2,v24
3656         vncipher        $out3,$out3,v24
3657         vncipher        $out4,$out4,v24
3658         lvx             v24,$x20,$key_          # round[3]
3659         addi            $key_,$key_,0x20
3660
3661         vncipher        $out0,$out0,v25
3662         vncipher        $out1,$out1,v25
3663         vncipher        $out2,$out2,v25
3664         vncipher        $out3,$out3,v25
3665         vncipher        $out4,$out4,v25
3666         lvx             v25,$x10,$key_          # round[4]
3667         bdnz            _aesp8_xts_dec5x
3668
3669         subi            r0,$taillen,1
3670         vncipher        $out0,$out0,v24
3671         vncipher        $out1,$out1,v24
3672         vncipher        $out2,$out2,v24
3673         vncipher        $out3,$out3,v24
3674         vncipher        $out4,$out4,v24
3675
3676         andi.           r0,r0,16
3677         cmpwi           $taillen,0
3678         vncipher        $out0,$out0,v25
3679         vncipher        $out1,$out1,v25
3680         vncipher        $out2,$out2,v25
3681         vncipher        $out3,$out3,v25
3682         vncipher        $out4,$out4,v25
3683          vxor           $twk0,$twk0,v31
3684
3685         sub             $inp,$inp,r0
3686         vncipher        $out0,$out0,v26
3687         vncipher        $out1,$out1,v26
3688         vncipher        $out2,$out2,v26
3689         vncipher        $out3,$out3,v26
3690         vncipher        $out4,$out4,v26
3691          vxor           $in1,$twk1,v31
3692
3693         vncipher        $out0,$out0,v27
3694         lvx_u           $in0,0,$inp
3695         vncipher        $out1,$out1,v27
3696         vncipher        $out2,$out2,v27
3697         vncipher        $out3,$out3,v27
3698         vncipher        $out4,$out4,v27
3699          vxor           $in2,$twk2,v31
3700
3701         addi            $key_,$sp,$FRAME+15     # rewind $key_
3702         vncipher        $out0,$out0,v28
3703         vncipher        $out1,$out1,v28
3704         vncipher        $out2,$out2,v28
3705         vncipher        $out3,$out3,v28
3706         vncipher        $out4,$out4,v28
3707         lvx             v24,$x00,$key_          # re-pre-load round[1]
3708          vxor           $in3,$twk3,v31
3709
3710         vncipher        $out0,$out0,v29
3711         le?vperm        $in0,$in0,$in0,$leperm
3712         vncipher        $out1,$out1,v29
3713         vncipher        $out2,$out2,v29
3714         vncipher        $out3,$out3,v29
3715         vncipher        $out4,$out4,v29
3716         lvx             v25,$x10,$key_          # re-pre-load round[2]
3717          vxor           $in4,$twk4,v31
3718
3719         vncipher        $out0,$out0,v30
3720         vncipher        $out1,$out1,v30
3721         vncipher        $out2,$out2,v30
3722         vncipher        $out3,$out3,v30
3723         vncipher        $out4,$out4,v30
3724
3725         vncipherlast    $out0,$out0,$twk0
3726         vncipherlast    $out1,$out1,$in1
3727         vncipherlast    $out2,$out2,$in2
3728         vncipherlast    $out3,$out3,$in3
3729         vncipherlast    $out4,$out4,$in4
3730         mtctr           $rounds
3731         blr
3732         .long           0
3733         .byte           0,12,0x14,0,0,0,0,0
3734 ___
3735 }}      }}}
3736
3737 my $consts=1;
3738 foreach(split("\n",$code)) {
3739         s/\`([^\`]*)\`/eval($1)/geo;
3740
3741         # constants table endian-specific conversion
3742         if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3743             my $conv=$3;
3744             my @bytes=();
3745
3746             # convert to endian-agnostic format
3747             if ($1 eq "long") {
3748               foreach (split(/,\s*/,$2)) {
3749                 my $l = /^0/?oct:int;
3750                 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3751               }
3752             } else {
3753                 @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3754             }
3755
3756             # little-endian conversion
3757             if ($flavour =~ /le$/o) {
3758                 SWITCH: for($conv)  {
3759                     /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3760                     /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3761                 }
3762             }
3763
3764             #emit
3765             print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3766             next;
3767         }
3768         $consts=0 if (m/Lconsts:/o);    # end of table
3769
3770         # instructions prefixed with '?' are endian-specific and need
3771         # to be adjusted accordingly...
3772         if ($flavour =~ /le$/o) {       # little-endian
3773             s/le\?//o           or
3774             s/be\?/#be#/o       or
3775             s/\?lvsr/lvsl/o     or
3776             s/\?lvsl/lvsr/o     or
3777             s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3778             s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3779             s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3780         } else {                        # big-endian
3781             s/le\?/#le#/o       or
3782             s/be\?//o           or
3783             s/\?([a-z]+)/$1/o;
3784         }
3785
3786         print $_,"\n";
3787 }
3788
3789 close STDOUT;