lib/crypto: powerpc/aes: Migrate POWER8 optimized code into library
authorEric Biggers <ebiggers@kernel.org>
Mon, 12 Jan 2026 19:20:11 +0000 (11:20 -0800)
committerEric Biggers <ebiggers@kernel.org>
Mon, 12 Jan 2026 19:39:58 +0000 (11:39 -0800)
Move the POWER8 AES assembly code into lib/crypto/, wire the key
expansion and single-block en/decryption functions up to the AES library
API, and remove the superseded "p8_aes" crypto_cipher algorithm.

The result is that both the AES library and crypto_cipher APIs are now
optimized for POWER8, whereas previously only crypto_cipher was (and
optimizations weren't enabled by default, which this commit fixes too).

Note that many of the functions in the POWER8 assembly code are still
used by the AES mode implementations in arch/powerpc/crypto/.  For now,
just export these functions.  These exports will go away once the AES
modes are migrated to the library as well.  (Trying to split up the
assembly file seemed like much more trouble than it would be worth.)

Another challenge with this code is that the POWER8 assembly code uses a
custom format for the expanded AES key.  Since that code is imported
from OpenSSL and is also targeted to POWER8 (rather than POWER9 which
has better data movement and byteswap instructions), that is not easily
changed.  For now I've just kept the custom format.  To maintain full
correctness, this requires executing some slow fallback code in the case
where the usability of VSX changes between key expansion and use.  This
should be tolerable, as this case shouldn't happen in practice.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260112192035.10427-14-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
arch/powerpc/crypto/Makefile
arch/powerpc/crypto/aes.c [deleted file]
arch/powerpc/crypto/aesp8-ppc.h
arch/powerpc/crypto/aesp8-ppc.pl [deleted file]
arch/powerpc/crypto/vmx.c
include/crypto/aes.h
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/powerpc/.gitignore [new file with mode: 0644]
lib/crypto/powerpc/aes.h
lib/crypto/powerpc/aesp8-ppc.pl [new file with mode: 0644]

index e22310d..3ac0886 100644 (file)
@@ -11,7 +11,7 @@ obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
 
 aes-ppc-spe-y := aes-spe-glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
-vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
+vmx-crypto-objs := vmx.o ghashp8-ppc.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 override flavour := linux-ppc64le
@@ -26,15 +26,14 @@ endif
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $< $(flavour) > $@
 
-targets += aesp10-ppc.S ghashp10-ppc.S aesp8-ppc.S ghashp8-ppc.S
+targets += aesp10-ppc.S ghashp10-ppc.S ghashp8-ppc.S
 
 $(obj)/aesp10-ppc.S $(obj)/ghashp10-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
        $(call if_changed,perl)
 
-$(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
+$(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
        $(call if_changed,perl)
 
 OBJECT_FILES_NON_STANDARD_aesp10-ppc.o := y
 OBJECT_FILES_NON_STANDARD_ghashp10-ppc.o := y
-OBJECT_FILES_NON_STANDARD_aesp8-ppc.o := y
 OBJECT_FILES_NON_STANDARD_ghashp8-ppc.o := y
diff --git a/arch/powerpc/crypto/aes.c b/arch/powerpc/crypto/aes.c
deleted file mode 100644 (file)
index b7192ee..0000000
+++ /dev/null
@@ -1,134 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * AES routines supporting VMX instructions on the Power 8
- *
- * Copyright (C) 2015 International Business Machines Inc.
- *
- * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
- */
-
-#include <asm/simd.h>
-#include <asm/switch_to.h>
-#include <crypto/aes.h>
-#include <crypto/internal/cipher.h>
-#include <crypto/internal/simd.h>
-#include <linux/err.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-#include "aesp8-ppc.h"
-
-struct p8_aes_ctx {
-       struct crypto_cipher *fallback;
-       struct p8_aes_key enc_key;
-       struct p8_aes_key dec_key;
-};
-
-static int p8_aes_init(struct crypto_tfm *tfm)
-{
-       const char *alg = crypto_tfm_alg_name(tfm);
-       struct crypto_cipher *fallback;
-       struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-
-       fallback = crypto_alloc_cipher(alg, 0, CRYPTO_ALG_NEED_FALLBACK);
-       if (IS_ERR(fallback)) {
-               printk(KERN_ERR
-                      "Failed to allocate transformation for '%s': %ld\n",
-                      alg, PTR_ERR(fallback));
-               return PTR_ERR(fallback);
-       }
-
-       crypto_cipher_set_flags(fallback,
-                               crypto_cipher_get_flags((struct
-                                                        crypto_cipher *)
-                                                       tfm));
-       ctx->fallback = fallback;
-
-       return 0;
-}
-
-static void p8_aes_exit(struct crypto_tfm *tfm)
-{
-       struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-
-       if (ctx->fallback) {
-               crypto_free_cipher(ctx->fallback);
-               ctx->fallback = NULL;
-       }
-}
-
-static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
-                        unsigned int keylen)
-{
-       int ret;
-       struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-
-       preempt_disable();
-       pagefault_disable();
-       enable_kernel_vsx();
-       ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
-       ret |= aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
-       disable_kernel_vsx();
-       pagefault_enable();
-       preempt_enable();
-
-       ret |= crypto_cipher_setkey(ctx->fallback, key, keylen);
-
-       return ret ? -EINVAL : 0;
-}
-
-static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-       struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-
-       if (!crypto_simd_usable()) {
-               crypto_cipher_encrypt_one(ctx->fallback, dst, src);
-       } else {
-               preempt_disable();
-               pagefault_disable();
-               enable_kernel_vsx();
-               aes_p8_encrypt(src, dst, &ctx->enc_key);
-               disable_kernel_vsx();
-               pagefault_enable();
-               preempt_enable();
-       }
-}
-
-static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-       struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-
-       if (!crypto_simd_usable()) {
-               crypto_cipher_decrypt_one(ctx->fallback, dst, src);
-       } else {
-               preempt_disable();
-               pagefault_disable();
-               enable_kernel_vsx();
-               aes_p8_decrypt(src, dst, &ctx->dec_key);
-               disable_kernel_vsx();
-               pagefault_enable();
-               preempt_enable();
-       }
-}
-
-struct crypto_alg p8_aes_alg = {
-       .cra_name = "aes",
-       .cra_driver_name = "p8_aes",
-       .cra_module = THIS_MODULE,
-       .cra_priority = 1000,
-       .cra_type = NULL,
-       .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_NEED_FALLBACK,
-       .cra_alignmask = 0,
-       .cra_blocksize = AES_BLOCK_SIZE,
-       .cra_ctxsize = sizeof(struct p8_aes_ctx),
-       .cra_init = p8_aes_init,
-       .cra_exit = p8_aes_exit,
-       .cra_cipher = {
-                      .cia_min_keysize = AES_MIN_KEY_SIZE,
-                      .cia_max_keysize = AES_MAX_KEY_SIZE,
-                      .cia_setkey = p8_aes_setkey,
-                      .cia_encrypt = p8_aes_encrypt,
-                      .cia_decrypt = p8_aes_decrypt,
-       },
-};
index 0bea010..6862c60 100644 (file)
@@ -2,30 +2,7 @@
 #include <linux/types.h>
 #include <crypto/aes.h>
 
-struct p8_aes_key {
-       u8 key[AES_MAX_KEYLENGTH];
-       int rounds;
-};
-
 extern struct shash_alg p8_ghash_alg;
-extern struct crypto_alg p8_aes_alg;
 extern struct skcipher_alg p8_aes_cbc_alg;
 extern struct skcipher_alg p8_aes_ctr_alg;
 extern struct skcipher_alg p8_aes_xts_alg;
-
-int aes_p8_set_encrypt_key(const u8 *userKey, const int bits,
-                          struct p8_aes_key *key);
-int aes_p8_set_decrypt_key(const u8 *userKey, const int bits,
-                          struct p8_aes_key *key);
-void aes_p8_encrypt(const u8 *in, u8 *out, const struct p8_aes_key *key);
-void aes_p8_decrypt(const u8 *in, u8 *out, const struct p8_aes_key *key);
-void aes_p8_cbc_encrypt(const u8 *in, u8 *out, size_t len,
-                       const struct p8_aes_key *key, u8 *iv, const int enc);
-void aes_p8_ctr32_encrypt_blocks(const u8 *in, u8 *out, size_t len,
-                                const struct p8_aes_key *key, const u8 *iv);
-void aes_p8_xts_encrypt(const u8 *in, u8 *out, size_t len,
-                       const struct p8_aes_key *key1,
-                       const struct p8_aes_key *key2, u8 *iv);
-void aes_p8_xts_decrypt(const u8 *in, u8 *out, size_t len,
-                       const struct p8_aes_key *key1,
-                       const struct p8_aes_key *key2, u8 *iv);
diff --git a/arch/powerpc/crypto/aesp8-ppc.pl b/arch/powerpc/crypto/aesp8-ppc.pl
deleted file mode 100644 (file)
index f729589..0000000
+++ /dev/null
@@ -1,3889 +0,0 @@
-#! /usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from CRYPTOGAMs[1] and is included here using the option
-# in the license to distribute the code under the GPL. Therefore this program
-# is free software; you can redistribute it and/or modify it under the terms of
-# the GNU General Public License version 2 as published by the Free Software
-# Foundation.
-#
-# [1] https://www.openssl.org/~appro/cryptogams/
-
-# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-#       * Redistributions of source code must retain copyright notices,
-#         this list of conditions and the following disclaimer.
-#
-#       * Redistributions in binary form must reproduce the above
-#         copyright notice, this list of conditions and the following
-#         disclaimer in the documentation and/or other materials
-#         provided with the distribution.
-#
-#       * Neither the name of the CRYPTOGAMS nor the names of its
-#         copyright holder and contributors may be used to endorse or
-#         promote products derived from this software without specific
-#         prior written permission.
-#
-# ALTERNATIVELY, provided that this notice is retained in full, this
-# product may be distributed under the terms of the GNU General Public
-# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
-# those given above.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements support for AES instructions as per PowerISA
-# specification version 2.07, first implemented by POWER8 processor.
-# The module is endian-agnostic in sense that it supports both big-
-# and little-endian cases. Data alignment in parallelizable modes is
-# handled with VSX loads and stores, which implies MSR.VSX flag being
-# set. It should also be noted that ISA specification doesn't prohibit
-# alignment exceptions for these instructions on page boundaries.
-# Initially alignment was handled in pure AltiVec/VMX way [when data
-# is aligned programmatically, which in turn guarantees exception-
-# free execution], but it turned to hamper performance when vcipher
-# instructions are interleaved. It's reckoned that eventual
-# misalignment penalties at page boundaries are in average lower
-# than additional overhead in pure AltiVec approach.
-#
-# May 2016
-#
-# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
-# systems were measured.
-#
-######################################################################
-# Current large-block performance in cycles per byte processed with
-# 128-bit key (less is better).
-#
-#              CBC en-/decrypt CTR     XTS
-# POWER8[le]   3.96/0.72       0.74    1.1
-# POWER8[be]   3.75/0.65       0.66    1.0
-
-$flavour = shift;
-
-if ($flavour =~ /64/) {
-       $SIZE_T =8;
-       $LRSAVE =2*$SIZE_T;
-       $STU    ="stdu";
-       $POP    ="ld";
-       $PUSH   ="std";
-       $UCMP   ="cmpld";
-       $SHL    ="sldi";
-} elsif ($flavour =~ /32/) {
-       $SIZE_T =4;
-       $LRSAVE =$SIZE_T;
-       $STU    ="stwu";
-       $POP    ="lwz";
-       $PUSH   ="stw";
-       $UCMP   ="cmplw";
-       $SHL    ="slwi";
-} else { die "nonsense $flavour"; }
-
-$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
-die "can't locate ppc-xlate.pl";
-
-open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-
-$FRAME=8*$SIZE_T;
-$prefix="aes_p8";
-
-$sp="r1";
-$vrsave="r12";
-
-#########################################################################
-{{{    # Key setup procedures                                          #
-my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
-my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
-my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
-
-$code.=<<___;
-.machine       "any"
-
-.text
-
-.align 7
-rcon:
-.long  0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
-.long  0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
-.long  0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
-.long  0,0,0,0                                         ?asis
-.long  0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
-Lconsts:
-       mflr    r0
-       bcl     20,31,\$+4
-       mflr    $ptr     #vvvvv "distance between . and rcon
-       addi    $ptr,$ptr,-0x58
-       mtlr    r0
-       blr
-       .long   0
-       .byte   0,12,0x14,0,0,0,0,0
-.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
-
-.globl .${prefix}_set_encrypt_key
-Lset_encrypt_key:
-       mflr            r11
-       $PUSH           r11,$LRSAVE($sp)
-
-       li              $ptr,-1
-       ${UCMP}i        $inp,0
-       beq-            Lenc_key_abort          # if ($inp==0) return -1;
-       ${UCMP}i        $out,0
-       beq-            Lenc_key_abort          # if ($out==0) return -1;
-       li              $ptr,-2
-       cmpwi           $bits,128
-       blt-            Lenc_key_abort
-       cmpwi           $bits,256
-       bgt-            Lenc_key_abort
-       andi.           r0,$bits,0x3f
-       bne-            Lenc_key_abort
-
-       lis             r0,0xfff0
-       mfspr           $vrsave,256
-       mtspr           256,r0
-
-       bl              Lconsts
-       mtlr            r11
-
-       neg             r9,$inp
-       lvx             $in0,0,$inp
-       addi            $inp,$inp,15            # 15 is not typo
-       lvsr            $key,0,r9               # borrow $key
-       li              r8,0x20
-       cmpwi           $bits,192
-       lvx             $in1,0,$inp
-       le?vspltisb     $mask,0x0f              # borrow $mask
-       lvx             $rcon,0,$ptr
-       le?vxor         $key,$key,$mask         # adjust for byte swap
-       lvx             $mask,r8,$ptr
-       addi            $ptr,$ptr,0x10
-       vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
-       li              $cnt,8
-       vxor            $zero,$zero,$zero
-       mtctr           $cnt
-
-       ?lvsr           $outperm,0,$out
-       vspltisb        $outmask,-1
-       lvx             $outhead,0,$out
-       ?vperm          $outmask,$zero,$outmask,$outperm
-
-       blt             Loop128
-       addi            $inp,$inp,8
-       beq             L192
-       addi            $inp,$inp,8
-       b               L256
-
-.align 4
-Loop128:
-       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
-       vsldoi          $tmp,$zero,$in0,12      # >>32
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-       vcipherlast     $key,$key,$rcon
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-        vadduwm        $rcon,$rcon,$rcon
-       vxor            $in0,$in0,$key
-       bdnz            Loop128
-
-       lvx             $rcon,0,$ptr            # last two round keys
-
-       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
-       vsldoi          $tmp,$zero,$in0,12      # >>32
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-       vcipherlast     $key,$key,$rcon
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-        vadduwm        $rcon,$rcon,$rcon
-       vxor            $in0,$in0,$key
-
-       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
-       vsldoi          $tmp,$zero,$in0,12      # >>32
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-       vcipherlast     $key,$key,$rcon
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-       vxor            $in0,$in0,$key
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-        stvx           $stage,0,$out
-
-       addi            $inp,$out,15            # 15 is not typo
-       addi            $out,$out,0x50
-
-       li              $rounds,10
-       b               Ldone
-
-.align 4
-L192:
-       lvx             $tmp,0,$inp
-       li              $cnt,4
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
-       vspltisb        $key,8                  # borrow $key
-       mtctr           $cnt
-       vsububm         $mask,$mask,$key        # adjust the mask
-
-Loop192:
-       vperm           $key,$in1,$in1,$mask    # roate-n-splat
-       vsldoi          $tmp,$zero,$in0,12      # >>32
-       vcipherlast     $key,$key,$rcon
-
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-
-        vsldoi         $stage,$zero,$in1,8
-       vspltw          $tmp,$in0,3
-       vxor            $tmp,$tmp,$in1
-       vsldoi          $in1,$zero,$in1,12      # >>32
-        vadduwm        $rcon,$rcon,$rcon
-       vxor            $in1,$in1,$tmp
-       vxor            $in0,$in0,$key
-       vxor            $in1,$in1,$key
-        vsldoi         $stage,$stage,$in0,8
-
-       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
-       vsldoi          $tmp,$zero,$in0,12      # >>32
-        vperm          $outtail,$stage,$stage,$outperm # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-       vcipherlast     $key,$key,$rcon
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-
-        vsldoi         $stage,$in0,$in1,8
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-        vperm          $outtail,$stage,$stage,$outperm # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-
-       vspltw          $tmp,$in0,3
-       vxor            $tmp,$tmp,$in1
-       vsldoi          $in1,$zero,$in1,12      # >>32
-        vadduwm        $rcon,$rcon,$rcon
-       vxor            $in1,$in1,$tmp
-       vxor            $in0,$in0,$key
-       vxor            $in1,$in1,$key
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-        stvx           $stage,0,$out
-        addi           $inp,$out,15            # 15 is not typo
-        addi           $out,$out,16
-       bdnz            Loop192
-
-       li              $rounds,12
-       addi            $out,$out,0x20
-       b               Ldone
-
-.align 4
-L256:
-       lvx             $tmp,0,$inp
-       li              $cnt,7
-       li              $rounds,14
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
-       mtctr           $cnt
-
-Loop256:
-       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
-       vsldoi          $tmp,$zero,$in0,12      # >>32
-        vperm          $outtail,$in1,$in1,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-       vcipherlast     $key,$key,$rcon
-        stvx           $stage,0,$out
-        addi           $out,$out,16
-
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in0,$in0,$tmp
-        vadduwm        $rcon,$rcon,$rcon
-       vxor            $in0,$in0,$key
-        vperm          $outtail,$in0,$in0,$outperm     # rotate
-        vsel           $stage,$outhead,$outtail,$outmask
-        vmr            $outhead,$outtail
-        stvx           $stage,0,$out
-        addi           $inp,$out,15            # 15 is not typo
-        addi           $out,$out,16
-       bdz             Ldone
-
-       vspltw          $key,$in0,3             # just splat
-       vsldoi          $tmp,$zero,$in1,12      # >>32
-       vsbox           $key,$key
-
-       vxor            $in1,$in1,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in1,$in1,$tmp
-       vsldoi          $tmp,$zero,$tmp,12      # >>32
-       vxor            $in1,$in1,$tmp
-
-       vxor            $in1,$in1,$key
-       b               Loop256
-
-.align 4
-Ldone:
-       lvx             $in1,0,$inp             # redundant in aligned case
-       vsel            $in1,$outhead,$in1,$outmask
-       stvx            $in1,0,$inp
-       li              $ptr,0
-       mtspr           256,$vrsave
-       stw             $rounds,0($out)
-
-Lenc_key_abort:
-       mr              r3,$ptr
-       blr
-       .long           0
-       .byte           0,12,0x14,1,0,0,3,0
-       .long           0
-.size  .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
-
-.globl .${prefix}_set_decrypt_key
-       $STU            $sp,-$FRAME($sp)
-       mflr            r10
-       $PUSH           r10,$FRAME+$LRSAVE($sp)
-       bl              Lset_encrypt_key
-       mtlr            r10
-
-       cmpwi           r3,0
-       bne-            Ldec_key_abort
-
-       slwi            $cnt,$rounds,4
-       subi            $inp,$out,240           # first round key
-       srwi            $rounds,$rounds,1
-       add             $out,$inp,$cnt          # last round key
-       mtctr           $rounds
-
-Ldeckey:
-       lwz             r0, 0($inp)
-       lwz             r6, 4($inp)
-       lwz             r7, 8($inp)
-       lwz             r8, 12($inp)
-       addi            $inp,$inp,16
-       lwz             r9, 0($out)
-       lwz             r10,4($out)
-       lwz             r11,8($out)
-       lwz             r12,12($out)
-       stw             r0, 0($out)
-       stw             r6, 4($out)
-       stw             r7, 8($out)
-       stw             r8, 12($out)
-       subi            $out,$out,16
-       stw             r9, -16($inp)
-       stw             r10,-12($inp)
-       stw             r11,-8($inp)
-       stw             r12,-4($inp)
-       bdnz            Ldeckey
-
-       xor             r3,r3,r3                # return value
-Ldec_key_abort:
-       addi            $sp,$sp,$FRAME
-       blr
-       .long           0
-       .byte           0,12,4,1,0x80,0,3,0
-       .long           0
-.size  .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
-___
-}}}
-#########################################################################
-{{{    # Single block en- and decrypt procedures                       #
-sub gen_block () {
-my $dir = shift;
-my $n   = $dir eq "de" ? "n" : "";
-my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
-
-$code.=<<___;
-.globl .${prefix}_${dir}crypt
-       lwz             $rounds,240($key)
-       lis             r0,0xfc00
-       mfspr           $vrsave,256
-       li              $idx,15                 # 15 is not typo
-       mtspr           256,r0
-
-       lvx             v0,0,$inp
-       neg             r11,$out
-       lvx             v1,$idx,$inp
-       lvsl            v2,0,$inp               # inpperm
-       le?vspltisb     v4,0x0f
-       ?lvsl           v3,0,r11                # outperm
-       le?vxor         v2,v2,v4
-       li              $idx,16
-       vperm           v0,v0,v1,v2             # align [and byte swap in LE]
-       lvx             v1,0,$key
-       ?lvsl           v5,0,$key               # keyperm
-       srwi            $rounds,$rounds,1
-       lvx             v2,$idx,$key
-       addi            $idx,$idx,16
-       subi            $rounds,$rounds,1
-       ?vperm          v1,v1,v2,v5             # align round key
-
-       vxor            v0,v0,v1
-       lvx             v1,$idx,$key
-       addi            $idx,$idx,16
-       mtctr           $rounds
-
-Loop_${dir}c:
-       ?vperm          v2,v2,v1,v5
-       v${n}cipher     v0,v0,v2
-       lvx             v2,$idx,$key
-       addi            $idx,$idx,16
-       ?vperm          v1,v1,v2,v5
-       v${n}cipher     v0,v0,v1
-       lvx             v1,$idx,$key
-       addi            $idx,$idx,16
-       bdnz            Loop_${dir}c
-
-       ?vperm          v2,v2,v1,v5
-       v${n}cipher     v0,v0,v2
-       lvx             v2,$idx,$key
-       ?vperm          v1,v1,v2,v5
-       v${n}cipherlast v0,v0,v1
-
-       vspltisb        v2,-1
-       vxor            v1,v1,v1
-       li              $idx,15                 # 15 is not typo
-       ?vperm          v2,v1,v2,v3             # outmask
-       le?vxor         v3,v3,v4
-       lvx             v1,0,$out               # outhead
-       vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
-       vsel            v1,v1,v0,v2
-       lvx             v4,$idx,$out
-       stvx            v1,0,$out
-       vsel            v0,v0,v4,v2
-       stvx            v0,$idx,$out
-
-       mtspr           256,$vrsave
-       blr
-       .long           0
-       .byte           0,12,0x14,0,0,0,3,0
-       .long           0
-.size  .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
-___
-}
-&gen_block("en");
-&gen_block("de");
-}}}
-#########################################################################
-{{{    # CBC en- and decrypt procedures                                #
-my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
-my ($rndkey0,$rndkey1,$inout,$tmp)=            map("v$_",(0..3));
-my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
-                                               map("v$_",(4..10));
-$code.=<<___;
-.globl .${prefix}_cbc_encrypt
-       ${UCMP}i        $len,16
-       bltlr-
-
-       cmpwi           $enc,0                  # test direction
-       lis             r0,0xffe0
-       mfspr           $vrsave,256
-       mtspr           256,r0
-
-       li              $idx,15
-       vxor            $rndkey0,$rndkey0,$rndkey0
-       le?vspltisb     $tmp,0x0f
-
-       lvx             $ivec,0,$ivp            # load [unaligned] iv
-       lvsl            $inpperm,0,$ivp
-       lvx             $inptail,$idx,$ivp
-       le?vxor         $inpperm,$inpperm,$tmp
-       vperm           $ivec,$ivec,$inptail,$inpperm
-
-       neg             r11,$inp
-       ?lvsl           $keyperm,0,$key         # prepare for unaligned key
-       lwz             $rounds,240($key)
-
-       lvsr            $inpperm,0,r11          # prepare for unaligned load
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,15            # 15 is not typo
-       le?vxor         $inpperm,$inpperm,$tmp
-
-       ?lvsr           $outperm,0,$out         # prepare for unaligned store
-       vspltisb        $outmask,-1
-       lvx             $outhead,0,$out
-       ?vperm          $outmask,$rndkey0,$outmask,$outperm
-       le?vxor         $outperm,$outperm,$tmp
-
-       srwi            $rounds,$rounds,1
-       li              $idx,16
-       subi            $rounds,$rounds,1
-       beq             Lcbc_dec
-
-Lcbc_enc:
-       vmr             $inout,$inptail
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,16
-       mtctr           $rounds
-       subi            $len,$len,16            # len-=16
-
-       lvx             $rndkey0,0,$key
-        vperm          $inout,$inout,$inptail,$inpperm
-       lvx             $rndkey1,$idx,$key
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key
-       addi            $idx,$idx,16
-       vxor            $inout,$inout,$ivec
-
-Loop_cbc_enc:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipher         $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key
-       addi            $idx,$idx,16
-       bdnz            Loop_cbc_enc
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key
-       li              $idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipherlast     $ivec,$inout,$rndkey0
-       ${UCMP}i        $len,16
-
-       vperm           $tmp,$ivec,$ivec,$outperm
-       vsel            $inout,$outhead,$tmp,$outmask
-       vmr             $outhead,$tmp
-       stvx            $inout,0,$out
-       addi            $out,$out,16
-       bge             Lcbc_enc
-
-       b               Lcbc_done
-
-.align 4
-Lcbc_dec:
-       ${UCMP}i        $len,128
-       bge             _aesp8_cbc_decrypt8x
-       vmr             $tmp,$inptail
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,16
-       mtctr           $rounds
-       subi            $len,$len,16            # len-=16
-
-       lvx             $rndkey0,0,$key
-        vperm          $tmp,$tmp,$inptail,$inpperm
-       lvx             $rndkey1,$idx,$key
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $inout,$tmp,$rndkey0
-       lvx             $rndkey0,$idx,$key
-       addi            $idx,$idx,16
-
-Loop_cbc_dec:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vncipher        $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vncipher        $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key
-       addi            $idx,$idx,16
-       bdnz            Loop_cbc_dec
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vncipher        $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key
-       li              $idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vncipherlast    $inout,$inout,$rndkey0
-       ${UCMP}i        $len,16
-
-       vxor            $inout,$inout,$ivec
-       vmr             $ivec,$tmp
-       vperm           $tmp,$inout,$inout,$outperm
-       vsel            $inout,$outhead,$tmp,$outmask
-       vmr             $outhead,$tmp
-       stvx            $inout,0,$out
-       addi            $out,$out,16
-       bge             Lcbc_dec
-
-Lcbc_done:
-       addi            $out,$out,-1
-       lvx             $inout,0,$out           # redundant in aligned case
-       vsel            $inout,$outhead,$inout,$outmask
-       stvx            $inout,0,$out
-
-       neg             $enc,$ivp               # write [unaligned] iv
-       li              $idx,15                 # 15 is not typo
-       vxor            $rndkey0,$rndkey0,$rndkey0
-       vspltisb        $outmask,-1
-       le?vspltisb     $tmp,0x0f
-       ?lvsl           $outperm,0,$enc
-       ?vperm          $outmask,$rndkey0,$outmask,$outperm
-       le?vxor         $outperm,$outperm,$tmp
-       lvx             $outhead,0,$ivp
-       vperm           $ivec,$ivec,$ivec,$outperm
-       vsel            $inout,$outhead,$ivec,$outmask
-       lvx             $inptail,$idx,$ivp
-       stvx            $inout,0,$ivp
-       vsel            $inout,$ivec,$inptail,$outmask
-       stvx            $inout,$idx,$ivp
-
-       mtspr           256,$vrsave
-       blr
-       .long           0
-       .byte           0,12,0x14,0,0,0,6,0
-       .long           0
-___
-#########################################################################
-{{     # Optimized CBC decrypt procedure                               #
-my $key_="r11";
-my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
-my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
-my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
-my $rndkey0="v23";     # v24-v25 rotating buffer for first found keys
-                       # v26-v31 last 6 round keys
-my ($tmp,$keyperm)=($in3,$in4);        # aliases with "caller", redundant assignment
-
-$code.=<<___;
-.align 5
-_aesp8_cbc_decrypt8x:
-       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
-       li              r10,`$FRAME+8*16+15`
-       li              r11,`$FRAME+8*16+31`
-       stvx            v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       stvx            v21,r11,$sp
-       addi            r11,r11,32
-       stvx            v22,r10,$sp
-       addi            r10,r10,32
-       stvx            v23,r11,$sp
-       addi            r11,r11,32
-       stvx            v24,r10,$sp
-       addi            r10,r10,32
-       stvx            v25,r11,$sp
-       addi            r11,r11,32
-       stvx            v26,r10,$sp
-       addi            r10,r10,32
-       stvx            v27,r11,$sp
-       addi            r11,r11,32
-       stvx            v28,r10,$sp
-       addi            r10,r10,32
-       stvx            v29,r11,$sp
-       addi            r11,r11,32
-       stvx            v30,r10,$sp
-       stvx            v31,r11,$sp
-       li              r0,-1
-       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
-       li              $x10,0x10
-       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       li              $x20,0x20
-       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       li              $x30,0x30
-       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       li              $x40,0x40
-       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       li              $x50,0x50
-       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       li              $x60,0x60
-       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       li              $x70,0x70
-       mtspr           256,r0
-
-       subi            $rounds,$rounds,3       # -4 in total
-       subi            $len,$len,128           # bias
-
-       lvx             $rndkey0,$x00,$key      # load key schedule
-       lvx             v30,$x10,$key
-       addi            $key,$key,0x20
-       lvx             v31,$x00,$key
-       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
-       addi            $key_,$sp,$FRAME+15
-       mtctr           $rounds
-
-Load_cbc_dec_key:
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v30,$x10,$key
-       addi            $key,$key,0x20
-       stvx            v24,$x00,$key_          # off-load round[1]
-       ?vperm          v25,v31,v30,$keyperm
-       lvx             v31,$x00,$key
-       stvx            v25,$x10,$key_          # off-load round[2]
-       addi            $key_,$key_,0x20
-       bdnz            Load_cbc_dec_key
-
-       lvx             v26,$x10,$key
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v27,$x20,$key
-       stvx            v24,$x00,$key_          # off-load round[3]
-       ?vperm          v25,v31,v26,$keyperm
-       lvx             v28,$x30,$key
-       stvx            v25,$x10,$key_          # off-load round[4]
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       ?vperm          v26,v26,v27,$keyperm
-       lvx             v29,$x40,$key
-       ?vperm          v27,v27,v28,$keyperm
-       lvx             v30,$x50,$key
-       ?vperm          v28,v28,v29,$keyperm
-       lvx             v31,$x60,$key
-       ?vperm          v29,v29,v30,$keyperm
-       lvx             $out0,$x70,$key         # borrow $out0
-       ?vperm          v30,v30,v31,$keyperm
-       lvx             v24,$x00,$key_          # pre-load round[1]
-       ?vperm          v31,v31,$out0,$keyperm
-       lvx             v25,$x10,$key_          # pre-load round[2]
-
-       #lvx            $inptail,0,$inp         # "caller" already did this
-       #addi           $inp,$inp,15            # 15 is not typo
-       subi            $inp,$inp,15            # undo "caller"
-
-        le?li          $idx,8
-       lvx_u           $in0,$x00,$inp          # load first 8 "words"
-        le?lvsl        $inpperm,0,$idx
-        le?vspltisb    $tmp,0x0f
-       lvx_u           $in1,$x10,$inp
-        le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
-       lvx_u           $in2,$x20,$inp
-        le?vperm       $in0,$in0,$in0,$inpperm
-       lvx_u           $in3,$x30,$inp
-        le?vperm       $in1,$in1,$in1,$inpperm
-       lvx_u           $in4,$x40,$inp
-        le?vperm       $in2,$in2,$in2,$inpperm
-       vxor            $out0,$in0,$rndkey0
-       lvx_u           $in5,$x50,$inp
-        le?vperm       $in3,$in3,$in3,$inpperm
-       vxor            $out1,$in1,$rndkey0
-       lvx_u           $in6,$x60,$inp
-        le?vperm       $in4,$in4,$in4,$inpperm
-       vxor            $out2,$in2,$rndkey0
-       lvx_u           $in7,$x70,$inp
-       addi            $inp,$inp,0x80
-        le?vperm       $in5,$in5,$in5,$inpperm
-       vxor            $out3,$in3,$rndkey0
-        le?vperm       $in6,$in6,$in6,$inpperm
-       vxor            $out4,$in4,$rndkey0
-        le?vperm       $in7,$in7,$in7,$inpperm
-       vxor            $out5,$in5,$rndkey0
-       vxor            $out6,$in6,$rndkey0
-       vxor            $out7,$in7,$rndkey0
-
-       mtctr           $rounds
-       b               Loop_cbc_dec8x
-.align 5
-Loop_cbc_dec8x:
-       vncipher        $out0,$out0,v24
-       vncipher        $out1,$out1,v24
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-       vncipher        $out5,$out5,v24
-       vncipher        $out6,$out6,v24
-       vncipher        $out7,$out7,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vncipher        $out0,$out0,v25
-       vncipher        $out1,$out1,v25
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-       vncipher        $out4,$out4,v25
-       vncipher        $out5,$out5,v25
-       vncipher        $out6,$out6,v25
-       vncipher        $out7,$out7,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Loop_cbc_dec8x
-
-       subic           $len,$len,128           # $len-=128
-       vncipher        $out0,$out0,v24
-       vncipher        $out1,$out1,v24
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-       vncipher        $out5,$out5,v24
-       vncipher        $out6,$out6,v24
-       vncipher        $out7,$out7,v24
-
-       subfe.          r0,r0,r0                # borrow?-1:0
-       vncipher        $out0,$out0,v25
-       vncipher        $out1,$out1,v25
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-       vncipher        $out4,$out4,v25
-       vncipher        $out5,$out5,v25
-       vncipher        $out6,$out6,v25
-       vncipher        $out7,$out7,v25
-
-       and             r0,r0,$len
-       vncipher        $out0,$out0,v26
-       vncipher        $out1,$out1,v26
-       vncipher        $out2,$out2,v26
-       vncipher        $out3,$out3,v26
-       vncipher        $out4,$out4,v26
-       vncipher        $out5,$out5,v26
-       vncipher        $out6,$out6,v26
-       vncipher        $out7,$out7,v26
-
-       add             $inp,$inp,r0            # $inp is adjusted in such
-                                               # way that at exit from the
-                                               # loop inX-in7 are loaded
-                                               # with last "words"
-       vncipher        $out0,$out0,v27
-       vncipher        $out1,$out1,v27
-       vncipher        $out2,$out2,v27
-       vncipher        $out3,$out3,v27
-       vncipher        $out4,$out4,v27
-       vncipher        $out5,$out5,v27
-       vncipher        $out6,$out6,v27
-       vncipher        $out7,$out7,v27
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       vncipher        $out0,$out0,v28
-       vncipher        $out1,$out1,v28
-       vncipher        $out2,$out2,v28
-       vncipher        $out3,$out3,v28
-       vncipher        $out4,$out4,v28
-       vncipher        $out5,$out5,v28
-       vncipher        $out6,$out6,v28
-       vncipher        $out7,$out7,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-
-       vncipher        $out0,$out0,v29
-       vncipher        $out1,$out1,v29
-       vncipher        $out2,$out2,v29
-       vncipher        $out3,$out3,v29
-       vncipher        $out4,$out4,v29
-       vncipher        $out5,$out5,v29
-       vncipher        $out6,$out6,v29
-       vncipher        $out7,$out7,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-
-       vncipher        $out0,$out0,v30
-        vxor           $ivec,$ivec,v31         # xor with last round key
-       vncipher        $out1,$out1,v30
-        vxor           $in0,$in0,v31
-       vncipher        $out2,$out2,v30
-        vxor           $in1,$in1,v31
-       vncipher        $out3,$out3,v30
-        vxor           $in2,$in2,v31
-       vncipher        $out4,$out4,v30
-        vxor           $in3,$in3,v31
-       vncipher        $out5,$out5,v30
-        vxor           $in4,$in4,v31
-       vncipher        $out6,$out6,v30
-        vxor           $in5,$in5,v31
-       vncipher        $out7,$out7,v30
-        vxor           $in6,$in6,v31
-
-       vncipherlast    $out0,$out0,$ivec
-       vncipherlast    $out1,$out1,$in0
-        lvx_u          $in0,$x00,$inp          # load next input block
-       vncipherlast    $out2,$out2,$in1
-        lvx_u          $in1,$x10,$inp
-       vncipherlast    $out3,$out3,$in2
-        le?vperm       $in0,$in0,$in0,$inpperm
-        lvx_u          $in2,$x20,$inp
-       vncipherlast    $out4,$out4,$in3
-        le?vperm       $in1,$in1,$in1,$inpperm
-        lvx_u          $in3,$x30,$inp
-       vncipherlast    $out5,$out5,$in4
-        le?vperm       $in2,$in2,$in2,$inpperm
-        lvx_u          $in4,$x40,$inp
-       vncipherlast    $out6,$out6,$in5
-        le?vperm       $in3,$in3,$in3,$inpperm
-        lvx_u          $in5,$x50,$inp
-       vncipherlast    $out7,$out7,$in6
-        le?vperm       $in4,$in4,$in4,$inpperm
-        lvx_u          $in6,$x60,$inp
-       vmr             $ivec,$in7
-        le?vperm       $in5,$in5,$in5,$inpperm
-        lvx_u          $in7,$x70,$inp
-        addi           $inp,$inp,0x80
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-        le?vperm       $in6,$in6,$in6,$inpperm
-        vxor           $out0,$in0,$rndkey0
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x10,$out
-        le?vperm       $in7,$in7,$in7,$inpperm
-        vxor           $out1,$in1,$rndkey0
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x20,$out
-        vxor           $out2,$in2,$rndkey0
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x30,$out
-        vxor           $out3,$in3,$rndkey0
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x40,$out
-        vxor           $out4,$in4,$rndkey0
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x50,$out
-        vxor           $out5,$in5,$rndkey0
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x60,$out
-        vxor           $out6,$in6,$rndkey0
-       stvx_u          $out7,$x70,$out
-       addi            $out,$out,0x80
-        vxor           $out7,$in7,$rndkey0
-
-       mtctr           $rounds
-       beq             Loop_cbc_dec8x          # did $len-=128 borrow?
-
-       addic.          $len,$len,128
-       beq             Lcbc_dec8x_done
-       nop
-       nop
-
-Loop_cbc_dec8x_tail:                           # up to 7 "words" tail...
-       vncipher        $out1,$out1,v24
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-       vncipher        $out5,$out5,v24
-       vncipher        $out6,$out6,v24
-       vncipher        $out7,$out7,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vncipher        $out1,$out1,v25
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-       vncipher        $out4,$out4,v25
-       vncipher        $out5,$out5,v25
-       vncipher        $out6,$out6,v25
-       vncipher        $out7,$out7,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Loop_cbc_dec8x_tail
-
-       vncipher        $out1,$out1,v24
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-       vncipher        $out5,$out5,v24
-       vncipher        $out6,$out6,v24
-       vncipher        $out7,$out7,v24
-
-       vncipher        $out1,$out1,v25
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-       vncipher        $out4,$out4,v25
-       vncipher        $out5,$out5,v25
-       vncipher        $out6,$out6,v25
-       vncipher        $out7,$out7,v25
-
-       vncipher        $out1,$out1,v26
-       vncipher        $out2,$out2,v26
-       vncipher        $out3,$out3,v26
-       vncipher        $out4,$out4,v26
-       vncipher        $out5,$out5,v26
-       vncipher        $out6,$out6,v26
-       vncipher        $out7,$out7,v26
-
-       vncipher        $out1,$out1,v27
-       vncipher        $out2,$out2,v27
-       vncipher        $out3,$out3,v27
-       vncipher        $out4,$out4,v27
-       vncipher        $out5,$out5,v27
-       vncipher        $out6,$out6,v27
-       vncipher        $out7,$out7,v27
-
-       vncipher        $out1,$out1,v28
-       vncipher        $out2,$out2,v28
-       vncipher        $out3,$out3,v28
-       vncipher        $out4,$out4,v28
-       vncipher        $out5,$out5,v28
-       vncipher        $out6,$out6,v28
-       vncipher        $out7,$out7,v28
-
-       vncipher        $out1,$out1,v29
-       vncipher        $out2,$out2,v29
-       vncipher        $out3,$out3,v29
-       vncipher        $out4,$out4,v29
-       vncipher        $out5,$out5,v29
-       vncipher        $out6,$out6,v29
-       vncipher        $out7,$out7,v29
-
-       vncipher        $out1,$out1,v30
-        vxor           $ivec,$ivec,v31         # last round key
-       vncipher        $out2,$out2,v30
-        vxor           $in1,$in1,v31
-       vncipher        $out3,$out3,v30
-        vxor           $in2,$in2,v31
-       vncipher        $out4,$out4,v30
-        vxor           $in3,$in3,v31
-       vncipher        $out5,$out5,v30
-        vxor           $in4,$in4,v31
-       vncipher        $out6,$out6,v30
-        vxor           $in5,$in5,v31
-       vncipher        $out7,$out7,v30
-        vxor           $in6,$in6,v31
-
-       cmplwi          $len,32                 # switch($len)
-       blt             Lcbc_dec8x_one
-       nop
-       beq             Lcbc_dec8x_two
-       cmplwi          $len,64
-       blt             Lcbc_dec8x_three
-       nop
-       beq             Lcbc_dec8x_four
-       cmplwi          $len,96
-       blt             Lcbc_dec8x_five
-       nop
-       beq             Lcbc_dec8x_six
-
-Lcbc_dec8x_seven:
-       vncipherlast    $out1,$out1,$ivec
-       vncipherlast    $out2,$out2,$in1
-       vncipherlast    $out3,$out3,$in2
-       vncipherlast    $out4,$out4,$in3
-       vncipherlast    $out5,$out5,$in4
-       vncipherlast    $out6,$out6,$in5
-       vncipherlast    $out7,$out7,$in6
-       vmr             $ivec,$in7
-
-       le?vperm        $out1,$out1,$out1,$inpperm
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x00,$out
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x10,$out
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x20,$out
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x30,$out
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x40,$out
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x50,$out
-       stvx_u          $out7,$x60,$out
-       addi            $out,$out,0x70
-       b               Lcbc_dec8x_done
-
-.align 5
-Lcbc_dec8x_six:
-       vncipherlast    $out2,$out2,$ivec
-       vncipherlast    $out3,$out3,$in2
-       vncipherlast    $out4,$out4,$in3
-       vncipherlast    $out5,$out5,$in4
-       vncipherlast    $out6,$out6,$in5
-       vncipherlast    $out7,$out7,$in6
-       vmr             $ivec,$in7
-
-       le?vperm        $out2,$out2,$out2,$inpperm
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x00,$out
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x10,$out
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x20,$out
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x30,$out
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x40,$out
-       stvx_u          $out7,$x50,$out
-       addi            $out,$out,0x60
-       b               Lcbc_dec8x_done
-
-.align 5
-Lcbc_dec8x_five:
-       vncipherlast    $out3,$out3,$ivec
-       vncipherlast    $out4,$out4,$in3
-       vncipherlast    $out5,$out5,$in4
-       vncipherlast    $out6,$out6,$in5
-       vncipherlast    $out7,$out7,$in6
-       vmr             $ivec,$in7
-
-       le?vperm        $out3,$out3,$out3,$inpperm
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x00,$out
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x10,$out
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x20,$out
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x30,$out
-       stvx_u          $out7,$x40,$out
-       addi            $out,$out,0x50
-       b               Lcbc_dec8x_done
-
-.align 5
-Lcbc_dec8x_four:
-       vncipherlast    $out4,$out4,$ivec
-       vncipherlast    $out5,$out5,$in4
-       vncipherlast    $out6,$out6,$in5
-       vncipherlast    $out7,$out7,$in6
-       vmr             $ivec,$in7
-
-       le?vperm        $out4,$out4,$out4,$inpperm
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x00,$out
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x10,$out
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x20,$out
-       stvx_u          $out7,$x30,$out
-       addi            $out,$out,0x40
-       b               Lcbc_dec8x_done
-
-.align 5
-Lcbc_dec8x_three:
-       vncipherlast    $out5,$out5,$ivec
-       vncipherlast    $out6,$out6,$in5
-       vncipherlast    $out7,$out7,$in6
-       vmr             $ivec,$in7
-
-       le?vperm        $out5,$out5,$out5,$inpperm
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x00,$out
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x10,$out
-       stvx_u          $out7,$x20,$out
-       addi            $out,$out,0x30
-       b               Lcbc_dec8x_done
-
-.align 5
-Lcbc_dec8x_two:
-       vncipherlast    $out6,$out6,$ivec
-       vncipherlast    $out7,$out7,$in6
-       vmr             $ivec,$in7
-
-       le?vperm        $out6,$out6,$out6,$inpperm
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x00,$out
-       stvx_u          $out7,$x10,$out
-       addi            $out,$out,0x20
-       b               Lcbc_dec8x_done
-
-.align 5
-Lcbc_dec8x_one:
-       vncipherlast    $out7,$out7,$ivec
-       vmr             $ivec,$in7
-
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out7,0,$out
-       addi            $out,$out,0x10
-
-Lcbc_dec8x_done:
-       le?vperm        $ivec,$ivec,$ivec,$inpperm
-       stvx_u          $ivec,0,$ivp            # write [unaligned] iv
-
-       li              r10,`$FRAME+15`
-       li              r11,`$FRAME+31`
-       stvx            $inpperm,r10,$sp        # wipe copies of round keys
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-       stvx            $inpperm,r10,$sp
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-       stvx            $inpperm,r10,$sp
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-       stvx            $inpperm,r10,$sp
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-
-       mtspr           256,$vrsave
-       lvx             v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       lvx             v21,r11,$sp
-       addi            r11,r11,32
-       lvx             v22,r10,$sp
-       addi            r10,r10,32
-       lvx             v23,r11,$sp
-       addi            r11,r11,32
-       lvx             v24,r10,$sp
-       addi            r10,r10,32
-       lvx             v25,r11,$sp
-       addi            r11,r11,32
-       lvx             v26,r10,$sp
-       addi            r10,r10,32
-       lvx             v27,r11,$sp
-       addi            r11,r11,32
-       lvx             v28,r10,$sp
-       addi            r10,r10,32
-       lvx             v29,r11,$sp
-       addi            r11,r11,32
-       lvx             v30,r10,$sp
-       lvx             v31,r11,$sp
-       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
-       blr
-       .long           0
-       .byte           0,12,0x14,0,0x80,6,6,0
-       .long           0
-.size  .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
-___
-}}     }}}
-
-#########################################################################
-{{{    # CTR procedure[s]                                              #
-
-####################### WARNING: Here be dragons! #######################
-#
-# This code is written as 'ctr32', based on a 32-bit counter used
-# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
-# a 128-bit counter.
-#
-# This leads to subtle changes from the upstream code: the counter
-# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
-# both the bulk (8 blocks at a time) path, and in the individual block
-# path. Be aware of this when doing updates.
-#
-# See:
-# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
-# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
-# https://github.com/openssl/openssl/pull/8942
-#
-#########################################################################
-my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
-my ($rndkey0,$rndkey1,$inout,$tmp)=            map("v$_",(0..3));
-my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
-                                               map("v$_",(4..11));
-my $dat=$tmp;
-
-$code.=<<___;
-.globl .${prefix}_ctr32_encrypt_blocks
-       ${UCMP}i        $len,1
-       bltlr-
-
-       lis             r0,0xfff0
-       mfspr           $vrsave,256
-       mtspr           256,r0
-
-       li              $idx,15
-       vxor            $rndkey0,$rndkey0,$rndkey0
-       le?vspltisb     $tmp,0x0f
-
-       lvx             $ivec,0,$ivp            # load [unaligned] iv
-       lvsl            $inpperm,0,$ivp
-       lvx             $inptail,$idx,$ivp
-        vspltisb       $one,1
-       le?vxor         $inpperm,$inpperm,$tmp
-       vperm           $ivec,$ivec,$inptail,$inpperm
-        vsldoi         $one,$rndkey0,$one,1
-
-       neg             r11,$inp
-       ?lvsl           $keyperm,0,$key         # prepare for unaligned key
-       lwz             $rounds,240($key)
-
-       lvsr            $inpperm,0,r11          # prepare for unaligned load
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,15            # 15 is not typo
-       le?vxor         $inpperm,$inpperm,$tmp
-
-       srwi            $rounds,$rounds,1
-       li              $idx,16
-       subi            $rounds,$rounds,1
-
-       ${UCMP}i        $len,8
-       bge             _aesp8_ctr32_encrypt8x
-
-       ?lvsr           $outperm,0,$out         # prepare for unaligned store
-       vspltisb        $outmask,-1
-       lvx             $outhead,0,$out
-       ?vperm          $outmask,$rndkey0,$outmask,$outperm
-       le?vxor         $outperm,$outperm,$tmp
-
-       lvx             $rndkey0,0,$key
-       mtctr           $rounds
-       lvx             $rndkey1,$idx,$key
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $inout,$ivec,$rndkey0
-       lvx             $rndkey0,$idx,$key
-       addi            $idx,$idx,16
-       b               Loop_ctr32_enc
-
-.align 5
-Loop_ctr32_enc:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipher         $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key
-       addi            $idx,$idx,16
-       bdnz            Loop_ctr32_enc
-
-       vadduqm         $ivec,$ivec,$one        # Kernel change for 128-bit
-        vmr            $dat,$inptail
-        lvx            $inptail,0,$inp
-        addi           $inp,$inp,16
-        subic.         $len,$len,1             # blocks--
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key
-        vperm          $dat,$dat,$inptail,$inpperm
-        li             $idx,16
-       ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
-        lvx            $rndkey0,0,$key
-       vxor            $dat,$dat,$rndkey1      # last round key
-       vcipherlast     $inout,$inout,$dat
-
-        lvx            $rndkey1,$idx,$key
-        addi           $idx,$idx,16
-       vperm           $inout,$inout,$inout,$outperm
-       vsel            $dat,$outhead,$inout,$outmask
-        mtctr          $rounds
-        ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vmr             $outhead,$inout
-        vxor           $inout,$ivec,$rndkey0
-        lvx            $rndkey0,$idx,$key
-        addi           $idx,$idx,16
-       stvx            $dat,0,$out
-       addi            $out,$out,16
-       bne             Loop_ctr32_enc
-
-       addi            $out,$out,-1
-       lvx             $inout,0,$out           # redundant in aligned case
-       vsel            $inout,$outhead,$inout,$outmask
-       stvx            $inout,0,$out
-
-       mtspr           256,$vrsave
-       blr
-       .long           0
-       .byte           0,12,0x14,0,0,0,6,0
-       .long           0
-___
-#########################################################################
-{{     # Optimized CTR procedure                                       #
-my $key_="r11";
-my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
-my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
-my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
-my $rndkey0="v23";     # v24-v25 rotating buffer for first found keys
-                       # v26-v31 last 6 round keys
-my ($tmp,$keyperm)=($in3,$in4);        # aliases with "caller", redundant assignment
-my ($two,$three,$four)=($outhead,$outperm,$outmask);
-
-$code.=<<___;
-.align 5
-_aesp8_ctr32_encrypt8x:
-       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
-       li              r10,`$FRAME+8*16+15`
-       li              r11,`$FRAME+8*16+31`
-       stvx            v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       stvx            v21,r11,$sp
-       addi            r11,r11,32
-       stvx            v22,r10,$sp
-       addi            r10,r10,32
-       stvx            v23,r11,$sp
-       addi            r11,r11,32
-       stvx            v24,r10,$sp
-       addi            r10,r10,32
-       stvx            v25,r11,$sp
-       addi            r11,r11,32
-       stvx            v26,r10,$sp
-       addi            r10,r10,32
-       stvx            v27,r11,$sp
-       addi            r11,r11,32
-       stvx            v28,r10,$sp
-       addi            r10,r10,32
-       stvx            v29,r11,$sp
-       addi            r11,r11,32
-       stvx            v30,r10,$sp
-       stvx            v31,r11,$sp
-       li              r0,-1
-       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
-       li              $x10,0x10
-       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       li              $x20,0x20
-       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       li              $x30,0x30
-       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       li              $x40,0x40
-       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       li              $x50,0x50
-       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       li              $x60,0x60
-       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       li              $x70,0x70
-       mtspr           256,r0
-
-       subi            $rounds,$rounds,3       # -4 in total
-
-       lvx             $rndkey0,$x00,$key      # load key schedule
-       lvx             v30,$x10,$key
-       addi            $key,$key,0x20
-       lvx             v31,$x00,$key
-       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
-       addi            $key_,$sp,$FRAME+15
-       mtctr           $rounds
-
-Load_ctr32_enc_key:
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v30,$x10,$key
-       addi            $key,$key,0x20
-       stvx            v24,$x00,$key_          # off-load round[1]
-       ?vperm          v25,v31,v30,$keyperm
-       lvx             v31,$x00,$key
-       stvx            v25,$x10,$key_          # off-load round[2]
-       addi            $key_,$key_,0x20
-       bdnz            Load_ctr32_enc_key
-
-       lvx             v26,$x10,$key
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v27,$x20,$key
-       stvx            v24,$x00,$key_          # off-load round[3]
-       ?vperm          v25,v31,v26,$keyperm
-       lvx             v28,$x30,$key
-       stvx            v25,$x10,$key_          # off-load round[4]
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       ?vperm          v26,v26,v27,$keyperm
-       lvx             v29,$x40,$key
-       ?vperm          v27,v27,v28,$keyperm
-       lvx             v30,$x50,$key
-       ?vperm          v28,v28,v29,$keyperm
-       lvx             v31,$x60,$key
-       ?vperm          v29,v29,v30,$keyperm
-       lvx             $out0,$x70,$key         # borrow $out0
-       ?vperm          v30,v30,v31,$keyperm
-       lvx             v24,$x00,$key_          # pre-load round[1]
-       ?vperm          v31,v31,$out0,$keyperm
-       lvx             v25,$x10,$key_          # pre-load round[2]
-
-       vadduqm         $two,$one,$one
-       subi            $inp,$inp,15            # undo "caller"
-       $SHL            $len,$len,4
-
-       vadduqm         $out1,$ivec,$one        # counter values ...
-       vadduqm         $out2,$ivec,$two        # (do all ctr adds as 128-bit)
-       vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
-        le?li          $idx,8
-       vadduqm         $out3,$out1,$two
-       vxor            $out1,$out1,$rndkey0
-        le?lvsl        $inpperm,0,$idx
-       vadduqm         $out4,$out2,$two
-       vxor            $out2,$out2,$rndkey0
-        le?vspltisb    $tmp,0x0f
-       vadduqm         $out5,$out3,$two
-       vxor            $out3,$out3,$rndkey0
-        le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
-       vadduqm         $out6,$out4,$two
-       vxor            $out4,$out4,$rndkey0
-       vadduqm         $out7,$out5,$two
-       vxor            $out5,$out5,$rndkey0
-       vadduqm         $ivec,$out6,$two        # next counter value
-       vxor            $out6,$out6,$rndkey0
-       vxor            $out7,$out7,$rndkey0
-
-       mtctr           $rounds
-       b               Loop_ctr32_enc8x
-.align 5
-Loop_ctr32_enc8x:
-       vcipher         $out0,$out0,v24
-       vcipher         $out1,$out1,v24
-       vcipher         $out2,$out2,v24
-       vcipher         $out3,$out3,v24
-       vcipher         $out4,$out4,v24
-       vcipher         $out5,$out5,v24
-       vcipher         $out6,$out6,v24
-       vcipher         $out7,$out7,v24
-Loop_ctr32_enc8x_middle:
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vcipher         $out0,$out0,v25
-       vcipher         $out1,$out1,v25
-       vcipher         $out2,$out2,v25
-       vcipher         $out3,$out3,v25
-       vcipher         $out4,$out4,v25
-       vcipher         $out5,$out5,v25
-       vcipher         $out6,$out6,v25
-       vcipher         $out7,$out7,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Loop_ctr32_enc8x
-
-       subic           r11,$len,256            # $len-256, borrow $key_
-       vcipher         $out0,$out0,v24
-       vcipher         $out1,$out1,v24
-       vcipher         $out2,$out2,v24
-       vcipher         $out3,$out3,v24
-       vcipher         $out4,$out4,v24
-       vcipher         $out5,$out5,v24
-       vcipher         $out6,$out6,v24
-       vcipher         $out7,$out7,v24
-
-       subfe           r0,r0,r0                # borrow?-1:0
-       vcipher         $out0,$out0,v25
-       vcipher         $out1,$out1,v25
-       vcipher         $out2,$out2,v25
-       vcipher         $out3,$out3,v25
-       vcipher         $out4,$out4,v25
-       vcipher         $out5,$out5,v25
-       vcipher         $out6,$out6,v25
-       vcipher         $out7,$out7,v25
-
-       and             r0,r0,r11
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       vcipher         $out0,$out0,v26
-       vcipher         $out1,$out1,v26
-       vcipher         $out2,$out2,v26
-       vcipher         $out3,$out3,v26
-       vcipher         $out4,$out4,v26
-       vcipher         $out5,$out5,v26
-       vcipher         $out6,$out6,v26
-       vcipher         $out7,$out7,v26
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-
-       subic           $len,$len,129           # $len-=129
-       vcipher         $out0,$out0,v27
-       addi            $len,$len,1             # $len-=128 really
-       vcipher         $out1,$out1,v27
-       vcipher         $out2,$out2,v27
-       vcipher         $out3,$out3,v27
-       vcipher         $out4,$out4,v27
-       vcipher         $out5,$out5,v27
-       vcipher         $out6,$out6,v27
-       vcipher         $out7,$out7,v27
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-
-       vcipher         $out0,$out0,v28
-        lvx_u          $in0,$x00,$inp          # load input
-       vcipher         $out1,$out1,v28
-        lvx_u          $in1,$x10,$inp
-       vcipher         $out2,$out2,v28
-        lvx_u          $in2,$x20,$inp
-       vcipher         $out3,$out3,v28
-        lvx_u          $in3,$x30,$inp
-       vcipher         $out4,$out4,v28
-        lvx_u          $in4,$x40,$inp
-       vcipher         $out5,$out5,v28
-        lvx_u          $in5,$x50,$inp
-       vcipher         $out6,$out6,v28
-        lvx_u          $in6,$x60,$inp
-       vcipher         $out7,$out7,v28
-        lvx_u          $in7,$x70,$inp
-        addi           $inp,$inp,0x80
-
-       vcipher         $out0,$out0,v29
-        le?vperm       $in0,$in0,$in0,$inpperm
-       vcipher         $out1,$out1,v29
-        le?vperm       $in1,$in1,$in1,$inpperm
-       vcipher         $out2,$out2,v29
-        le?vperm       $in2,$in2,$in2,$inpperm
-       vcipher         $out3,$out3,v29
-        le?vperm       $in3,$in3,$in3,$inpperm
-       vcipher         $out4,$out4,v29
-        le?vperm       $in4,$in4,$in4,$inpperm
-       vcipher         $out5,$out5,v29
-        le?vperm       $in5,$in5,$in5,$inpperm
-       vcipher         $out6,$out6,v29
-        le?vperm       $in6,$in6,$in6,$inpperm
-       vcipher         $out7,$out7,v29
-        le?vperm       $in7,$in7,$in7,$inpperm
-
-       add             $inp,$inp,r0            # $inp is adjusted in such
-                                               # way that at exit from the
-                                               # loop inX-in7 are loaded
-                                               # with last "words"
-       subfe.          r0,r0,r0                # borrow?-1:0
-       vcipher         $out0,$out0,v30
-        vxor           $in0,$in0,v31           # xor with last round key
-       vcipher         $out1,$out1,v30
-        vxor           $in1,$in1,v31
-       vcipher         $out2,$out2,v30
-        vxor           $in2,$in2,v31
-       vcipher         $out3,$out3,v30
-        vxor           $in3,$in3,v31
-       vcipher         $out4,$out4,v30
-        vxor           $in4,$in4,v31
-       vcipher         $out5,$out5,v30
-        vxor           $in5,$in5,v31
-       vcipher         $out6,$out6,v30
-        vxor           $in6,$in6,v31
-       vcipher         $out7,$out7,v30
-        vxor           $in7,$in7,v31
-
-       bne             Lctr32_enc8x_break      # did $len-129 borrow?
-
-       vcipherlast     $in0,$out0,$in0
-       vcipherlast     $in1,$out1,$in1
-        vadduqm        $out1,$ivec,$one        # counter values ...
-       vcipherlast     $in2,$out2,$in2
-        vadduqm        $out2,$ivec,$two
-        vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
-       vcipherlast     $in3,$out3,$in3
-        vadduqm        $out3,$out1,$two
-        vxor           $out1,$out1,$rndkey0
-       vcipherlast     $in4,$out4,$in4
-        vadduqm        $out4,$out2,$two
-        vxor           $out2,$out2,$rndkey0
-       vcipherlast     $in5,$out5,$in5
-        vadduqm        $out5,$out3,$two
-        vxor           $out3,$out3,$rndkey0
-       vcipherlast     $in6,$out6,$in6
-        vadduqm        $out6,$out4,$two
-        vxor           $out4,$out4,$rndkey0
-       vcipherlast     $in7,$out7,$in7
-        vadduqm        $out7,$out5,$two
-        vxor           $out5,$out5,$rndkey0
-       le?vperm        $in0,$in0,$in0,$inpperm
-        vadduqm        $ivec,$out6,$two        # next counter value
-        vxor           $out6,$out6,$rndkey0
-       le?vperm        $in1,$in1,$in1,$inpperm
-        vxor           $out7,$out7,$rndkey0
-       mtctr           $rounds
-
-        vcipher        $out0,$out0,v24
-       stvx_u          $in0,$x00,$out
-       le?vperm        $in2,$in2,$in2,$inpperm
-        vcipher        $out1,$out1,v24
-       stvx_u          $in1,$x10,$out
-       le?vperm        $in3,$in3,$in3,$inpperm
-        vcipher        $out2,$out2,v24
-       stvx_u          $in2,$x20,$out
-       le?vperm        $in4,$in4,$in4,$inpperm
-        vcipher        $out3,$out3,v24
-       stvx_u          $in3,$x30,$out
-       le?vperm        $in5,$in5,$in5,$inpperm
-        vcipher        $out4,$out4,v24
-       stvx_u          $in4,$x40,$out
-       le?vperm        $in6,$in6,$in6,$inpperm
-        vcipher        $out5,$out5,v24
-       stvx_u          $in5,$x50,$out
-       le?vperm        $in7,$in7,$in7,$inpperm
-        vcipher        $out6,$out6,v24
-       stvx_u          $in6,$x60,$out
-        vcipher        $out7,$out7,v24
-       stvx_u          $in7,$x70,$out
-       addi            $out,$out,0x80
-
-       b               Loop_ctr32_enc8x_middle
-
-.align 5
-Lctr32_enc8x_break:
-       cmpwi           $len,-0x60
-       blt             Lctr32_enc8x_one
-       nop
-       beq             Lctr32_enc8x_two
-       cmpwi           $len,-0x40
-       blt             Lctr32_enc8x_three
-       nop
-       beq             Lctr32_enc8x_four
-       cmpwi           $len,-0x20
-       blt             Lctr32_enc8x_five
-       nop
-       beq             Lctr32_enc8x_six
-       cmpwi           $len,0x00
-       blt             Lctr32_enc8x_seven
-
-Lctr32_enc8x_eight:
-       vcipherlast     $out0,$out0,$in0
-       vcipherlast     $out1,$out1,$in1
-       vcipherlast     $out2,$out2,$in2
-       vcipherlast     $out3,$out3,$in3
-       vcipherlast     $out4,$out4,$in4
-       vcipherlast     $out5,$out5,$in5
-       vcipherlast     $out6,$out6,$in6
-       vcipherlast     $out7,$out7,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x20,$out
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x30,$out
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x40,$out
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x50,$out
-       le?vperm        $out7,$out7,$out7,$inpperm
-       stvx_u          $out6,$x60,$out
-       stvx_u          $out7,$x70,$out
-       addi            $out,$out,0x80
-       b               Lctr32_enc8x_done
-
-.align 5
-Lctr32_enc8x_seven:
-       vcipherlast     $out0,$out0,$in1
-       vcipherlast     $out1,$out1,$in2
-       vcipherlast     $out2,$out2,$in3
-       vcipherlast     $out3,$out3,$in4
-       vcipherlast     $out4,$out4,$in5
-       vcipherlast     $out5,$out5,$in6
-       vcipherlast     $out6,$out6,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x20,$out
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x30,$out
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x40,$out
-       le?vperm        $out6,$out6,$out6,$inpperm
-       stvx_u          $out5,$x50,$out
-       stvx_u          $out6,$x60,$out
-       addi            $out,$out,0x70
-       b               Lctr32_enc8x_done
-
-.align 5
-Lctr32_enc8x_six:
-       vcipherlast     $out0,$out0,$in2
-       vcipherlast     $out1,$out1,$in3
-       vcipherlast     $out2,$out2,$in4
-       vcipherlast     $out3,$out3,$in5
-       vcipherlast     $out4,$out4,$in6
-       vcipherlast     $out5,$out5,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x20,$out
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x30,$out
-       le?vperm        $out5,$out5,$out5,$inpperm
-       stvx_u          $out4,$x40,$out
-       stvx_u          $out5,$x50,$out
-       addi            $out,$out,0x60
-       b               Lctr32_enc8x_done
-
-.align 5
-Lctr32_enc8x_five:
-       vcipherlast     $out0,$out0,$in3
-       vcipherlast     $out1,$out1,$in4
-       vcipherlast     $out2,$out2,$in5
-       vcipherlast     $out3,$out3,$in6
-       vcipherlast     $out4,$out4,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x20,$out
-       le?vperm        $out4,$out4,$out4,$inpperm
-       stvx_u          $out3,$x30,$out
-       stvx_u          $out4,$x40,$out
-       addi            $out,$out,0x50
-       b               Lctr32_enc8x_done
-
-.align 5
-Lctr32_enc8x_four:
-       vcipherlast     $out0,$out0,$in4
-       vcipherlast     $out1,$out1,$in5
-       vcipherlast     $out2,$out2,$in6
-       vcipherlast     $out3,$out3,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$inpperm
-       stvx_u          $out2,$x20,$out
-       stvx_u          $out3,$x30,$out
-       addi            $out,$out,0x40
-       b               Lctr32_enc8x_done
-
-.align 5
-Lctr32_enc8x_three:
-       vcipherlast     $out0,$out0,$in5
-       vcipherlast     $out1,$out1,$in6
-       vcipherlast     $out2,$out2,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-       le?vperm        $out2,$out2,$out2,$inpperm
-       stvx_u          $out1,$x10,$out
-       stvx_u          $out2,$x20,$out
-       addi            $out,$out,0x30
-       b               Lctr32_enc8x_done
-
-.align 5
-Lctr32_enc8x_two:
-       vcipherlast     $out0,$out0,$in6
-       vcipherlast     $out1,$out1,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       le?vperm        $out1,$out1,$out1,$inpperm
-       stvx_u          $out0,$x00,$out
-       stvx_u          $out1,$x10,$out
-       addi            $out,$out,0x20
-       b               Lctr32_enc8x_done
-
-.align 5
-Lctr32_enc8x_one:
-       vcipherlast     $out0,$out0,$in7
-
-       le?vperm        $out0,$out0,$out0,$inpperm
-       stvx_u          $out0,0,$out
-       addi            $out,$out,0x10
-
-Lctr32_enc8x_done:
-       li              r10,`$FRAME+15`
-       li              r11,`$FRAME+31`
-       stvx            $inpperm,r10,$sp        # wipe copies of round keys
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-       stvx            $inpperm,r10,$sp
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-       stvx            $inpperm,r10,$sp
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-       stvx            $inpperm,r10,$sp
-       addi            r10,r10,32
-       stvx            $inpperm,r11,$sp
-       addi            r11,r11,32
-
-       mtspr           256,$vrsave
-       lvx             v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       lvx             v21,r11,$sp
-       addi            r11,r11,32
-       lvx             v22,r10,$sp
-       addi            r10,r10,32
-       lvx             v23,r11,$sp
-       addi            r11,r11,32
-       lvx             v24,r10,$sp
-       addi            r10,r10,32
-       lvx             v25,r11,$sp
-       addi            r11,r11,32
-       lvx             v26,r10,$sp
-       addi            r10,r10,32
-       lvx             v27,r11,$sp
-       addi            r11,r11,32
-       lvx             v28,r10,$sp
-       addi            r10,r10,32
-       lvx             v29,r11,$sp
-       addi            r11,r11,32
-       lvx             v30,r10,$sp
-       lvx             v31,r11,$sp
-       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
-       blr
-       .long           0
-       .byte           0,12,0x14,0,0x80,6,6,0
-       .long           0
-.size  .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
-___
-}}     }}}
-
-#########################################################################
-{{{    # XTS procedures                                                #
-# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,  #
-#                             const AES_KEY *key1, const AES_KEY *key2,        #
-#                             [const] unsigned char iv[16]);           #
-# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which  #
-# input tweak value is assumed to be encrypted already, and last tweak #
-# value, one suitable for consecutive call on same chunk of data, is   #
-# written back to original buffer. In addition, in "tweak chaining"    #
-# mode only complete input blocks are processed.                       #
-
-my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =    map("r$_",(3..10));
-my ($rndkey0,$rndkey1,$inout) =                                map("v$_",(0..2));
-my ($output,$inptail,$inpperm,$leperm,$keyperm) =      map("v$_",(3..7));
-my ($tweak,$seven,$eighty7,$tmp,$tweak1) =             map("v$_",(8..12));
-my $taillen = $key2;
-
-   ($inp,$idx) = ($idx,$inp);                          # reassign
-
-$code.=<<___;
-.globl .${prefix}_xts_encrypt
-       mr              $inp,r3                         # reassign
-       li              r3,-1
-       ${UCMP}i        $len,16
-       bltlr-
-
-       lis             r0,0xfff0
-       mfspr           r12,256                         # save vrsave
-       li              r11,0
-       mtspr           256,r0
-
-       vspltisb        $seven,0x07                     # 0x070707..07
-       le?lvsl         $leperm,r11,r11
-       le?vspltisb     $tmp,0x0f
-       le?vxor         $leperm,$leperm,$seven
-
-       li              $idx,15
-       lvx             $tweak,0,$ivp                   # load [unaligned] iv
-       lvsl            $inpperm,0,$ivp
-       lvx             $inptail,$idx,$ivp
-       le?vxor         $inpperm,$inpperm,$tmp
-       vperm           $tweak,$tweak,$inptail,$inpperm
-
-       neg             r11,$inp
-       lvsr            $inpperm,0,r11                  # prepare for unaligned load
-       lvx             $inout,0,$inp
-       addi            $inp,$inp,15                    # 15 is not typo
-       le?vxor         $inpperm,$inpperm,$tmp
-
-       ${UCMP}i        $key2,0                         # key2==NULL?
-       beq             Lxts_enc_no_key2
-
-       ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
-       lwz             $rounds,240($key2)
-       srwi            $rounds,$rounds,1
-       subi            $rounds,$rounds,1
-       li              $idx,16
-
-       lvx             $rndkey0,0,$key2
-       lvx             $rndkey1,$idx,$key2
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $tweak,$tweak,$rndkey0
-       lvx             $rndkey0,$idx,$key2
-       addi            $idx,$idx,16
-       mtctr           $rounds
-
-Ltweak_xts_enc:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $tweak,$tweak,$rndkey1
-       lvx             $rndkey1,$idx,$key2
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipher         $tweak,$tweak,$rndkey0
-       lvx             $rndkey0,$idx,$key2
-       addi            $idx,$idx,16
-       bdnz            Ltweak_xts_enc
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $tweak,$tweak,$rndkey1
-       lvx             $rndkey1,$idx,$key2
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipherlast     $tweak,$tweak,$rndkey0
-
-       li              $ivp,0                          # don't chain the tweak
-       b               Lxts_enc
-
-Lxts_enc_no_key2:
-       li              $idx,-16
-       and             $len,$len,$idx                  # in "tweak chaining"
-                                                       # mode only complete
-                                                       # blocks are processed
-Lxts_enc:
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,16
-
-       ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
-       lwz             $rounds,240($key1)
-       srwi            $rounds,$rounds,1
-       subi            $rounds,$rounds,1
-       li              $idx,16
-
-       vslb            $eighty7,$seven,$seven          # 0x808080..80
-       vor             $eighty7,$eighty7,$seven        # 0x878787..87
-       vspltisb        $tmp,1                          # 0x010101..01
-       vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
-
-       ${UCMP}i        $len,96
-       bge             _aesp8_xts_encrypt6x
-
-       andi.           $taillen,$len,15
-       subic           r0,$len,32
-       subi            $taillen,$taillen,16
-       subfe           r0,r0,r0
-       and             r0,r0,$taillen
-       add             $inp,$inp,r0
-
-       lvx             $rndkey0,0,$key1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-       vperm           $inout,$inout,$inptail,$inpperm
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $inout,$inout,$tweak
-       vxor            $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-       mtctr           $rounds
-       b               Loop_xts_enc
-
-.align 5
-Loop_xts_enc:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipher         $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-       bdnz            Loop_xts_enc
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key1
-       li              $idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $rndkey0,$rndkey0,$tweak
-       vcipherlast     $output,$inout,$rndkey0
-
-       le?vperm        $tmp,$output,$output,$leperm
-       be?nop
-       le?stvx_u       $tmp,0,$out
-       be?stvx_u       $output,0,$out
-       addi            $out,$out,16
-
-       subic.          $len,$len,16
-       beq             Lxts_enc_done
-
-       vmr             $inout,$inptail
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,16
-       lvx             $rndkey0,0,$key1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-
-       subic           r0,$len,32
-       subfe           r0,r0,r0
-       and             r0,r0,$taillen
-       add             $inp,$inp,r0
-
-       vsrab           $tmp,$tweak,$seven              # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-       vsldoi          $tmp,$tmp,$tmp,15
-       vand            $tmp,$tmp,$eighty7
-       vxor            $tweak,$tweak,$tmp
-
-       vperm           $inout,$inout,$inptail,$inpperm
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $inout,$inout,$tweak
-       vxor            $output,$output,$rndkey0        # just in case $len<16
-       vxor            $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-
-       mtctr           $rounds
-       ${UCMP}i        $len,16
-       bge             Loop_xts_enc
-
-       vxor            $output,$output,$tweak
-       lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
-       vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
-       vspltisb        $tmp,-1
-       vperm           $inptail,$inptail,$tmp,$inpperm
-       vsel            $inout,$inout,$output,$inptail
-
-       subi            r11,$out,17
-       subi            $out,$out,16
-       mtctr           $len
-       li              $len,16
-Loop_xts_enc_steal:
-       lbzu            r0,1(r11)
-       stb             r0,16(r11)
-       bdnz            Loop_xts_enc_steal
-
-       mtctr           $rounds
-       b               Loop_xts_enc                    # one more time...
-
-Lxts_enc_done:
-       ${UCMP}i        $ivp,0
-       beq             Lxts_enc_ret
-
-       vsrab           $tmp,$tweak,$seven              # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-       vsldoi          $tmp,$tmp,$tmp,15
-       vand            $tmp,$tmp,$eighty7
-       vxor            $tweak,$tweak,$tmp
-
-       le?vperm        $tweak,$tweak,$tweak,$leperm
-       stvx_u          $tweak,0,$ivp
-
-Lxts_enc_ret:
-       mtspr           256,r12                         # restore vrsave
-       li              r3,0
-       blr
-       .long           0
-       .byte           0,12,0x04,0,0x80,6,6,0
-       .long           0
-.size  .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
-
-.globl .${prefix}_xts_decrypt
-       mr              $inp,r3                         # reassign
-       li              r3,-1
-       ${UCMP}i        $len,16
-       bltlr-
-
-       lis             r0,0xfff8
-       mfspr           r12,256                         # save vrsave
-       li              r11,0
-       mtspr           256,r0
-
-       andi.           r0,$len,15
-       neg             r0,r0
-       andi.           r0,r0,16
-       sub             $len,$len,r0
-
-       vspltisb        $seven,0x07                     # 0x070707..07
-       le?lvsl         $leperm,r11,r11
-       le?vspltisb     $tmp,0x0f
-       le?vxor         $leperm,$leperm,$seven
-
-       li              $idx,15
-       lvx             $tweak,0,$ivp                   # load [unaligned] iv
-       lvsl            $inpperm,0,$ivp
-       lvx             $inptail,$idx,$ivp
-       le?vxor         $inpperm,$inpperm,$tmp
-       vperm           $tweak,$tweak,$inptail,$inpperm
-
-       neg             r11,$inp
-       lvsr            $inpperm,0,r11                  # prepare for unaligned load
-       lvx             $inout,0,$inp
-       addi            $inp,$inp,15                    # 15 is not typo
-       le?vxor         $inpperm,$inpperm,$tmp
-
-       ${UCMP}i        $key2,0                         # key2==NULL?
-       beq             Lxts_dec_no_key2
-
-       ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
-       lwz             $rounds,240($key2)
-       srwi            $rounds,$rounds,1
-       subi            $rounds,$rounds,1
-       li              $idx,16
-
-       lvx             $rndkey0,0,$key2
-       lvx             $rndkey1,$idx,$key2
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $tweak,$tweak,$rndkey0
-       lvx             $rndkey0,$idx,$key2
-       addi            $idx,$idx,16
-       mtctr           $rounds
-
-Ltweak_xts_dec:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $tweak,$tweak,$rndkey1
-       lvx             $rndkey1,$idx,$key2
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipher         $tweak,$tweak,$rndkey0
-       lvx             $rndkey0,$idx,$key2
-       addi            $idx,$idx,16
-       bdnz            Ltweak_xts_dec
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vcipher         $tweak,$tweak,$rndkey1
-       lvx             $rndkey1,$idx,$key2
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vcipherlast     $tweak,$tweak,$rndkey0
-
-       li              $ivp,0                          # don't chain the tweak
-       b               Lxts_dec
-
-Lxts_dec_no_key2:
-       neg             $idx,$len
-       andi.           $idx,$idx,15
-       add             $len,$len,$idx                  # in "tweak chaining"
-                                                       # mode only complete
-                                                       # blocks are processed
-Lxts_dec:
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,16
-
-       ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
-       lwz             $rounds,240($key1)
-       srwi            $rounds,$rounds,1
-       subi            $rounds,$rounds,1
-       li              $idx,16
-
-       vslb            $eighty7,$seven,$seven          # 0x808080..80
-       vor             $eighty7,$eighty7,$seven        # 0x878787..87
-       vspltisb        $tmp,1                          # 0x010101..01
-       vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
-
-       ${UCMP}i        $len,96
-       bge             _aesp8_xts_decrypt6x
-
-       lvx             $rndkey0,0,$key1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-       vperm           $inout,$inout,$inptail,$inpperm
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $inout,$inout,$tweak
-       vxor            $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-       mtctr           $rounds
-
-       ${UCMP}i        $len,16
-       blt             Ltail_xts_dec
-       be?b            Loop_xts_dec
-
-.align 5
-Loop_xts_dec:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vncipher        $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vncipher        $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-       bdnz            Loop_xts_dec
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vncipher        $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key1
-       li              $idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $rndkey0,$rndkey0,$tweak
-       vncipherlast    $output,$inout,$rndkey0
-
-       le?vperm        $tmp,$output,$output,$leperm
-       be?nop
-       le?stvx_u       $tmp,0,$out
-       be?stvx_u       $output,0,$out
-       addi            $out,$out,16
-
-       subic.          $len,$len,16
-       beq             Lxts_dec_done
-
-       vmr             $inout,$inptail
-       lvx             $inptail,0,$inp
-       addi            $inp,$inp,16
-       lvx             $rndkey0,0,$key1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-
-       vsrab           $tmp,$tweak,$seven              # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-       vsldoi          $tmp,$tmp,$tmp,15
-       vand            $tmp,$tmp,$eighty7
-       vxor            $tweak,$tweak,$tmp
-
-       vperm           $inout,$inout,$inptail,$inpperm
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $inout,$inout,$tweak
-       vxor            $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-
-       mtctr           $rounds
-       ${UCMP}i        $len,16
-       bge             Loop_xts_dec
-
-Ltail_xts_dec:
-       vsrab           $tmp,$tweak,$seven              # next tweak value
-       vaddubm         $tweak1,$tweak,$tweak
-       vsldoi          $tmp,$tmp,$tmp,15
-       vand            $tmp,$tmp,$eighty7
-       vxor            $tweak1,$tweak1,$tmp
-
-       subi            $inp,$inp,16
-       add             $inp,$inp,$len
-
-       vxor            $inout,$inout,$tweak            # :-(
-       vxor            $inout,$inout,$tweak1           # :-)
-
-Loop_xts_dec_short:
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vncipher        $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vncipher        $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-       bdnz            Loop_xts_dec_short
-
-       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
-       vncipher        $inout,$inout,$rndkey1
-       lvx             $rndkey1,$idx,$key1
-       li              $idx,16
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-       vxor            $rndkey0,$rndkey0,$tweak1
-       vncipherlast    $output,$inout,$rndkey0
-
-       le?vperm        $tmp,$output,$output,$leperm
-       be?nop
-       le?stvx_u       $tmp,0,$out
-       be?stvx_u       $output,0,$out
-
-       vmr             $inout,$inptail
-       lvx             $inptail,0,$inp
-       #addi           $inp,$inp,16
-       lvx             $rndkey0,0,$key1
-       lvx             $rndkey1,$idx,$key1
-       addi            $idx,$idx,16
-       vperm           $inout,$inout,$inptail,$inpperm
-       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
-
-       lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
-       vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
-       vspltisb        $tmp,-1
-       vperm           $inptail,$inptail,$tmp,$inpperm
-       vsel            $inout,$inout,$output,$inptail
-
-       vxor            $rndkey0,$rndkey0,$tweak
-       vxor            $inout,$inout,$rndkey0
-       lvx             $rndkey0,$idx,$key1
-       addi            $idx,$idx,16
-
-       subi            r11,$out,1
-       mtctr           $len
-       li              $len,16
-Loop_xts_dec_steal:
-       lbzu            r0,1(r11)
-       stb             r0,16(r11)
-       bdnz            Loop_xts_dec_steal
-
-       mtctr           $rounds
-       b               Loop_xts_dec                    # one more time...
-
-Lxts_dec_done:
-       ${UCMP}i        $ivp,0
-       beq             Lxts_dec_ret
-
-       vsrab           $tmp,$tweak,$seven              # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-       vsldoi          $tmp,$tmp,$tmp,15
-       vand            $tmp,$tmp,$eighty7
-       vxor            $tweak,$tweak,$tmp
-
-       le?vperm        $tweak,$tweak,$tweak,$leperm
-       stvx_u          $tweak,0,$ivp
-
-Lxts_dec_ret:
-       mtspr           256,r12                         # restore vrsave
-       li              r3,0
-       blr
-       .long           0
-       .byte           0,12,0x04,0,0x80,6,6,0
-       .long           0
-.size  .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
-___
-#########################################################################
-{{     # Optimized XTS procedures                                      #
-my $key_=$key2;
-my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
-    $x00=0 if ($flavour =~ /osx/);
-my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
-my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
-my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
-my $rndkey0="v23";     # v24-v25 rotating buffer for first found keys
-                       # v26-v31 last 6 round keys
-my ($keyperm)=($out0); # aliases with "caller", redundant assignment
-my $taillen=$x70;
-
-$code.=<<___;
-.align 5
-_aesp8_xts_encrypt6x:
-       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
-       mflr            r11
-       li              r7,`$FRAME+8*16+15`
-       li              r3,`$FRAME+8*16+31`
-       $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
-       stvx            v20,r7,$sp              # ABI says so
-       addi            r7,r7,32
-       stvx            v21,r3,$sp
-       addi            r3,r3,32
-       stvx            v22,r7,$sp
-       addi            r7,r7,32
-       stvx            v23,r3,$sp
-       addi            r3,r3,32
-       stvx            v24,r7,$sp
-       addi            r7,r7,32
-       stvx            v25,r3,$sp
-       addi            r3,r3,32
-       stvx            v26,r7,$sp
-       addi            r7,r7,32
-       stvx            v27,r3,$sp
-       addi            r3,r3,32
-       stvx            v28,r7,$sp
-       addi            r7,r7,32
-       stvx            v29,r3,$sp
-       addi            r3,r3,32
-       stvx            v30,r7,$sp
-       stvx            v31,r3,$sp
-       li              r0,-1
-       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
-       li              $x10,0x10
-       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       li              $x20,0x20
-       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       li              $x30,0x30
-       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       li              $x40,0x40
-       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       li              $x50,0x50
-       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       li              $x60,0x60
-       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       li              $x70,0x70
-       mtspr           256,r0
-
-       xxlor           2, 32+$eighty7, 32+$eighty7
-       vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
-       xxlor           1, 32+$eighty7, 32+$eighty7
-
-       # Load XOR Lconsts.
-       mr              $x70, r6
-       bl              Lconsts
-       lxvw4x          0, $x40, r6             # load XOR contents
-       mr              r6, $x70
-       li              $x70,0x70
-
-       subi            $rounds,$rounds,3       # -4 in total
-
-       lvx             $rndkey0,$x00,$key1     # load key schedule
-       lvx             v30,$x10,$key1
-       addi            $key1,$key1,0x20
-       lvx             v31,$x00,$key1
-       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
-       addi            $key_,$sp,$FRAME+15
-       mtctr           $rounds
-
-Load_xts_enc_key:
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v30,$x10,$key1
-       addi            $key1,$key1,0x20
-       stvx            v24,$x00,$key_          # off-load round[1]
-       ?vperm          v25,v31,v30,$keyperm
-       lvx             v31,$x00,$key1
-       stvx            v25,$x10,$key_          # off-load round[2]
-       addi            $key_,$key_,0x20
-       bdnz            Load_xts_enc_key
-
-       lvx             v26,$x10,$key1
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v27,$x20,$key1
-       stvx            v24,$x00,$key_          # off-load round[3]
-       ?vperm          v25,v31,v26,$keyperm
-       lvx             v28,$x30,$key1
-       stvx            v25,$x10,$key_          # off-load round[4]
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       ?vperm          v26,v26,v27,$keyperm
-       lvx             v29,$x40,$key1
-       ?vperm          v27,v27,v28,$keyperm
-       lvx             v30,$x50,$key1
-       ?vperm          v28,v28,v29,$keyperm
-       lvx             v31,$x60,$key1
-       ?vperm          v29,v29,v30,$keyperm
-       lvx             $twk5,$x70,$key1        # borrow $twk5
-       ?vperm          v30,v30,v31,$keyperm
-       lvx             v24,$x00,$key_          # pre-load round[1]
-       ?vperm          v31,v31,$twk5,$keyperm
-       lvx             v25,$x10,$key_          # pre-load round[2]
-
-       # Switch to use the following codes with 0x010101..87 to generate tweak.
-       #     eighty7 = 0x010101..87
-       # vsrab         tmp, tweak, seven       # next tweak value, right shift 7 bits
-       # vand          tmp, tmp, eighty7       # last byte with carry
-       # vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
-       # xxlor         vsx, 0, 0
-       # vpermxor      tweak, tweak, tmp, vsx
-
-        vperm          $in0,$inout,$inptail,$inpperm
-        subi           $inp,$inp,31            # undo "caller"
-       vxor            $twk0,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out0,$in0,$twk0
-       xxlor           32+$in1, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in1
-
-        lvx_u          $in1,$x10,$inp
-       vxor            $twk1,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in1,$in1,$in1,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out1,$in1,$twk1
-       xxlor           32+$in2, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in2
-
-        lvx_u          $in2,$x20,$inp
-        andi.          $taillen,$len,15
-       vxor            $twk2,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in2,$in2,$in2,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out2,$in2,$twk2
-       xxlor           32+$in3, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in3
-
-        lvx_u          $in3,$x30,$inp
-        sub            $len,$len,$taillen
-       vxor            $twk3,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in3,$in3,$in3,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out3,$in3,$twk3
-       xxlor           32+$in4, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in4
-
-        lvx_u          $in4,$x40,$inp
-        subi           $len,$len,0x60
-       vxor            $twk4,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in4,$in4,$in4,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out4,$in4,$twk4
-       xxlor           32+$in5, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in5
-
-        lvx_u          $in5,$x50,$inp
-        addi           $inp,$inp,0x60
-       vxor            $twk5,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in5,$in5,$in5,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out5,$in5,$twk5
-       xxlor           32+$in0, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in0
-
-       vxor            v31,v31,$rndkey0
-       mtctr           $rounds
-       b               Loop_xts_enc6x
-
-.align 5
-Loop_xts_enc6x:
-       vcipher         $out0,$out0,v24
-       vcipher         $out1,$out1,v24
-       vcipher         $out2,$out2,v24
-       vcipher         $out3,$out3,v24
-       vcipher         $out4,$out4,v24
-       vcipher         $out5,$out5,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vcipher         $out0,$out0,v25
-       vcipher         $out1,$out1,v25
-       vcipher         $out2,$out2,v25
-       vcipher         $out3,$out3,v25
-       vcipher         $out4,$out4,v25
-       vcipher         $out5,$out5,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Loop_xts_enc6x
-
-       xxlor           32+$eighty7, 1, 1       # 0x010101..87
-
-       subic           $len,$len,96            # $len-=96
-        vxor           $in0,$twk0,v31          # xor with last round key
-       vcipher         $out0,$out0,v24
-       vcipher         $out1,$out1,v24
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk0,$tweak,$rndkey0
-        vaddubm        $tweak,$tweak,$tweak
-       vcipher         $out2,$out2,v24
-       vcipher         $out3,$out3,v24
-       vcipher         $out4,$out4,v24
-       vcipher         $out5,$out5,v24
-
-       subfe.          r0,r0,r0                # borrow?-1:0
-        vand           $tmp,$tmp,$eighty7
-       vcipher         $out0,$out0,v25
-       vcipher         $out1,$out1,v25
-        xxlor          32+$in1, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in1
-       vcipher         $out2,$out2,v25
-       vcipher         $out3,$out3,v25
-        vxor           $in1,$twk1,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk1,$tweak,$rndkey0
-       vcipher         $out4,$out4,v25
-       vcipher         $out5,$out5,v25
-
-       and             r0,r0,$len
-        vaddubm        $tweak,$tweak,$tweak
-       vcipher         $out0,$out0,v26
-       vcipher         $out1,$out1,v26
-        vand           $tmp,$tmp,$eighty7
-       vcipher         $out2,$out2,v26
-       vcipher         $out3,$out3,v26
-        xxlor          32+$in2, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in2
-       vcipher         $out4,$out4,v26
-       vcipher         $out5,$out5,v26
-
-       add             $inp,$inp,r0            # $inp is adjusted in such
-                                               # way that at exit from the
-                                               # loop inX-in5 are loaded
-                                               # with last "words"
-        vxor           $in2,$twk2,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk2,$tweak,$rndkey0
-        vaddubm        $tweak,$tweak,$tweak
-       vcipher         $out0,$out0,v27
-       vcipher         $out1,$out1,v27
-       vcipher         $out2,$out2,v27
-       vcipher         $out3,$out3,v27
-        vand           $tmp,$tmp,$eighty7
-       vcipher         $out4,$out4,v27
-       vcipher         $out5,$out5,v27
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-        xxlor          32+$in3, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in3
-       vcipher         $out0,$out0,v28
-       vcipher         $out1,$out1,v28
-        vxor           $in3,$twk3,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk3,$tweak,$rndkey0
-       vcipher         $out2,$out2,v28
-       vcipher         $out3,$out3,v28
-        vaddubm        $tweak,$tweak,$tweak
-       vcipher         $out4,$out4,v28
-       vcipher         $out5,$out5,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-        vand           $tmp,$tmp,$eighty7
-
-       vcipher         $out0,$out0,v29
-       vcipher         $out1,$out1,v29
-        xxlor          32+$in4, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in4
-       vcipher         $out2,$out2,v29
-       vcipher         $out3,$out3,v29
-        vxor           $in4,$twk4,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk4,$tweak,$rndkey0
-       vcipher         $out4,$out4,v29
-       vcipher         $out5,$out5,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-        vaddubm        $tweak,$tweak,$tweak
-
-       vcipher         $out0,$out0,v30
-       vcipher         $out1,$out1,v30
-        vand           $tmp,$tmp,$eighty7
-       vcipher         $out2,$out2,v30
-       vcipher         $out3,$out3,v30
-        xxlor          32+$in5, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in5
-       vcipher         $out4,$out4,v30
-       vcipher         $out5,$out5,v30
-        vxor           $in5,$twk5,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk5,$tweak,$rndkey0
-
-       vcipherlast     $out0,$out0,$in0
-        lvx_u          $in0,$x00,$inp          # load next input block
-        vaddubm        $tweak,$tweak,$tweak
-       vcipherlast     $out1,$out1,$in1
-        lvx_u          $in1,$x10,$inp
-       vcipherlast     $out2,$out2,$in2
-        le?vperm       $in0,$in0,$in0,$leperm
-        lvx_u          $in2,$x20,$inp
-        vand           $tmp,$tmp,$eighty7
-       vcipherlast     $out3,$out3,$in3
-        le?vperm       $in1,$in1,$in1,$leperm
-        lvx_u          $in3,$x30,$inp
-       vcipherlast     $out4,$out4,$in4
-        le?vperm       $in2,$in2,$in2,$leperm
-        lvx_u          $in4,$x40,$inp
-        xxlor          10, 32+$in0, 32+$in0
-        xxlor          32+$in0, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in0
-        xxlor          32+$in0, 10, 10
-       vcipherlast     $tmp,$out5,$in5         # last block might be needed
-                                               # in stealing mode
-        le?vperm       $in3,$in3,$in3,$leperm
-        lvx_u          $in5,$x50,$inp
-        addi           $inp,$inp,0x60
-        le?vperm       $in4,$in4,$in4,$leperm
-        le?vperm       $in5,$in5,$in5,$leperm
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-        vxor           $out0,$in0,$twk0
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-        vxor           $out1,$in1,$twk1
-       le?vperm        $out3,$out3,$out3,$leperm
-       stvx_u          $out2,$x20,$out
-        vxor           $out2,$in2,$twk2
-       le?vperm        $out4,$out4,$out4,$leperm
-       stvx_u          $out3,$x30,$out
-        vxor           $out3,$in3,$twk3
-       le?vperm        $out5,$tmp,$tmp,$leperm
-       stvx_u          $out4,$x40,$out
-        vxor           $out4,$in4,$twk4
-       le?stvx_u       $out5,$x50,$out
-       be?stvx_u       $tmp, $x50,$out
-        vxor           $out5,$in5,$twk5
-       addi            $out,$out,0x60
-
-       mtctr           $rounds
-       beq             Loop_xts_enc6x          # did $len-=96 borrow?
-
-       xxlor           32+$eighty7, 2, 2       # 0x010101..87
-
-       addic.          $len,$len,0x60
-       beq             Lxts_enc6x_zero
-       cmpwi           $len,0x20
-       blt             Lxts_enc6x_one
-       nop
-       beq             Lxts_enc6x_two
-       cmpwi           $len,0x40
-       blt             Lxts_enc6x_three
-       nop
-       beq             Lxts_enc6x_four
-
-Lxts_enc6x_five:
-       vxor            $out0,$in1,$twk0
-       vxor            $out1,$in2,$twk1
-       vxor            $out2,$in3,$twk2
-       vxor            $out3,$in4,$twk3
-       vxor            $out4,$in5,$twk4
-
-       bl              _aesp8_xts_enc5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk5             # unused tweak
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$leperm
-       stvx_u          $out2,$x20,$out
-       vxor            $tmp,$out4,$twk5        # last block prep for stealing
-       le?vperm        $out4,$out4,$out4,$leperm
-       stvx_u          $out3,$x30,$out
-       stvx_u          $out4,$x40,$out
-       addi            $out,$out,0x50
-       bne             Lxts_enc6x_steal
-       b               Lxts_enc6x_done
-
-.align 4
-Lxts_enc6x_four:
-       vxor            $out0,$in2,$twk0
-       vxor            $out1,$in3,$twk1
-       vxor            $out2,$in4,$twk2
-       vxor            $out3,$in5,$twk3
-       vxor            $out4,$out4,$out4
-
-       bl              _aesp8_xts_enc5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk4             # unused tweak
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-       vxor            $tmp,$out3,$twk4        # last block prep for stealing
-       le?vperm        $out3,$out3,$out3,$leperm
-       stvx_u          $out2,$x20,$out
-       stvx_u          $out3,$x30,$out
-       addi            $out,$out,0x40
-       bne             Lxts_enc6x_steal
-       b               Lxts_enc6x_done
-
-.align 4
-Lxts_enc6x_three:
-       vxor            $out0,$in3,$twk0
-       vxor            $out1,$in4,$twk1
-       vxor            $out2,$in5,$twk2
-       vxor            $out3,$out3,$out3
-       vxor            $out4,$out4,$out4
-
-       bl              _aesp8_xts_enc5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk3             # unused tweak
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       vxor            $tmp,$out2,$twk3        # last block prep for stealing
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-       stvx_u          $out2,$x20,$out
-       addi            $out,$out,0x30
-       bne             Lxts_enc6x_steal
-       b               Lxts_enc6x_done
-
-.align 4
-Lxts_enc6x_two:
-       vxor            $out0,$in4,$twk0
-       vxor            $out1,$in5,$twk1
-       vxor            $out2,$out2,$out2
-       vxor            $out3,$out3,$out3
-       vxor            $out4,$out4,$out4
-
-       bl              _aesp8_xts_enc5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk2             # unused tweak
-       vxor            $tmp,$out1,$twk2        # last block prep for stealing
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       stvx_u          $out1,$x10,$out
-       addi            $out,$out,0x20
-       bne             Lxts_enc6x_steal
-       b               Lxts_enc6x_done
-
-.align 4
-Lxts_enc6x_one:
-       vxor            $out0,$in5,$twk0
-       nop
-Loop_xts_enc1x:
-       vcipher         $out0,$out0,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vcipher         $out0,$out0,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Loop_xts_enc1x
-
-       add             $inp,$inp,$taillen
-       cmpwi           $taillen,0
-       vcipher         $out0,$out0,v24
-
-       subi            $inp,$inp,16
-       vcipher         $out0,$out0,v25
-
-       lvsr            $inpperm,0,$taillen
-       vcipher         $out0,$out0,v26
-
-       lvx_u           $in0,0,$inp
-       vcipher         $out0,$out0,v27
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       vcipher         $out0,$out0,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-
-       vcipher         $out0,$out0,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-        vxor           $twk0,$twk0,v31
-
-       le?vperm        $in0,$in0,$in0,$leperm
-       vcipher         $out0,$out0,v30
-
-       vperm           $in0,$in0,$in0,$inpperm
-       vcipherlast     $out0,$out0,$twk0
-
-       vmr             $twk0,$twk1             # unused tweak
-       vxor            $tmp,$out0,$twk1        # last block prep for stealing
-       le?vperm        $out0,$out0,$out0,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       addi            $out,$out,0x10
-       bne             Lxts_enc6x_steal
-       b               Lxts_enc6x_done
-
-.align 4
-Lxts_enc6x_zero:
-       cmpwi           $taillen,0
-       beq             Lxts_enc6x_done
-
-       add             $inp,$inp,$taillen
-       subi            $inp,$inp,16
-       lvx_u           $in0,0,$inp
-       lvsr            $inpperm,0,$taillen     # $in5 is no more
-       le?vperm        $in0,$in0,$in0,$leperm
-       vperm           $in0,$in0,$in0,$inpperm
-       vxor            $tmp,$tmp,$twk0
-Lxts_enc6x_steal:
-       vxor            $in0,$in0,$twk0
-       vxor            $out0,$out0,$out0
-       vspltisb        $out1,-1
-       vperm           $out0,$out0,$out1,$inpperm
-       vsel            $out0,$in0,$tmp,$out0   # $tmp is last block, remember?
-
-       subi            r30,$out,17
-       subi            $out,$out,16
-       mtctr           $taillen
-Loop_xts_enc6x_steal:
-       lbzu            r0,1(r30)
-       stb             r0,16(r30)
-       bdnz            Loop_xts_enc6x_steal
-
-       li              $taillen,0
-       mtctr           $rounds
-       b               Loop_xts_enc1x          # one more time...
-
-.align 4
-Lxts_enc6x_done:
-       ${UCMP}i        $ivp,0
-       beq             Lxts_enc6x_ret
-
-       vxor            $tweak,$twk0,$rndkey0
-       le?vperm        $tweak,$tweak,$tweak,$leperm
-       stvx_u          $tweak,0,$ivp
-
-Lxts_enc6x_ret:
-       mtlr            r11
-       li              r10,`$FRAME+15`
-       li              r11,`$FRAME+31`
-       stvx            $seven,r10,$sp          # wipe copies of round keys
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-       stvx            $seven,r10,$sp
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-       stvx            $seven,r10,$sp
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-       stvx            $seven,r10,$sp
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-
-       mtspr           256,$vrsave
-       lvx             v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       lvx             v21,r11,$sp
-       addi            r11,r11,32
-       lvx             v22,r10,$sp
-       addi            r10,r10,32
-       lvx             v23,r11,$sp
-       addi            r11,r11,32
-       lvx             v24,r10,$sp
-       addi            r10,r10,32
-       lvx             v25,r11,$sp
-       addi            r11,r11,32
-       lvx             v26,r10,$sp
-       addi            r10,r10,32
-       lvx             v27,r11,$sp
-       addi            r11,r11,32
-       lvx             v28,r10,$sp
-       addi            r10,r10,32
-       lvx             v29,r11,$sp
-       addi            r11,r11,32
-       lvx             v30,r10,$sp
-       lvx             v31,r11,$sp
-       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
-       blr
-       .long           0
-       .byte           0,12,0x04,1,0x80,6,6,0
-       .long           0
-
-.align 5
-_aesp8_xts_enc5x:
-       vcipher         $out0,$out0,v24
-       vcipher         $out1,$out1,v24
-       vcipher         $out2,$out2,v24
-       vcipher         $out3,$out3,v24
-       vcipher         $out4,$out4,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vcipher         $out0,$out0,v25
-       vcipher         $out1,$out1,v25
-       vcipher         $out2,$out2,v25
-       vcipher         $out3,$out3,v25
-       vcipher         $out4,$out4,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            _aesp8_xts_enc5x
-
-       add             $inp,$inp,$taillen
-       cmpwi           $taillen,0
-       vcipher         $out0,$out0,v24
-       vcipher         $out1,$out1,v24
-       vcipher         $out2,$out2,v24
-       vcipher         $out3,$out3,v24
-       vcipher         $out4,$out4,v24
-
-       subi            $inp,$inp,16
-       vcipher         $out0,$out0,v25
-       vcipher         $out1,$out1,v25
-       vcipher         $out2,$out2,v25
-       vcipher         $out3,$out3,v25
-       vcipher         $out4,$out4,v25
-        vxor           $twk0,$twk0,v31
-
-       vcipher         $out0,$out0,v26
-       lvsr            $inpperm,r0,$taillen    # $in5 is no more
-       vcipher         $out1,$out1,v26
-       vcipher         $out2,$out2,v26
-       vcipher         $out3,$out3,v26
-       vcipher         $out4,$out4,v26
-        vxor           $in1,$twk1,v31
-
-       vcipher         $out0,$out0,v27
-       lvx_u           $in0,0,$inp
-       vcipher         $out1,$out1,v27
-       vcipher         $out2,$out2,v27
-       vcipher         $out3,$out3,v27
-       vcipher         $out4,$out4,v27
-        vxor           $in2,$twk2,v31
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       vcipher         $out0,$out0,v28
-       vcipher         $out1,$out1,v28
-       vcipher         $out2,$out2,v28
-       vcipher         $out3,$out3,v28
-       vcipher         $out4,$out4,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-        vxor           $in3,$twk3,v31
-
-       vcipher         $out0,$out0,v29
-       le?vperm        $in0,$in0,$in0,$leperm
-       vcipher         $out1,$out1,v29
-       vcipher         $out2,$out2,v29
-       vcipher         $out3,$out3,v29
-       vcipher         $out4,$out4,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-        vxor           $in4,$twk4,v31
-
-       vcipher         $out0,$out0,v30
-       vperm           $in0,$in0,$in0,$inpperm
-       vcipher         $out1,$out1,v30
-       vcipher         $out2,$out2,v30
-       vcipher         $out3,$out3,v30
-       vcipher         $out4,$out4,v30
-
-       vcipherlast     $out0,$out0,$twk0
-       vcipherlast     $out1,$out1,$in1
-       vcipherlast     $out2,$out2,$in2
-       vcipherlast     $out3,$out3,$in3
-       vcipherlast     $out4,$out4,$in4
-       blr
-        .long          0
-        .byte          0,12,0x14,0,0,0,0,0
-
-.align 5
-_aesp8_xts_decrypt6x:
-       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
-       mflr            r11
-       li              r7,`$FRAME+8*16+15`
-       li              r3,`$FRAME+8*16+31`
-       $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
-       stvx            v20,r7,$sp              # ABI says so
-       addi            r7,r7,32
-       stvx            v21,r3,$sp
-       addi            r3,r3,32
-       stvx            v22,r7,$sp
-       addi            r7,r7,32
-       stvx            v23,r3,$sp
-       addi            r3,r3,32
-       stvx            v24,r7,$sp
-       addi            r7,r7,32
-       stvx            v25,r3,$sp
-       addi            r3,r3,32
-       stvx            v26,r7,$sp
-       addi            r7,r7,32
-       stvx            v27,r3,$sp
-       addi            r3,r3,32
-       stvx            v28,r7,$sp
-       addi            r7,r7,32
-       stvx            v29,r3,$sp
-       addi            r3,r3,32
-       stvx            v30,r7,$sp
-       stvx            v31,r3,$sp
-       li              r0,-1
-       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
-       li              $x10,0x10
-       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       li              $x20,0x20
-       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       li              $x30,0x30
-       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       li              $x40,0x40
-       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       li              $x50,0x50
-       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       li              $x60,0x60
-       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       li              $x70,0x70
-       mtspr           256,r0
-
-       xxlor           2, 32+$eighty7, 32+$eighty7
-       vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
-       xxlor           1, 32+$eighty7, 32+$eighty7
-
-       # Load XOR Lconsts.
-       mr              $x70, r6
-       bl              Lconsts
-       lxvw4x          0, $x40, r6             # load XOR contents
-       mr              r6, $x70
-       li              $x70,0x70
-
-       subi            $rounds,$rounds,3       # -4 in total
-
-       lvx             $rndkey0,$x00,$key1     # load key schedule
-       lvx             v30,$x10,$key1
-       addi            $key1,$key1,0x20
-       lvx             v31,$x00,$key1
-       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
-       addi            $key_,$sp,$FRAME+15
-       mtctr           $rounds
-
-Load_xts_dec_key:
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v30,$x10,$key1
-       addi            $key1,$key1,0x20
-       stvx            v24,$x00,$key_          # off-load round[1]
-       ?vperm          v25,v31,v30,$keyperm
-       lvx             v31,$x00,$key1
-       stvx            v25,$x10,$key_          # off-load round[2]
-       addi            $key_,$key_,0x20
-       bdnz            Load_xts_dec_key
-
-       lvx             v26,$x10,$key1
-       ?vperm          v24,v30,v31,$keyperm
-       lvx             v27,$x20,$key1
-       stvx            v24,$x00,$key_          # off-load round[3]
-       ?vperm          v25,v31,v26,$keyperm
-       lvx             v28,$x30,$key1
-       stvx            v25,$x10,$key_          # off-load round[4]
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       ?vperm          v26,v26,v27,$keyperm
-       lvx             v29,$x40,$key1
-       ?vperm          v27,v27,v28,$keyperm
-       lvx             v30,$x50,$key1
-       ?vperm          v28,v28,v29,$keyperm
-       lvx             v31,$x60,$key1
-       ?vperm          v29,v29,v30,$keyperm
-       lvx             $twk5,$x70,$key1        # borrow $twk5
-       ?vperm          v30,v30,v31,$keyperm
-       lvx             v24,$x00,$key_          # pre-load round[1]
-       ?vperm          v31,v31,$twk5,$keyperm
-       lvx             v25,$x10,$key_          # pre-load round[2]
-
-        vperm          $in0,$inout,$inptail,$inpperm
-        subi           $inp,$inp,31            # undo "caller"
-       vxor            $twk0,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out0,$in0,$twk0
-       xxlor           32+$in1, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in1
-
-        lvx_u          $in1,$x10,$inp
-       vxor            $twk1,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in1,$in1,$in1,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out1,$in1,$twk1
-       xxlor           32+$in2, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in2
-
-        lvx_u          $in2,$x20,$inp
-        andi.          $taillen,$len,15
-       vxor            $twk2,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in2,$in2,$in2,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out2,$in2,$twk2
-       xxlor           32+$in3, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in3
-
-        lvx_u          $in3,$x30,$inp
-        sub            $len,$len,$taillen
-       vxor            $twk3,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in3,$in3,$in3,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out3,$in3,$twk3
-       xxlor           32+$in4, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in4
-
-        lvx_u          $in4,$x40,$inp
-        subi           $len,$len,0x60
-       vxor            $twk4,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in4,$in4,$in4,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out4,$in4,$twk4
-       xxlor           32+$in5, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in5
-
-        lvx_u          $in5,$x50,$inp
-        addi           $inp,$inp,0x60
-       vxor            $twk5,$tweak,$rndkey0
-       vsrab           $tmp,$tweak,$seven      # next tweak value
-       vaddubm         $tweak,$tweak,$tweak
-        le?vperm       $in5,$in5,$in5,$leperm
-       vand            $tmp,$tmp,$eighty7
-        vxor           $out5,$in5,$twk5
-       xxlor           32+$in0, 0, 0
-       vpermxor        $tweak, $tweak, $tmp, $in0
-
-       vxor            v31,v31,$rndkey0
-       mtctr           $rounds
-       b               Loop_xts_dec6x
-
-.align 5
-Loop_xts_dec6x:
-       vncipher        $out0,$out0,v24
-       vncipher        $out1,$out1,v24
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-       vncipher        $out5,$out5,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vncipher        $out0,$out0,v25
-       vncipher        $out1,$out1,v25
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-       vncipher        $out4,$out4,v25
-       vncipher        $out5,$out5,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Loop_xts_dec6x
-
-       xxlor           32+$eighty7, 1, 1       # 0x010101..87
-
-       subic           $len,$len,96            # $len-=96
-        vxor           $in0,$twk0,v31          # xor with last round key
-       vncipher        $out0,$out0,v24
-       vncipher        $out1,$out1,v24
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk0,$tweak,$rndkey0
-        vaddubm        $tweak,$tweak,$tweak
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-       vncipher        $out5,$out5,v24
-
-       subfe.          r0,r0,r0                # borrow?-1:0
-        vand           $tmp,$tmp,$eighty7
-       vncipher        $out0,$out0,v25
-       vncipher        $out1,$out1,v25
-        xxlor          32+$in1, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in1
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-        vxor           $in1,$twk1,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk1,$tweak,$rndkey0
-       vncipher        $out4,$out4,v25
-       vncipher        $out5,$out5,v25
-
-       and             r0,r0,$len
-        vaddubm        $tweak,$tweak,$tweak
-       vncipher        $out0,$out0,v26
-       vncipher        $out1,$out1,v26
-        vand           $tmp,$tmp,$eighty7
-       vncipher        $out2,$out2,v26
-       vncipher        $out3,$out3,v26
-        xxlor          32+$in2, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in2
-       vncipher        $out4,$out4,v26
-       vncipher        $out5,$out5,v26
-
-       add             $inp,$inp,r0            # $inp is adjusted in such
-                                               # way that at exit from the
-                                               # loop inX-in5 are loaded
-                                               # with last "words"
-        vxor           $in2,$twk2,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk2,$tweak,$rndkey0
-        vaddubm        $tweak,$tweak,$tweak
-       vncipher        $out0,$out0,v27
-       vncipher        $out1,$out1,v27
-       vncipher        $out2,$out2,v27
-       vncipher        $out3,$out3,v27
-        vand           $tmp,$tmp,$eighty7
-       vncipher        $out4,$out4,v27
-       vncipher        $out5,$out5,v27
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-        xxlor          32+$in3, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in3
-       vncipher        $out0,$out0,v28
-       vncipher        $out1,$out1,v28
-        vxor           $in3,$twk3,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk3,$tweak,$rndkey0
-       vncipher        $out2,$out2,v28
-       vncipher        $out3,$out3,v28
-        vaddubm        $tweak,$tweak,$tweak
-       vncipher        $out4,$out4,v28
-       vncipher        $out5,$out5,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-        vand           $tmp,$tmp,$eighty7
-
-       vncipher        $out0,$out0,v29
-       vncipher        $out1,$out1,v29
-        xxlor          32+$in4, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in4
-       vncipher        $out2,$out2,v29
-       vncipher        $out3,$out3,v29
-        vxor           $in4,$twk4,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk4,$tweak,$rndkey0
-       vncipher        $out4,$out4,v29
-       vncipher        $out5,$out5,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-        vaddubm        $tweak,$tweak,$tweak
-
-       vncipher        $out0,$out0,v30
-       vncipher        $out1,$out1,v30
-        vand           $tmp,$tmp,$eighty7
-       vncipher        $out2,$out2,v30
-       vncipher        $out3,$out3,v30
-        xxlor          32+$in5, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in5
-       vncipher        $out4,$out4,v30
-       vncipher        $out5,$out5,v30
-        vxor           $in5,$twk5,v31
-        vsrab          $tmp,$tweak,$seven      # next tweak value
-        vxor           $twk5,$tweak,$rndkey0
-
-       vncipherlast    $out0,$out0,$in0
-        lvx_u          $in0,$x00,$inp          # load next input block
-        vaddubm        $tweak,$tweak,$tweak
-       vncipherlast    $out1,$out1,$in1
-        lvx_u          $in1,$x10,$inp
-       vncipherlast    $out2,$out2,$in2
-        le?vperm       $in0,$in0,$in0,$leperm
-        lvx_u          $in2,$x20,$inp
-        vand           $tmp,$tmp,$eighty7
-       vncipherlast    $out3,$out3,$in3
-        le?vperm       $in1,$in1,$in1,$leperm
-        lvx_u          $in3,$x30,$inp
-       vncipherlast    $out4,$out4,$in4
-        le?vperm       $in2,$in2,$in2,$leperm
-        lvx_u          $in4,$x40,$inp
-        xxlor          10, 32+$in0, 32+$in0
-        xxlor          32+$in0, 0, 0
-        vpermxor       $tweak, $tweak, $tmp, $in0
-        xxlor          32+$in0, 10, 10
-       vncipherlast    $out5,$out5,$in5
-        le?vperm       $in3,$in3,$in3,$leperm
-        lvx_u          $in5,$x50,$inp
-        addi           $inp,$inp,0x60
-        le?vperm       $in4,$in4,$in4,$leperm
-        le?vperm       $in5,$in5,$in5,$leperm
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-        vxor           $out0,$in0,$twk0
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-        vxor           $out1,$in1,$twk1
-       le?vperm        $out3,$out3,$out3,$leperm
-       stvx_u          $out2,$x20,$out
-        vxor           $out2,$in2,$twk2
-       le?vperm        $out4,$out4,$out4,$leperm
-       stvx_u          $out3,$x30,$out
-        vxor           $out3,$in3,$twk3
-       le?vperm        $out5,$out5,$out5,$leperm
-       stvx_u          $out4,$x40,$out
-        vxor           $out4,$in4,$twk4
-       stvx_u          $out5,$x50,$out
-        vxor           $out5,$in5,$twk5
-       addi            $out,$out,0x60
-
-       mtctr           $rounds
-       beq             Loop_xts_dec6x          # did $len-=96 borrow?
-
-       xxlor           32+$eighty7, 2, 2       # 0x010101..87
-
-       addic.          $len,$len,0x60
-       beq             Lxts_dec6x_zero
-       cmpwi           $len,0x20
-       blt             Lxts_dec6x_one
-       nop
-       beq             Lxts_dec6x_two
-       cmpwi           $len,0x40
-       blt             Lxts_dec6x_three
-       nop
-       beq             Lxts_dec6x_four
-
-Lxts_dec6x_five:
-       vxor            $out0,$in1,$twk0
-       vxor            $out1,$in2,$twk1
-       vxor            $out2,$in3,$twk2
-       vxor            $out3,$in4,$twk3
-       vxor            $out4,$in5,$twk4
-
-       bl              _aesp8_xts_dec5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk5             # unused tweak
-       vxor            $twk1,$tweak,$rndkey0
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       vxor            $out0,$in0,$twk1
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$leperm
-       stvx_u          $out2,$x20,$out
-       le?vperm        $out4,$out4,$out4,$leperm
-       stvx_u          $out3,$x30,$out
-       stvx_u          $out4,$x40,$out
-       addi            $out,$out,0x50
-       bne             Lxts_dec6x_steal
-       b               Lxts_dec6x_done
-
-.align 4
-Lxts_dec6x_four:
-       vxor            $out0,$in2,$twk0
-       vxor            $out1,$in3,$twk1
-       vxor            $out2,$in4,$twk2
-       vxor            $out3,$in5,$twk3
-       vxor            $out4,$out4,$out4
-
-       bl              _aesp8_xts_dec5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk4             # unused tweak
-       vmr             $twk1,$twk5
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       vxor            $out0,$in0,$twk5
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-       le?vperm        $out3,$out3,$out3,$leperm
-       stvx_u          $out2,$x20,$out
-       stvx_u          $out3,$x30,$out
-       addi            $out,$out,0x40
-       bne             Lxts_dec6x_steal
-       b               Lxts_dec6x_done
-
-.align 4
-Lxts_dec6x_three:
-       vxor            $out0,$in3,$twk0
-       vxor            $out1,$in4,$twk1
-       vxor            $out2,$in5,$twk2
-       vxor            $out3,$out3,$out3
-       vxor            $out4,$out4,$out4
-
-       bl              _aesp8_xts_dec5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk3             # unused tweak
-       vmr             $twk1,$twk4
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       vxor            $out0,$in0,$twk4
-       le?vperm        $out2,$out2,$out2,$leperm
-       stvx_u          $out1,$x10,$out
-       stvx_u          $out2,$x20,$out
-       addi            $out,$out,0x30
-       bne             Lxts_dec6x_steal
-       b               Lxts_dec6x_done
-
-.align 4
-Lxts_dec6x_two:
-       vxor            $out0,$in4,$twk0
-       vxor            $out1,$in5,$twk1
-       vxor            $out2,$out2,$out2
-       vxor            $out3,$out3,$out3
-       vxor            $out4,$out4,$out4
-
-       bl              _aesp8_xts_dec5x
-
-       le?vperm        $out0,$out0,$out0,$leperm
-       vmr             $twk0,$twk2             # unused tweak
-       vmr             $twk1,$twk3
-       le?vperm        $out1,$out1,$out1,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       vxor            $out0,$in0,$twk3
-       stvx_u          $out1,$x10,$out
-       addi            $out,$out,0x20
-       bne             Lxts_dec6x_steal
-       b               Lxts_dec6x_done
-
-.align 4
-Lxts_dec6x_one:
-       vxor            $out0,$in5,$twk0
-       nop
-Loop_xts_dec1x:
-       vncipher        $out0,$out0,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vncipher        $out0,$out0,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Loop_xts_dec1x
-
-       subi            r0,$taillen,1
-       vncipher        $out0,$out0,v24
-
-       andi.           r0,r0,16
-       cmpwi           $taillen,0
-       vncipher        $out0,$out0,v25
-
-       sub             $inp,$inp,r0
-       vncipher        $out0,$out0,v26
-
-       lvx_u           $in0,0,$inp
-       vncipher        $out0,$out0,v27
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       vncipher        $out0,$out0,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-
-       vncipher        $out0,$out0,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-        vxor           $twk0,$twk0,v31
-
-       le?vperm        $in0,$in0,$in0,$leperm
-       vncipher        $out0,$out0,v30
-
-       mtctr           $rounds
-       vncipherlast    $out0,$out0,$twk0
-
-       vmr             $twk0,$twk1             # unused tweak
-       vmr             $twk1,$twk2
-       le?vperm        $out0,$out0,$out0,$leperm
-       stvx_u          $out0,$x00,$out         # store output
-       addi            $out,$out,0x10
-       vxor            $out0,$in0,$twk2
-       bne             Lxts_dec6x_steal
-       b               Lxts_dec6x_done
-
-.align 4
-Lxts_dec6x_zero:
-       cmpwi           $taillen,0
-       beq             Lxts_dec6x_done
-
-       lvx_u           $in0,0,$inp
-       le?vperm        $in0,$in0,$in0,$leperm
-       vxor            $out0,$in0,$twk1
-Lxts_dec6x_steal:
-       vncipher        $out0,$out0,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vncipher        $out0,$out0,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            Lxts_dec6x_steal
-
-       add             $inp,$inp,$taillen
-       vncipher        $out0,$out0,v24
-
-       cmpwi           $taillen,0
-       vncipher        $out0,$out0,v25
-
-       lvx_u           $in0,0,$inp
-       vncipher        $out0,$out0,v26
-
-       lvsr            $inpperm,0,$taillen     # $in5 is no more
-       vncipher        $out0,$out0,v27
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       vncipher        $out0,$out0,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-
-       vncipher        $out0,$out0,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-        vxor           $twk1,$twk1,v31
-
-       le?vperm        $in0,$in0,$in0,$leperm
-       vncipher        $out0,$out0,v30
-
-       vperm           $in0,$in0,$in0,$inpperm
-       vncipherlast    $tmp,$out0,$twk1
-
-       le?vperm        $out0,$tmp,$tmp,$leperm
-       le?stvx_u       $out0,0,$out
-       be?stvx_u       $tmp,0,$out
-
-       vxor            $out0,$out0,$out0
-       vspltisb        $out1,-1
-       vperm           $out0,$out0,$out1,$inpperm
-       vsel            $out0,$in0,$tmp,$out0
-       vxor            $out0,$out0,$twk0
-
-       subi            r30,$out,1
-       mtctr           $taillen
-Loop_xts_dec6x_steal:
-       lbzu            r0,1(r30)
-       stb             r0,16(r30)
-       bdnz            Loop_xts_dec6x_steal
-
-       li              $taillen,0
-       mtctr           $rounds
-       b               Loop_xts_dec1x          # one more time...
-
-.align 4
-Lxts_dec6x_done:
-       ${UCMP}i        $ivp,0
-       beq             Lxts_dec6x_ret
-
-       vxor            $tweak,$twk0,$rndkey0
-       le?vperm        $tweak,$tweak,$tweak,$leperm
-       stvx_u          $tweak,0,$ivp
-
-Lxts_dec6x_ret:
-       mtlr            r11
-       li              r10,`$FRAME+15`
-       li              r11,`$FRAME+31`
-       stvx            $seven,r10,$sp          # wipe copies of round keys
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-       stvx            $seven,r10,$sp
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-       stvx            $seven,r10,$sp
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-       stvx            $seven,r10,$sp
-       addi            r10,r10,32
-       stvx            $seven,r11,$sp
-       addi            r11,r11,32
-
-       mtspr           256,$vrsave
-       lvx             v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       lvx             v21,r11,$sp
-       addi            r11,r11,32
-       lvx             v22,r10,$sp
-       addi            r10,r10,32
-       lvx             v23,r11,$sp
-       addi            r11,r11,32
-       lvx             v24,r10,$sp
-       addi            r10,r10,32
-       lvx             v25,r11,$sp
-       addi            r11,r11,32
-       lvx             v26,r10,$sp
-       addi            r10,r10,32
-       lvx             v27,r11,$sp
-       addi            r11,r11,32
-       lvx             v28,r10,$sp
-       addi            r10,r10,32
-       lvx             v29,r11,$sp
-       addi            r11,r11,32
-       lvx             v30,r10,$sp
-       lvx             v31,r11,$sp
-       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
-       blr
-       .long           0
-       .byte           0,12,0x04,1,0x80,6,6,0
-       .long           0
-
-.align 5
-_aesp8_xts_dec5x:
-       vncipher        $out0,$out0,v24
-       vncipher        $out1,$out1,v24
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-       lvx             v24,$x20,$key_          # round[3]
-       addi            $key_,$key_,0x20
-
-       vncipher        $out0,$out0,v25
-       vncipher        $out1,$out1,v25
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-       vncipher        $out4,$out4,v25
-       lvx             v25,$x10,$key_          # round[4]
-       bdnz            _aesp8_xts_dec5x
-
-       subi            r0,$taillen,1
-       vncipher        $out0,$out0,v24
-       vncipher        $out1,$out1,v24
-       vncipher        $out2,$out2,v24
-       vncipher        $out3,$out3,v24
-       vncipher        $out4,$out4,v24
-
-       andi.           r0,r0,16
-       cmpwi           $taillen,0
-       vncipher        $out0,$out0,v25
-       vncipher        $out1,$out1,v25
-       vncipher        $out2,$out2,v25
-       vncipher        $out3,$out3,v25
-       vncipher        $out4,$out4,v25
-        vxor           $twk0,$twk0,v31
-
-       sub             $inp,$inp,r0
-       vncipher        $out0,$out0,v26
-       vncipher        $out1,$out1,v26
-       vncipher        $out2,$out2,v26
-       vncipher        $out3,$out3,v26
-       vncipher        $out4,$out4,v26
-        vxor           $in1,$twk1,v31
-
-       vncipher        $out0,$out0,v27
-       lvx_u           $in0,0,$inp
-       vncipher        $out1,$out1,v27
-       vncipher        $out2,$out2,v27
-       vncipher        $out3,$out3,v27
-       vncipher        $out4,$out4,v27
-        vxor           $in2,$twk2,v31
-
-       addi            $key_,$sp,$FRAME+15     # rewind $key_
-       vncipher        $out0,$out0,v28
-       vncipher        $out1,$out1,v28
-       vncipher        $out2,$out2,v28
-       vncipher        $out3,$out3,v28
-       vncipher        $out4,$out4,v28
-       lvx             v24,$x00,$key_          # re-pre-load round[1]
-        vxor           $in3,$twk3,v31
-
-       vncipher        $out0,$out0,v29
-       le?vperm        $in0,$in0,$in0,$leperm
-       vncipher        $out1,$out1,v29
-       vncipher        $out2,$out2,v29
-       vncipher        $out3,$out3,v29
-       vncipher        $out4,$out4,v29
-       lvx             v25,$x10,$key_          # re-pre-load round[2]
-        vxor           $in4,$twk4,v31
-
-       vncipher        $out0,$out0,v30
-       vncipher        $out1,$out1,v30
-       vncipher        $out2,$out2,v30
-       vncipher        $out3,$out3,v30
-       vncipher        $out4,$out4,v30
-
-       vncipherlast    $out0,$out0,$twk0
-       vncipherlast    $out1,$out1,$in1
-       vncipherlast    $out2,$out2,$in2
-       vncipherlast    $out3,$out3,$in3
-       vncipherlast    $out4,$out4,$in4
-       mtctr           $rounds
-       blr
-        .long          0
-        .byte          0,12,0x14,0,0,0,0,0
-___
-}}     }}}
-
-my $consts=1;
-foreach(split("\n",$code)) {
-        s/\`([^\`]*)\`/eval($1)/geo;
-
-       # constants table endian-specific conversion
-       if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
-           my $conv=$3;
-           my @bytes=();
-
-           # convert to endian-agnostic format
-           if ($1 eq "long") {
-             foreach (split(/,\s*/,$2)) {
-               my $l = /^0/?oct:int;
-               push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
-             }
-           } else {
-               @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
-           }
-
-           # little-endian conversion
-           if ($flavour =~ /le$/o) {
-               SWITCH: for($conv)  {
-                   /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
-                   /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
-               }
-           }
-
-           #emit
-           print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
-           next;
-       }
-       $consts=0 if (m/Lconsts:/o);    # end of table
-
-       # instructions prefixed with '?' are endian-specific and need
-       # to be adjusted accordingly...
-       if ($flavour =~ /le$/o) {       # little-endian
-           s/le\?//o           or
-           s/be\?/#be#/o       or
-           s/\?lvsr/lvsl/o     or
-           s/\?lvsl/lvsr/o     or
-           s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
-           s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
-           s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
-       } else {                        # big-endian
-           s/le\?/#le#/o       or
-           s/be\?//o           or
-           s/\?([a-z]+)/$1/o;
-       }
-
-        print $_,"\n";
-}
-
-close STDOUT;
index 0b725e8..7d2beb7 100644 (file)
@@ -27,13 +27,9 @@ static int __init p8_init(void)
        if (ret)
                goto err;
 
-       ret = crypto_register_alg(&p8_aes_alg);
-       if (ret)
-               goto err_unregister_ghash;
-
        ret = crypto_register_skcipher(&p8_aes_cbc_alg);
        if (ret)
-               goto err_unregister_aes;
+               goto err_unregister_ghash;
 
        ret = crypto_register_skcipher(&p8_aes_ctr_alg);
        if (ret)
@@ -49,8 +45,6 @@ err_unregister_aes_ctr:
        crypto_unregister_skcipher(&p8_aes_ctr_alg);
 err_unregister_aes_cbc:
        crypto_unregister_skcipher(&p8_aes_cbc_alg);
-err_unregister_aes:
-       crypto_unregister_alg(&p8_aes_alg);
 err_unregister_ghash:
        crypto_unregister_shash(&p8_ghash_alg);
 err:
@@ -62,7 +56,6 @@ static void __exit p8_exit(void)
        crypto_unregister_skcipher(&p8_aes_xts_alg);
        crypto_unregister_skcipher(&p8_aes_ctr_alg);
        crypto_unregister_skcipher(&p8_aes_cbc_alg);
-       crypto_unregister_alg(&p8_aes_alg);
        crypto_unregister_shash(&p8_ghash_alg);
 }
 
@@ -74,4 +67,3 @@ MODULE_DESCRIPTION("IBM VMX cryptographic acceleration instructions "
                   "support on Power 8");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1.0.0");
-MODULE_IMPORT_NS("CRYPTO_INTERNAL");
index c893c92..bff71cf 100644 (file)
 #define AES_MAX_KEYLENGTH      (15 * 16)
 #define AES_MAX_KEYLENGTH_U32  (AES_MAX_KEYLENGTH / sizeof(u32))
 
+/*
+ * The POWER8 VSX optimized AES assembly code is borrowed from OpenSSL and
+ * inherits OpenSSL's AES_KEY format, which stores the number of rounds after
+ * the round keys.  That assembly code is difficult to change.  So for
+ * compatibility purposes we reserve space for the extra nrounds field on PPC64.
+ *
+ * Note: when prepared for decryption, the round keys are just the reversed
+ * standard round keys, not the round keys for the Equivalent Inverse Cipher.
+ */
+struct p8_aes_key {
+       u32 rndkeys[AES_MAX_KEYLENGTH_U32];
+       int nrounds;
+};
+
 union aes_enckey_arch {
        u32 rndkeys[AES_MAX_KEYLENGTH_U32];
 #ifdef CONFIG_CRYPTO_LIB_AES_ARCH
 #if defined(CONFIG_PPC) && defined(CONFIG_SPE)
        /* Used unconditionally (when SPE AES code is enabled in kconfig) */
        u32 spe_enc_key[AES_MAX_KEYLENGTH_U32] __aligned(8);
+#elif defined(CONFIG_PPC)
+       /*
+        * Kernels that include the POWER8 VSX optimized AES code use this field
+        * when that code is usable at key preparation time.  Otherwise they
+        * fall back to rndkeys.  In the latter case, p8.nrounds (which doesn't
+        * overlap rndkeys) is set to 0 to differentiate the two formats.
+        */
+       struct p8_aes_key p8;
 #endif
 #endif /* CONFIG_CRYPTO_LIB_AES_ARCH */
 };
@@ -34,6 +56,9 @@ union aes_invkey_arch {
 #if defined(CONFIG_PPC) && defined(CONFIG_SPE)
        /* Used unconditionally (when SPE AES code is enabled in kconfig) */
        u32 spe_dec_key[AES_MAX_KEYLENGTH_U32] __aligned(8);
+#elif defined(CONFIG_PPC)
+       /* Used conditionally, analogous to aes_enckey_arch::p8 */
+       struct p8_aes_key p8;
 #endif
 #endif /* CONFIG_CRYPTO_LIB_AES_ARCH */
 };
@@ -155,6 +180,22 @@ void ppc_encrypt_xts(u8 *out, const u8 *in, u32 *key_enc, u32 rounds, u32 bytes,
                     u8 *iv, u32 *key_twk);
 void ppc_decrypt_xts(u8 *out, const u8 *in, u32 *key_dec, u32 rounds, u32 bytes,
                     u8 *iv, u32 *key_twk);
+int aes_p8_set_encrypt_key(const u8 *userKey, const int bits,
+                          struct p8_aes_key *key);
+int aes_p8_set_decrypt_key(const u8 *userKey, const int bits,
+                          struct p8_aes_key *key);
+void aes_p8_encrypt(const u8 *in, u8 *out, const struct p8_aes_key *key);
+void aes_p8_decrypt(const u8 *in, u8 *out, const struct p8_aes_key *key);
+void aes_p8_cbc_encrypt(const u8 *in, u8 *out, size_t len,
+                       const struct p8_aes_key *key, u8 *iv, const int enc);
+void aes_p8_ctr32_encrypt_blocks(const u8 *in, u8 *out, size_t len,
+                                const struct p8_aes_key *key, const u8 *iv);
+void aes_p8_xts_encrypt(const u8 *in, u8 *out, size_t len,
+                       const struct p8_aes_key *key1,
+                       const struct p8_aes_key *key2, u8 *iv);
+void aes_p8_xts_decrypt(const u8 *in, u8 *out, size_t len,
+                       const struct p8_aes_key *key1,
+                       const struct p8_aes_key *key2, u8 *iv);
 #endif
 
 /**
index cfc6171..a0f1c10 100644 (file)
@@ -16,7 +16,7 @@ config CRYPTO_LIB_AES_ARCH
        depends on CRYPTO_LIB_AES && !UML && !KMSAN
        default y if ARM
        default y if ARM64
-       default y if PPC && SPE
+       default y if PPC && (SPE || (PPC64 && VSX))
 
 config CRYPTO_LIB_AESCFB
        tristate
index d68fde0..1614061 100644 (file)
@@ -35,7 +35,19 @@ libaes-y += powerpc/aes-spe-core.o \
            powerpc/aes-spe-keys.o \
            powerpc/aes-spe-modes.o \
            powerpc/aes-tab-4k.o
-endif
+else
+libaes-y += powerpc/aesp8-ppc.o
+aes-perlasm-flavour-y := linux-ppc64
+aes-perlasm-flavour-$(CONFIG_PPC64_ELF_ABI_V2) := linux-ppc64-elfv2
+aes-perlasm-flavour-$(CONFIG_CPU_LITTLE_ENDIAN) := linux-ppc64le
+quiet_cmd_perlasm_aes = PERLASM $@
+      cmd_perlasm_aes = $(PERL) $< $(aes-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/powerpc/aesp8-ppc.S: $(src)/powerpc/aesp8-ppc.pl FORCE
+       $(call if_changed,perlasm_aes)
+targets += powerpc/aesp8-ppc.S
+OBJECT_FILES_NON_STANDARD_powerpc/aesp8-ppc.o := y
+endif # !CONFIG_SPE
 endif # CONFIG_PPC
 
 endif # CONFIG_CRYPTO_LIB_AES_ARCH
diff --git a/lib/crypto/powerpc/.gitignore b/lib/crypto/powerpc/.gitignore
new file mode 100644 (file)
index 0000000..598ca7a
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+aesp8-ppc.S
index cf22020..42e0a99 100644 (file)
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ * Copyright (C) 2015 International Business Machines Inc.
  * Copyright 2026 Google LLC
  */
 #include <asm/simd.h>
@@ -10,6 +11,8 @@
 #include <linux/preempt.h>
 #include <linux/uaccess.h>
 
+#ifdef CONFIG_SPE
+
 EXPORT_SYMBOL_GPL(ppc_expand_key_128);
 EXPORT_SYMBOL_GPL(ppc_expand_key_192);
 EXPORT_SYMBOL_GPL(ppc_expand_key_256);
@@ -72,3 +75,164 @@ static void aes_decrypt_arch(const struct aes_key *key,
        ppc_decrypt_aes(out, in, key->inv_k.spe_dec_key, key->nrounds / 2 - 1);
        spe_end();
 }
+
+#else /* CONFIG_SPE */
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+EXPORT_SYMBOL_GPL(aes_p8_set_encrypt_key);
+EXPORT_SYMBOL_GPL(aes_p8_set_decrypt_key);
+EXPORT_SYMBOL_GPL(aes_p8_encrypt);
+EXPORT_SYMBOL_GPL(aes_p8_decrypt);
+EXPORT_SYMBOL_GPL(aes_p8_cbc_encrypt);
+EXPORT_SYMBOL_GPL(aes_p8_ctr32_encrypt_blocks);
+EXPORT_SYMBOL_GPL(aes_p8_xts_encrypt);
+EXPORT_SYMBOL_GPL(aes_p8_xts_decrypt);
+
+static inline bool is_vsx_format(const struct p8_aes_key *key)
+{
+       return key->nrounds != 0;
+}
+
+/*
+ * Convert a round key from VSX to generic format by reflecting the 16 bytes,
+ * and (if apply_inv_mix=true) applying InvMixColumn to each column.
+ *
+ * It would be nice if the VSX and generic key formats would be compatible.  But
+ * that's very difficult to do, with the assembly code having been borrowed from
+ * OpenSSL and also targeted to POWER8 rather than POWER9.
+ *
+ * Fortunately, this conversion should only be needed in extremely rare cases,
+ * possibly not at all in practice.  It's just included for full correctness.
+ */
+static void rndkey_from_vsx(u32 out[4], const u32 in[4], bool apply_inv_mix)
+{
+       u32 k0 = swab32(in[0]);
+       u32 k1 = swab32(in[1]);
+       u32 k2 = swab32(in[2]);
+       u32 k3 = swab32(in[3]);
+
+       if (apply_inv_mix) {
+               k0 = inv_mix_columns(k0);
+               k1 = inv_mix_columns(k1);
+               k2 = inv_mix_columns(k2);
+               k3 = inv_mix_columns(k3);
+       }
+       out[0] = k3;
+       out[1] = k2;
+       out[2] = k1;
+       out[3] = k0;
+}
+
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+                               union aes_invkey_arch *inv_k,
+                               const u8 *in_key, int key_len, int nrounds)
+{
+       const int keybits = 8 * key_len;
+       int ret;
+
+       if (static_branch_likely(&have_vec_crypto) && likely(may_use_simd())) {
+               preempt_disable();
+               pagefault_disable();
+               enable_kernel_vsx();
+               ret = aes_p8_set_encrypt_key(in_key, keybits, &k->p8);
+               /*
+                * aes_p8_set_encrypt_key() should never fail here, since the
+                * key length was already validated.
+                */
+               WARN_ON_ONCE(ret);
+               if (inv_k) {
+                       ret = aes_p8_set_decrypt_key(in_key, keybits,
+                                                    &inv_k->p8);
+                       /* ... and likewise for aes_p8_set_decrypt_key(). */
+                       WARN_ON_ONCE(ret);
+               }
+               disable_kernel_vsx();
+               pagefault_enable();
+               preempt_enable();
+       } else {
+               aes_expandkey_generic(k->rndkeys,
+                                     inv_k ? inv_k->inv_rndkeys : NULL,
+                                     in_key, key_len);
+               /* Mark the key as using the generic format. */
+               k->p8.nrounds = 0;
+               if (inv_k)
+                       inv_k->p8.nrounds = 0;
+       }
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+                            u8 out[AES_BLOCK_SIZE],
+                            const u8 in[AES_BLOCK_SIZE])
+{
+       if (static_branch_likely(&have_vec_crypto) &&
+           likely(is_vsx_format(&key->k.p8) && may_use_simd())) {
+               preempt_disable();
+               pagefault_disable();
+               enable_kernel_vsx();
+               aes_p8_encrypt(in, out, &key->k.p8);
+               disable_kernel_vsx();
+               pagefault_enable();
+               preempt_enable();
+       } else if (unlikely(is_vsx_format(&key->k.p8))) {
+               /*
+                * This handles (the hopefully extremely rare) case where a key
+                * was prepared using the VSX optimized format, then encryption
+                * is done in a context that cannot use VSX instructions.
+                */
+               u32 rndkeys[AES_MAX_KEYLENGTH_U32];
+
+               for (int i = 0; i < 4 * (key->nrounds + 1); i += 4)
+                       rndkey_from_vsx(&rndkeys[i],
+                                       &key->k.p8.rndkeys[i], false);
+               aes_encrypt_generic(rndkeys, key->nrounds, out, in);
+       } else {
+               aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+       }
+}
+
+static void aes_decrypt_arch(const struct aes_key *key, u8 out[AES_BLOCK_SIZE],
+                            const u8 in[AES_BLOCK_SIZE])
+{
+       if (static_branch_likely(&have_vec_crypto) &&
+           likely(is_vsx_format(&key->inv_k.p8) && may_use_simd())) {
+               preempt_disable();
+               pagefault_disable();
+               enable_kernel_vsx();
+               aes_p8_decrypt(in, out, &key->inv_k.p8);
+               disable_kernel_vsx();
+               pagefault_enable();
+               preempt_enable();
+       } else if (unlikely(is_vsx_format(&key->inv_k.p8))) {
+               /*
+                * This handles (the hopefully extremely rare) case where a key
+                * was prepared using the VSX optimized format, then decryption
+                * is done in a context that cannot use VSX instructions.
+                */
+               u32 inv_rndkeys[AES_MAX_KEYLENGTH_U32];
+               int i;
+
+               rndkey_from_vsx(&inv_rndkeys[0],
+                               &key->inv_k.p8.rndkeys[0], false);
+               for (i = 4; i < 4 * key->nrounds; i += 4) {
+                       rndkey_from_vsx(&inv_rndkeys[i],
+                                       &key->inv_k.p8.rndkeys[i], true);
+               }
+               rndkey_from_vsx(&inv_rndkeys[i],
+                               &key->inv_k.p8.rndkeys[i], false);
+               aes_decrypt_generic(inv_rndkeys, key->nrounds, out, in);
+       } else {
+               aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+                                   out, in);
+       }
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+           (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+               static_branch_enable(&have_vec_crypto);
+}
+
+#endif /* !CONFIG_SPE */
diff --git a/lib/crypto/powerpc/aesp8-ppc.pl b/lib/crypto/powerpc/aesp8-ppc.pl
new file mode 100644 (file)
index 0000000..253a067
--- /dev/null
@@ -0,0 +1,3890 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://www.openssl.org/~appro/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#       * Redistributions of source code must retain copyright notices,
+#         this list of conditions and the following disclaimer.
+#
+#       * Redistributions in binary form must reproduce the above
+#         copyright notice, this list of conditions and the following
+#         disclaimer in the documentation and/or other materials
+#         provided with the distribution.
+#
+#       * Neither the name of the CRYPTOGAMS nor the names of its
+#         copyright holder and contributors may be used to endorse or
+#         promote products derived from this software without specific
+#         prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. Data alignment in parallelizable modes is
+# handled with VSX loads and stores, which implies MSR.VSX flag being
+# set. It should also be noted that ISA specification doesn't prohibit
+# alignment exceptions for these instructions on page boundaries.
+# Initially alignment was handled in pure AltiVec/VMX way [when data
+# is aligned programmatically, which in turn guarantees exception-
+# free execution], but it turned to hamper performance when vcipher
+# instructions are interleaved. It's reckoned that eventual
+# misalignment penalties at page boundaries are in average lower
+# than additional overhead in pure AltiVec approach.
+#
+# May 2016
+#
+# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
+# systems were measured.
+#
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#              CBC en-/decrypt CTR     XTS
+# POWER8[le]   3.96/0.72       0.74    1.1
+# POWER8[be]   3.75/0.65       0.66    1.0
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+       $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
+       $STU    ="stdu";
+       $POP    ="ld";
+       $PUSH   ="std";
+       $UCMP   ="cmpld";
+       $SHL    ="sldi";
+} elsif ($flavour =~ /32/) {
+       $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
+       $STU    ="stwu";
+       $POP    ="lwz";
+       $PUSH   ="stw";
+       $UCMP   ="cmplw";
+       $SHL    ="slwi";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+( $xlate="${dir}../../../arch/powerpc/crypto/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="aes_p8";
+
+$sp="r1";
+$vrsave="r12";
+
+#########################################################################
+{{{    # Key setup procedures                                          #
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine       "any"
+
+.text
+
+.align 7
+rcon:
+.long  0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
+.long  0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
+.long  0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
+.long  0,0,0,0                                         ?asis
+.long  0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
+Lconsts:
+       mflr    r0
+       bcl     20,31,\$+4
+       mflr    $ptr     #vvvvv "distance between . and rcon
+       addi    $ptr,$ptr,-0x58
+       mtlr    r0
+       blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl .${prefix}_set_encrypt_key
+Lset_encrypt_key:
+       mflr            r11
+       $PUSH           r11,$LRSAVE($sp)
+
+       li              $ptr,-1
+       ${UCMP}i        $inp,0
+       beq-            Lenc_key_abort          # if ($inp==0) return -1;
+       ${UCMP}i        $out,0
+       beq-            Lenc_key_abort          # if ($out==0) return -1;
+       li              $ptr,-2
+       cmpwi           $bits,128
+       blt-            Lenc_key_abort
+       cmpwi           $bits,256
+       bgt-            Lenc_key_abort
+       andi.           r0,$bits,0x3f
+       bne-            Lenc_key_abort
+
+       lis             r0,0xfff0
+       mfspr           $vrsave,256
+       mtspr           256,r0
+
+       bl              Lconsts
+       mtlr            r11
+
+       neg             r9,$inp
+       lvx             $in0,0,$inp
+       addi            $inp,$inp,15            # 15 is not typo
+       lvsr            $key,0,r9               # borrow $key
+       li              r8,0x20
+       cmpwi           $bits,192
+       lvx             $in1,0,$inp
+       le?vspltisb     $mask,0x0f              # borrow $mask
+       lvx             $rcon,0,$ptr
+       le?vxor         $key,$key,$mask         # adjust for byte swap
+       lvx             $mask,r8,$ptr
+       addi            $ptr,$ptr,0x10
+       vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
+       li              $cnt,8
+       vxor            $zero,$zero,$zero
+       mtctr           $cnt
+
+       ?lvsr           $outperm,0,$out
+       vspltisb        $outmask,-1
+       lvx             $outhead,0,$out
+       ?vperm          $outmask,$zero,$outmask,$outperm
+
+       blt             Loop128
+       addi            $inp,$inp,8
+       beq             L192
+       addi            $inp,$inp,8
+       b               L256
+
+.align 4
+Loop128:
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+       bdnz            Loop128
+
+       lvx             $rcon,0,$ptr            # last two round keys
+
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vxor            $in0,$in0,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+
+       addi            $inp,$out,15            # 15 is not typo
+       addi            $out,$out,0x50
+
+       li              $rounds,10
+       b               Ldone
+
+.align 4
+L192:
+       lvx             $tmp,0,$inp
+       li              $cnt,4
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
+       vspltisb        $key,8                  # borrow $key
+       mtctr           $cnt
+       vsububm         $mask,$mask,$key        # adjust the mask
+
+Loop192:
+       vperm           $key,$in1,$in1,$mask    # roate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+       vcipherlast     $key,$key,$rcon
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+
+        vsldoi         $stage,$zero,$in1,8
+       vspltw          $tmp,$in0,3
+       vxor            $tmp,$tmp,$in1
+       vsldoi          $in1,$zero,$in1,12      # >>32
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in1,$in1,$tmp
+       vxor            $in0,$in0,$key
+       vxor            $in1,$in1,$key
+        vsldoi         $stage,$stage,$in0,8
+
+       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$stage,$stage,$outperm # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+        vsldoi         $stage,$in0,$in1,8
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+        vperm          $outtail,$stage,$stage,$outperm # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vspltw          $tmp,$in0,3
+       vxor            $tmp,$tmp,$in1
+       vsldoi          $in1,$zero,$in1,12      # >>32
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in1,$in1,$tmp
+       vxor            $in0,$in0,$key
+       vxor            $in1,$in1,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $inp,$out,15            # 15 is not typo
+        addi           $out,$out,16
+       bdnz            Loop192
+
+       li              $rounds,12
+       addi            $out,$out,0x20
+       b               Ldone
+
+.align 4
+L256:
+       lvx             $tmp,0,$inp
+       li              $cnt,7
+       li              $rounds,14
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
+       mtctr           $cnt
+
+Loop256:
+       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in1,$in1,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $inp,$out,15            # 15 is not typo
+        addi           $out,$out,16
+       bdz             Ldone
+
+       vspltw          $key,$in0,3             # just splat
+       vsldoi          $tmp,$zero,$in1,12      # >>32
+       vsbox           $key,$key
+
+       vxor            $in1,$in1,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in1,$in1,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in1,$in1,$tmp
+
+       vxor            $in1,$in1,$key
+       b               Loop256
+
+.align 4
+Ldone:
+       lvx             $in1,0,$inp             # redundant in aligned case
+       vsel            $in1,$outhead,$in1,$outmask
+       stvx            $in1,0,$inp
+       li              $ptr,0
+       mtspr           256,$vrsave
+       stw             $rounds,0($out)
+
+Lenc_key_abort:
+       mr              r3,$ptr
+       blr
+       .long           0
+       .byte           0,12,0x14,1,0,0,3,0
+       .long           0
+.size  .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl .${prefix}_set_decrypt_key
+       $STU            $sp,-$FRAME($sp)
+       mflr            r10
+       $PUSH           r10,$FRAME+$LRSAVE($sp)
+       bl              Lset_encrypt_key
+       mtlr            r10
+
+       cmpwi           r3,0
+       bne-            Ldec_key_abort
+
+       slwi            $cnt,$rounds,4
+       subi            $inp,$out,240           # first round key
+       srwi            $rounds,$rounds,1
+       add             $out,$inp,$cnt          # last round key
+       mtctr           $rounds
+
+Ldeckey:
+       lwz             r0, 0($inp)
+       lwz             r6, 4($inp)
+       lwz             r7, 8($inp)
+       lwz             r8, 12($inp)
+       addi            $inp,$inp,16
+       lwz             r9, 0($out)
+       lwz             r10,4($out)
+       lwz             r11,8($out)
+       lwz             r12,12($out)
+       stw             r0, 0($out)
+       stw             r6, 4($out)
+       stw             r7, 8($out)
+       stw             r8, 12($out)
+       subi            $out,$out,16
+       stw             r9, -16($inp)
+       stw             r10,-12($inp)
+       stw             r11,-8($inp)
+       stw             r12,-4($inp)
+       bdnz            Ldeckey
+
+       xor             r3,r3,r3                # return value
+Ldec_key_abort:
+       addi            $sp,$sp,$FRAME
+       blr
+       .long           0
+       .byte           0,12,4,1,0x80,0,3,0
+       .long           0
+.size  .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+#########################################################################
+{{{    # Single block en- and decrypt procedures                       #
+sub gen_block () {
+my $dir = shift;
+my $n   = $dir eq "de" ? "n" : "";
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl .${prefix}_${dir}crypt
+       lwz             $rounds,240($key)
+       lis             r0,0xfc00
+       mfspr           $vrsave,256
+       li              $idx,15                 # 15 is not typo
+       mtspr           256,r0
+
+       lvx             v0,0,$inp
+       neg             r11,$out
+       lvx             v1,$idx,$inp
+       lvsl            v2,0,$inp               # inpperm
+       le?vspltisb     v4,0x0f
+       ?lvsl           v3,0,r11                # outperm
+       le?vxor         v2,v2,v4
+       li              $idx,16
+       vperm           v0,v0,v1,v2             # align [and byte swap in LE]
+       lvx             v1,0,$key
+       ?lvsl           v5,0,$key               # keyperm
+       srwi            $rounds,$rounds,1
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       subi            $rounds,$rounds,1
+       ?vperm          v1,v1,v2,v5             # align round key
+
+       vxor            v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       mtctr           $rounds
+
+Loop_${dir}c:
+       ?vperm          v2,v2,v1,v5
+       v${n}cipher     v0,v0,v2
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          v1,v1,v2,v5
+       v${n}cipher     v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_${dir}c
+
+       ?vperm          v2,v2,v1,v5
+       v${n}cipher     v0,v0,v2
+       lvx             v2,$idx,$key
+       ?vperm          v1,v1,v2,v5
+       v${n}cipherlast v0,v0,v1
+
+       vspltisb        v2,-1
+       vxor            v1,v1,v1
+       li              $idx,15                 # 15 is not typo
+       ?vperm          v2,v1,v2,v3             # outmask
+       le?vxor         v3,v3,v4
+       lvx             v1,0,$out               # outhead
+       vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
+       vsel            v1,v1,v0,v2
+       lvx             v4,$idx,$out
+       stvx            v1,0,$out
+       vsel            v0,v0,v4,v2
+       stvx            v0,$idx,$out
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,3,0
+       .long           0
+.size  .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+#########################################################################
+{{{    # CBC en- and decrypt procedures                                #
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)=            map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
+                                               map("v$_",(4..10));
+$code.=<<___;
+.globl .${prefix}_cbc_encrypt
+       ${UCMP}i        $len,16
+       bltlr-
+
+       cmpwi           $enc,0                  # test direction
+       lis             r0,0xffe0
+       mfspr           $vrsave,256
+       mtspr           256,r0
+
+       li              $idx,15
+       vxor            $rndkey0,$rndkey0,$rndkey0
+       le?vspltisb     $tmp,0x0f
+
+       lvx             $ivec,0,$ivp            # load [unaligned] iv
+       lvsl            $inpperm,0,$ivp
+       lvx             $inptail,$idx,$ivp
+       le?vxor         $inpperm,$inpperm,$tmp
+       vperm           $ivec,$ivec,$inptail,$inpperm
+
+       neg             r11,$inp
+       ?lvsl           $keyperm,0,$key         # prepare for unaligned key
+       lwz             $rounds,240($key)
+
+       lvsr            $inpperm,0,r11          # prepare for unaligned load
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,15            # 15 is not typo
+       le?vxor         $inpperm,$inpperm,$tmp
+
+       ?lvsr           $outperm,0,$out         # prepare for unaligned store
+       vspltisb        $outmask,-1
+       lvx             $outhead,0,$out
+       ?vperm          $outmask,$rndkey0,$outmask,$outperm
+       le?vxor         $outperm,$outperm,$tmp
+
+       srwi            $rounds,$rounds,1
+       li              $idx,16
+       subi            $rounds,$rounds,1
+       beq             Lcbc_dec
+
+Lcbc_enc:
+       vmr             $inout,$inptail
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+       mtctr           $rounds
+       subi            $len,$len,16            # len-=16
+
+       lvx             $rndkey0,0,$key
+        vperm          $inout,$inout,$inptail,$inpperm
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       vxor            $inout,$inout,$ivec
+
+Loop_cbc_enc:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipher         $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_cbc_enc
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       li              $idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipherlast     $ivec,$inout,$rndkey0
+       ${UCMP}i        $len,16
+
+       vperm           $tmp,$ivec,$ivec,$outperm
+       vsel            $inout,$outhead,$tmp,$outmask
+       vmr             $outhead,$tmp
+       stvx            $inout,0,$out
+       addi            $out,$out,16
+       bge             Lcbc_enc
+
+       b               Lcbc_done
+
+.align 4
+Lcbc_dec:
+       ${UCMP}i        $len,128
+       bge             _aesp8_cbc_decrypt8x
+       vmr             $tmp,$inptail
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+       mtctr           $rounds
+       subi            $len,$len,16            # len-=16
+
+       lvx             $rndkey0,0,$key
+        vperm          $tmp,$tmp,$inptail,$inpperm
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$tmp,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+
+Loop_cbc_dec:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vncipher        $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_cbc_dec
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       li              $idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vncipherlast    $inout,$inout,$rndkey0
+       ${UCMP}i        $len,16
+
+       vxor            $inout,$inout,$ivec
+       vmr             $ivec,$tmp
+       vperm           $tmp,$inout,$inout,$outperm
+       vsel            $inout,$outhead,$tmp,$outmask
+       vmr             $outhead,$tmp
+       stvx            $inout,0,$out
+       addi            $out,$out,16
+       bge             Lcbc_dec
+
+Lcbc_done:
+       addi            $out,$out,-1
+       lvx             $inout,0,$out           # redundant in aligned case
+       vsel            $inout,$outhead,$inout,$outmask
+       stvx            $inout,0,$out
+
+       neg             $enc,$ivp               # write [unaligned] iv
+       li              $idx,15                 # 15 is not typo
+       vxor            $rndkey0,$rndkey0,$rndkey0
+       vspltisb        $outmask,-1
+       le?vspltisb     $tmp,0x0f
+       ?lvsl           $outperm,0,$enc
+       ?vperm          $outmask,$rndkey0,$outmask,$outperm
+       le?vxor         $outperm,$outperm,$tmp
+       lvx             $outhead,0,$ivp
+       vperm           $ivec,$ivec,$ivec,$outperm
+       vsel            $inout,$outhead,$ivec,$outmask
+       lvx             $inptail,$idx,$ivp
+       stvx            $inout,0,$ivp
+       vsel            $inout,$ivec,$inptail,$outmask
+       stvx            $inout,$idx,$ivp
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,6,0
+       .long           0
+___
+#########################################################################
+{{     # Optimized CBC decrypt procedure                               #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
+my $rndkey0="v23";     # v24-v25 rotating buffer for first found keys
+                       # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4);        # aliases with "caller", redundant assignment
+
+$code.=<<___;
+.align 5
+_aesp8_cbc_decrypt8x:
+       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+       li              r10,`$FRAME+8*16+15`
+       li              r11,`$FRAME+8*16+31`
+       stvx            v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       stvx            v21,r11,$sp
+       addi            r11,r11,32
+       stvx            v22,r10,$sp
+       addi            r10,r10,32
+       stvx            v23,r11,$sp
+       addi            r11,r11,32
+       stvx            v24,r10,$sp
+       addi            r10,r10,32
+       stvx            v25,r11,$sp
+       addi            r11,r11,32
+       stvx            v26,r10,$sp
+       addi            r10,r10,32
+       stvx            v27,r11,$sp
+       addi            r11,r11,32
+       stvx            v28,r10,$sp
+       addi            r10,r10,32
+       stvx            v29,r11,$sp
+       addi            r11,r11,32
+       stvx            v30,r10,$sp
+       stvx            v31,r11,$sp
+       li              r0,-1
+       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
+       li              $x10,0x10
+       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       li              $x20,0x20
+       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       li              $x30,0x30
+       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       li              $x40,0x40
+       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       li              $x50,0x50
+       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       li              $x60,0x60
+       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       li              $x70,0x70
+       mtspr           256,r0
+
+       subi            $rounds,$rounds,3       # -4 in total
+       subi            $len,$len,128           # bias
+
+       lvx             $rndkey0,$x00,$key      # load key schedule
+       lvx             v30,$x10,$key
+       addi            $key,$key,0x20
+       lvx             v31,$x00,$key
+       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
+       addi            $key_,$sp,$FRAME+15
+       mtctr           $rounds
+
+Load_cbc_dec_key:
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v30,$x10,$key
+       addi            $key,$key,0x20
+       stvx            v24,$x00,$key_          # off-load round[1]
+       ?vperm          v25,v31,v30,$keyperm
+       lvx             v31,$x00,$key
+       stvx            v25,$x10,$key_          # off-load round[2]
+       addi            $key_,$key_,0x20
+       bdnz            Load_cbc_dec_key
+
+       lvx             v26,$x10,$key
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v27,$x20,$key
+       stvx            v24,$x00,$key_          # off-load round[3]
+       ?vperm          v25,v31,v26,$keyperm
+       lvx             v28,$x30,$key
+       stvx            v25,$x10,$key_          # off-load round[4]
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       ?vperm          v26,v26,v27,$keyperm
+       lvx             v29,$x40,$key
+       ?vperm          v27,v27,v28,$keyperm
+       lvx             v30,$x50,$key
+       ?vperm          v28,v28,v29,$keyperm
+       lvx             v31,$x60,$key
+       ?vperm          v29,v29,v30,$keyperm
+       lvx             $out0,$x70,$key         # borrow $out0
+       ?vperm          v30,v30,v31,$keyperm
+       lvx             v24,$x00,$key_          # pre-load round[1]
+       ?vperm          v31,v31,$out0,$keyperm
+       lvx             v25,$x10,$key_          # pre-load round[2]
+
+       #lvx            $inptail,0,$inp         # "caller" already did this
+       #addi           $inp,$inp,15            # 15 is not typo
+       subi            $inp,$inp,15            # undo "caller"
+
+        le?li          $idx,8
+       lvx_u           $in0,$x00,$inp          # load first 8 "words"
+        le?lvsl        $inpperm,0,$idx
+        le?vspltisb    $tmp,0x0f
+       lvx_u           $in1,$x10,$inp
+        le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
+       lvx_u           $in2,$x20,$inp
+        le?vperm       $in0,$in0,$in0,$inpperm
+       lvx_u           $in3,$x30,$inp
+        le?vperm       $in1,$in1,$in1,$inpperm
+       lvx_u           $in4,$x40,$inp
+        le?vperm       $in2,$in2,$in2,$inpperm
+       vxor            $out0,$in0,$rndkey0
+       lvx_u           $in5,$x50,$inp
+        le?vperm       $in3,$in3,$in3,$inpperm
+       vxor            $out1,$in1,$rndkey0
+       lvx_u           $in6,$x60,$inp
+        le?vperm       $in4,$in4,$in4,$inpperm
+       vxor            $out2,$in2,$rndkey0
+       lvx_u           $in7,$x70,$inp
+       addi            $inp,$inp,0x80
+        le?vperm       $in5,$in5,$in5,$inpperm
+       vxor            $out3,$in3,$rndkey0
+        le?vperm       $in6,$in6,$in6,$inpperm
+       vxor            $out4,$in4,$rndkey0
+        le?vperm       $in7,$in7,$in7,$inpperm
+       vxor            $out5,$in5,$rndkey0
+       vxor            $out6,$in6,$rndkey0
+       vxor            $out7,$in7,$rndkey0
+
+       mtctr           $rounds
+       b               Loop_cbc_dec8x
+.align 5
+Loop_cbc_dec8x:
+       vncipher        $out0,$out0,v24
+       vncipher        $out1,$out1,v24
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+       vncipher        $out5,$out5,v24
+       vncipher        $out6,$out6,v24
+       vncipher        $out7,$out7,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vncipher        $out0,$out0,v25
+       vncipher        $out1,$out1,v25
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+       vncipher        $out4,$out4,v25
+       vncipher        $out5,$out5,v25
+       vncipher        $out6,$out6,v25
+       vncipher        $out7,$out7,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_cbc_dec8x
+
+       subic           $len,$len,128           # $len-=128
+       vncipher        $out0,$out0,v24
+       vncipher        $out1,$out1,v24
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+       vncipher        $out5,$out5,v24
+       vncipher        $out6,$out6,v24
+       vncipher        $out7,$out7,v24
+
+       subfe.          r0,r0,r0                # borrow?-1:0
+       vncipher        $out0,$out0,v25
+       vncipher        $out1,$out1,v25
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+       vncipher        $out4,$out4,v25
+       vncipher        $out5,$out5,v25
+       vncipher        $out6,$out6,v25
+       vncipher        $out7,$out7,v25
+
+       and             r0,r0,$len
+       vncipher        $out0,$out0,v26
+       vncipher        $out1,$out1,v26
+       vncipher        $out2,$out2,v26
+       vncipher        $out3,$out3,v26
+       vncipher        $out4,$out4,v26
+       vncipher        $out5,$out5,v26
+       vncipher        $out6,$out6,v26
+       vncipher        $out7,$out7,v26
+
+       add             $inp,$inp,r0            # $inp is adjusted in such
+                                               # way that at exit from the
+                                               # loop inX-in7 are loaded
+                                               # with last "words"
+       vncipher        $out0,$out0,v27
+       vncipher        $out1,$out1,v27
+       vncipher        $out2,$out2,v27
+       vncipher        $out3,$out3,v27
+       vncipher        $out4,$out4,v27
+       vncipher        $out5,$out5,v27
+       vncipher        $out6,$out6,v27
+       vncipher        $out7,$out7,v27
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vncipher        $out0,$out0,v28
+       vncipher        $out1,$out1,v28
+       vncipher        $out2,$out2,v28
+       vncipher        $out3,$out3,v28
+       vncipher        $out4,$out4,v28
+       vncipher        $out5,$out5,v28
+       vncipher        $out6,$out6,v28
+       vncipher        $out7,$out7,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+
+       vncipher        $out0,$out0,v29
+       vncipher        $out1,$out1,v29
+       vncipher        $out2,$out2,v29
+       vncipher        $out3,$out3,v29
+       vncipher        $out4,$out4,v29
+       vncipher        $out5,$out5,v29
+       vncipher        $out6,$out6,v29
+       vncipher        $out7,$out7,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+
+       vncipher        $out0,$out0,v30
+        vxor           $ivec,$ivec,v31         # xor with last round key
+       vncipher        $out1,$out1,v30
+        vxor           $in0,$in0,v31
+       vncipher        $out2,$out2,v30
+        vxor           $in1,$in1,v31
+       vncipher        $out3,$out3,v30
+        vxor           $in2,$in2,v31
+       vncipher        $out4,$out4,v30
+        vxor           $in3,$in3,v31
+       vncipher        $out5,$out5,v30
+        vxor           $in4,$in4,v31
+       vncipher        $out6,$out6,v30
+        vxor           $in5,$in5,v31
+       vncipher        $out7,$out7,v30
+        vxor           $in6,$in6,v31
+
+       vncipherlast    $out0,$out0,$ivec
+       vncipherlast    $out1,$out1,$in0
+        lvx_u          $in0,$x00,$inp          # load next input block
+       vncipherlast    $out2,$out2,$in1
+        lvx_u          $in1,$x10,$inp
+       vncipherlast    $out3,$out3,$in2
+        le?vperm       $in0,$in0,$in0,$inpperm
+        lvx_u          $in2,$x20,$inp
+       vncipherlast    $out4,$out4,$in3
+        le?vperm       $in1,$in1,$in1,$inpperm
+        lvx_u          $in3,$x30,$inp
+       vncipherlast    $out5,$out5,$in4
+        le?vperm       $in2,$in2,$in2,$inpperm
+        lvx_u          $in4,$x40,$inp
+       vncipherlast    $out6,$out6,$in5
+        le?vperm       $in3,$in3,$in3,$inpperm
+        lvx_u          $in5,$x50,$inp
+       vncipherlast    $out7,$out7,$in6
+        le?vperm       $in4,$in4,$in4,$inpperm
+        lvx_u          $in6,$x60,$inp
+       vmr             $ivec,$in7
+        le?vperm       $in5,$in5,$in5,$inpperm
+        lvx_u          $in7,$x70,$inp
+        addi           $inp,$inp,0x80
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+        le?vperm       $in6,$in6,$in6,$inpperm
+        vxor           $out0,$in0,$rndkey0
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+        le?vperm       $in7,$in7,$in7,$inpperm
+        vxor           $out1,$in1,$rndkey0
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+        vxor           $out2,$in2,$rndkey0
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+        vxor           $out3,$in3,$rndkey0
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x40,$out
+        vxor           $out4,$in4,$rndkey0
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x50,$out
+        vxor           $out5,$in5,$rndkey0
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x60,$out
+        vxor           $out6,$in6,$rndkey0
+       stvx_u          $out7,$x70,$out
+       addi            $out,$out,0x80
+        vxor           $out7,$in7,$rndkey0
+
+       mtctr           $rounds
+       beq             Loop_cbc_dec8x          # did $len-=128 borrow?
+
+       addic.          $len,$len,128
+       beq             Lcbc_dec8x_done
+       nop
+       nop
+
+Loop_cbc_dec8x_tail:                           # up to 7 "words" tail...
+       vncipher        $out1,$out1,v24
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+       vncipher        $out5,$out5,v24
+       vncipher        $out6,$out6,v24
+       vncipher        $out7,$out7,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vncipher        $out1,$out1,v25
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+       vncipher        $out4,$out4,v25
+       vncipher        $out5,$out5,v25
+       vncipher        $out6,$out6,v25
+       vncipher        $out7,$out7,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_cbc_dec8x_tail
+
+       vncipher        $out1,$out1,v24
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+       vncipher        $out5,$out5,v24
+       vncipher        $out6,$out6,v24
+       vncipher        $out7,$out7,v24
+
+       vncipher        $out1,$out1,v25
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+       vncipher        $out4,$out4,v25
+       vncipher        $out5,$out5,v25
+       vncipher        $out6,$out6,v25
+       vncipher        $out7,$out7,v25
+
+       vncipher        $out1,$out1,v26
+       vncipher        $out2,$out2,v26
+       vncipher        $out3,$out3,v26
+       vncipher        $out4,$out4,v26
+       vncipher        $out5,$out5,v26
+       vncipher        $out6,$out6,v26
+       vncipher        $out7,$out7,v26
+
+       vncipher        $out1,$out1,v27
+       vncipher        $out2,$out2,v27
+       vncipher        $out3,$out3,v27
+       vncipher        $out4,$out4,v27
+       vncipher        $out5,$out5,v27
+       vncipher        $out6,$out6,v27
+       vncipher        $out7,$out7,v27
+
+       vncipher        $out1,$out1,v28
+       vncipher        $out2,$out2,v28
+       vncipher        $out3,$out3,v28
+       vncipher        $out4,$out4,v28
+       vncipher        $out5,$out5,v28
+       vncipher        $out6,$out6,v28
+       vncipher        $out7,$out7,v28
+
+       vncipher        $out1,$out1,v29
+       vncipher        $out2,$out2,v29
+       vncipher        $out3,$out3,v29
+       vncipher        $out4,$out4,v29
+       vncipher        $out5,$out5,v29
+       vncipher        $out6,$out6,v29
+       vncipher        $out7,$out7,v29
+
+       vncipher        $out1,$out1,v30
+        vxor           $ivec,$ivec,v31         # last round key
+       vncipher        $out2,$out2,v30
+        vxor           $in1,$in1,v31
+       vncipher        $out3,$out3,v30
+        vxor           $in2,$in2,v31
+       vncipher        $out4,$out4,v30
+        vxor           $in3,$in3,v31
+       vncipher        $out5,$out5,v30
+        vxor           $in4,$in4,v31
+       vncipher        $out6,$out6,v30
+        vxor           $in5,$in5,v31
+       vncipher        $out7,$out7,v30
+        vxor           $in6,$in6,v31
+
+       cmplwi          $len,32                 # switch($len)
+       blt             Lcbc_dec8x_one
+       nop
+       beq             Lcbc_dec8x_two
+       cmplwi          $len,64
+       blt             Lcbc_dec8x_three
+       nop
+       beq             Lcbc_dec8x_four
+       cmplwi          $len,96
+       blt             Lcbc_dec8x_five
+       nop
+       beq             Lcbc_dec8x_six
+
+Lcbc_dec8x_seven:
+       vncipherlast    $out1,$out1,$ivec
+       vncipherlast    $out2,$out2,$in1
+       vncipherlast    $out3,$out3,$in2
+       vncipherlast    $out4,$out4,$in3
+       vncipherlast    $out5,$out5,$in4
+       vncipherlast    $out6,$out6,$in5
+       vncipherlast    $out7,$out7,$in6
+       vmr             $ivec,$in7
+
+       le?vperm        $out1,$out1,$out1,$inpperm
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x00,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x10,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x20,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x30,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x40,$out
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x50,$out
+       stvx_u          $out7,$x60,$out
+       addi            $out,$out,0x70
+       b               Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_six:
+       vncipherlast    $out2,$out2,$ivec
+       vncipherlast    $out3,$out3,$in2
+       vncipherlast    $out4,$out4,$in3
+       vncipherlast    $out5,$out5,$in4
+       vncipherlast    $out6,$out6,$in5
+       vncipherlast    $out7,$out7,$in6
+       vmr             $ivec,$in7
+
+       le?vperm        $out2,$out2,$out2,$inpperm
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x00,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x10,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x20,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x30,$out
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x40,$out
+       stvx_u          $out7,$x50,$out
+       addi            $out,$out,0x60
+       b               Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_five:
+       vncipherlast    $out3,$out3,$ivec
+       vncipherlast    $out4,$out4,$in3
+       vncipherlast    $out5,$out5,$in4
+       vncipherlast    $out6,$out6,$in5
+       vncipherlast    $out7,$out7,$in6
+       vmr             $ivec,$in7
+
+       le?vperm        $out3,$out3,$out3,$inpperm
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x00,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x10,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x20,$out
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x30,$out
+       stvx_u          $out7,$x40,$out
+       addi            $out,$out,0x50
+       b               Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_four:
+       vncipherlast    $out4,$out4,$ivec
+       vncipherlast    $out5,$out5,$in4
+       vncipherlast    $out6,$out6,$in5
+       vncipherlast    $out7,$out7,$in6
+       vmr             $ivec,$in7
+
+       le?vperm        $out4,$out4,$out4,$inpperm
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x00,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x10,$out
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x20,$out
+       stvx_u          $out7,$x30,$out
+       addi            $out,$out,0x40
+       b               Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_three:
+       vncipherlast    $out5,$out5,$ivec
+       vncipherlast    $out6,$out6,$in5
+       vncipherlast    $out7,$out7,$in6
+       vmr             $ivec,$in7
+
+       le?vperm        $out5,$out5,$out5,$inpperm
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x00,$out
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x10,$out
+       stvx_u          $out7,$x20,$out
+       addi            $out,$out,0x30
+       b               Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_two:
+       vncipherlast    $out6,$out6,$ivec
+       vncipherlast    $out7,$out7,$in6
+       vmr             $ivec,$in7
+
+       le?vperm        $out6,$out6,$out6,$inpperm
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x00,$out
+       stvx_u          $out7,$x10,$out
+       addi            $out,$out,0x20
+       b               Lcbc_dec8x_done
+
+.align 5
+Lcbc_dec8x_one:
+       vncipherlast    $out7,$out7,$ivec
+       vmr             $ivec,$in7
+
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out7,0,$out
+       addi            $out,$out,0x10
+
+Lcbc_dec8x_done:
+       le?vperm        $ivec,$ivec,$ivec,$inpperm
+       stvx_u          $ivec,0,$ivp            # write [unaligned] iv
+
+       li              r10,`$FRAME+15`
+       li              r11,`$FRAME+31`
+       stvx            $inpperm,r10,$sp        # wipe copies of round keys
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+
+       mtspr           256,$vrsave
+       lvx             v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       lvx             v21,r11,$sp
+       addi            r11,r11,32
+       lvx             v22,r10,$sp
+       addi            r10,r10,32
+       lvx             v23,r11,$sp
+       addi            r11,r11,32
+       lvx             v24,r10,$sp
+       addi            r10,r10,32
+       lvx             v25,r11,$sp
+       addi            r11,r11,32
+       lvx             v26,r10,$sp
+       addi            r10,r10,32
+       lvx             v27,r11,$sp
+       addi            r11,r11,32
+       lvx             v28,r10,$sp
+       addi            r10,r10,32
+       lvx             v29,r11,$sp
+       addi            r11,r11,32
+       lvx             v30,r10,$sp
+       lvx             v31,r11,$sp
+       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0x80,6,6,0
+       .long           0
+.size  .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
+___
+}}     }}}
+
+#########################################################################
+{{{    # CTR procedure[s]                                              #
+
+####################### WARNING: Here be dragons! #######################
+#
+# This code is written as 'ctr32', based on a 32-bit counter used
+# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
+# a 128-bit counter.
+#
+# This leads to subtle changes from the upstream code: the counter
+# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
+# both the bulk (8 blocks at a time) path, and in the individual block
+# path. Be aware of this when doing updates.
+#
+# See:
+# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
+# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
+# https://github.com/openssl/openssl/pull/8942
+#
+#########################################################################
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)=            map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
+                                               map("v$_",(4..11));
+my $dat=$tmp;
+
+$code.=<<___;
+.globl .${prefix}_ctr32_encrypt_blocks
+       ${UCMP}i        $len,1
+       bltlr-
+
+       lis             r0,0xfff0
+       mfspr           $vrsave,256
+       mtspr           256,r0
+
+       li              $idx,15
+       vxor            $rndkey0,$rndkey0,$rndkey0
+       le?vspltisb     $tmp,0x0f
+
+       lvx             $ivec,0,$ivp            # load [unaligned] iv
+       lvsl            $inpperm,0,$ivp
+       lvx             $inptail,$idx,$ivp
+        vspltisb       $one,1
+       le?vxor         $inpperm,$inpperm,$tmp
+       vperm           $ivec,$ivec,$inptail,$inpperm
+        vsldoi         $one,$rndkey0,$one,1
+
+       neg             r11,$inp
+       ?lvsl           $keyperm,0,$key         # prepare for unaligned key
+       lwz             $rounds,240($key)
+
+       lvsr            $inpperm,0,r11          # prepare for unaligned load
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,15            # 15 is not typo
+       le?vxor         $inpperm,$inpperm,$tmp
+
+       srwi            $rounds,$rounds,1
+       li              $idx,16
+       subi            $rounds,$rounds,1
+
+       ${UCMP}i        $len,8
+       bge             _aesp8_ctr32_encrypt8x
+
+       ?lvsr           $outperm,0,$out         # prepare for unaligned store
+       vspltisb        $outmask,-1
+       lvx             $outhead,0,$out
+       ?vperm          $outmask,$rndkey0,$outmask,$outperm
+       le?vxor         $outperm,$outperm,$tmp
+
+       lvx             $rndkey0,0,$key
+       mtctr           $rounds
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$ivec,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       b               Loop_ctr32_enc
+
+.align 5
+Loop_ctr32_enc:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipher         $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_ctr32_enc
+
+       vadduqm         $ivec,$ivec,$one        # Kernel change for 128-bit
+        vmr            $dat,$inptail
+        lvx            $inptail,0,$inp
+        addi           $inp,$inp,16
+        subic.         $len,$len,1             # blocks--
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+        vperm          $dat,$dat,$inptail,$inpperm
+        li             $idx,16
+       ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
+        lvx            $rndkey0,0,$key
+       vxor            $dat,$dat,$rndkey1      # last round key
+       vcipherlast     $inout,$inout,$dat
+
+        lvx            $rndkey1,$idx,$key
+        addi           $idx,$idx,16
+       vperm           $inout,$inout,$inout,$outperm
+       vsel            $dat,$outhead,$inout,$outmask
+        mtctr          $rounds
+        ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vmr             $outhead,$inout
+        vxor           $inout,$ivec,$rndkey0
+        lvx            $rndkey0,$idx,$key
+        addi           $idx,$idx,16
+       stvx            $dat,0,$out
+       addi            $out,$out,16
+       bne             Loop_ctr32_enc
+
+       addi            $out,$out,-1
+       lvx             $inout,0,$out           # redundant in aligned case
+       vsel            $inout,$outhead,$inout,$outmask
+       stvx            $inout,0,$out
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,6,0
+       .long           0
+___
+#########################################################################
+{{     # Optimized CTR procedure                                       #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
+my $rndkey0="v23";     # v24-v25 rotating buffer for first found keys
+                       # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4);        # aliases with "caller", redundant assignment
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
+
+$code.=<<___;
+.align 5
+_aesp8_ctr32_encrypt8x:
+       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+       li              r10,`$FRAME+8*16+15`
+       li              r11,`$FRAME+8*16+31`
+       stvx            v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       stvx            v21,r11,$sp
+       addi            r11,r11,32
+       stvx            v22,r10,$sp
+       addi            r10,r10,32
+       stvx            v23,r11,$sp
+       addi            r11,r11,32
+       stvx            v24,r10,$sp
+       addi            r10,r10,32
+       stvx            v25,r11,$sp
+       addi            r11,r11,32
+       stvx            v26,r10,$sp
+       addi            r10,r10,32
+       stvx            v27,r11,$sp
+       addi            r11,r11,32
+       stvx            v28,r10,$sp
+       addi            r10,r10,32
+       stvx            v29,r11,$sp
+       addi            r11,r11,32
+       stvx            v30,r10,$sp
+       stvx            v31,r11,$sp
+       li              r0,-1
+       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
+       li              $x10,0x10
+       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       li              $x20,0x20
+       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       li              $x30,0x30
+       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       li              $x40,0x40
+       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       li              $x50,0x50
+       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       li              $x60,0x60
+       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       li              $x70,0x70
+       mtspr           256,r0
+
+       subi            $rounds,$rounds,3       # -4 in total
+
+       lvx             $rndkey0,$x00,$key      # load key schedule
+       lvx             v30,$x10,$key
+       addi            $key,$key,0x20
+       lvx             v31,$x00,$key
+       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
+       addi            $key_,$sp,$FRAME+15
+       mtctr           $rounds
+
+Load_ctr32_enc_key:
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v30,$x10,$key
+       addi            $key,$key,0x20
+       stvx            v24,$x00,$key_          # off-load round[1]
+       ?vperm          v25,v31,v30,$keyperm
+       lvx             v31,$x00,$key
+       stvx            v25,$x10,$key_          # off-load round[2]
+       addi            $key_,$key_,0x20
+       bdnz            Load_ctr32_enc_key
+
+       lvx             v26,$x10,$key
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v27,$x20,$key
+       stvx            v24,$x00,$key_          # off-load round[3]
+       ?vperm          v25,v31,v26,$keyperm
+       lvx             v28,$x30,$key
+       stvx            v25,$x10,$key_          # off-load round[4]
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       ?vperm          v26,v26,v27,$keyperm
+       lvx             v29,$x40,$key
+       ?vperm          v27,v27,v28,$keyperm
+       lvx             v30,$x50,$key
+       ?vperm          v28,v28,v29,$keyperm
+       lvx             v31,$x60,$key
+       ?vperm          v29,v29,v30,$keyperm
+       lvx             $out0,$x70,$key         # borrow $out0
+       ?vperm          v30,v30,v31,$keyperm
+       lvx             v24,$x00,$key_          # pre-load round[1]
+       ?vperm          v31,v31,$out0,$keyperm
+       lvx             v25,$x10,$key_          # pre-load round[2]
+
+       vadduqm         $two,$one,$one
+       subi            $inp,$inp,15            # undo "caller"
+       $SHL            $len,$len,4
+
+       vadduqm         $out1,$ivec,$one        # counter values ...
+       vadduqm         $out2,$ivec,$two        # (do all ctr adds as 128-bit)
+       vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
+        le?li          $idx,8
+       vadduqm         $out3,$out1,$two
+       vxor            $out1,$out1,$rndkey0
+        le?lvsl        $inpperm,0,$idx
+       vadduqm         $out4,$out2,$two
+       vxor            $out2,$out2,$rndkey0
+        le?vspltisb    $tmp,0x0f
+       vadduqm         $out5,$out3,$two
+       vxor            $out3,$out3,$rndkey0
+        le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
+       vadduqm         $out6,$out4,$two
+       vxor            $out4,$out4,$rndkey0
+       vadduqm         $out7,$out5,$two
+       vxor            $out5,$out5,$rndkey0
+       vadduqm         $ivec,$out6,$two        # next counter value
+       vxor            $out6,$out6,$rndkey0
+       vxor            $out7,$out7,$rndkey0
+
+       mtctr           $rounds
+       b               Loop_ctr32_enc8x
+.align 5
+Loop_ctr32_enc8x:
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+       vcipher         $out5,$out5,v24
+       vcipher         $out6,$out6,v24
+       vcipher         $out7,$out7,v24
+Loop_ctr32_enc8x_middle:
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+       vcipher         $out4,$out4,v25
+       vcipher         $out5,$out5,v25
+       vcipher         $out6,$out6,v25
+       vcipher         $out7,$out7,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_ctr32_enc8x
+
+       subic           r11,$len,256            # $len-256, borrow $key_
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+       vcipher         $out5,$out5,v24
+       vcipher         $out6,$out6,v24
+       vcipher         $out7,$out7,v24
+
+       subfe           r0,r0,r0                # borrow?-1:0
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+       vcipher         $out4,$out4,v25
+       vcipher         $out5,$out5,v25
+       vcipher         $out6,$out6,v25
+       vcipher         $out7,$out7,v25
+
+       and             r0,r0,r11
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vcipher         $out0,$out0,v26
+       vcipher         $out1,$out1,v26
+       vcipher         $out2,$out2,v26
+       vcipher         $out3,$out3,v26
+       vcipher         $out4,$out4,v26
+       vcipher         $out5,$out5,v26
+       vcipher         $out6,$out6,v26
+       vcipher         $out7,$out7,v26
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+
+       subic           $len,$len,129           # $len-=129
+       vcipher         $out0,$out0,v27
+       addi            $len,$len,1             # $len-=128 really
+       vcipher         $out1,$out1,v27
+       vcipher         $out2,$out2,v27
+       vcipher         $out3,$out3,v27
+       vcipher         $out4,$out4,v27
+       vcipher         $out5,$out5,v27
+       vcipher         $out6,$out6,v27
+       vcipher         $out7,$out7,v27
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+
+       vcipher         $out0,$out0,v28
+        lvx_u          $in0,$x00,$inp          # load input
+       vcipher         $out1,$out1,v28
+        lvx_u          $in1,$x10,$inp
+       vcipher         $out2,$out2,v28
+        lvx_u          $in2,$x20,$inp
+       vcipher         $out3,$out3,v28
+        lvx_u          $in3,$x30,$inp
+       vcipher         $out4,$out4,v28
+        lvx_u          $in4,$x40,$inp
+       vcipher         $out5,$out5,v28
+        lvx_u          $in5,$x50,$inp
+       vcipher         $out6,$out6,v28
+        lvx_u          $in6,$x60,$inp
+       vcipher         $out7,$out7,v28
+        lvx_u          $in7,$x70,$inp
+        addi           $inp,$inp,0x80
+
+       vcipher         $out0,$out0,v29
+        le?vperm       $in0,$in0,$in0,$inpperm
+       vcipher         $out1,$out1,v29
+        le?vperm       $in1,$in1,$in1,$inpperm
+       vcipher         $out2,$out2,v29
+        le?vperm       $in2,$in2,$in2,$inpperm
+       vcipher         $out3,$out3,v29
+        le?vperm       $in3,$in3,$in3,$inpperm
+       vcipher         $out4,$out4,v29
+        le?vperm       $in4,$in4,$in4,$inpperm
+       vcipher         $out5,$out5,v29
+        le?vperm       $in5,$in5,$in5,$inpperm
+       vcipher         $out6,$out6,v29
+        le?vperm       $in6,$in6,$in6,$inpperm
+       vcipher         $out7,$out7,v29
+        le?vperm       $in7,$in7,$in7,$inpperm
+
+       add             $inp,$inp,r0            # $inp is adjusted in such
+                                               # way that at exit from the
+                                               # loop inX-in7 are loaded
+                                               # with last "words"
+       subfe.          r0,r0,r0                # borrow?-1:0
+       vcipher         $out0,$out0,v30
+        vxor           $in0,$in0,v31           # xor with last round key
+       vcipher         $out1,$out1,v30
+        vxor           $in1,$in1,v31
+       vcipher         $out2,$out2,v30
+        vxor           $in2,$in2,v31
+       vcipher         $out3,$out3,v30
+        vxor           $in3,$in3,v31
+       vcipher         $out4,$out4,v30
+        vxor           $in4,$in4,v31
+       vcipher         $out5,$out5,v30
+        vxor           $in5,$in5,v31
+       vcipher         $out6,$out6,v30
+        vxor           $in6,$in6,v31
+       vcipher         $out7,$out7,v30
+        vxor           $in7,$in7,v31
+
+       bne             Lctr32_enc8x_break      # did $len-129 borrow?
+
+       vcipherlast     $in0,$out0,$in0
+       vcipherlast     $in1,$out1,$in1
+        vadduqm        $out1,$ivec,$one        # counter values ...
+       vcipherlast     $in2,$out2,$in2
+        vadduqm        $out2,$ivec,$two
+        vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
+       vcipherlast     $in3,$out3,$in3
+        vadduqm        $out3,$out1,$two
+        vxor           $out1,$out1,$rndkey0
+       vcipherlast     $in4,$out4,$in4
+        vadduqm        $out4,$out2,$two
+        vxor           $out2,$out2,$rndkey0
+       vcipherlast     $in5,$out5,$in5
+        vadduqm        $out5,$out3,$two
+        vxor           $out3,$out3,$rndkey0
+       vcipherlast     $in6,$out6,$in6
+        vadduqm        $out6,$out4,$two
+        vxor           $out4,$out4,$rndkey0
+       vcipherlast     $in7,$out7,$in7
+        vadduqm        $out7,$out5,$two
+        vxor           $out5,$out5,$rndkey0
+       le?vperm        $in0,$in0,$in0,$inpperm
+        vadduqm        $ivec,$out6,$two        # next counter value
+        vxor           $out6,$out6,$rndkey0
+       le?vperm        $in1,$in1,$in1,$inpperm
+        vxor           $out7,$out7,$rndkey0
+       mtctr           $rounds
+
+        vcipher        $out0,$out0,v24
+       stvx_u          $in0,$x00,$out
+       le?vperm        $in2,$in2,$in2,$inpperm
+        vcipher        $out1,$out1,v24
+       stvx_u          $in1,$x10,$out
+       le?vperm        $in3,$in3,$in3,$inpperm
+        vcipher        $out2,$out2,v24
+       stvx_u          $in2,$x20,$out
+       le?vperm        $in4,$in4,$in4,$inpperm
+        vcipher        $out3,$out3,v24
+       stvx_u          $in3,$x30,$out
+       le?vperm        $in5,$in5,$in5,$inpperm
+        vcipher        $out4,$out4,v24
+       stvx_u          $in4,$x40,$out
+       le?vperm        $in6,$in6,$in6,$inpperm
+        vcipher        $out5,$out5,v24
+       stvx_u          $in5,$x50,$out
+       le?vperm        $in7,$in7,$in7,$inpperm
+        vcipher        $out6,$out6,v24
+       stvx_u          $in6,$x60,$out
+        vcipher        $out7,$out7,v24
+       stvx_u          $in7,$x70,$out
+       addi            $out,$out,0x80
+
+       b               Loop_ctr32_enc8x_middle
+
+.align 5
+Lctr32_enc8x_break:
+       cmpwi           $len,-0x60
+       blt             Lctr32_enc8x_one
+       nop
+       beq             Lctr32_enc8x_two
+       cmpwi           $len,-0x40
+       blt             Lctr32_enc8x_three
+       nop
+       beq             Lctr32_enc8x_four
+       cmpwi           $len,-0x20
+       blt             Lctr32_enc8x_five
+       nop
+       beq             Lctr32_enc8x_six
+       cmpwi           $len,0x00
+       blt             Lctr32_enc8x_seven
+
+Lctr32_enc8x_eight:
+       vcipherlast     $out0,$out0,$in0
+       vcipherlast     $out1,$out1,$in1
+       vcipherlast     $out2,$out2,$in2
+       vcipherlast     $out3,$out3,$in3
+       vcipherlast     $out4,$out4,$in4
+       vcipherlast     $out5,$out5,$in5
+       vcipherlast     $out6,$out6,$in6
+       vcipherlast     $out7,$out7,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x40,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x50,$out
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x60,$out
+       stvx_u          $out7,$x70,$out
+       addi            $out,$out,0x80
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_seven:
+       vcipherlast     $out0,$out0,$in1
+       vcipherlast     $out1,$out1,$in2
+       vcipherlast     $out2,$out2,$in3
+       vcipherlast     $out3,$out3,$in4
+       vcipherlast     $out4,$out4,$in5
+       vcipherlast     $out5,$out5,$in6
+       vcipherlast     $out6,$out6,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x40,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x50,$out
+       stvx_u          $out6,$x60,$out
+       addi            $out,$out,0x70
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_six:
+       vcipherlast     $out0,$out0,$in2
+       vcipherlast     $out1,$out1,$in3
+       vcipherlast     $out2,$out2,$in4
+       vcipherlast     $out3,$out3,$in5
+       vcipherlast     $out4,$out4,$in6
+       vcipherlast     $out5,$out5,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x40,$out
+       stvx_u          $out5,$x50,$out
+       addi            $out,$out,0x60
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_five:
+       vcipherlast     $out0,$out0,$in3
+       vcipherlast     $out1,$out1,$in4
+       vcipherlast     $out2,$out2,$in5
+       vcipherlast     $out3,$out3,$in6
+       vcipherlast     $out4,$out4,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       stvx_u          $out4,$x40,$out
+       addi            $out,$out,0x50
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_four:
+       vcipherlast     $out0,$out0,$in4
+       vcipherlast     $out1,$out1,$in5
+       vcipherlast     $out2,$out2,$in6
+       vcipherlast     $out3,$out3,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       stvx_u          $out3,$x30,$out
+       addi            $out,$out,0x40
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_three:
+       vcipherlast     $out0,$out0,$in5
+       vcipherlast     $out1,$out1,$in6
+       vcipherlast     $out2,$out2,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       stvx_u          $out2,$x20,$out
+       addi            $out,$out,0x30
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_two:
+       vcipherlast     $out0,$out0,$in6
+       vcipherlast     $out1,$out1,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       stvx_u          $out1,$x10,$out
+       addi            $out,$out,0x20
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_one:
+       vcipherlast     $out0,$out0,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       stvx_u          $out0,0,$out
+       addi            $out,$out,0x10
+
+Lctr32_enc8x_done:
+       li              r10,`$FRAME+15`
+       li              r11,`$FRAME+31`
+       stvx            $inpperm,r10,$sp        # wipe copies of round keys
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+
+       mtspr           256,$vrsave
+       lvx             v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       lvx             v21,r11,$sp
+       addi            r11,r11,32
+       lvx             v22,r10,$sp
+       addi            r10,r10,32
+       lvx             v23,r11,$sp
+       addi            r11,r11,32
+       lvx             v24,r10,$sp
+       addi            r10,r10,32
+       lvx             v25,r11,$sp
+       addi            r11,r11,32
+       lvx             v26,r10,$sp
+       addi            r10,r10,32
+       lvx             v27,r11,$sp
+       addi            r11,r11,32
+       lvx             v28,r10,$sp
+       addi            r10,r10,32
+       lvx             v29,r11,$sp
+       addi            r11,r11,32
+       lvx             v30,r10,$sp
+       lvx             v31,r11,$sp
+       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0x80,6,6,0
+       .long           0
+.size  .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
+___
+}}     }}}
+
+#########################################################################
+{{{    # XTS procedures                                                #
+# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,  #
+#                             const AES_KEY *key1, const AES_KEY *key2,        #
+#                             [const] unsigned char iv[16]);           #
+# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which  #
+# input tweak value is assumed to be encrypted already, and last tweak #
+# value, one suitable for consecutive call on same chunk of data, is   #
+# written back to original buffer. In addition, in "tweak chaining"    #
+# mode only complete input blocks are processed.                       #
+
+my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =    map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout) =                                map("v$_",(0..2));
+my ($output,$inptail,$inpperm,$leperm,$keyperm) =      map("v$_",(3..7));
+my ($tweak,$seven,$eighty7,$tmp,$tweak1) =             map("v$_",(8..12));
+my $taillen = $key2;
+
+   ($inp,$idx) = ($idx,$inp);                          # reassign
+
+$code.=<<___;
+.globl .${prefix}_xts_encrypt
+       mr              $inp,r3                         # reassign
+       li              r3,-1
+       ${UCMP}i        $len,16
+       bltlr-
+
+       lis             r0,0xfff0
+       mfspr           r12,256                         # save vrsave
+       li              r11,0
+       mtspr           256,r0
+
+       vspltisb        $seven,0x07                     # 0x070707..07
+       le?lvsl         $leperm,r11,r11
+       le?vspltisb     $tmp,0x0f
+       le?vxor         $leperm,$leperm,$seven
+
+       li              $idx,15
+       lvx             $tweak,0,$ivp                   # load [unaligned] iv
+       lvsl            $inpperm,0,$ivp
+       lvx             $inptail,$idx,$ivp
+       le?vxor         $inpperm,$inpperm,$tmp
+       vperm           $tweak,$tweak,$inptail,$inpperm
+
+       neg             r11,$inp
+       lvsr            $inpperm,0,r11                  # prepare for unaligned load
+       lvx             $inout,0,$inp
+       addi            $inp,$inp,15                    # 15 is not typo
+       le?vxor         $inpperm,$inpperm,$tmp
+
+       ${UCMP}i        $key2,0                         # key2==NULL?
+       beq             Lxts_enc_no_key2
+
+       ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
+       lwz             $rounds,240($key2)
+       srwi            $rounds,$rounds,1
+       subi            $rounds,$rounds,1
+       li              $idx,16
+
+       lvx             $rndkey0,0,$key2
+       lvx             $rndkey1,$idx,$key2
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $tweak,$tweak,$rndkey0
+       lvx             $rndkey0,$idx,$key2
+       addi            $idx,$idx,16
+       mtctr           $rounds
+
+Ltweak_xts_enc:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $tweak,$tweak,$rndkey1
+       lvx             $rndkey1,$idx,$key2
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipher         $tweak,$tweak,$rndkey0
+       lvx             $rndkey0,$idx,$key2
+       addi            $idx,$idx,16
+       bdnz            Ltweak_xts_enc
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $tweak,$tweak,$rndkey1
+       lvx             $rndkey1,$idx,$key2
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipherlast     $tweak,$tweak,$rndkey0
+
+       li              $ivp,0                          # don't chain the tweak
+       b               Lxts_enc
+
+Lxts_enc_no_key2:
+       li              $idx,-16
+       and             $len,$len,$idx                  # in "tweak chaining"
+                                                       # mode only complete
+                                                       # blocks are processed
+Lxts_enc:
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+
+       ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
+       lwz             $rounds,240($key1)
+       srwi            $rounds,$rounds,1
+       subi            $rounds,$rounds,1
+       li              $idx,16
+
+       vslb            $eighty7,$seven,$seven          # 0x808080..80
+       vor             $eighty7,$eighty7,$seven        # 0x878787..87
+       vspltisb        $tmp,1                          # 0x010101..01
+       vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
+
+       ${UCMP}i        $len,96
+       bge             _aesp8_xts_encrypt6x
+
+       andi.           $taillen,$len,15
+       subic           r0,$len,32
+       subi            $taillen,$taillen,16
+       subfe           r0,r0,r0
+       and             r0,r0,$taillen
+       add             $inp,$inp,r0
+
+       lvx             $rndkey0,0,$key1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+       vperm           $inout,$inout,$inptail,$inpperm
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$inout,$tweak
+       vxor            $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+       mtctr           $rounds
+       b               Loop_xts_enc
+
+.align 5
+Loop_xts_enc:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipher         $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+       bdnz            Loop_xts_enc
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key1
+       li              $idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $rndkey0,$rndkey0,$tweak
+       vcipherlast     $output,$inout,$rndkey0
+
+       le?vperm        $tmp,$output,$output,$leperm
+       be?nop
+       le?stvx_u       $tmp,0,$out
+       be?stvx_u       $output,0,$out
+       addi            $out,$out,16
+
+       subic.          $len,$len,16
+       beq             Lxts_enc_done
+
+       vmr             $inout,$inptail
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+       lvx             $rndkey0,0,$key1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+
+       subic           r0,$len,32
+       subfe           r0,r0,r0
+       and             r0,r0,$taillen
+       add             $inp,$inp,r0
+
+       vsrab           $tmp,$tweak,$seven              # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+       vsldoi          $tmp,$tmp,$tmp,15
+       vand            $tmp,$tmp,$eighty7
+       vxor            $tweak,$tweak,$tmp
+
+       vperm           $inout,$inout,$inptail,$inpperm
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$inout,$tweak
+       vxor            $output,$output,$rndkey0        # just in case $len<16
+       vxor            $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+
+       mtctr           $rounds
+       ${UCMP}i        $len,16
+       bge             Loop_xts_enc
+
+       vxor            $output,$output,$tweak
+       lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
+       vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
+       vspltisb        $tmp,-1
+       vperm           $inptail,$inptail,$tmp,$inpperm
+       vsel            $inout,$inout,$output,$inptail
+
+       subi            r11,$out,17
+       subi            $out,$out,16
+       mtctr           $len
+       li              $len,16
+Loop_xts_enc_steal:
+       lbzu            r0,1(r11)
+       stb             r0,16(r11)
+       bdnz            Loop_xts_enc_steal
+
+       mtctr           $rounds
+       b               Loop_xts_enc                    # one more time...
+
+Lxts_enc_done:
+       ${UCMP}i        $ivp,0
+       beq             Lxts_enc_ret
+
+       vsrab           $tmp,$tweak,$seven              # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+       vsldoi          $tmp,$tmp,$tmp,15
+       vand            $tmp,$tmp,$eighty7
+       vxor            $tweak,$tweak,$tmp
+
+       le?vperm        $tweak,$tweak,$tweak,$leperm
+       stvx_u          $tweak,0,$ivp
+
+Lxts_enc_ret:
+       mtspr           256,r12                         # restore vrsave
+       li              r3,0
+       blr
+       .long           0
+       .byte           0,12,0x04,0,0x80,6,6,0
+       .long           0
+.size  .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
+
+.globl .${prefix}_xts_decrypt
+       mr              $inp,r3                         # reassign
+       li              r3,-1
+       ${UCMP}i        $len,16
+       bltlr-
+
+       lis             r0,0xfff8
+       mfspr           r12,256                         # save vrsave
+       li              r11,0
+       mtspr           256,r0
+
+       andi.           r0,$len,15
+       neg             r0,r0
+       andi.           r0,r0,16
+       sub             $len,$len,r0
+
+       vspltisb        $seven,0x07                     # 0x070707..07
+       le?lvsl         $leperm,r11,r11
+       le?vspltisb     $tmp,0x0f
+       le?vxor         $leperm,$leperm,$seven
+
+       li              $idx,15
+       lvx             $tweak,0,$ivp                   # load [unaligned] iv
+       lvsl            $inpperm,0,$ivp
+       lvx             $inptail,$idx,$ivp
+       le?vxor         $inpperm,$inpperm,$tmp
+       vperm           $tweak,$tweak,$inptail,$inpperm
+
+       neg             r11,$inp
+       lvsr            $inpperm,0,r11                  # prepare for unaligned load
+       lvx             $inout,0,$inp
+       addi            $inp,$inp,15                    # 15 is not typo
+       le?vxor         $inpperm,$inpperm,$tmp
+
+       ${UCMP}i        $key2,0                         # key2==NULL?
+       beq             Lxts_dec_no_key2
+
+       ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
+       lwz             $rounds,240($key2)
+       srwi            $rounds,$rounds,1
+       subi            $rounds,$rounds,1
+       li              $idx,16
+
+       lvx             $rndkey0,0,$key2
+       lvx             $rndkey1,$idx,$key2
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $tweak,$tweak,$rndkey0
+       lvx             $rndkey0,$idx,$key2
+       addi            $idx,$idx,16
+       mtctr           $rounds
+
+Ltweak_xts_dec:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $tweak,$tweak,$rndkey1
+       lvx             $rndkey1,$idx,$key2
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipher         $tweak,$tweak,$rndkey0
+       lvx             $rndkey0,$idx,$key2
+       addi            $idx,$idx,16
+       bdnz            Ltweak_xts_dec
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $tweak,$tweak,$rndkey1
+       lvx             $rndkey1,$idx,$key2
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipherlast     $tweak,$tweak,$rndkey0
+
+       li              $ivp,0                          # don't chain the tweak
+       b               Lxts_dec
+
+Lxts_dec_no_key2:
+       neg             $idx,$len
+       andi.           $idx,$idx,15
+       add             $len,$len,$idx                  # in "tweak chaining"
+                                                       # mode only complete
+                                                       # blocks are processed
+Lxts_dec:
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+
+       ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
+       lwz             $rounds,240($key1)
+       srwi            $rounds,$rounds,1
+       subi            $rounds,$rounds,1
+       li              $idx,16
+
+       vslb            $eighty7,$seven,$seven          # 0x808080..80
+       vor             $eighty7,$eighty7,$seven        # 0x878787..87
+       vspltisb        $tmp,1                          # 0x010101..01
+       vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
+
+       ${UCMP}i        $len,96
+       bge             _aesp8_xts_decrypt6x
+
+       lvx             $rndkey0,0,$key1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+       vperm           $inout,$inout,$inptail,$inpperm
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$inout,$tweak
+       vxor            $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+       mtctr           $rounds
+
+       ${UCMP}i        $len,16
+       blt             Ltail_xts_dec
+       be?b            Loop_xts_dec
+
+.align 5
+Loop_xts_dec:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vncipher        $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+       bdnz            Loop_xts_dec
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key1
+       li              $idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $rndkey0,$rndkey0,$tweak
+       vncipherlast    $output,$inout,$rndkey0
+
+       le?vperm        $tmp,$output,$output,$leperm
+       be?nop
+       le?stvx_u       $tmp,0,$out
+       be?stvx_u       $output,0,$out
+       addi            $out,$out,16
+
+       subic.          $len,$len,16
+       beq             Lxts_dec_done
+
+       vmr             $inout,$inptail
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+       lvx             $rndkey0,0,$key1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+
+       vsrab           $tmp,$tweak,$seven              # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+       vsldoi          $tmp,$tmp,$tmp,15
+       vand            $tmp,$tmp,$eighty7
+       vxor            $tweak,$tweak,$tmp
+
+       vperm           $inout,$inout,$inptail,$inpperm
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$inout,$tweak
+       vxor            $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+
+       mtctr           $rounds
+       ${UCMP}i        $len,16
+       bge             Loop_xts_dec
+
+Ltail_xts_dec:
+       vsrab           $tmp,$tweak,$seven              # next tweak value
+       vaddubm         $tweak1,$tweak,$tweak
+       vsldoi          $tmp,$tmp,$tmp,15
+       vand            $tmp,$tmp,$eighty7
+       vxor            $tweak1,$tweak1,$tmp
+
+       subi            $inp,$inp,16
+       add             $inp,$inp,$len
+
+       vxor            $inout,$inout,$tweak            # :-(
+       vxor            $inout,$inout,$tweak1           # :-)
+
+Loop_xts_dec_short:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vncipher        $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+       bdnz            Loop_xts_dec_short
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key1
+       li              $idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $rndkey0,$rndkey0,$tweak1
+       vncipherlast    $output,$inout,$rndkey0
+
+       le?vperm        $tmp,$output,$output,$leperm
+       be?nop
+       le?stvx_u       $tmp,0,$out
+       be?stvx_u       $output,0,$out
+
+       vmr             $inout,$inptail
+       lvx             $inptail,0,$inp
+       #addi           $inp,$inp,16
+       lvx             $rndkey0,0,$key1
+       lvx             $rndkey1,$idx,$key1
+       addi            $idx,$idx,16
+       vperm           $inout,$inout,$inptail,$inpperm
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+
+       lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
+       vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
+       vspltisb        $tmp,-1
+       vperm           $inptail,$inptail,$tmp,$inpperm
+       vsel            $inout,$inout,$output,$inptail
+
+       vxor            $rndkey0,$rndkey0,$tweak
+       vxor            $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key1
+       addi            $idx,$idx,16
+
+       subi            r11,$out,1
+       mtctr           $len
+       li              $len,16
+Loop_xts_dec_steal:
+       lbzu            r0,1(r11)
+       stb             r0,16(r11)
+       bdnz            Loop_xts_dec_steal
+
+       mtctr           $rounds
+       b               Loop_xts_dec                    # one more time...
+
+Lxts_dec_done:
+       ${UCMP}i        $ivp,0
+       beq             Lxts_dec_ret
+
+       vsrab           $tmp,$tweak,$seven              # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+       vsldoi          $tmp,$tmp,$tmp,15
+       vand            $tmp,$tmp,$eighty7
+       vxor            $tweak,$tweak,$tmp
+
+       le?vperm        $tweak,$tweak,$tweak,$leperm
+       stvx_u          $tweak,0,$ivp
+
+Lxts_dec_ret:
+       mtspr           256,r12                         # restore vrsave
+       li              r3,0
+       blr
+       .long           0
+       .byte           0,12,0x04,0,0x80,6,6,0
+       .long           0
+.size  .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
+___
+#########################################################################
+{{     # Optimized XTS procedures                                      #
+my $key_=$key2;
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
+    $x00=0 if ($flavour =~ /osx/);
+my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
+my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
+my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
+my $rndkey0="v23";     # v24-v25 rotating buffer for first found keys
+                       # v26-v31 last 6 round keys
+my ($keyperm)=($out0); # aliases with "caller", redundant assignment
+my $taillen=$x70;
+
+$code.=<<___;
+.align 5
+_aesp8_xts_encrypt6x:
+       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+       mflr            r11
+       li              r7,`$FRAME+8*16+15`
+       li              r3,`$FRAME+8*16+31`
+       $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+       stvx            v20,r7,$sp              # ABI says so
+       addi            r7,r7,32
+       stvx            v21,r3,$sp
+       addi            r3,r3,32
+       stvx            v22,r7,$sp
+       addi            r7,r7,32
+       stvx            v23,r3,$sp
+       addi            r3,r3,32
+       stvx            v24,r7,$sp
+       addi            r7,r7,32
+       stvx            v25,r3,$sp
+       addi            r3,r3,32
+       stvx            v26,r7,$sp
+       addi            r7,r7,32
+       stvx            v27,r3,$sp
+       addi            r3,r3,32
+       stvx            v28,r7,$sp
+       addi            r7,r7,32
+       stvx            v29,r3,$sp
+       addi            r3,r3,32
+       stvx            v30,r7,$sp
+       stvx            v31,r3,$sp
+       li              r0,-1
+       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
+       li              $x10,0x10
+       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       li              $x20,0x20
+       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       li              $x30,0x30
+       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       li              $x40,0x40
+       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       li              $x50,0x50
+       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       li              $x60,0x60
+       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       li              $x70,0x70
+       mtspr           256,r0
+
+       xxlor           2, 32+$eighty7, 32+$eighty7
+       vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
+       xxlor           1, 32+$eighty7, 32+$eighty7
+
+       # Load XOR Lconsts.
+       mr              $x70, r6
+       bl              Lconsts
+       lxvw4x          0, $x40, r6             # load XOR contents
+       mr              r6, $x70
+       li              $x70,0x70
+
+       subi            $rounds,$rounds,3       # -4 in total
+
+       lvx             $rndkey0,$x00,$key1     # load key schedule
+       lvx             v30,$x10,$key1
+       addi            $key1,$key1,0x20
+       lvx             v31,$x00,$key1
+       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
+       addi            $key_,$sp,$FRAME+15
+       mtctr           $rounds
+
+Load_xts_enc_key:
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v30,$x10,$key1
+       addi            $key1,$key1,0x20
+       stvx            v24,$x00,$key_          # off-load round[1]
+       ?vperm          v25,v31,v30,$keyperm
+       lvx             v31,$x00,$key1
+       stvx            v25,$x10,$key_          # off-load round[2]
+       addi            $key_,$key_,0x20
+       bdnz            Load_xts_enc_key
+
+       lvx             v26,$x10,$key1
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v27,$x20,$key1
+       stvx            v24,$x00,$key_          # off-load round[3]
+       ?vperm          v25,v31,v26,$keyperm
+       lvx             v28,$x30,$key1
+       stvx            v25,$x10,$key_          # off-load round[4]
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       ?vperm          v26,v26,v27,$keyperm
+       lvx             v29,$x40,$key1
+       ?vperm          v27,v27,v28,$keyperm
+       lvx             v30,$x50,$key1
+       ?vperm          v28,v28,v29,$keyperm
+       lvx             v31,$x60,$key1
+       ?vperm          v29,v29,v30,$keyperm
+       lvx             $twk5,$x70,$key1        # borrow $twk5
+       ?vperm          v30,v30,v31,$keyperm
+       lvx             v24,$x00,$key_          # pre-load round[1]
+       ?vperm          v31,v31,$twk5,$keyperm
+       lvx             v25,$x10,$key_          # pre-load round[2]
+
+       # Switch to use the following codes with 0x010101..87 to generate tweak.
+       #     eighty7 = 0x010101..87
+       # vsrab         tmp, tweak, seven       # next tweak value, right shift 7 bits
+       # vand          tmp, tmp, eighty7       # last byte with carry
+       # vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
+       # xxlor         vsx, 0, 0
+       # vpermxor      tweak, tweak, tmp, vsx
+
+        vperm          $in0,$inout,$inptail,$inpperm
+        subi           $inp,$inp,31            # undo "caller"
+       vxor            $twk0,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out0,$in0,$twk0
+       xxlor           32+$in1, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in1
+
+        lvx_u          $in1,$x10,$inp
+       vxor            $twk1,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in1,$in1,$in1,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out1,$in1,$twk1
+       xxlor           32+$in2, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in2
+
+        lvx_u          $in2,$x20,$inp
+        andi.          $taillen,$len,15
+       vxor            $twk2,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in2,$in2,$in2,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out2,$in2,$twk2
+       xxlor           32+$in3, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in3
+
+        lvx_u          $in3,$x30,$inp
+        sub            $len,$len,$taillen
+       vxor            $twk3,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in3,$in3,$in3,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out3,$in3,$twk3
+       xxlor           32+$in4, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in4
+
+        lvx_u          $in4,$x40,$inp
+        subi           $len,$len,0x60
+       vxor            $twk4,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in4,$in4,$in4,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out4,$in4,$twk4
+       xxlor           32+$in5, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in5
+
+        lvx_u          $in5,$x50,$inp
+        addi           $inp,$inp,0x60
+       vxor            $twk5,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in5,$in5,$in5,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out5,$in5,$twk5
+       xxlor           32+$in0, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in0
+
+       vxor            v31,v31,$rndkey0
+       mtctr           $rounds
+       b               Loop_xts_enc6x
+
+.align 5
+Loop_xts_enc6x:
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+       vcipher         $out5,$out5,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+       vcipher         $out4,$out4,v25
+       vcipher         $out5,$out5,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_xts_enc6x
+
+       xxlor           32+$eighty7, 1, 1       # 0x010101..87
+
+       subic           $len,$len,96            # $len-=96
+        vxor           $in0,$twk0,v31          # xor with last round key
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk0,$tweak,$rndkey0
+        vaddubm        $tweak,$tweak,$tweak
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+       vcipher         $out5,$out5,v24
+
+       subfe.          r0,r0,r0                # borrow?-1:0
+        vand           $tmp,$tmp,$eighty7
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+        xxlor          32+$in1, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in1
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+        vxor           $in1,$twk1,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk1,$tweak,$rndkey0
+       vcipher         $out4,$out4,v25
+       vcipher         $out5,$out5,v25
+
+       and             r0,r0,$len
+        vaddubm        $tweak,$tweak,$tweak
+       vcipher         $out0,$out0,v26
+       vcipher         $out1,$out1,v26
+        vand           $tmp,$tmp,$eighty7
+       vcipher         $out2,$out2,v26
+       vcipher         $out3,$out3,v26
+        xxlor          32+$in2, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in2
+       vcipher         $out4,$out4,v26
+       vcipher         $out5,$out5,v26
+
+       add             $inp,$inp,r0            # $inp is adjusted in such
+                                               # way that at exit from the
+                                               # loop inX-in5 are loaded
+                                               # with last "words"
+        vxor           $in2,$twk2,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk2,$tweak,$rndkey0
+        vaddubm        $tweak,$tweak,$tweak
+       vcipher         $out0,$out0,v27
+       vcipher         $out1,$out1,v27
+       vcipher         $out2,$out2,v27
+       vcipher         $out3,$out3,v27
+        vand           $tmp,$tmp,$eighty7
+       vcipher         $out4,$out4,v27
+       vcipher         $out5,$out5,v27
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+        xxlor          32+$in3, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in3
+       vcipher         $out0,$out0,v28
+       vcipher         $out1,$out1,v28
+        vxor           $in3,$twk3,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk3,$tweak,$rndkey0
+       vcipher         $out2,$out2,v28
+       vcipher         $out3,$out3,v28
+        vaddubm        $tweak,$tweak,$tweak
+       vcipher         $out4,$out4,v28
+       vcipher         $out5,$out5,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+        vand           $tmp,$tmp,$eighty7
+
+       vcipher         $out0,$out0,v29
+       vcipher         $out1,$out1,v29
+        xxlor          32+$in4, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in4
+       vcipher         $out2,$out2,v29
+       vcipher         $out3,$out3,v29
+        vxor           $in4,$twk4,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk4,$tweak,$rndkey0
+       vcipher         $out4,$out4,v29
+       vcipher         $out5,$out5,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+        vaddubm        $tweak,$tweak,$tweak
+
+       vcipher         $out0,$out0,v30
+       vcipher         $out1,$out1,v30
+        vand           $tmp,$tmp,$eighty7
+       vcipher         $out2,$out2,v30
+       vcipher         $out3,$out3,v30
+        xxlor          32+$in5, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in5
+       vcipher         $out4,$out4,v30
+       vcipher         $out5,$out5,v30
+        vxor           $in5,$twk5,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk5,$tweak,$rndkey0
+
+       vcipherlast     $out0,$out0,$in0
+        lvx_u          $in0,$x00,$inp          # load next input block
+        vaddubm        $tweak,$tweak,$tweak
+       vcipherlast     $out1,$out1,$in1
+        lvx_u          $in1,$x10,$inp
+       vcipherlast     $out2,$out2,$in2
+        le?vperm       $in0,$in0,$in0,$leperm
+        lvx_u          $in2,$x20,$inp
+        vand           $tmp,$tmp,$eighty7
+       vcipherlast     $out3,$out3,$in3
+        le?vperm       $in1,$in1,$in1,$leperm
+        lvx_u          $in3,$x30,$inp
+       vcipherlast     $out4,$out4,$in4
+        le?vperm       $in2,$in2,$in2,$leperm
+        lvx_u          $in4,$x40,$inp
+        xxlor          10, 32+$in0, 32+$in0
+        xxlor          32+$in0, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in0
+        xxlor          32+$in0, 10, 10
+       vcipherlast     $tmp,$out5,$in5         # last block might be needed
+                                               # in stealing mode
+        le?vperm       $in3,$in3,$in3,$leperm
+        lvx_u          $in5,$x50,$inp
+        addi           $inp,$inp,0x60
+        le?vperm       $in4,$in4,$in4,$leperm
+        le?vperm       $in5,$in5,$in5,$leperm
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+        vxor           $out0,$in0,$twk0
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+        vxor           $out1,$in1,$twk1
+       le?vperm        $out3,$out3,$out3,$leperm
+       stvx_u          $out2,$x20,$out
+        vxor           $out2,$in2,$twk2
+       le?vperm        $out4,$out4,$out4,$leperm
+       stvx_u          $out3,$x30,$out
+        vxor           $out3,$in3,$twk3
+       le?vperm        $out5,$tmp,$tmp,$leperm
+       stvx_u          $out4,$x40,$out
+        vxor           $out4,$in4,$twk4
+       le?stvx_u       $out5,$x50,$out
+       be?stvx_u       $tmp, $x50,$out
+        vxor           $out5,$in5,$twk5
+       addi            $out,$out,0x60
+
+       mtctr           $rounds
+       beq             Loop_xts_enc6x          # did $len-=96 borrow?
+
+       xxlor           32+$eighty7, 2, 2       # 0x010101..87
+
+       addic.          $len,$len,0x60
+       beq             Lxts_enc6x_zero
+       cmpwi           $len,0x20
+       blt             Lxts_enc6x_one
+       nop
+       beq             Lxts_enc6x_two
+       cmpwi           $len,0x40
+       blt             Lxts_enc6x_three
+       nop
+       beq             Lxts_enc6x_four
+
+Lxts_enc6x_five:
+       vxor            $out0,$in1,$twk0
+       vxor            $out1,$in2,$twk1
+       vxor            $out2,$in3,$twk2
+       vxor            $out3,$in4,$twk3
+       vxor            $out4,$in5,$twk4
+
+       bl              _aesp8_xts_enc5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk5             # unused tweak
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$leperm
+       stvx_u          $out2,$x20,$out
+       vxor            $tmp,$out4,$twk5        # last block prep for stealing
+       le?vperm        $out4,$out4,$out4,$leperm
+       stvx_u          $out3,$x30,$out
+       stvx_u          $out4,$x40,$out
+       addi            $out,$out,0x50
+       bne             Lxts_enc6x_steal
+       b               Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_four:
+       vxor            $out0,$in2,$twk0
+       vxor            $out1,$in3,$twk1
+       vxor            $out2,$in4,$twk2
+       vxor            $out3,$in5,$twk3
+       vxor            $out4,$out4,$out4
+
+       bl              _aesp8_xts_enc5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk4             # unused tweak
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+       vxor            $tmp,$out3,$twk4        # last block prep for stealing
+       le?vperm        $out3,$out3,$out3,$leperm
+       stvx_u          $out2,$x20,$out
+       stvx_u          $out3,$x30,$out
+       addi            $out,$out,0x40
+       bne             Lxts_enc6x_steal
+       b               Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_three:
+       vxor            $out0,$in3,$twk0
+       vxor            $out1,$in4,$twk1
+       vxor            $out2,$in5,$twk2
+       vxor            $out3,$out3,$out3
+       vxor            $out4,$out4,$out4
+
+       bl              _aesp8_xts_enc5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk3             # unused tweak
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       vxor            $tmp,$out2,$twk3        # last block prep for stealing
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+       stvx_u          $out2,$x20,$out
+       addi            $out,$out,0x30
+       bne             Lxts_enc6x_steal
+       b               Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_two:
+       vxor            $out0,$in4,$twk0
+       vxor            $out1,$in5,$twk1
+       vxor            $out2,$out2,$out2
+       vxor            $out3,$out3,$out3
+       vxor            $out4,$out4,$out4
+
+       bl              _aesp8_xts_enc5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk2             # unused tweak
+       vxor            $tmp,$out1,$twk2        # last block prep for stealing
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       stvx_u          $out1,$x10,$out
+       addi            $out,$out,0x20
+       bne             Lxts_enc6x_steal
+       b               Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_one:
+       vxor            $out0,$in5,$twk0
+       nop
+Loop_xts_enc1x:
+       vcipher         $out0,$out0,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vcipher         $out0,$out0,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_xts_enc1x
+
+       add             $inp,$inp,$taillen
+       cmpwi           $taillen,0
+       vcipher         $out0,$out0,v24
+
+       subi            $inp,$inp,16
+       vcipher         $out0,$out0,v25
+
+       lvsr            $inpperm,0,$taillen
+       vcipher         $out0,$out0,v26
+
+       lvx_u           $in0,0,$inp
+       vcipher         $out0,$out0,v27
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vcipher         $out0,$out0,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+
+       vcipher         $out0,$out0,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+        vxor           $twk0,$twk0,v31
+
+       le?vperm        $in0,$in0,$in0,$leperm
+       vcipher         $out0,$out0,v30
+
+       vperm           $in0,$in0,$in0,$inpperm
+       vcipherlast     $out0,$out0,$twk0
+
+       vmr             $twk0,$twk1             # unused tweak
+       vxor            $tmp,$out0,$twk1        # last block prep for stealing
+       le?vperm        $out0,$out0,$out0,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       addi            $out,$out,0x10
+       bne             Lxts_enc6x_steal
+       b               Lxts_enc6x_done
+
+.align 4
+Lxts_enc6x_zero:
+       cmpwi           $taillen,0
+       beq             Lxts_enc6x_done
+
+       add             $inp,$inp,$taillen
+       subi            $inp,$inp,16
+       lvx_u           $in0,0,$inp
+       lvsr            $inpperm,0,$taillen     # $in5 is no more
+       le?vperm        $in0,$in0,$in0,$leperm
+       vperm           $in0,$in0,$in0,$inpperm
+       vxor            $tmp,$tmp,$twk0
+Lxts_enc6x_steal:
+       vxor            $in0,$in0,$twk0
+       vxor            $out0,$out0,$out0
+       vspltisb        $out1,-1
+       vperm           $out0,$out0,$out1,$inpperm
+       vsel            $out0,$in0,$tmp,$out0   # $tmp is last block, remember?
+
+       subi            r30,$out,17
+       subi            $out,$out,16
+       mtctr           $taillen
+Loop_xts_enc6x_steal:
+       lbzu            r0,1(r30)
+       stb             r0,16(r30)
+       bdnz            Loop_xts_enc6x_steal
+
+       li              $taillen,0
+       mtctr           $rounds
+       b               Loop_xts_enc1x          # one more time...
+
+.align 4
+Lxts_enc6x_done:
+       ${UCMP}i        $ivp,0
+       beq             Lxts_enc6x_ret
+
+       vxor            $tweak,$twk0,$rndkey0
+       le?vperm        $tweak,$tweak,$tweak,$leperm
+       stvx_u          $tweak,0,$ivp
+
+Lxts_enc6x_ret:
+       mtlr            r11
+       li              r10,`$FRAME+15`
+       li              r11,`$FRAME+31`
+       stvx            $seven,r10,$sp          # wipe copies of round keys
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+       stvx            $seven,r10,$sp
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+       stvx            $seven,r10,$sp
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+       stvx            $seven,r10,$sp
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+
+       mtspr           256,$vrsave
+       lvx             v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       lvx             v21,r11,$sp
+       addi            r11,r11,32
+       lvx             v22,r10,$sp
+       addi            r10,r10,32
+       lvx             v23,r11,$sp
+       addi            r11,r11,32
+       lvx             v24,r10,$sp
+       addi            r10,r10,32
+       lvx             v25,r11,$sp
+       addi            r11,r11,32
+       lvx             v26,r10,$sp
+       addi            r10,r10,32
+       lvx             v27,r11,$sp
+       addi            r11,r11,32
+       lvx             v28,r10,$sp
+       addi            r10,r10,32
+       lvx             v29,r11,$sp
+       addi            r11,r11,32
+       lvx             v30,r10,$sp
+       lvx             v31,r11,$sp
+       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+       blr
+       .long           0
+       .byte           0,12,0x04,1,0x80,6,6,0
+       .long           0
+
+.align 5
+_aesp8_xts_enc5x:
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+       vcipher         $out4,$out4,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            _aesp8_xts_enc5x
+
+       add             $inp,$inp,$taillen
+       cmpwi           $taillen,0
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+
+       subi            $inp,$inp,16
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+       vcipher         $out4,$out4,v25
+        vxor           $twk0,$twk0,v31
+
+       vcipher         $out0,$out0,v26
+       lvsr            $inpperm,r0,$taillen    # $in5 is no more
+       vcipher         $out1,$out1,v26
+       vcipher         $out2,$out2,v26
+       vcipher         $out3,$out3,v26
+       vcipher         $out4,$out4,v26
+        vxor           $in1,$twk1,v31
+
+       vcipher         $out0,$out0,v27
+       lvx_u           $in0,0,$inp
+       vcipher         $out1,$out1,v27
+       vcipher         $out2,$out2,v27
+       vcipher         $out3,$out3,v27
+       vcipher         $out4,$out4,v27
+        vxor           $in2,$twk2,v31
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vcipher         $out0,$out0,v28
+       vcipher         $out1,$out1,v28
+       vcipher         $out2,$out2,v28
+       vcipher         $out3,$out3,v28
+       vcipher         $out4,$out4,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+        vxor           $in3,$twk3,v31
+
+       vcipher         $out0,$out0,v29
+       le?vperm        $in0,$in0,$in0,$leperm
+       vcipher         $out1,$out1,v29
+       vcipher         $out2,$out2,v29
+       vcipher         $out3,$out3,v29
+       vcipher         $out4,$out4,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+        vxor           $in4,$twk4,v31
+
+       vcipher         $out0,$out0,v30
+       vperm           $in0,$in0,$in0,$inpperm
+       vcipher         $out1,$out1,v30
+       vcipher         $out2,$out2,v30
+       vcipher         $out3,$out3,v30
+       vcipher         $out4,$out4,v30
+
+       vcipherlast     $out0,$out0,$twk0
+       vcipherlast     $out1,$out1,$in1
+       vcipherlast     $out2,$out2,$in2
+       vcipherlast     $out3,$out3,$in3
+       vcipherlast     $out4,$out4,$in4
+       blr
+        .long          0
+        .byte          0,12,0x14,0,0,0,0,0
+
+.align 5
+_aesp8_xts_decrypt6x:
+       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+       mflr            r11
+       li              r7,`$FRAME+8*16+15`
+       li              r3,`$FRAME+8*16+31`
+       $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+       stvx            v20,r7,$sp              # ABI says so
+       addi            r7,r7,32
+       stvx            v21,r3,$sp
+       addi            r3,r3,32
+       stvx            v22,r7,$sp
+       addi            r7,r7,32
+       stvx            v23,r3,$sp
+       addi            r3,r3,32
+       stvx            v24,r7,$sp
+       addi            r7,r7,32
+       stvx            v25,r3,$sp
+       addi            r3,r3,32
+       stvx            v26,r7,$sp
+       addi            r7,r7,32
+       stvx            v27,r3,$sp
+       addi            r3,r3,32
+       stvx            v28,r7,$sp
+       addi            r7,r7,32
+       stvx            v29,r3,$sp
+       addi            r3,r3,32
+       stvx            v30,r7,$sp
+       stvx            v31,r3,$sp
+       li              r0,-1
+       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
+       li              $x10,0x10
+       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       li              $x20,0x20
+       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       li              $x30,0x30
+       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       li              $x40,0x40
+       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       li              $x50,0x50
+       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       li              $x60,0x60
+       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       li              $x70,0x70
+       mtspr           256,r0
+
+       xxlor           2, 32+$eighty7, 32+$eighty7
+       vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
+       xxlor           1, 32+$eighty7, 32+$eighty7
+
+       # Load XOR Lconsts.
+       mr              $x70, r6
+       bl              Lconsts
+       lxvw4x          0, $x40, r6             # load XOR contents
+       mr              r6, $x70
+       li              $x70,0x70
+
+       subi            $rounds,$rounds,3       # -4 in total
+
+       lvx             $rndkey0,$x00,$key1     # load key schedule
+       lvx             v30,$x10,$key1
+       addi            $key1,$key1,0x20
+       lvx             v31,$x00,$key1
+       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
+       addi            $key_,$sp,$FRAME+15
+       mtctr           $rounds
+
+Load_xts_dec_key:
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v30,$x10,$key1
+       addi            $key1,$key1,0x20
+       stvx            v24,$x00,$key_          # off-load round[1]
+       ?vperm          v25,v31,v30,$keyperm
+       lvx             v31,$x00,$key1
+       stvx            v25,$x10,$key_          # off-load round[2]
+       addi            $key_,$key_,0x20
+       bdnz            Load_xts_dec_key
+
+       lvx             v26,$x10,$key1
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v27,$x20,$key1
+       stvx            v24,$x00,$key_          # off-load round[3]
+       ?vperm          v25,v31,v26,$keyperm
+       lvx             v28,$x30,$key1
+       stvx            v25,$x10,$key_          # off-load round[4]
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       ?vperm          v26,v26,v27,$keyperm
+       lvx             v29,$x40,$key1
+       ?vperm          v27,v27,v28,$keyperm
+       lvx             v30,$x50,$key1
+       ?vperm          v28,v28,v29,$keyperm
+       lvx             v31,$x60,$key1
+       ?vperm          v29,v29,v30,$keyperm
+       lvx             $twk5,$x70,$key1        # borrow $twk5
+       ?vperm          v30,v30,v31,$keyperm
+       lvx             v24,$x00,$key_          # pre-load round[1]
+       ?vperm          v31,v31,$twk5,$keyperm
+       lvx             v25,$x10,$key_          # pre-load round[2]
+
+        vperm          $in0,$inout,$inptail,$inpperm
+        subi           $inp,$inp,31            # undo "caller"
+       vxor            $twk0,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out0,$in0,$twk0
+       xxlor           32+$in1, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in1
+
+        lvx_u          $in1,$x10,$inp
+       vxor            $twk1,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in1,$in1,$in1,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out1,$in1,$twk1
+       xxlor           32+$in2, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in2
+
+        lvx_u          $in2,$x20,$inp
+        andi.          $taillen,$len,15
+       vxor            $twk2,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in2,$in2,$in2,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out2,$in2,$twk2
+       xxlor           32+$in3, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in3
+
+        lvx_u          $in3,$x30,$inp
+        sub            $len,$len,$taillen
+       vxor            $twk3,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in3,$in3,$in3,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out3,$in3,$twk3
+       xxlor           32+$in4, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in4
+
+        lvx_u          $in4,$x40,$inp
+        subi           $len,$len,0x60
+       vxor            $twk4,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in4,$in4,$in4,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out4,$in4,$twk4
+       xxlor           32+$in5, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in5
+
+        lvx_u          $in5,$x50,$inp
+        addi           $inp,$inp,0x60
+       vxor            $twk5,$tweak,$rndkey0
+       vsrab           $tmp,$tweak,$seven      # next tweak value
+       vaddubm         $tweak,$tweak,$tweak
+        le?vperm       $in5,$in5,$in5,$leperm
+       vand            $tmp,$tmp,$eighty7
+        vxor           $out5,$in5,$twk5
+       xxlor           32+$in0, 0, 0
+       vpermxor        $tweak, $tweak, $tmp, $in0
+
+       vxor            v31,v31,$rndkey0
+       mtctr           $rounds
+       b               Loop_xts_dec6x
+
+.align 5
+Loop_xts_dec6x:
+       vncipher        $out0,$out0,v24
+       vncipher        $out1,$out1,v24
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+       vncipher        $out5,$out5,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vncipher        $out0,$out0,v25
+       vncipher        $out1,$out1,v25
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+       vncipher        $out4,$out4,v25
+       vncipher        $out5,$out5,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_xts_dec6x
+
+       xxlor           32+$eighty7, 1, 1       # 0x010101..87
+
+       subic           $len,$len,96            # $len-=96
+        vxor           $in0,$twk0,v31          # xor with last round key
+       vncipher        $out0,$out0,v24
+       vncipher        $out1,$out1,v24
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk0,$tweak,$rndkey0
+        vaddubm        $tweak,$tweak,$tweak
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+       vncipher        $out5,$out5,v24
+
+       subfe.          r0,r0,r0                # borrow?-1:0
+        vand           $tmp,$tmp,$eighty7
+       vncipher        $out0,$out0,v25
+       vncipher        $out1,$out1,v25
+        xxlor          32+$in1, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in1
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+        vxor           $in1,$twk1,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk1,$tweak,$rndkey0
+       vncipher        $out4,$out4,v25
+       vncipher        $out5,$out5,v25
+
+       and             r0,r0,$len
+        vaddubm        $tweak,$tweak,$tweak
+       vncipher        $out0,$out0,v26
+       vncipher        $out1,$out1,v26
+        vand           $tmp,$tmp,$eighty7
+       vncipher        $out2,$out2,v26
+       vncipher        $out3,$out3,v26
+        xxlor          32+$in2, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in2
+       vncipher        $out4,$out4,v26
+       vncipher        $out5,$out5,v26
+
+       add             $inp,$inp,r0            # $inp is adjusted in such
+                                               # way that at exit from the
+                                               # loop inX-in5 are loaded
+                                               # with last "words"
+        vxor           $in2,$twk2,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk2,$tweak,$rndkey0
+        vaddubm        $tweak,$tweak,$tweak
+       vncipher        $out0,$out0,v27
+       vncipher        $out1,$out1,v27
+       vncipher        $out2,$out2,v27
+       vncipher        $out3,$out3,v27
+        vand           $tmp,$tmp,$eighty7
+       vncipher        $out4,$out4,v27
+       vncipher        $out5,$out5,v27
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+        xxlor          32+$in3, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in3
+       vncipher        $out0,$out0,v28
+       vncipher        $out1,$out1,v28
+        vxor           $in3,$twk3,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk3,$tweak,$rndkey0
+       vncipher        $out2,$out2,v28
+       vncipher        $out3,$out3,v28
+        vaddubm        $tweak,$tweak,$tweak
+       vncipher        $out4,$out4,v28
+       vncipher        $out5,$out5,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+        vand           $tmp,$tmp,$eighty7
+
+       vncipher        $out0,$out0,v29
+       vncipher        $out1,$out1,v29
+        xxlor          32+$in4, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in4
+       vncipher        $out2,$out2,v29
+       vncipher        $out3,$out3,v29
+        vxor           $in4,$twk4,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk4,$tweak,$rndkey0
+       vncipher        $out4,$out4,v29
+       vncipher        $out5,$out5,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+        vaddubm        $tweak,$tweak,$tweak
+
+       vncipher        $out0,$out0,v30
+       vncipher        $out1,$out1,v30
+        vand           $tmp,$tmp,$eighty7
+       vncipher        $out2,$out2,v30
+       vncipher        $out3,$out3,v30
+        xxlor          32+$in5, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in5
+       vncipher        $out4,$out4,v30
+       vncipher        $out5,$out5,v30
+        vxor           $in5,$twk5,v31
+        vsrab          $tmp,$tweak,$seven      # next tweak value
+        vxor           $twk5,$tweak,$rndkey0
+
+       vncipherlast    $out0,$out0,$in0
+        lvx_u          $in0,$x00,$inp          # load next input block
+        vaddubm        $tweak,$tweak,$tweak
+       vncipherlast    $out1,$out1,$in1
+        lvx_u          $in1,$x10,$inp
+       vncipherlast    $out2,$out2,$in2
+        le?vperm       $in0,$in0,$in0,$leperm
+        lvx_u          $in2,$x20,$inp
+        vand           $tmp,$tmp,$eighty7
+       vncipherlast    $out3,$out3,$in3
+        le?vperm       $in1,$in1,$in1,$leperm
+        lvx_u          $in3,$x30,$inp
+       vncipherlast    $out4,$out4,$in4
+        le?vperm       $in2,$in2,$in2,$leperm
+        lvx_u          $in4,$x40,$inp
+        xxlor          10, 32+$in0, 32+$in0
+        xxlor          32+$in0, 0, 0
+        vpermxor       $tweak, $tweak, $tmp, $in0
+        xxlor          32+$in0, 10, 10
+       vncipherlast    $out5,$out5,$in5
+        le?vperm       $in3,$in3,$in3,$leperm
+        lvx_u          $in5,$x50,$inp
+        addi           $inp,$inp,0x60
+        le?vperm       $in4,$in4,$in4,$leperm
+        le?vperm       $in5,$in5,$in5,$leperm
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+        vxor           $out0,$in0,$twk0
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+        vxor           $out1,$in1,$twk1
+       le?vperm        $out3,$out3,$out3,$leperm
+       stvx_u          $out2,$x20,$out
+        vxor           $out2,$in2,$twk2
+       le?vperm        $out4,$out4,$out4,$leperm
+       stvx_u          $out3,$x30,$out
+        vxor           $out3,$in3,$twk3
+       le?vperm        $out5,$out5,$out5,$leperm
+       stvx_u          $out4,$x40,$out
+        vxor           $out4,$in4,$twk4
+       stvx_u          $out5,$x50,$out
+        vxor           $out5,$in5,$twk5
+       addi            $out,$out,0x60
+
+       mtctr           $rounds
+       beq             Loop_xts_dec6x          # did $len-=96 borrow?
+
+       xxlor           32+$eighty7, 2, 2       # 0x010101..87
+
+       addic.          $len,$len,0x60
+       beq             Lxts_dec6x_zero
+       cmpwi           $len,0x20
+       blt             Lxts_dec6x_one
+       nop
+       beq             Lxts_dec6x_two
+       cmpwi           $len,0x40
+       blt             Lxts_dec6x_three
+       nop
+       beq             Lxts_dec6x_four
+
+Lxts_dec6x_five:
+       vxor            $out0,$in1,$twk0
+       vxor            $out1,$in2,$twk1
+       vxor            $out2,$in3,$twk2
+       vxor            $out3,$in4,$twk3
+       vxor            $out4,$in5,$twk4
+
+       bl              _aesp8_xts_dec5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk5             # unused tweak
+       vxor            $twk1,$tweak,$rndkey0
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       vxor            $out0,$in0,$twk1
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$leperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$leperm
+       stvx_u          $out3,$x30,$out
+       stvx_u          $out4,$x40,$out
+       addi            $out,$out,0x50
+       bne             Lxts_dec6x_steal
+       b               Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_four:
+       vxor            $out0,$in2,$twk0
+       vxor            $out1,$in3,$twk1
+       vxor            $out2,$in4,$twk2
+       vxor            $out3,$in5,$twk3
+       vxor            $out4,$out4,$out4
+
+       bl              _aesp8_xts_dec5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk4             # unused tweak
+       vmr             $twk1,$twk5
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       vxor            $out0,$in0,$twk5
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$leperm
+       stvx_u          $out2,$x20,$out
+       stvx_u          $out3,$x30,$out
+       addi            $out,$out,0x40
+       bne             Lxts_dec6x_steal
+       b               Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_three:
+       vxor            $out0,$in3,$twk0
+       vxor            $out1,$in4,$twk1
+       vxor            $out2,$in5,$twk2
+       vxor            $out3,$out3,$out3
+       vxor            $out4,$out4,$out4
+
+       bl              _aesp8_xts_dec5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk3             # unused tweak
+       vmr             $twk1,$twk4
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       vxor            $out0,$in0,$twk4
+       le?vperm        $out2,$out2,$out2,$leperm
+       stvx_u          $out1,$x10,$out
+       stvx_u          $out2,$x20,$out
+       addi            $out,$out,0x30
+       bne             Lxts_dec6x_steal
+       b               Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_two:
+       vxor            $out0,$in4,$twk0
+       vxor            $out1,$in5,$twk1
+       vxor            $out2,$out2,$out2
+       vxor            $out3,$out3,$out3
+       vxor            $out4,$out4,$out4
+
+       bl              _aesp8_xts_dec5x
+
+       le?vperm        $out0,$out0,$out0,$leperm
+       vmr             $twk0,$twk2             # unused tweak
+       vmr             $twk1,$twk3
+       le?vperm        $out1,$out1,$out1,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       vxor            $out0,$in0,$twk3
+       stvx_u          $out1,$x10,$out
+       addi            $out,$out,0x20
+       bne             Lxts_dec6x_steal
+       b               Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_one:
+       vxor            $out0,$in5,$twk0
+       nop
+Loop_xts_dec1x:
+       vncipher        $out0,$out0,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vncipher        $out0,$out0,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_xts_dec1x
+
+       subi            r0,$taillen,1
+       vncipher        $out0,$out0,v24
+
+       andi.           r0,r0,16
+       cmpwi           $taillen,0
+       vncipher        $out0,$out0,v25
+
+       sub             $inp,$inp,r0
+       vncipher        $out0,$out0,v26
+
+       lvx_u           $in0,0,$inp
+       vncipher        $out0,$out0,v27
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vncipher        $out0,$out0,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+
+       vncipher        $out0,$out0,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+        vxor           $twk0,$twk0,v31
+
+       le?vperm        $in0,$in0,$in0,$leperm
+       vncipher        $out0,$out0,v30
+
+       mtctr           $rounds
+       vncipherlast    $out0,$out0,$twk0
+
+       vmr             $twk0,$twk1             # unused tweak
+       vmr             $twk1,$twk2
+       le?vperm        $out0,$out0,$out0,$leperm
+       stvx_u          $out0,$x00,$out         # store output
+       addi            $out,$out,0x10
+       vxor            $out0,$in0,$twk2
+       bne             Lxts_dec6x_steal
+       b               Lxts_dec6x_done
+
+.align 4
+Lxts_dec6x_zero:
+       cmpwi           $taillen,0
+       beq             Lxts_dec6x_done
+
+       lvx_u           $in0,0,$inp
+       le?vperm        $in0,$in0,$in0,$leperm
+       vxor            $out0,$in0,$twk1
+Lxts_dec6x_steal:
+       vncipher        $out0,$out0,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vncipher        $out0,$out0,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Lxts_dec6x_steal
+
+       add             $inp,$inp,$taillen
+       vncipher        $out0,$out0,v24
+
+       cmpwi           $taillen,0
+       vncipher        $out0,$out0,v25
+
+       lvx_u           $in0,0,$inp
+       vncipher        $out0,$out0,v26
+
+       lvsr            $inpperm,0,$taillen     # $in5 is no more
+       vncipher        $out0,$out0,v27
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vncipher        $out0,$out0,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+
+       vncipher        $out0,$out0,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+        vxor           $twk1,$twk1,v31
+
+       le?vperm        $in0,$in0,$in0,$leperm
+       vncipher        $out0,$out0,v30
+
+       vperm           $in0,$in0,$in0,$inpperm
+       vncipherlast    $tmp,$out0,$twk1
+
+       le?vperm        $out0,$tmp,$tmp,$leperm
+       le?stvx_u       $out0,0,$out
+       be?stvx_u       $tmp,0,$out
+
+       vxor            $out0,$out0,$out0
+       vspltisb        $out1,-1
+       vperm           $out0,$out0,$out1,$inpperm
+       vsel            $out0,$in0,$tmp,$out0
+       vxor            $out0,$out0,$twk0
+
+       subi            r30,$out,1
+       mtctr           $taillen
+Loop_xts_dec6x_steal:
+       lbzu            r0,1(r30)
+       stb             r0,16(r30)
+       bdnz            Loop_xts_dec6x_steal
+
+       li              $taillen,0
+       mtctr           $rounds
+       b               Loop_xts_dec1x          # one more time...
+
+.align 4
+Lxts_dec6x_done:
+       ${UCMP}i        $ivp,0
+       beq             Lxts_dec6x_ret
+
+       vxor            $tweak,$twk0,$rndkey0
+       le?vperm        $tweak,$tweak,$tweak,$leperm
+       stvx_u          $tweak,0,$ivp
+
+Lxts_dec6x_ret:
+       mtlr            r11
+       li              r10,`$FRAME+15`
+       li              r11,`$FRAME+31`
+       stvx            $seven,r10,$sp          # wipe copies of round keys
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+       stvx            $seven,r10,$sp
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+       stvx            $seven,r10,$sp
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+       stvx            $seven,r10,$sp
+       addi            r10,r10,32
+       stvx            $seven,r11,$sp
+       addi            r11,r11,32
+
+       mtspr           256,$vrsave
+       lvx             v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       lvx             v21,r11,$sp
+       addi            r11,r11,32
+       lvx             v22,r10,$sp
+       addi            r10,r10,32
+       lvx             v23,r11,$sp
+       addi            r11,r11,32
+       lvx             v24,r10,$sp
+       addi            r10,r10,32
+       lvx             v25,r11,$sp
+       addi            r11,r11,32
+       lvx             v26,r10,$sp
+       addi            r10,r10,32
+       lvx             v27,r11,$sp
+       addi            r11,r11,32
+       lvx             v28,r10,$sp
+       addi            r10,r10,32
+       lvx             v29,r11,$sp
+       addi            r11,r11,32
+       lvx             v30,r10,$sp
+       lvx             v31,r11,$sp
+       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+       blr
+       .long           0
+       .byte           0,12,0x04,1,0x80,6,6,0
+       .long           0
+
+.align 5
+_aesp8_xts_dec5x:
+       vncipher        $out0,$out0,v24
+       vncipher        $out1,$out1,v24
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vncipher        $out0,$out0,v25
+       vncipher        $out1,$out1,v25
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+       vncipher        $out4,$out4,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            _aesp8_xts_dec5x
+
+       subi            r0,$taillen,1
+       vncipher        $out0,$out0,v24
+       vncipher        $out1,$out1,v24
+       vncipher        $out2,$out2,v24
+       vncipher        $out3,$out3,v24
+       vncipher        $out4,$out4,v24
+
+       andi.           r0,r0,16
+       cmpwi           $taillen,0
+       vncipher        $out0,$out0,v25
+       vncipher        $out1,$out1,v25
+       vncipher        $out2,$out2,v25
+       vncipher        $out3,$out3,v25
+       vncipher        $out4,$out4,v25
+        vxor           $twk0,$twk0,v31
+
+       sub             $inp,$inp,r0
+       vncipher        $out0,$out0,v26
+       vncipher        $out1,$out1,v26
+       vncipher        $out2,$out2,v26
+       vncipher        $out3,$out3,v26
+       vncipher        $out4,$out4,v26
+        vxor           $in1,$twk1,v31
+
+       vncipher        $out0,$out0,v27
+       lvx_u           $in0,0,$inp
+       vncipher        $out1,$out1,v27
+       vncipher        $out2,$out2,v27
+       vncipher        $out3,$out3,v27
+       vncipher        $out4,$out4,v27
+        vxor           $in2,$twk2,v31
+
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vncipher        $out0,$out0,v28
+       vncipher        $out1,$out1,v28
+       vncipher        $out2,$out2,v28
+       vncipher        $out3,$out3,v28
+       vncipher        $out4,$out4,v28
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+        vxor           $in3,$twk3,v31
+
+       vncipher        $out0,$out0,v29
+       le?vperm        $in0,$in0,$in0,$leperm
+       vncipher        $out1,$out1,v29
+       vncipher        $out2,$out2,v29
+       vncipher        $out3,$out3,v29
+       vncipher        $out4,$out4,v29
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+        vxor           $in4,$twk4,v31
+
+       vncipher        $out0,$out0,v30
+       vncipher        $out1,$out1,v30
+       vncipher        $out2,$out2,v30
+       vncipher        $out3,$out3,v30
+       vncipher        $out4,$out4,v30
+
+       vncipherlast    $out0,$out0,$twk0
+       vncipherlast    $out1,$out1,$in1
+       vncipherlast    $out2,$out2,$in2
+       vncipherlast    $out3,$out3,$in3
+       vncipherlast    $out4,$out4,$in4
+       mtctr           $rounds
+       blr
+        .long          0
+        .byte          0,12,0x14,0,0,0,0,0
+___
+}}     }}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+        s/\`([^\`]*)\`/eval($1)/geo;
+
+       # constants table endian-specific conversion
+       if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+           my $conv=$3;
+           my @bytes=();
+
+           # convert to endian-agnostic format
+           if ($1 eq "long") {
+             foreach (split(/,\s*/,$2)) {
+               my $l = /^0/?oct:int;
+               push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+             }
+           } else {
+               @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+           }
+
+           # little-endian conversion
+           if ($flavour =~ /le$/o) {
+               SWITCH: for($conv)  {
+                   /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
+                   /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
+               }
+           }
+
+           #emit
+           print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+           next;
+       }
+       $consts=0 if (m/Lconsts:/o);    # end of table
+
+       # instructions prefixed with '?' are endian-specific and need
+       # to be adjusted accordingly...
+       if ($flavour =~ /le$/o) {       # little-endian
+           s/le\?//o           or
+           s/be\?/#be#/o       or
+           s/\?lvsr/lvsl/o     or
+           s/\?lvsl/lvsr/o     or
+           s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+           s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+           s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+       } else {                        # big-endian
+           s/le\?/#le#/o       or
+           s/be\?//o           or
+           s/\?([a-z]+)/$1/o;
+       }
+
+        print $_,"\n";
+}
+
+close STDOUT;