s390/crypto: add SIMD implementation for ChaCha20
authorPatrick Steuer <patrick.steuer@de.ibm.com>
Thu, 4 Nov 2021 14:58:51 +0000 (15:58 +0100)
committerHeiko Carstens <hca@linux.ibm.com>
Mon, 6 Dec 2021 13:42:24 +0000 (14:42 +0100)
Add an implementation of the ChaCha20 stream cipher (see e.g. RFC 7539)
that makes use of z13's vector instruction set extension.

The original implementation is by Andy Polyakov which is
adapted for kernel use.

Four to six blocks are processed in parallel resulting in a performance
gain for inputs >= 256 bytes.

chacha20-generic

1 operation in 622 cycles (256 bytes)
1 operation in 2346 cycles (1024 bytes)

chacha20-s390

1 operation in 218 cycles (256 bytes)
1 operation in 647 cycles (1024 bytes)

Cc: Andy Polyakov <appro@openssl.org>
Reviewed-by: Harald Freudenberger <freude@de.ibm.com>
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
arch/s390/configs/debug_defconfig
arch/s390/configs/defconfig
arch/s390/crypto/Makefile
arch/s390/crypto/chacha-glue.c [new file with mode: 0644]
arch/s390/crypto/chacha-s390.S [new file with mode: 0644]
arch/s390/crypto/chacha-s390.h [new file with mode: 0644]
arch/s390/include/asm/vx-insn.h
drivers/crypto/Kconfig

index b626bc6..416257b 100644 (file)
@@ -768,6 +768,7 @@ CONFIG_CRYPTO_SHA3_256_S390=m
 CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_DES_S390=m
 CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_CRC32_S390=y
 CONFIG_CRYPTO_DEV_VIRTIO=m
index 0056cab..03ab608 100644 (file)
@@ -755,6 +755,7 @@ CONFIG_CRYPTO_SHA3_256_S390=m
 CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_DES_S390=m
 CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_CHACHA_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_CRC32_S390=y
 CONFIG_CRYPTO_DEV_VIRTIO=m
index 12889d4..c63abfe 100644 (file)
@@ -11,9 +11,11 @@ obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
 obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
 obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
+obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
 obj-$(CONFIG_S390_PRNG) += prng.o
 obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
 obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
 obj-$(CONFIG_ARCH_RANDOM) += arch_random.o
 
 crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
+chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c
new file mode 100644 (file)
index 0000000..ccfff73
--- /dev/null
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#define KMSG_COMPONENT "chacha_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/algapi.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/fpu/api.h>
+#include "chacha-s390.h"
+
+static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
+                               unsigned int nbytes, const u32 *key,
+                               u32 *counter)
+{
+       struct kernel_fpu vxstate;
+
+       kernel_fpu_begin(&vxstate, KERNEL_VXR);
+       chacha20_vx(dst, src, nbytes, key, counter);
+       kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+       *counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static int chacha20_s390(struct skcipher_request *req)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+       u32 state[CHACHA_STATE_WORDS] __aligned(16);
+       struct skcipher_walk walk;
+       unsigned int nbytes;
+       int rc;
+
+       rc = skcipher_walk_virt(&walk, req, false);
+       chacha_init_generic(state, ctx->key, req->iv);
+
+       while (walk.nbytes > 0) {
+               nbytes = walk.nbytes;
+               if (nbytes < walk.total)
+                       nbytes = round_down(nbytes, walk.stride);
+
+               if (nbytes <= CHACHA_BLOCK_SIZE) {
+                       chacha_crypt_generic(state, walk.dst.virt.addr,
+                                            walk.src.virt.addr, nbytes,
+                                            ctx->nrounds);
+               } else {
+                       chacha20_crypt_s390(state, walk.dst.virt.addr,
+                                           walk.src.virt.addr, nbytes,
+                                           &state[4], &state[12]);
+               }
+               rc = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+       }
+       return rc;
+}
+
+static struct skcipher_alg chacha_algs[] = {
+       {
+               .base.cra_name          = "chacha20",
+               .base.cra_driver_name   = "chacha20-s390",
+               .base.cra_priority      = 900,
+               .base.cra_blocksize     = 1,
+               .base.cra_ctxsize       = sizeof(struct chacha_ctx),
+               .base.cra_module        = THIS_MODULE,
+
+               .min_keysize            = CHACHA_KEY_SIZE,
+               .max_keysize            = CHACHA_KEY_SIZE,
+               .ivsize                 = CHACHA_IV_SIZE,
+               .chunksize              = CHACHA_BLOCK_SIZE,
+               .setkey                 = chacha20_setkey,
+               .encrypt                = chacha20_s390,
+               .decrypt                = chacha20_s390,
+       }
+};
+
+static int __init chacha_mod_init(void)
+{
+       return crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
+}
+
+static void __exit chacha_mod_fini(void)
+{
+       crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
+}
+
+module_cpu_feature_match(VXRS, chacha_mod_init);
+module_exit(chacha_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha20 stream cipher");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S
new file mode 100644 (file)
index 0000000..9b03362
--- /dev/null
@@ -0,0 +1,907 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/vx-insn.h>
+
+#define SP     %r15
+#define FRAME  (16 * 8 + 4 * 8)
+
+.data
+.align 32
+
+.Lsigma:
+.long  0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
+.long  1,0,0,0
+.long  2,0,0,0
+.long  3,0,0,0
+.long  0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c     # byte swap
+
+.long  0,1,2,3
+.long  0x61707865,0x61707865,0x61707865,0x61707865     # smashed sigma
+.long  0x3320646e,0x3320646e,0x3320646e,0x3320646e
+.long  0x79622d32,0x79622d32,0x79622d32,0x79622d32
+.long  0x6b206574,0x6b206574,0x6b206574,0x6b206574
+
+.previous
+
+       GEN_BR_THUNK %r14
+
+.text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+#                    counst u32 *key, const u32 *counter)
+
+#define        OUT             %r2
+#define        INP             %r3
+#define        LEN             %r4
+#define        KEY             %r5
+#define        COUNTER         %r6
+
+#define BEPERM         %v31
+#define CTR            %v26
+
+#define K0             %v16
+#define K1             %v17
+#define K2             %v18
+#define K3             %v19
+
+#define XA0            %v0
+#define XA1            %v1
+#define XA2            %v2
+#define XA3            %v3
+
+#define XB0            %v4
+#define XB1            %v5
+#define XB2            %v6
+#define XB3            %v7
+
+#define XC0            %v8
+#define XC1            %v9
+#define XC2            %v10
+#define XC3            %v11
+
+#define XD0            %v12
+#define XD1            %v13
+#define XD2            %v14
+#define XD3            %v15
+
+#define XT0            %v27
+#define XT1            %v28
+#define XT2            %v29
+#define XT3            %v30
+
+ENTRY(chacha20_vx_4x)
+       stmg    %r6,%r7,6*8(SP)
+
+       larl    %r7,.Lsigma
+       lhi     %r0,10
+       lhi     %r1,0
+
+       VL      K0,0,,%r7               # load sigma
+       VL      K1,0,,KEY               # load key
+       VL      K2,16,,KEY
+       VL      K3,0,,COUNTER           # load counter
+
+       VL      BEPERM,0x40,,%r7
+       VL      CTR,0x50,,%r7
+
+       VLM     XA0,XA3,0x60,%r7,4      # load [smashed] sigma
+
+       VREPF   XB0,K1,0                # smash the key
+       VREPF   XB1,K1,1
+       VREPF   XB2,K1,2
+       VREPF   XB3,K1,3
+
+       VREPF   XD0,K3,0
+       VREPF   XD1,K3,1
+       VREPF   XD2,K3,2
+       VREPF   XD3,K3,3
+       VAF     XD0,XD0,CTR
+
+       VREPF   XC0,K2,0
+       VREPF   XC1,K2,1
+       VREPF   XC2,K2,2
+       VREPF   XC3,K2,3
+
+.Loop_4x:
+       VAF     XA0,XA0,XB0
+       VX      XD0,XD0,XA0
+       VERLLF  XD0,XD0,16
+
+       VAF     XA1,XA1,XB1
+       VX      XD1,XD1,XA1
+       VERLLF  XD1,XD1,16
+
+       VAF     XA2,XA2,XB2
+       VX      XD2,XD2,XA2
+       VERLLF  XD2,XD2,16
+
+       VAF     XA3,XA3,XB3
+       VX      XD3,XD3,XA3
+       VERLLF  XD3,XD3,16
+
+       VAF     XC0,XC0,XD0
+       VX      XB0,XB0,XC0
+       VERLLF  XB0,XB0,12
+
+       VAF     XC1,XC1,XD1
+       VX      XB1,XB1,XC1
+       VERLLF  XB1,XB1,12
+
+       VAF     XC2,XC2,XD2
+       VX      XB2,XB2,XC2
+       VERLLF  XB2,XB2,12
+
+       VAF     XC3,XC3,XD3
+       VX      XB3,XB3,XC3
+       VERLLF  XB3,XB3,12
+
+       VAF     XA0,XA0,XB0
+       VX      XD0,XD0,XA0
+       VERLLF  XD0,XD0,8
+
+       VAF     XA1,XA1,XB1
+       VX      XD1,XD1,XA1
+       VERLLF  XD1,XD1,8
+
+       VAF     XA2,XA2,XB2
+       VX      XD2,XD2,XA2
+       VERLLF  XD2,XD2,8
+
+       VAF     XA3,XA3,XB3
+       VX      XD3,XD3,XA3
+       VERLLF  XD3,XD3,8
+
+       VAF     XC0,XC0,XD0
+       VX      XB0,XB0,XC0
+       VERLLF  XB0,XB0,7
+
+       VAF     XC1,XC1,XD1
+       VX      XB1,XB1,XC1
+       VERLLF  XB1,XB1,7
+
+       VAF     XC2,XC2,XD2
+       VX      XB2,XB2,XC2
+       VERLLF  XB2,XB2,7
+
+       VAF     XC3,XC3,XD3
+       VX      XB3,XB3,XC3
+       VERLLF  XB3,XB3,7
+
+       VAF     XA0,XA0,XB1
+       VX      XD3,XD3,XA0
+       VERLLF  XD3,XD3,16
+
+       VAF     XA1,XA1,XB2
+       VX      XD0,XD0,XA1
+       VERLLF  XD0,XD0,16
+
+       VAF     XA2,XA2,XB3
+       VX      XD1,XD1,XA2
+       VERLLF  XD1,XD1,16
+
+       VAF     XA3,XA3,XB0
+       VX      XD2,XD2,XA3
+       VERLLF  XD2,XD2,16
+
+       VAF     XC2,XC2,XD3
+       VX      XB1,XB1,XC2
+       VERLLF  XB1,XB1,12
+
+       VAF     XC3,XC3,XD0
+       VX      XB2,XB2,XC3
+       VERLLF  XB2,XB2,12
+
+       VAF     XC0,XC0,XD1
+       VX      XB3,XB3,XC0
+       VERLLF  XB3,XB3,12
+
+       VAF     XC1,XC1,XD2
+       VX      XB0,XB0,XC1
+       VERLLF  XB0,XB0,12
+
+       VAF     XA0,XA0,XB1
+       VX      XD3,XD3,XA0
+       VERLLF  XD3,XD3,8
+
+       VAF     XA1,XA1,XB2
+       VX      XD0,XD0,XA1
+       VERLLF  XD0,XD0,8
+
+       VAF     XA2,XA2,XB3
+       VX      XD1,XD1,XA2
+       VERLLF  XD1,XD1,8
+
+       VAF     XA3,XA3,XB0
+       VX      XD2,XD2,XA3
+       VERLLF  XD2,XD2,8
+
+       VAF     XC2,XC2,XD3
+       VX      XB1,XB1,XC2
+       VERLLF  XB1,XB1,7
+
+       VAF     XC3,XC3,XD0
+       VX      XB2,XB2,XC3
+       VERLLF  XB2,XB2,7
+
+       VAF     XC0,XC0,XD1
+       VX      XB3,XB3,XC0
+       VERLLF  XB3,XB3,7
+
+       VAF     XC1,XC1,XD2
+       VX      XB0,XB0,XC1
+       VERLLF  XB0,XB0,7
+       brct    %r0,.Loop_4x
+
+       VAF     XD0,XD0,CTR
+
+       VMRHF   XT0,XA0,XA1             # transpose data
+       VMRHF   XT1,XA2,XA3
+       VMRLF   XT2,XA0,XA1
+       VMRLF   XT3,XA2,XA3
+       VPDI    XA0,XT0,XT1,0b0000
+       VPDI    XA1,XT0,XT1,0b0101
+       VPDI    XA2,XT2,XT3,0b0000
+       VPDI    XA3,XT2,XT3,0b0101
+
+       VMRHF   XT0,XB0,XB1
+       VMRHF   XT1,XB2,XB3
+       VMRLF   XT2,XB0,XB1
+       VMRLF   XT3,XB2,XB3
+       VPDI    XB0,XT0,XT1,0b0000
+       VPDI    XB1,XT0,XT1,0b0101
+       VPDI    XB2,XT2,XT3,0b0000
+       VPDI    XB3,XT2,XT3,0b0101
+
+       VMRHF   XT0,XC0,XC1
+       VMRHF   XT1,XC2,XC3
+       VMRLF   XT2,XC0,XC1
+       VMRLF   XT3,XC2,XC3
+       VPDI    XC0,XT0,XT1,0b0000
+       VPDI    XC1,XT0,XT1,0b0101
+       VPDI    XC2,XT2,XT3,0b0000
+       VPDI    XC3,XT2,XT3,0b0101
+
+       VMRHF   XT0,XD0,XD1
+       VMRHF   XT1,XD2,XD3
+       VMRLF   XT2,XD0,XD1
+       VMRLF   XT3,XD2,XD3
+       VPDI    XD0,XT0,XT1,0b0000
+       VPDI    XD1,XT0,XT1,0b0101
+       VPDI    XD2,XT2,XT3,0b0000
+       VPDI    XD3,XT2,XT3,0b0101
+
+       VAF     XA0,XA0,K0
+       VAF     XB0,XB0,K1
+       VAF     XC0,XC0,K2
+       VAF     XD0,XD0,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+
+       VAF     XA0,XA1,K0
+       VAF     XB0,XB1,K1
+       VAF     XC0,XC1,K2
+       VAF     XD0,XD1,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_4x
+
+       VAF     XA0,XA2,K0
+       VAF     XB0,XB2,K1
+       VAF     XC0,XC2,K2
+       VAF     XD0,XD2,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_4x
+
+       VAF     XA0,XA3,K0
+       VAF     XB0,XB3,K1
+       VAF     XC0,XC3,K2
+       VAF     XD0,XD3,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+       lmg     %r6,%r7,6*8(SP)
+       BR_EX   %r14
+
+.Ltail_4x:
+       VLR     XT0,XC0
+       VLR     XT1,XD0
+
+       VST     XA0,8*8+0x00,,SP
+       VST     XB0,8*8+0x10,,SP
+       VST     XT0,8*8+0x20,,SP
+       VST     XT1,8*8+0x30,,SP
+
+       lghi    %r1,0
+
+.Loop_tail_4x:
+       llgc    %r5,0(%r1,INP)
+       llgc    %r6,8*8(%r1,SP)
+       xr      %r6,%r5
+       stc     %r6,0(%r1,OUT)
+       la      %r1,1(%r1)
+       brct    LEN,.Loop_tail_4x
+
+       lmg     %r6,%r7,6*8(SP)
+       BR_EX   %r14
+ENDPROC(chacha20_vx_4x)
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+#                 counst u32 *key, const u32 *counter)
+
+#define        OUT             %r2
+#define        INP             %r3
+#define        LEN             %r4
+#define        KEY             %r5
+#define        COUNTER         %r6
+
+#define BEPERM         %v31
+
+#define K0             %v27
+#define K1             %v24
+#define K2             %v25
+#define K3             %v26
+
+#define A0             %v0
+#define B0             %v1
+#define C0             %v2
+#define D0             %v3
+
+#define A1             %v4
+#define B1             %v5
+#define C1             %v6
+#define D1             %v7
+
+#define A2             %v8
+#define B2             %v9
+#define C2             %v10
+#define D2             %v11
+
+#define A3             %v12
+#define B3             %v13
+#define C3             %v14
+#define D3             %v15
+
+#define A4             %v16
+#define B4             %v17
+#define C4             %v18
+#define D4             %v19
+
+#define A5             %v20
+#define B5             %v21
+#define C5             %v22
+#define D5             %v23
+
+#define T0             %v27
+#define T1             %v28
+#define T2             %v29
+#define T3             %v30
+
+ENTRY(chacha20_vx)
+       clgfi   LEN,256
+       jle     chacha20_vx_4x
+       stmg    %r6,%r7,6*8(SP)
+
+       lghi    %r1,-FRAME
+       lgr     %r0,SP
+       la      SP,0(%r1,SP)
+       stg     %r0,0(SP)               # back-chain
+
+       larl    %r7,.Lsigma
+       lhi     %r0,10
+
+       VLM     K1,K2,0,KEY,0           # load key
+       VL      K3,0,,COUNTER           # load counter
+
+       VLM     K0,BEPERM,0,%r7,4       # load sigma, increments, ...
+
+.Loop_outer_vx:
+       VLR     A0,K0
+       VLR     B0,K1
+       VLR     A1,K0
+       VLR     B1,K1
+       VLR     A2,K0
+       VLR     B2,K1
+       VLR     A3,K0
+       VLR     B3,K1
+       VLR     A4,K0
+       VLR     B4,K1
+       VLR     A5,K0
+       VLR     B5,K1
+
+       VLR     D0,K3
+       VAF     D1,K3,T1                # K[3]+1
+       VAF     D2,K3,T2                # K[3]+2
+       VAF     D3,K3,T3                # K[3]+3
+       VAF     D4,D2,T2                # K[3]+4
+       VAF     D5,D2,T3                # K[3]+5
+
+       VLR     C0,K2
+       VLR     C1,K2
+       VLR     C2,K2
+       VLR     C3,K2
+       VLR     C4,K2
+       VLR     C5,K2
+
+       VLR     T1,D1
+       VLR     T2,D2
+       VLR     T3,D3
+
+.Loop_vx:
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,16
+       VERLLF  D1,D1,16
+       VERLLF  D2,D2,16
+       VERLLF  D3,D3,16
+       VERLLF  D4,D4,16
+       VERLLF  D5,D5,16
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,12
+       VERLLF  B1,B1,12
+       VERLLF  B2,B2,12
+       VERLLF  B3,B3,12
+       VERLLF  B4,B4,12
+       VERLLF  B5,B5,12
+
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,8
+       VERLLF  D1,D1,8
+       VERLLF  D2,D2,8
+       VERLLF  D3,D3,8
+       VERLLF  D4,D4,8
+       VERLLF  D5,D5,8
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,7
+       VERLLF  B1,B1,7
+       VERLLF  B2,B2,7
+       VERLLF  B3,B3,7
+       VERLLF  B4,B4,7
+       VERLLF  B5,B5,7
+
+       VSLDB   C0,C0,C0,8
+       VSLDB   C1,C1,C1,8
+       VSLDB   C2,C2,C2,8
+       VSLDB   C3,C3,C3,8
+       VSLDB   C4,C4,C4,8
+       VSLDB   C5,C5,C5,8
+       VSLDB   B0,B0,B0,4
+       VSLDB   B1,B1,B1,4
+       VSLDB   B2,B2,B2,4
+       VSLDB   B3,B3,B3,4
+       VSLDB   B4,B4,B4,4
+       VSLDB   B5,B5,B5,4
+       VSLDB   D0,D0,D0,12
+       VSLDB   D1,D1,D1,12
+       VSLDB   D2,D2,D2,12
+       VSLDB   D3,D3,D3,12
+       VSLDB   D4,D4,D4,12
+       VSLDB   D5,D5,D5,12
+
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,16
+       VERLLF  D1,D1,16
+       VERLLF  D2,D2,16
+       VERLLF  D3,D3,16
+       VERLLF  D4,D4,16
+       VERLLF  D5,D5,16
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,12
+       VERLLF  B1,B1,12
+       VERLLF  B2,B2,12
+       VERLLF  B3,B3,12
+       VERLLF  B4,B4,12
+       VERLLF  B5,B5,12
+
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,8
+       VERLLF  D1,D1,8
+       VERLLF  D2,D2,8
+       VERLLF  D3,D3,8
+       VERLLF  D4,D4,8
+       VERLLF  D5,D5,8
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,7
+       VERLLF  B1,B1,7
+       VERLLF  B2,B2,7
+       VERLLF  B3,B3,7
+       VERLLF  B4,B4,7
+       VERLLF  B5,B5,7
+
+       VSLDB   C0,C0,C0,8
+       VSLDB   C1,C1,C1,8
+       VSLDB   C2,C2,C2,8
+       VSLDB   C3,C3,C3,8
+       VSLDB   C4,C4,C4,8
+       VSLDB   C5,C5,C5,8
+       VSLDB   B0,B0,B0,12
+       VSLDB   B1,B1,B1,12
+       VSLDB   B2,B2,B2,12
+       VSLDB   B3,B3,B3,12
+       VSLDB   B4,B4,B4,12
+       VSLDB   B5,B5,B5,12
+       VSLDB   D0,D0,D0,4
+       VSLDB   D1,D1,D1,4
+       VSLDB   D2,D2,D2,4
+       VSLDB   D3,D3,D3,4
+       VSLDB   D4,D4,D4,4
+       VSLDB   D5,D5,D5,4
+       brct    %r0,.Loop_vx
+
+       VAF     A0,A0,K0
+       VAF     B0,B0,K1
+       VAF     C0,C0,K2
+       VAF     D0,D0,K3
+       VAF     A1,A1,K0
+       VAF     D1,D1,T1                # +K[3]+1
+
+       VPERM   A0,A0,A0,BEPERM
+       VPERM   B0,B0,B0,BEPERM
+       VPERM   C0,C0,C0,BEPERM
+       VPERM   D0,D0,D0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VAF     D2,D2,T2                # +K[3]+2
+       VAF     D3,D3,T3                # +K[3]+3
+       VLM     T0,T3,0,INP,0
+
+       VX      A0,A0,T0
+       VX      B0,B0,T1
+       VX      C0,C0,T2
+       VX      D0,D0,T3
+
+       VLM     K0,T3,0,%r7,4           # re-load sigma and increments
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     B1,B1,K1
+       VAF     C1,C1,K2
+
+       VPERM   A0,A1,A1,BEPERM
+       VPERM   B0,B1,B1,BEPERM
+       VPERM   C0,C1,C1,BEPERM
+       VPERM   D0,D1,D1,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A2,A2,K0
+       VAF     B2,B2,K1
+       VAF     C2,C2,K2
+
+       VPERM   A0,A2,A2,BEPERM
+       VPERM   B0,B2,B2,BEPERM
+       VPERM   C0,C2,C2,BEPERM
+       VPERM   D0,D2,D2,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A3,A3,K0
+       VAF     B3,B3,K1
+       VAF     C3,C3,K2
+       VAF     D2,K3,T3                # K[3]+3
+
+       VPERM   A0,A3,A3,BEPERM
+       VPERM   B0,B3,B3,BEPERM
+       VPERM   C0,C3,C3,BEPERM
+       VPERM   D0,D3,D3,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VAF     D3,D2,T1                # K[3]+4
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A4,A4,K0
+       VAF     B4,B4,K1
+       VAF     C4,C4,K2
+       VAF     D4,D4,D3                # +K[3]+4
+       VAF     D3,D3,T1                # K[3]+5
+       VAF     K3,D2,T3                # K[3]+=6
+
+       VPERM   A0,A4,A4,BEPERM
+       VPERM   B0,B4,B4,BEPERM
+       VPERM   C0,C4,C4,BEPERM
+       VPERM   D0,D4,D4,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A5,A5,K0
+       VAF     B5,B5,K1
+       VAF     C5,C5,K2
+       VAF     D5,D5,D3                # +K[3]+5
+
+       VPERM   A0,A5,A5,BEPERM
+       VPERM   B0,B5,B5,BEPERM
+       VPERM   C0,C5,C5,BEPERM
+       VPERM   D0,D5,D5,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       lhi     %r0,10
+       aghi    LEN,-0x40
+       jne     .Loop_outer_vx
+
+.Ldone_vx:
+       lmg     %r6,%r7,FRAME+6*8(SP)
+       la      SP,FRAME(SP)
+       BR_EX   %r14
+
+.Ltail_vx:
+       VSTM    A0,D0,8*8,SP,3
+       lghi    %r1,0
+
+.Loop_tail_vx:
+       llgc    %r5,0(%r1,INP)
+       llgc    %r6,8*8(%r1,SP)
+       xr      %r6,%r5
+       stc     %r6,0(%r1,OUT)
+       la      %r1,1(%r1)
+       brct    LEN,.Loop_tail_vx
+
+       lmg     %r6,%r7,FRAME+6*8(SP)
+       la      SP,FRAME(SP)
+       BR_EX   %r14
+ENDPROC(chacha20_vx)
+
+.previous
diff --git a/arch/s390/crypto/chacha-s390.h b/arch/s390/crypto/chacha-s390.h
new file mode 100644 (file)
index 0000000..733744c
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+                const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
index 0c05a67..87e6cc2 100644 (file)
        MRXBOPC \hint, 0x36, v1, v3
 .endm
 
+/* VECTOR STORE */
+.macro VST     vr1, disp, index="%r0", base
+       VX_NUM  v1, \vr1
+       GR_NUM  x2, \index
+       GR_NUM  b2, \base           /* Base register */
+       .word   0xE700 | ((v1&15) << 4) | (x2&15)
+       .word   (b2 << 12) | (\disp)
+       MRXBOPC 0, 0x0E, v1
+.endm
+
 /* VECTOR STORE MULTIPLE */
 .macro VSTM    vfrom, vto, disp, base, hint=3
        VX_NUM  v1, \vfrom
        VUPLL   \vr1, \vr2, 2
 .endm
 
+/* VECTOR PERMUTE DOUBLEWORD IMMEDIATE */
+.macro VPDI    vr1, vr2, vr3, m4
+       VX_NUM  v1, \vr1
+       VX_NUM  v2, \vr2
+       VX_NUM  v3, \vr3
+       .word   0xE700 | ((v1&15) << 4) | (v2&15)
+       .word   ((v3&15) << 12)
+       MRXBOPC \m4, 0x84, v1, v2, v3
+.endm
+
+/* VECTOR REPLICATE */
+.macro VREP    vr1, vr3, imm2, m4
+       VX_NUM  v1, \vr1
+       VX_NUM  v3, \vr3
+       .word   0xE700 | ((v1&15) << 4) | (v3&15)
+       .word   \imm2
+       MRXBOPC \m4, 0x4D, v1, v3
+.endm
+.macro VREPB   vr1, vr3, imm2
+       VREP    \vr1, \vr3, \imm2, 0
+.endm
+.macro VREPH   vr1, vr3, imm2
+       VREP    \vr1, \vr3, \imm2, 1
+.endm
+.macro VREPF   vr1, vr3, imm2
+       VREP    \vr1, \vr3, \imm2, 2
+.endm
+.macro VREPG   vr1, vr3, imm2
+       VREP    \vr1, \vr3, \imm2, 3
+.endm
+
+/* VECTOR MERGE HIGH */
+.macro VMRH    vr1, vr2, vr3, m4
+       VX_NUM  v1, \vr1
+       VX_NUM  v2, \vr2
+       VX_NUM  v3, \vr3
+       .word   0xE700 | ((v1&15) << 4) | (v2&15)
+       .word   ((v3&15) << 12)
+       MRXBOPC \m4, 0x61, v1, v2, v3
+.endm
+.macro VMRHB   vr1, vr2, vr3
+       VMRH    \vr1, \vr2, \vr3, 0
+.endm
+.macro VMRHH   vr1, vr2, vr3
+       VMRH    \vr1, \vr2, \vr3, 1
+.endm
+.macro VMRHF   vr1, vr2, vr3
+       VMRH    \vr1, \vr2, \vr3, 2
+.endm
+.macro VMRHG   vr1, vr2, vr3
+       VMRH    \vr1, \vr2, \vr3, 3
+.endm
+
+/* VECTOR MERGE LOW */
+.macro VMRL    vr1, vr2, vr3, m4
+       VX_NUM  v1, \vr1
+       VX_NUM  v2, \vr2
+       VX_NUM  v3, \vr3
+       .word   0xE700 | ((v1&15) << 4) | (v2&15)
+       .word   ((v3&15) << 12)
+       MRXBOPC \m4, 0x60, v1, v2, v3
+.endm
+.macro VMRLB   vr1, vr2, vr3
+       VMRL    \vr1, \vr2, \vr3, 0
+.endm
+.macro VMRLH   vr1, vr2, vr3
+       VMRL    \vr1, \vr2, \vr3, 1
+.endm
+.macro VMRLF   vr1, vr2, vr3
+       VMRL    \vr1, \vr2, \vr3, 2
+.endm
+.macro VMRLG   vr1, vr2, vr3
+       VMRL    \vr1, \vr2, \vr3, 3
+.endm
+
 
 /* Vector integer instructions */
 
        VESRAV  \vr1, \vr2, \vr3, 3
 .endm
 
+/* VECTOR ELEMENT ROTATE LEFT LOGICAL */
+.macro VERLL   vr1, vr3, disp, base="%r0", m4
+       VX_NUM  v1, \vr1
+       VX_NUM  v3, \vr3
+       GR_NUM  b2, \base
+       .word   0xE700 | ((v1&15) << 4) | (v3&15)
+       .word   (b2 << 12) | (\disp)
+       MRXBOPC \m4, 0x33, v1, v3
+.endm
+.macro VERLLB  vr1, vr3, disp, base="%r0"
+       VERLL   \vr1, \vr3, \disp, \base, 0
+.endm
+.macro VERLLH  vr1, vr3, disp, base="%r0"
+       VERLL   \vr1, \vr3, \disp, \base, 1
+.endm
+.macro VERLLF  vr1, vr3, disp, base="%r0"
+       VERLL   \vr1, \vr3, \disp, \base, 2
+.endm
+.macro VERLLG  vr1, vr3, disp, base="%r0"
+       VERLL   \vr1, \vr3, \disp, \base, 3
+.endm
+
+/* VECTOR SHIFT LEFT DOUBLE BY BYTE */
+.macro VSLDB   vr1, vr2, vr3, imm4
+       VX_NUM  v1, \vr1
+       VX_NUM  v2, \vr2
+       VX_NUM  v3, \vr3
+       .word   0xE700 | ((v1&15) << 4) | (v2&15)
+       .word   ((v3&15) << 12) | (\imm4)
+       MRXBOPC 0, 0x77, v1, v2, v3
+.endm
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_S390_VX_INSN_H */
index 51690e7..4f70567 100644 (file)
@@ -213,6 +213,18 @@ config CRYPTO_AES_S390
          key sizes and XTS mode is hardware accelerated for 256 and
          512 bit keys.
 
+config CRYPTO_CHACHA_S390
+       tristate "ChaCha20 stream cipher"
+       depends on S390
+       select CRYPTO_ALGAPI
+       select CRYPTO_SKCIPHER
+       select CRYPTO_CHACHA20
+       help
+         This is the s390 SIMD implementation of the ChaCha20 stream
+         cipher (RFC 7539).
+
+         It is available as of z13.
+
 config S390_PRNG
        tristate "Pseudo random number generator device driver"
        depends on S390