lib/crypto: arm/nh: Migrate optimized code into library
authorEric Biggers <ebiggers@kernel.org>
Thu, 11 Dec 2025 01:18:35 +0000 (17:18 -0800)
committerEric Biggers <ebiggers@kernel.org>
Mon, 12 Jan 2026 19:07:49 +0000 (11:07 -0800)
Migrate the arm32 NEON implementation of NH into lib/crypto/.  This
makes the nh() function be optimized on arm32 kernels.

Note: this temporarily makes the adiantum template not utilize the arm32
optimized NH code.  This is resolved in a later commit that converts the
adiantum template to use nh() instead of "nhpoly1305".

Link: https://lore.kernel.org/r/20251211011846.8179-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
arch/arm/crypto/Kconfig
arch/arm/crypto/Makefile
arch/arm/crypto/nh-neon-core.S [deleted file]
arch/arm/crypto/nhpoly1305-neon-glue.c [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/arm/nh-neon-core.S [new file with mode: 0644]
lib/crypto/arm/nh.h [new file with mode: 0644]

index f30d743..3eb5071 100644 (file)
@@ -23,16 +23,6 @@ config CRYPTO_GHASH_ARM_CE
          that is part of the ARMv8 Crypto Extensions, or a slower variant that
          uses the vmull.p8 instruction that is part of the basic NEON ISA.
 
-config CRYPTO_NHPOLY1305_NEON
-       tristate "Hash functions: NHPoly1305 (NEON)"
-       depends on KERNEL_MODE_NEON
-       select CRYPTO_NHPOLY1305
-       help
-         NHPoly1305 hash function (Adiantum)
-
-         Architecture: arm using:
-         - NEON (Advanced SIMD) extensions
-
 config CRYPTO_AES_ARM
        tristate "Ciphers: AES"
        select CRYPTO_ALGAPI
index 86dd433..d6683e9 100644 (file)
@@ -5,7 +5,6 @@
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
@@ -14,4 +13,3 @@ aes-arm-y     := aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y   := aes-neonbs-core.o aes-neonbs-glue.o
 aes-arm-ce-y   := aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
-nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S
deleted file mode 100644 (file)
index 01620a0..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NH - ε-almost-universal hash function, NEON accelerated version
- *
- * Copyright 2018 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-       .text
-       .fpu            neon
-
-       KEY             .req    r0
-       MESSAGE         .req    r1
-       MESSAGE_LEN     .req    r2
-       HASH            .req    r3
-
-       PASS0_SUMS      .req    q0
-       PASS0_SUM_A     .req    d0
-       PASS0_SUM_B     .req    d1
-       PASS1_SUMS      .req    q1
-       PASS1_SUM_A     .req    d2
-       PASS1_SUM_B     .req    d3
-       PASS2_SUMS      .req    q2
-       PASS2_SUM_A     .req    d4
-       PASS2_SUM_B     .req    d5
-       PASS3_SUMS      .req    q3
-       PASS3_SUM_A     .req    d6
-       PASS3_SUM_B     .req    d7
-       K0              .req    q4
-       K1              .req    q5
-       K2              .req    q6
-       K3              .req    q7
-       T0              .req    q8
-       T0_L            .req    d16
-       T0_H            .req    d17
-       T1              .req    q9
-       T1_L            .req    d18
-       T1_H            .req    d19
-       T2              .req    q10
-       T2_L            .req    d20
-       T2_H            .req    d21
-       T3              .req    q11
-       T3_L            .req    d22
-       T3_H            .req    d23
-
-.macro _nh_stride      k0, k1, k2, k3
-
-       // Load next message stride
-       vld1.8          {T3}, [MESSAGE]!
-
-       // Load next key stride
-       vld1.32         {\k3}, [KEY]!
-
-       // Add message words to key words
-       vadd.u32        T0, T3, \k0
-       vadd.u32        T1, T3, \k1
-       vadd.u32        T2, T3, \k2
-       vadd.u32        T3, T3, \k3
-
-       // Multiply 32x32 => 64 and accumulate
-       vmlal.u32       PASS0_SUMS, T0_L, T0_H
-       vmlal.u32       PASS1_SUMS, T1_L, T1_H
-       vmlal.u32       PASS2_SUMS, T2_L, T2_H
-       vmlal.u32       PASS3_SUMS, T3_L, T3_H
-.endm
-
-/*
- * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
- *             __le64 hash[NH_NUM_PASSES])
- *
- * It's guaranteed that message_len % 16 == 0.
- */
-ENTRY(nh_neon)
-
-       vld1.32         {K0,K1}, [KEY]!
-         vmov.u64      PASS0_SUMS, #0
-         vmov.u64      PASS1_SUMS, #0
-       vld1.32         {K2}, [KEY]!
-         vmov.u64      PASS2_SUMS, #0
-         vmov.u64      PASS3_SUMS, #0
-
-       subs            MESSAGE_LEN, MESSAGE_LEN, #64
-       blt             .Lloop4_done
-.Lloop4:
-       _nh_stride      K0, K1, K2, K3
-       _nh_stride      K1, K2, K3, K0
-       _nh_stride      K2, K3, K0, K1
-       _nh_stride      K3, K0, K1, K2
-       subs            MESSAGE_LEN, MESSAGE_LEN, #64
-       bge             .Lloop4
-
-.Lloop4_done:
-       ands            MESSAGE_LEN, MESSAGE_LEN, #63
-       beq             .Ldone
-       _nh_stride      K0, K1, K2, K3
-
-       subs            MESSAGE_LEN, MESSAGE_LEN, #16
-       beq             .Ldone
-       _nh_stride      K1, K2, K3, K0
-
-       subs            MESSAGE_LEN, MESSAGE_LEN, #16
-       beq             .Ldone
-       _nh_stride      K2, K3, K0, K1
-
-.Ldone:
-       // Sum the accumulators for each pass, then store the sums to 'hash'
-       vadd.u64        T0_L, PASS0_SUM_A, PASS0_SUM_B
-       vadd.u64        T0_H, PASS1_SUM_A, PASS1_SUM_B
-       vadd.u64        T1_L, PASS2_SUM_A, PASS2_SUM_B
-       vadd.u64        T1_H, PASS3_SUM_A, PASS3_SUM_B
-       vst1.8          {T0-T1}, [HASH]
-       bx              lr
-ENDPROC(nh_neon)
diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c
deleted file mode 100644 (file)
index 62cf7cc..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
- * (NEON accelerated version)
- *
- * Copyright 2018 Google LLC
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/nhpoly1305.h>
-#include <linux/module.h>
-
-asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
-                       __le64 hash[NH_NUM_PASSES]);
-
-static int nhpoly1305_neon_update(struct shash_desc *desc,
-                                 const u8 *src, unsigned int srclen)
-{
-       if (srclen < 64 || !crypto_simd_usable())
-               return crypto_nhpoly1305_update(desc, src, srclen);
-
-       do {
-               unsigned int n = min_t(unsigned int, srclen, SZ_4K);
-
-               kernel_neon_begin();
-               crypto_nhpoly1305_update_helper(desc, src, n, nh_neon);
-               kernel_neon_end();
-               src += n;
-               srclen -= n;
-       } while (srclen);
-       return 0;
-}
-
-static int nhpoly1305_neon_digest(struct shash_desc *desc,
-                                 const u8 *src, unsigned int srclen, u8 *out)
-{
-       return crypto_nhpoly1305_init(desc) ?:
-              nhpoly1305_neon_update(desc, src, srclen) ?:
-              crypto_nhpoly1305_final(desc, out);
-}
-
-static struct shash_alg nhpoly1305_alg = {
-       .base.cra_name          = "nhpoly1305",
-       .base.cra_driver_name   = "nhpoly1305-neon",
-       .base.cra_priority      = 200,
-       .base.cra_ctxsize       = sizeof(struct nhpoly1305_key),
-       .base.cra_module        = THIS_MODULE,
-       .digestsize             = POLY1305_DIGEST_SIZE,
-       .init                   = crypto_nhpoly1305_init,
-       .update                 = nhpoly1305_neon_update,
-       .final                  = crypto_nhpoly1305_final,
-       .digest                 = nhpoly1305_neon_digest,
-       .setkey                 = crypto_nhpoly1305_setkey,
-       .descsize               = sizeof(struct nhpoly1305_state),
-};
-
-static int __init nhpoly1305_mod_init(void)
-{
-       if (!(elf_hwcap & HWCAP_NEON))
-               return -ENODEV;
-
-       return crypto_register_shash(&nhpoly1305_alg);
-}
-
-static void __exit nhpoly1305_mod_exit(void)
-{
-       crypto_unregister_shash(&nhpoly1305_alg);
-}
-
-module_init(nhpoly1305_mod_init);
-module_exit(nhpoly1305_mod_exit);
-
-MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("nhpoly1305");
-MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
index f14c9f5..c6ee7ca 100644 (file)
@@ -117,6 +117,7 @@ config CRYPTO_LIB_NH
 config CRYPTO_LIB_NH_ARCH
        bool
        depends on CRYPTO_LIB_NH && !UML
+       default y if ARM && KERNEL_MODE_NEON
 
 config CRYPTO_LIB_POLY1305
        tristate
index 929b845..6dae7e1 100644 (file)
@@ -135,6 +135,7 @@ obj-$(CONFIG_CRYPTO_LIB_NH) += libnh.o
 libnh-y := nh.o
 ifeq ($(CONFIG_CRYPTO_LIB_NH_ARCH),y)
 CFLAGS_nh.o += -I$(src)/$(SRCARCH)
+libnh-$(CONFIG_ARM) += arm/nh-neon-core.o
 endif
 
 ################################################################################
diff --git a/lib/crypto/arm/nh-neon-core.S b/lib/crypto/arm/nh-neon-core.S
new file mode 100644 (file)
index 0000000..01620a0
--- /dev/null
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+       .text
+       .fpu            neon
+
+       KEY             .req    r0
+       MESSAGE         .req    r1
+       MESSAGE_LEN     .req    r2
+       HASH            .req    r3
+
+       PASS0_SUMS      .req    q0
+       PASS0_SUM_A     .req    d0
+       PASS0_SUM_B     .req    d1
+       PASS1_SUMS      .req    q1
+       PASS1_SUM_A     .req    d2
+       PASS1_SUM_B     .req    d3
+       PASS2_SUMS      .req    q2
+       PASS2_SUM_A     .req    d4
+       PASS2_SUM_B     .req    d5
+       PASS3_SUMS      .req    q3
+       PASS3_SUM_A     .req    d6
+       PASS3_SUM_B     .req    d7
+       K0              .req    q4
+       K1              .req    q5
+       K2              .req    q6
+       K3              .req    q7
+       T0              .req    q8
+       T0_L            .req    d16
+       T0_H            .req    d17
+       T1              .req    q9
+       T1_L            .req    d18
+       T1_H            .req    d19
+       T2              .req    q10
+       T2_L            .req    d20
+       T2_H            .req    d21
+       T3              .req    q11
+       T3_L            .req    d22
+       T3_H            .req    d23
+
+.macro _nh_stride      k0, k1, k2, k3
+
+       // Load next message stride
+       vld1.8          {T3}, [MESSAGE]!
+
+       // Load next key stride
+       vld1.32         {\k3}, [KEY]!
+
+       // Add message words to key words
+       vadd.u32        T0, T3, \k0
+       vadd.u32        T1, T3, \k1
+       vadd.u32        T2, T3, \k2
+       vadd.u32        T3, T3, \k3
+
+       // Multiply 32x32 => 64 and accumulate
+       vmlal.u32       PASS0_SUMS, T0_L, T0_H
+       vmlal.u32       PASS1_SUMS, T1_L, T1_H
+       vmlal.u32       PASS2_SUMS, T2_L, T2_H
+       vmlal.u32       PASS3_SUMS, T3_L, T3_H
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *             __le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+       vld1.32         {K0,K1}, [KEY]!
+         vmov.u64      PASS0_SUMS, #0
+         vmov.u64      PASS1_SUMS, #0
+       vld1.32         {K2}, [KEY]!
+         vmov.u64      PASS2_SUMS, #0
+         vmov.u64      PASS3_SUMS, #0
+
+       subs            MESSAGE_LEN, MESSAGE_LEN, #64
+       blt             .Lloop4_done
+.Lloop4:
+       _nh_stride      K0, K1, K2, K3
+       _nh_stride      K1, K2, K3, K0
+       _nh_stride      K2, K3, K0, K1
+       _nh_stride      K3, K0, K1, K2
+       subs            MESSAGE_LEN, MESSAGE_LEN, #64
+       bge             .Lloop4
+
+.Lloop4_done:
+       ands            MESSAGE_LEN, MESSAGE_LEN, #63
+       beq             .Ldone
+       _nh_stride      K0, K1, K2, K3
+
+       subs            MESSAGE_LEN, MESSAGE_LEN, #16
+       beq             .Ldone
+       _nh_stride      K1, K2, K3, K0
+
+       subs            MESSAGE_LEN, MESSAGE_LEN, #16
+       beq             .Ldone
+       _nh_stride      K2, K3, K0, K1
+
+.Ldone:
+       // Sum the accumulators for each pass, then store the sums to 'hash'
+       vadd.u64        T0_L, PASS0_SUM_A, PASS0_SUM_B
+       vadd.u64        T0_H, PASS1_SUM_A, PASS1_SUM_B
+       vadd.u64        T1_L, PASS2_SUM_A, PASS2_SUM_B
+       vadd.u64        T1_H, PASS3_SUM_A, PASS3_SUM_B
+       vst1.8          {T0-T1}, [HASH]
+       bx              lr
+ENDPROC(nh_neon)
diff --git a/lib/crypto/arm/nh.h b/lib/crypto/arm/nh.h
new file mode 100644 (file)
index 0000000..c9f39d8
--- /dev/null
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM32 accelerated implementation of NH
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+                       __le64 hash[NH_NUM_PASSES]);
+
+static bool nh_arch(const u32 *key, const u8 *message, size_t message_len,
+                   __le64 hash[NH_NUM_PASSES])
+{
+       if (static_branch_likely(&have_neon) && message_len >= 64 &&
+           may_use_simd()) {
+               scoped_ksimd()
+                       nh_neon(key, message, message_len, hash);
+               return true;
+       }
+       return false;
+}
+
+#define nh_mod_init_arch nh_mod_init_arch
+static void nh_mod_init_arch(void)
+{
+       if (elf_hwcap & HWCAP_NEON)
+               static_branch_enable(&have_neon);
+}