crypto: arm64/crc-t10dif - move NEON yield to C code

author Ard Biesheuvel <ardb@kernel.org>

Wed, 3 Feb 2021 11:36:25 +0000 (12:36 +0100)

committer Herbert Xu <herbert@gondor.apana.org.au>

Wed, 10 Feb 2021 06:55:58 +0000 (17:55 +1100)
author Ard Biesheuvel <ardb@kernel.org>
Wed, 3 Feb 2021 11:36:25 +0000 (12:36 +0100)
committer Herbert Xu <herbert@gondor.apana.org.au>
Wed, 10 Feb 2021 06:55:58 +0000 (17:55 +1100)
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S

index 111d9c9..dce6dce 100644 (file)
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -68,10 +68,10 @@
         .text
         .arch           armv8-a+crypto
  
-       init_crc        .req    w19
-       buf             .req    x20
-       len             .req    x21
-       fold_consts_ptr .req    x22
+       init_crc        .req    w0
+       buf             .req    x1
+       len             .req    x2
+       fold_consts_ptr .req    x3
  
         fold_consts     .req    v10
  
@@ -257,12 +257,6 @@ CPU_LE(    ext             v12.16b, v12.16b, v12.16b, #8   )
         .endm
  
         .macro          crc_t10dif_pmull, p
-       frame_push      4, 128
-
-       mov             init_crc, w0
-       mov             buf, x1
-       mov             len, x2
-
         __pmull_init_\p
  
         // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
@@ -317,26 +311,7 @@ CPU_LE(    ext             v7.16b, v7.16b, v7.16b, #8      )
         fold_32_bytes   \p, v6, v7
  
         subs            len, len, #128
-       b.lt            .Lfold_128_bytes_loop_done_\@
-
-       if_will_cond_yield_neon
-       stp             q0, q1, [sp, #.Lframe_local_offset]
-       stp             q2, q3, [sp, #.Lframe_local_offset + 32]
-       stp             q4, q5, [sp, #.Lframe_local_offset + 64]
-       stp             q6, q7, [sp, #.Lframe_local_offset + 96]
-       do_cond_yield_neon
-       ldp             q0, q1, [sp, #.Lframe_local_offset]
-       ldp             q2, q3, [sp, #.Lframe_local_offset + 32]
-       ldp             q4, q5, [sp, #.Lframe_local_offset + 64]
-       ldp             q6, q7, [sp, #.Lframe_local_offset + 96]
-       ld1             {fold_consts.2d}, [fold_consts_ptr]
-       __pmull_init_\p
-       __pmull_pre_\p  fold_consts
-       endif_yield_neon
-
-       b               .Lfold_128_bytes_loop_\@
-
-.Lfold_128_bytes_loop_done_\@:
+       b.ge            .Lfold_128_bytes_loop_\@
  
         // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
  
@@ -453,7 +428,9 @@ CPU_LE(     ext             v0.16b, v0.16b, v0.16b, #8      )
         // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
  
         umov            w0, v0.h[0]
-       frame_pop
+       .ifc            \p, p8
+       ldp             x29, x30, [sp], #16
+       .endif
         ret
  
  .Lless_than_256_bytes_\@:
@@ -489,7 +466,9 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
  // Assumes len >= 16.
  //
  SYM_FUNC_START(crc_t10dif_pmull_p8)
-       crc_t10dif_pmull        p8
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+       crc_t10dif_pmull p8
  SYM_FUNC_END(crc_t10dif_pmull_p8)
  
         .align          5
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c

index ccc3f60..09eb145 100644 (file)
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
         u16 *crc = shash_desc_ctx(desc);
  
         if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-               kernel_neon_begin();
-               *crc = crc_t10dif_pmull_p8(*crc, data, length);
-               kernel_neon_end();
+               do {
+                       unsigned int chunk = length;
+
+                       if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+                               chunk = SZ_4K;
+
+                       kernel_neon_begin();
+                       *crc = crc_t10dif_pmull_p8(*crc, data, chunk);
+                       kernel_neon_end();
+                       data += chunk;
+                       length -= chunk;
+               } while (length);
         } else {
                 *crc = crc_t10dif_generic(*crc, data, length);
         }
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
         u16 *crc = shash_desc_ctx(desc);
  
         if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-               kernel_neon_begin();
-               *crc = crc_t10dif_pmull_p64(*crc, data, length);
-               kernel_neon_end();
+               do {
+                       unsigned int chunk = length;
+
+                       if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+                               chunk = SZ_4K;
+
+                       kernel_neon_begin();
+                       *crc = crc_t10dif_pmull_p64(*crc, data, chunk);
+                       kernel_neon_end();
+                       data += chunk;
+                       length -= chunk;
+               } while (length);
         } else {
                 *crc = crc_t10dif_generic(*crc, data, length);
         }
author	Ard Biesheuvel <ardb@kernel.org>
	Wed, 3 Feb 2021 11:36:25 +0000 (12:36 +0100)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Wed, 10 Feb 2021 06:55:58 +0000 (17:55 +1100)
arch/arm64/crypto/crct10dif-ce-core.S		patch \| blob \| history
arch/arm64/crypto/crct10dif-ce-glue.c		patch \| blob \| history