crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Mon, 24 Jun 2019 17:38:31 +0000 (19:38 +0200)
committerHerbert Xu <herbert@gondor.apana.org.au>
Wed, 3 Jul 2019 14:13:12 +0000 (22:13 +0800)
This implements 5-way interleaving for ECB, CBC decryption and CTR,
resulting in a speedup of ~11% on Marvell ThunderX2, which has a
very deep pipeline and therefore a high issue latency for NEON
instructions operating on the same registers.

Note that XTS is left alone: implementing 5-way interleave there
would either involve spilling of the calculated tweaks to the
stack, or recalculating them after the encryption operation, and
doing either of those would most likely penalize low end cores.

For ECB, this is not a concern at all, given that we have plenty
of spare registers. For CTR and CBC decryption, we take advantage
of the fact that v16 is not used by the CE version of the code
(which is the only one targeted by the optimization), and so we
can reshuffle the code a bit and avoid having to spill to memory
(with the exception of one extra reload in the CBC routine)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/aes-ce.S
arch/arm64/crypto/aes-modes.S
arch/arm64/crypto/aes-neon.S

index 0fca5f4..dbfc04e 100644 (file)
@@ -18,6 +18,8 @@
        .arch           armv8-a+crypto
 
        xtsmask         .req    v16
+       cbciv           .req    v16
+       vctr            .req    v16
 
        .macro          xts_reload_mask, tmp
        .endm
index add6267..3f60f7b 100644 (file)
 #define MAX_STRIDE     4
 #endif
 
+#if MAX_STRIDE == 4
+#define ST4(x...) x
+#define ST5(x...)
+#else
+#define ST4(x...)
+#define ST5(x...) x
+#endif
+
 aes_encrypt_block4x:
        encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
        ret
@@ -53,14 +61,17 @@ AES_ENTRY(aes_ecb_encrypt)
        enc_prepare     w3, x2, x5
 
 .LecbencloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lecbenc1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
-       bl              aes_encrypt_block4x
+ST4(   bl              aes_encrypt_block4x             )
+ST5(   ld1             {v4.16b}, [x1], #16             )
+ST5(   bl              aes_encrypt_block5x             )
        st1             {v0.16b-v3.16b}, [x0], #64
+ST5(   st1             {v4.16b}, [x0], #16             )
        b               .LecbencloopNx
 .Lecbenc1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lecbencout
 .Lecbencloop:
        ld1             {v0.16b}, [x1], #16             /* get next pt block */
@@ -81,14 +92,17 @@ AES_ENTRY(aes_ecb_decrypt)
        dec_prepare     w3, x2, x5
 
 .LecbdecloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lecbdec1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
-       bl              aes_decrypt_block4x
+ST4(   bl              aes_decrypt_block4x             )
+ST5(   ld1             {v4.16b}, [x1], #16             )
+ST5(   bl              aes_decrypt_block5x             )
        st1             {v0.16b-v3.16b}, [x0], #64
+ST5(   st1             {v4.16b}, [x0], #16             )
        b               .LecbdecloopNx
 .Lecbdec1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lecbdecout
 .Lecbdecloop:
        ld1             {v0.16b}, [x1], #16             /* get next ct block */
@@ -148,39 +162,56 @@ AES_ENTRY(aes_cbc_decrypt)
        stp             x29, x30, [sp, #-16]!
        mov             x29, sp
 
-       ld1             {v7.16b}, [x5]                  /* get iv */
+       ld1             {cbciv.16b}, [x5]               /* get iv */
        dec_prepare     w3, x2, x6
 
 .LcbcdecloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lcbcdec1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+#if MAX_STRIDE == 5
+       ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
+       mov             v5.16b, v0.16b
+       mov             v6.16b, v1.16b
+       mov             v7.16b, v2.16b
+       bl              aes_decrypt_block5x
+       sub             x1, x1, #32
+       eor             v0.16b, v0.16b, cbciv.16b
+       eor             v1.16b, v1.16b, v5.16b
+       ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
+       ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
+       eor             v2.16b, v2.16b, v6.16b
+       eor             v3.16b, v3.16b, v7.16b
+       eor             v4.16b, v4.16b, v5.16b
+#else
        mov             v4.16b, v0.16b
        mov             v5.16b, v1.16b
        mov             v6.16b, v2.16b
        bl              aes_decrypt_block4x
        sub             x1, x1, #16
-       eor             v0.16b, v0.16b, v7.16b
+       eor             v0.16b, v0.16b, cbciv.16b
        eor             v1.16b, v1.16b, v4.16b
-       ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
+       ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
        eor             v2.16b, v2.16b, v5.16b
        eor             v3.16b, v3.16b, v6.16b
+#endif
        st1             {v0.16b-v3.16b}, [x0], #64
+ST5(   st1             {v4.16b}, [x0], #16             )
        b               .LcbcdecloopNx
 .Lcbcdec1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lcbcdecout
 .Lcbcdecloop:
        ld1             {v1.16b}, [x1], #16             /* get next ct block */
        mov             v0.16b, v1.16b                  /* ...and copy to v0 */
        decrypt_block   v0, w3, x2, x6, w7
-       eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
-       mov             v7.16b, v1.16b                  /* ct is next iv */
+       eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
+       mov             cbciv.16b, v1.16b               /* ct is next iv */
        st1             {v0.16b}, [x0], #16
        subs            w4, w4, #1
        bne             .Lcbcdecloop
 .Lcbcdecout:
-       st1             {v7.16b}, [x5]                  /* return iv */
+       st1             {cbciv.16b}, [x5]               /* return iv */
        ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_cbc_decrypt)
@@ -274,51 +305,60 @@ AES_ENTRY(aes_ctr_encrypt)
        mov             x29, sp
 
        enc_prepare     w3, x2, x6
-       ld1             {v4.16b}, [x5]
+       ld1             {vctr.16b}, [x5]
 
-       umov            x6, v4.d[1]             /* keep swabbed ctr in reg */
+       umov            x6, vctr.d[1]           /* keep swabbed ctr in reg */
        rev             x6, x6
        cmn             w6, w4                  /* 32 bit overflow? */
        bcs             .Lctrloop
 .LctrloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lctr1x
        add             w7, w6, #1
-       mov             v0.16b, v4.16b
+       mov             v0.16b, vctr.16b
        add             w8, w6, #2
-       mov             v1.16b, v4.16b
+       mov             v1.16b, vctr.16b
+       add             w9, w6, #3
+       mov             v2.16b, vctr.16b
        add             w9, w6, #3
-       mov             v2.16b, v4.16b
        rev             w7, w7
-       mov             v3.16b, v4.16b
+       mov             v3.16b, vctr.16b
        rev             w8, w8
+ST5(   mov             v4.16b, vctr.16b                )
        mov             v1.s[3], w7
        rev             w9, w9
+ST5(   add             w10, w6, #4                     )
        mov             v2.s[3], w8
+ST5(   rev             w10, w10                        )
        mov             v3.s[3], w9
+ST5(   mov             v4.s[3], w10                    )
        ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
-       bl              aes_encrypt_block4x
+ST4(   bl              aes_encrypt_block4x             )
+ST5(   bl              aes_encrypt_block5x             )
        eor             v0.16b, v5.16b, v0.16b
-       ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
+ST4(   ld1             {v5.16b}, [x1], #16             )
        eor             v1.16b, v6.16b, v1.16b
+ST5(   ld1             {v5.16b-v6.16b}, [x1], #32      )
        eor             v2.16b, v7.16b, v2.16b
        eor             v3.16b, v5.16b, v3.16b
+ST5(   eor             v4.16b, v6.16b, v4.16b          )
        st1             {v0.16b-v3.16b}, [x0], #64
-       add             x6, x6, #4
+ST5(   st1             {v4.16b}, [x0], #16             )
+       add             x6, x6, #MAX_STRIDE
        rev             x7, x6
-       ins             v4.d[1], x7
+       ins             vctr.d[1], x7
        cbz             w4, .Lctrout
        b               .LctrloopNx
 .Lctr1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lctrout
 .Lctrloop:
-       mov             v0.16b, v4.16b
+       mov             v0.16b, vctr.16b
        encrypt_block   v0, w3, x2, x8, w7
 
        adds            x6, x6, #1              /* increment BE ctr */
        rev             x7, x6
-       ins             v4.d[1], x7
+       ins             vctr.d[1], x7
        bcs             .Lctrcarry              /* overflow? */
 
 .Lctrcarrydone:
@@ -330,7 +370,7 @@ AES_ENTRY(aes_ctr_encrypt)
        bne             .Lctrloop
 
 .Lctrout:
-       st1             {v4.16b}, [x5]          /* return next CTR value */
+       st1             {vctr.16b}, [x5]        /* return next CTR value */
        ldp             x29, x30, [sp], #16
        ret
 
@@ -339,11 +379,11 @@ AES_ENTRY(aes_ctr_encrypt)
        b               .Lctrout
 
 .Lctrcarry:
-       umov            x7, v4.d[0]             /* load upper word of ctr  */
+       umov            x7, vctr.d[0]           /* load upper word of ctr  */
        rev             x7, x7                  /* ... to handle the carry */
        add             x7, x7, #1
        rev             x7, x7
-       ins             v4.d[0], x7
+       ins             vctr.d[0], x7
        b               .Lctrcarrydone
 AES_ENDPROC(aes_ctr_encrypt)
 
index 33bb6af..8bd66a6 100644 (file)
@@ -15,6 +15,8 @@
 #define AES_ENDPROC(func)      ENDPROC(neon_ ## func)
 
        xtsmask         .req    v7
+       cbciv           .req    v7
+       vctr            .req    v4
 
        .macro          xts_reload_mask, tmp
        xts_load_mask   \tmp