arch/arm64/crypto/aes-modes.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
   4  *
   5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
   6  */
   7
   8 /* included by aes-ce.S and aes-neon.S */
   9
  10         .text
  11         .align          4
  12
  13 #ifndef MAX_STRIDE
  14 #define MAX_STRIDE      4
  15 #endif
  16
  17 #if MAX_STRIDE == 4
  18 #define ST4(x...) x
  19 #define ST5(x...)
  20 #else
  21 #define ST4(x...)
  22 #define ST5(x...) x
  23 #endif
  24
  25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
  26         encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  27         ret
  28 SYM_FUNC_END(aes_encrypt_block4x)
  29
  30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
  31         decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
  32         ret
  33 SYM_FUNC_END(aes_decrypt_block4x)
  34
  35 #if MAX_STRIDE == 5
  36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
  37         encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  38         ret
  39 SYM_FUNC_END(aes_encrypt_block5x)
  40
  41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
  42         decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
  43         ret
  44 SYM_FUNC_END(aes_decrypt_block5x)
  45 #endif
  46
  47         /*
  48          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  49          *                 int blocks)
  50          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  51          *                 int blocks)
  52          */
  53
  54 AES_FUNC_START(aes_ecb_encrypt)
  55         stp             x29, x30, [sp, #-16]!
  56         mov             x29, sp
  57
  58         enc_prepare     w3, x2, x5
  59
  60 .LecbencloopNx:
  61         subs            w4, w4, #MAX_STRIDE
  62         bmi             .Lecbenc1x
  63         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
  64 ST4(    bl              aes_encrypt_block4x             )
  65 ST5(    ld1             {v4.16b}, [x1], #16             )
  66 ST5(    bl              aes_encrypt_block5x             )
  67         st1             {v0.16b-v3.16b}, [x0], #64
  68 ST5(    st1             {v4.16b}, [x0], #16             )
  69         b               .LecbencloopNx
  70 .Lecbenc1x:
  71         adds            w4, w4, #MAX_STRIDE
  72         beq             .Lecbencout
  73 .Lecbencloop:
  74         ld1             {v0.16b}, [x1], #16             /* get next pt block */
  75         encrypt_block   v0, w3, x2, x5, w6
  76         st1             {v0.16b}, [x0], #16
  77         subs            w4, w4, #1
  78         bne             .Lecbencloop
  79 .Lecbencout:
  80         ldp             x29, x30, [sp], #16
  81         ret
  82 AES_FUNC_END(aes_ecb_encrypt)
  83
  84
  85 AES_FUNC_START(aes_ecb_decrypt)
  86         stp             x29, x30, [sp, #-16]!
  87         mov             x29, sp
  88
  89         dec_prepare     w3, x2, x5
  90
  91 .LecbdecloopNx:
  92         subs            w4, w4, #MAX_STRIDE
  93         bmi             .Lecbdec1x
  94         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
  95 ST4(    bl              aes_decrypt_block4x             )
  96 ST5(    ld1             {v4.16b}, [x1], #16             )
  97 ST5(    bl              aes_decrypt_block5x             )
  98         st1             {v0.16b-v3.16b}, [x0], #64
  99 ST5(    st1             {v4.16b}, [x0], #16             )
 100         b               .LecbdecloopNx
 101 .Lecbdec1x:
 102         adds            w4, w4, #MAX_STRIDE
 103         beq             .Lecbdecout
 104 .Lecbdecloop:
 105         ld1             {v0.16b}, [x1], #16             /* get next ct block */
 106         decrypt_block   v0, w3, x2, x5, w6
 107         st1             {v0.16b}, [x0], #16
 108         subs            w4, w4, #1
 109         bne             .Lecbdecloop
 110 .Lecbdecout:
 111         ldp             x29, x30, [sp], #16
 112         ret
 113 AES_FUNC_END(aes_ecb_decrypt)
 114
 115
 116         /*
 117          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 118          *                 int blocks, u8 iv[])
 119          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 120          *                 int blocks, u8 iv[])
 121          * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
 122          *                       int rounds, int blocks, u8 iv[],
 123          *                       u32 const rk2[]);
 124          * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
 125          *                       int rounds, int blocks, u8 iv[],
 126          *                       u32 const rk2[]);
 127          */
 128
 129 AES_FUNC_START(aes_essiv_cbc_encrypt)
 130         ld1             {v4.16b}, [x5]                  /* get iv */
 131
 132         mov             w8, #14                         /* AES-256: 14 rounds */
 133         enc_prepare     w8, x6, x7
 134         encrypt_block   v4, w8, x6, x7, w9
 135         enc_switch_key  w3, x2, x6
 136         b               .Lcbcencloop4x
 137
 138 AES_FUNC_START(aes_cbc_encrypt)
 139         ld1             {v4.16b}, [x5]                  /* get iv */
 140         enc_prepare     w3, x2, x6
 141
 142 .Lcbcencloop4x:
 143         subs            w4, w4, #4
 144         bmi             .Lcbcenc1x
 145         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
 146         eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
 147         encrypt_block   v0, w3, x2, x6, w7
 148         eor             v1.16b, v1.16b, v0.16b
 149         encrypt_block   v1, w3, x2, x6, w7
 150         eor             v2.16b, v2.16b, v1.16b
 151         encrypt_block   v2, w3, x2, x6, w7
 152         eor             v3.16b, v3.16b, v2.16b
 153         encrypt_block   v3, w3, x2, x6, w7
 154         st1             {v0.16b-v3.16b}, [x0], #64
 155         mov             v4.16b, v3.16b
 156         b               .Lcbcencloop4x
 157 .Lcbcenc1x:
 158         adds            w4, w4, #4
 159         beq             .Lcbcencout
 160 .Lcbcencloop:
 161         ld1             {v0.16b}, [x1], #16             /* get next pt block */
 162         eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
 163         encrypt_block   v4, w3, x2, x6, w7
 164         st1             {v4.16b}, [x0], #16
 165         subs            w4, w4, #1
 166         bne             .Lcbcencloop
 167 .Lcbcencout:
 168         st1             {v4.16b}, [x5]                  /* return iv */
 169         ret
 170 AES_FUNC_END(aes_cbc_encrypt)
 171 AES_FUNC_END(aes_essiv_cbc_encrypt)
 172
 173 AES_FUNC_START(aes_essiv_cbc_decrypt)
 174         stp             x29, x30, [sp, #-16]!
 175         mov             x29, sp
 176
 177         ld1             {cbciv.16b}, [x5]               /* get iv */
 178
 179         mov             w8, #14                         /* AES-256: 14 rounds */
 180         enc_prepare     w8, x6, x7
 181         encrypt_block   cbciv, w8, x6, x7, w9
 182         b               .Lessivcbcdecstart
 183
 184 AES_FUNC_START(aes_cbc_decrypt)
 185         stp             x29, x30, [sp, #-16]!
 186         mov             x29, sp
 187
 188         ld1             {cbciv.16b}, [x5]               /* get iv */
 189 .Lessivcbcdecstart:
 190         dec_prepare     w3, x2, x6
 191
 192 .LcbcdecloopNx:
 193         subs            w4, w4, #MAX_STRIDE
 194         bmi             .Lcbcdec1x
 195         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
 196 #if MAX_STRIDE == 5
 197         ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
 198         mov             v5.16b, v0.16b
 199         mov             v6.16b, v1.16b
 200         mov             v7.16b, v2.16b
 201         bl              aes_decrypt_block5x
 202         sub             x1, x1, #32
 203         eor             v0.16b, v0.16b, cbciv.16b
 204         eor             v1.16b, v1.16b, v5.16b
 205         ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
 206         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
 207         eor             v2.16b, v2.16b, v6.16b
 208         eor             v3.16b, v3.16b, v7.16b
 209         eor             v4.16b, v4.16b, v5.16b
 210 #else
 211         mov             v4.16b, v0.16b
 212         mov             v5.16b, v1.16b
 213         mov             v6.16b, v2.16b
 214         bl              aes_decrypt_block4x
 215         sub             x1, x1, #16
 216         eor             v0.16b, v0.16b, cbciv.16b
 217         eor             v1.16b, v1.16b, v4.16b
 218         ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
 219         eor             v2.16b, v2.16b, v5.16b
 220         eor             v3.16b, v3.16b, v6.16b
 221 #endif
 222         st1             {v0.16b-v3.16b}, [x0], #64
 223 ST5(    st1             {v4.16b}, [x0], #16             )
 224         b               .LcbcdecloopNx
 225 .Lcbcdec1x:
 226         adds            w4, w4, #MAX_STRIDE
 227         beq             .Lcbcdecout
 228 .Lcbcdecloop:
 229         ld1             {v1.16b}, [x1], #16             /* get next ct block */
 230         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
 231         decrypt_block   v0, w3, x2, x6, w7
 232         eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
 233         mov             cbciv.16b, v1.16b               /* ct is next iv */
 234         st1             {v0.16b}, [x0], #16
 235         subs            w4, w4, #1
 236         bne             .Lcbcdecloop
 237 .Lcbcdecout:
 238         st1             {cbciv.16b}, [x5]               /* return iv */
 239         ldp             x29, x30, [sp], #16
 240         ret
 241 AES_FUNC_END(aes_cbc_decrypt)
 242 AES_FUNC_END(aes_essiv_cbc_decrypt)
 243
 244
 245         /*
 246          * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
 247          *                     int rounds, int bytes, u8 const iv[])
 248          * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
 249          *                     int rounds, int bytes, u8 const iv[])
 250          */
 251
 252 AES_FUNC_START(aes_cbc_cts_encrypt)
 253         adr_l           x8, .Lcts_permute_table
 254         sub             x4, x4, #16
 255         add             x9, x8, #32
 256         add             x8, x8, x4
 257         sub             x9, x9, x4
 258         ld1             {v3.16b}, [x8]
 259         ld1             {v4.16b}, [x9]
 260
 261         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
 262         ld1             {v1.16b}, [x1]
 263
 264         ld1             {v5.16b}, [x5]                  /* get iv */
 265         enc_prepare     w3, x2, x6
 266
 267         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
 268         tbl             v1.16b, {v1.16b}, v4.16b
 269         encrypt_block   v0, w3, x2, x6, w7
 270
 271         eor             v1.16b, v1.16b, v0.16b
 272         tbl             v0.16b, {v0.16b}, v3.16b
 273         encrypt_block   v1, w3, x2, x6, w7
 274
 275         add             x4, x0, x4
 276         st1             {v0.16b}, [x4]                  /* overlapping stores */
 277         st1             {v1.16b}, [x0]
 278         ret
 279 AES_FUNC_END(aes_cbc_cts_encrypt)
 280
 281 AES_FUNC_START(aes_cbc_cts_decrypt)
 282         adr_l           x8, .Lcts_permute_table
 283         sub             x4, x4, #16
 284         add             x9, x8, #32
 285         add             x8, x8, x4
 286         sub             x9, x9, x4
 287         ld1             {v3.16b}, [x8]
 288         ld1             {v4.16b}, [x9]
 289
 290         ld1             {v0.16b}, [x1], x4              /* overlapping loads */
 291         ld1             {v1.16b}, [x1]
 292
 293         ld1             {v5.16b}, [x5]                  /* get iv */
 294         dec_prepare     w3, x2, x6
 295
 296         decrypt_block   v0, w3, x2, x6, w7
 297         tbl             v2.16b, {v0.16b}, v3.16b
 298         eor             v2.16b, v2.16b, v1.16b
 299
 300         tbx             v0.16b, {v1.16b}, v4.16b
 301         decrypt_block   v0, w3, x2, x6, w7
 302         eor             v0.16b, v0.16b, v5.16b          /* xor with iv */
 303
 304         add             x4, x0, x4
 305         st1             {v2.16b}, [x4]                  /* overlapping stores */
 306         st1             {v0.16b}, [x0]
 307         ret
 308 AES_FUNC_END(aes_cbc_cts_decrypt)
 309
 310         .section        ".rodata", "a"
 311         .align          6
 312 .Lcts_permute_table:
 313         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 314         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 315         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 316         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
 317         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 318         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 319         .previous
 320
 321         /*
 322          * This macro generates the code for CTR and XCTR mode.
 323          */
 324 .macro ctr_encrypt xctr
 325         // Arguments
 326         OUT             .req x0
 327         IN              .req x1
 328         KEY             .req x2
 329         ROUNDS_W        .req w3
 330         BYTES_W         .req w4
 331         IV              .req x5
 332         BYTE_CTR_W      .req w6         // XCTR only
 333         // Intermediate values
 334         CTR_W           .req w11        // XCTR only
 335         CTR             .req x11        // XCTR only
 336         IV_PART         .req x12
 337         BLOCKS          .req x13
 338         BLOCKS_W        .req w13
 339
 340         stp             x29, x30, [sp, #-16]!
 341         mov             x29, sp
 342
 343         enc_prepare     ROUNDS_W, KEY, IV_PART
 344         ld1             {vctr.16b}, [IV]
 345
 346         /*
 347          * Keep 64 bits of the IV in a register.  For CTR mode this lets us
 348          * easily increment the IV.  For XCTR mode this lets us efficiently XOR
 349          * the 64-bit counter with the IV.
 350          */
 351         .if \xctr
 352                 umov            IV_PART, vctr.d[0]
 353                 lsr             CTR_W, BYTE_CTR_W, #4
 354         .else
 355                 umov            IV_PART, vctr.d[1]
 356                 rev             IV_PART, IV_PART
 357         .endif
 358
 359 .LctrloopNx\xctr:
 360         add             BLOCKS_W, BYTES_W, #15
 361         sub             BYTES_W, BYTES_W, #MAX_STRIDE << 4
 362         lsr             BLOCKS_W, BLOCKS_W, #4
 363         mov             w8, #MAX_STRIDE
 364         cmp             BLOCKS_W, w8
 365         csel            BLOCKS_W, BLOCKS_W, w8, lt
 366
 367         /*
 368          * Set up the counter values in v0-v{MAX_STRIDE-1}.
 369          *
 370          * If we are encrypting less than MAX_STRIDE blocks, the tail block
 371          * handling code expects the last keystream block to be in
 372          * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
 373          * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
 374          */
 375         .if \xctr
 376                 add             CTR, CTR, BLOCKS
 377         .else
 378                 adds            IV_PART, IV_PART, BLOCKS
 379         .endif
 380         mov             v0.16b, vctr.16b
 381         mov             v1.16b, vctr.16b
 382         mov             v2.16b, vctr.16b
 383         mov             v3.16b, vctr.16b
 384 ST5(    mov             v4.16b, vctr.16b                )
 385         .if \xctr
 386                 sub             x6, CTR, #MAX_STRIDE - 1
 387                 sub             x7, CTR, #MAX_STRIDE - 2
 388                 sub             x8, CTR, #MAX_STRIDE - 3
 389                 sub             x9, CTR, #MAX_STRIDE - 4
 390 ST5(            sub             x10, CTR, #MAX_STRIDE - 5       )
 391                 eor             x6, x6, IV_PART
 392                 eor             x7, x7, IV_PART
 393                 eor             x8, x8, IV_PART
 394                 eor             x9, x9, IV_PART
 395 ST5(            eor             x10, x10, IV_PART               )
 396                 mov             v0.d[0], x6
 397                 mov             v1.d[0], x7
 398                 mov             v2.d[0], x8
 399                 mov             v3.d[0], x9
 400 ST5(            mov             v4.d[0], x10                    )
 401         .else
 402                 bcs             0f
 403                 .subsection     1
 404                 /*
 405                  * This subsection handles carries.
 406                  *
 407                  * Conditional branching here is allowed with respect to time
 408                  * invariance since the branches are dependent on the IV instead
 409                  * of the plaintext or key.  This code is rarely executed in
 410                  * practice anyway.
 411                  */
 412
 413                 /* Apply carry to outgoing counter. */
 414 0:              umov            x8, vctr.d[0]
 415                 rev             x8, x8
 416                 add             x8, x8, #1
 417                 rev             x8, x8
 418                 ins             vctr.d[0], x8
 419
 420                 /*
 421                  * Apply carry to counter blocks if needed.
 422                  *
 423                  * Since the carry flag was set, we know 0 <= IV_PART <
 424                  * MAX_STRIDE.  Using the value of IV_PART we can determine how
 425                  * many counter blocks need to be updated.
 426                  */
 427                 cbz             IV_PART, 2f
 428                 adr             x16, 1f
 429                 sub             x16, x16, IV_PART, lsl #3
 430                 br              x16
 431                 bti             c
 432                 mov             v0.d[0], vctr.d[0]
 433                 bti             c
 434                 mov             v1.d[0], vctr.d[0]
 435                 bti             c
 436                 mov             v2.d[0], vctr.d[0]
 437                 bti             c
 438                 mov             v3.d[0], vctr.d[0]
 439 ST5(            bti             c                               )
 440 ST5(            mov             v4.d[0], vctr.d[0]              )
 441 1:              b               2f
 442                 .previous
 443
 444 2:              rev             x7, IV_PART
 445                 ins             vctr.d[1], x7
 446                 sub             x7, IV_PART, #MAX_STRIDE - 1
 447                 sub             x8, IV_PART, #MAX_STRIDE - 2
 448                 sub             x9, IV_PART, #MAX_STRIDE - 3
 449                 rev             x7, x7
 450                 rev             x8, x8
 451                 mov             v1.d[1], x7
 452                 rev             x9, x9
 453 ST5(            sub             x10, IV_PART, #MAX_STRIDE - 4   )
 454                 mov             v2.d[1], x8
 455 ST5(            rev             x10, x10                        )
 456                 mov             v3.d[1], x9
 457 ST5(            mov             v4.d[1], x10                    )
 458         .endif
 459
 460         /*
 461          * If there are at least MAX_STRIDE blocks left, XOR the data with
 462          * keystream and store.  Otherwise jump to tail handling.
 463          */
 464         tbnz            BYTES_W, #31, .Lctrtail\xctr
 465         ld1             {v5.16b-v7.16b}, [IN], #48
 466 ST4(    bl              aes_encrypt_block4x             )
 467 ST5(    bl              aes_encrypt_block5x             )
 468         eor             v0.16b, v5.16b, v0.16b
 469 ST4(    ld1             {v5.16b}, [IN], #16             )
 470         eor             v1.16b, v6.16b, v1.16b
 471 ST5(    ld1             {v5.16b-v6.16b}, [IN], #32      )
 472         eor             v2.16b, v7.16b, v2.16b
 473         eor             v3.16b, v5.16b, v3.16b
 474 ST5(    eor             v4.16b, v6.16b, v4.16b          )
 475         st1             {v0.16b-v3.16b}, [OUT], #64
 476 ST5(    st1             {v4.16b}, [OUT], #16            )
 477         cbz             BYTES_W, .Lctrout\xctr
 478         b               .LctrloopNx\xctr
 479
 480 .Lctrout\xctr:
 481         .if !\xctr
 482                 st1             {vctr.16b}, [IV] /* return next CTR value */
 483         .endif
 484         ldp             x29, x30, [sp], #16
 485         ret
 486
 487 .Lctrtail\xctr:
 488         /*
 489          * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
 490          *
 491          * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
 492          * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
 493          * v4 should have the next two counter blocks.
 494          *
 495          * This allows us to store the ciphertext by writing to overlapping
 496          * regions of memory.  Any invalid ciphertext blocks get overwritten by
 497          * correctly computed blocks.  This approach greatly simplifies the
 498          * logic for storing the ciphertext.
 499          */
 500         mov             x16, #16
 501         ands            w7, BYTES_W, #0xf
 502         csel            x13, x7, x16, ne
 503
 504 ST5(    cmp             BYTES_W, #64 - (MAX_STRIDE << 4))
 505 ST5(    csel            x14, x16, xzr, gt               )
 506         cmp             BYTES_W, #48 - (MAX_STRIDE << 4)
 507         csel            x15, x16, xzr, gt
 508         cmp             BYTES_W, #32 - (MAX_STRIDE << 4)
 509         csel            x16, x16, xzr, gt
 510         cmp             BYTES_W, #16 - (MAX_STRIDE << 4)
 511
 512         adr_l           x9, .Lcts_permute_table
 513         add             x9, x9, x13
 514         ble             .Lctrtail1x\xctr
 515
 516 ST5(    ld1             {v5.16b}, [IN], x14             )
 517         ld1             {v6.16b}, [IN], x15
 518         ld1             {v7.16b}, [IN], x16
 519
 520 ST4(    bl              aes_encrypt_block4x             )
 521 ST5(    bl              aes_encrypt_block5x             )
 522
 523         ld1             {v8.16b}, [IN], x13
 524         ld1             {v9.16b}, [IN]
 525         ld1             {v10.16b}, [x9]
 526
 527 ST4(    eor             v6.16b, v6.16b, v0.16b          )
 528 ST4(    eor             v7.16b, v7.16b, v1.16b          )
 529 ST4(    tbl             v3.16b, {v3.16b}, v10.16b       )
 530 ST4(    eor             v8.16b, v8.16b, v2.16b          )
 531 ST4(    eor             v9.16b, v9.16b, v3.16b          )
 532
 533 ST5(    eor             v5.16b, v5.16b, v0.16b          )
 534 ST5(    eor             v6.16b, v6.16b, v1.16b          )
 535 ST5(    tbl             v4.16b, {v4.16b}, v10.16b       )
 536 ST5(    eor             v7.16b, v7.16b, v2.16b          )
 537 ST5(    eor             v8.16b, v8.16b, v3.16b          )
 538 ST5(    eor             v9.16b, v9.16b, v4.16b          )
 539
 540 ST5(    st1             {v5.16b}, [OUT], x14            )
 541         st1             {v6.16b}, [OUT], x15
 542         st1             {v7.16b}, [OUT], x16
 543         add             x13, x13, OUT
 544         st1             {v9.16b}, [x13]         // overlapping stores
 545         st1             {v8.16b}, [OUT]
 546         b               .Lctrout\xctr
 547
 548 .Lctrtail1x\xctr:
 549         /*
 550          * Handle <= 16 bytes of plaintext
 551          *
 552          * This code always reads and writes 16 bytes.  To avoid out of bounds
 553          * accesses, XCTR and CTR modes must use a temporary buffer when
 554          * encrypting/decrypting less than 16 bytes.
 555          *
 556          * This code is unusual in that it loads the input and stores the output
 557          * relative to the end of the buffers rather than relative to the start.
 558          * This causes unusual behaviour when encrypting/decrypting less than 16
 559          * bytes; the end of the data is expected to be at the end of the
 560          * temporary buffer rather than the start of the data being at the start
 561          * of the temporary buffer.
 562          */
 563         sub             x8, x7, #16
 564         csel            x7, x7, x8, eq
 565         add             IN, IN, x7
 566         add             OUT, OUT, x7
 567         ld1             {v5.16b}, [IN]
 568         ld1             {v6.16b}, [OUT]
 569 ST5(    mov             v3.16b, v4.16b                  )
 570         encrypt_block   v3, ROUNDS_W, KEY, x8, w7
 571         ld1             {v10.16b-v11.16b}, [x9]
 572         tbl             v3.16b, {v3.16b}, v10.16b
 573         sshr            v11.16b, v11.16b, #7
 574         eor             v5.16b, v5.16b, v3.16b
 575         bif             v5.16b, v6.16b, v11.16b
 576         st1             {v5.16b}, [OUT]
 577         b               .Lctrout\xctr
 578
 579         // Arguments
 580         .unreq OUT
 581         .unreq IN
 582         .unreq KEY
 583         .unreq ROUNDS_W
 584         .unreq BYTES_W
 585         .unreq IV
 586         .unreq BYTE_CTR_W       // XCTR only
 587         // Intermediate values
 588         .unreq CTR_W            // XCTR only
 589         .unreq CTR              // XCTR only
 590         .unreq IV_PART
 591         .unreq BLOCKS
 592         .unreq BLOCKS_W
 593 .endm
 594
 595         /*
 596          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 597          *                 int bytes, u8 ctr[])
 598          *
 599          * The input and output buffers must always be at least 16 bytes even if
 600          * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
 601          * accesses will occur.  The data to be encrypted/decrypted is expected
 602          * to be at the end of this 16-byte temporary buffer rather than the
 603          * start.
 604          */
 605
 606 AES_FUNC_START(aes_ctr_encrypt)
 607         ctr_encrypt 0
 608 AES_FUNC_END(aes_ctr_encrypt)
 609
 610         /*
 611          * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 612          *                 int bytes, u8 const iv[], int byte_ctr)
 613          *
 614          * The input and output buffers must always be at least 16 bytes even if
 615          * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
 616          * accesses will occur.  The data to be encrypted/decrypted is expected
 617          * to be at the end of this 16-byte temporary buffer rather than the
 618          * start.
 619          */
 620
 621 AES_FUNC_START(aes_xctr_encrypt)
 622         ctr_encrypt 1
 623 AES_FUNC_END(aes_xctr_encrypt)
 624
 625
 626         /*
 627          * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
 628          *                 int bytes, u8 const rk2[], u8 iv[], int first)
 629          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
 630          *                 int bytes, u8 const rk2[], u8 iv[], int first)
 631          */
 632
 633         .macro          next_tweak, out, in, tmp
 634         sshr            \tmp\().2d,  \in\().2d,   #63
 635         and             \tmp\().16b, \tmp\().16b, xtsmask.16b
 636         add             \out\().2d,  \in\().2d,   \in\().2d
 637         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
 638         eor             \out\().16b, \out\().16b, \tmp\().16b
 639         .endm
 640
 641         .macro          xts_load_mask, tmp
 642         movi            xtsmask.2s, #0x1
 643         movi            \tmp\().2s, #0x87
 644         uzp1            xtsmask.4s, xtsmask.4s, \tmp\().4s
 645         .endm
 646
 647 AES_FUNC_START(aes_xts_encrypt)
 648         stp             x29, x30, [sp, #-16]!
 649         mov             x29, sp
 650
 651         ld1             {v4.16b}, [x6]
 652         xts_load_mask   v8
 653         cbz             w7, .Lxtsencnotfirst
 654
 655         enc_prepare     w3, x5, x8
 656         xts_cts_skip_tw w7, .LxtsencNx
 657         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
 658         enc_switch_key  w3, x2, x8
 659         b               .LxtsencNx
 660
 661 .Lxtsencnotfirst:
 662         enc_prepare     w3, x2, x8
 663 .LxtsencloopNx:
 664         next_tweak      v4, v4, v8
 665 .LxtsencNx:
 666         subs            w4, w4, #64
 667         bmi             .Lxtsenc1x
 668         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
 669         next_tweak      v5, v4, v8
 670         eor             v0.16b, v0.16b, v4.16b
 671         next_tweak      v6, v5, v8
 672         eor             v1.16b, v1.16b, v5.16b
 673         eor             v2.16b, v2.16b, v6.16b
 674         next_tweak      v7, v6, v8
 675         eor             v3.16b, v3.16b, v7.16b
 676         bl              aes_encrypt_block4x
 677         eor             v3.16b, v3.16b, v7.16b
 678         eor             v0.16b, v0.16b, v4.16b
 679         eor             v1.16b, v1.16b, v5.16b
 680         eor             v2.16b, v2.16b, v6.16b
 681         st1             {v0.16b-v3.16b}, [x0], #64
 682         mov             v4.16b, v7.16b
 683         cbz             w4, .Lxtsencret
 684         xts_reload_mask v8
 685         b               .LxtsencloopNx
 686 .Lxtsenc1x:
 687         adds            w4, w4, #64
 688         beq             .Lxtsencout
 689         subs            w4, w4, #16
 690         bmi             .LxtsencctsNx
 691 .Lxtsencloop:
 692         ld1             {v0.16b}, [x1], #16
 693 .Lxtsencctsout:
 694         eor             v0.16b, v0.16b, v4.16b
 695         encrypt_block   v0, w3, x2, x8, w7
 696         eor             v0.16b, v0.16b, v4.16b
 697         cbz             w4, .Lxtsencout
 698         subs            w4, w4, #16
 699         next_tweak      v4, v4, v8
 700         bmi             .Lxtsenccts
 701         st1             {v0.16b}, [x0], #16
 702         b               .Lxtsencloop
 703 .Lxtsencout:
 704         st1             {v0.16b}, [x0]
 705 .Lxtsencret:
 706         st1             {v4.16b}, [x6]
 707         ldp             x29, x30, [sp], #16
 708         ret
 709
 710 .LxtsencctsNx:
 711         mov             v0.16b, v3.16b
 712         sub             x0, x0, #16
 713 .Lxtsenccts:
 714         adr_l           x8, .Lcts_permute_table
 715
 716         add             x1, x1, w4, sxtw        /* rewind input pointer */
 717         add             w4, w4, #16             /* # bytes in final block */
 718         add             x9, x8, #32
 719         add             x8, x8, x4
 720         sub             x9, x9, x4
 721         add             x4, x0, x4              /* output address of final block */
 722
 723         ld1             {v1.16b}, [x1]          /* load final block */
 724         ld1             {v2.16b}, [x8]
 725         ld1             {v3.16b}, [x9]
 726
 727         tbl             v2.16b, {v0.16b}, v2.16b
 728         tbx             v0.16b, {v1.16b}, v3.16b
 729         st1             {v2.16b}, [x4]                  /* overlapping stores */
 730         mov             w4, wzr
 731         b               .Lxtsencctsout
 732 AES_FUNC_END(aes_xts_encrypt)
 733
 734 AES_FUNC_START(aes_xts_decrypt)
 735         stp             x29, x30, [sp, #-16]!
 736         mov             x29, sp
 737
 738         /* subtract 16 bytes if we are doing CTS */
 739         sub             w8, w4, #0x10
 740         tst             w4, #0xf
 741         csel            w4, w4, w8, eq
 742
 743         ld1             {v4.16b}, [x6]
 744         xts_load_mask   v8
 745         xts_cts_skip_tw w7, .Lxtsdecskiptw
 746         cbz             w7, .Lxtsdecnotfirst
 747
 748         enc_prepare     w3, x5, x8
 749         encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
 750 .Lxtsdecskiptw:
 751         dec_prepare     w3, x2, x8
 752         b               .LxtsdecNx
 753
 754 .Lxtsdecnotfirst:
 755         dec_prepare     w3, x2, x8
 756 .LxtsdecloopNx:
 757         next_tweak      v4, v4, v8
 758 .LxtsdecNx:
 759         subs            w4, w4, #64
 760         bmi             .Lxtsdec1x
 761         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
 762         next_tweak      v5, v4, v8
 763         eor             v0.16b, v0.16b, v4.16b
 764         next_tweak      v6, v5, v8
 765         eor             v1.16b, v1.16b, v5.16b
 766         eor             v2.16b, v2.16b, v6.16b
 767         next_tweak      v7, v6, v8
 768         eor             v3.16b, v3.16b, v7.16b
 769         bl              aes_decrypt_block4x
 770         eor             v3.16b, v3.16b, v7.16b
 771         eor             v0.16b, v0.16b, v4.16b
 772         eor             v1.16b, v1.16b, v5.16b
 773         eor             v2.16b, v2.16b, v6.16b
 774         st1             {v0.16b-v3.16b}, [x0], #64
 775         mov             v4.16b, v7.16b
 776         cbz             w4, .Lxtsdecout
 777         xts_reload_mask v8
 778         b               .LxtsdecloopNx
 779 .Lxtsdec1x:
 780         adds            w4, w4, #64
 781         beq             .Lxtsdecout
 782         subs            w4, w4, #16
 783 .Lxtsdecloop:
 784         ld1             {v0.16b}, [x1], #16
 785         bmi             .Lxtsdeccts
 786 .Lxtsdecctsout:
 787         eor             v0.16b, v0.16b, v4.16b
 788         decrypt_block   v0, w3, x2, x8, w7
 789         eor             v0.16b, v0.16b, v4.16b
 790         st1             {v0.16b}, [x0], #16
 791         cbz             w4, .Lxtsdecout
 792         subs            w4, w4, #16
 793         next_tweak      v4, v4, v8
 794         b               .Lxtsdecloop
 795 .Lxtsdecout:
 796         st1             {v4.16b}, [x6]
 797         ldp             x29, x30, [sp], #16
 798         ret
 799
 800 .Lxtsdeccts:
 801         adr_l           x8, .Lcts_permute_table
 802
 803         add             x1, x1, w4, sxtw        /* rewind input pointer */
 804         add             w4, w4, #16             /* # bytes in final block */
 805         add             x9, x8, #32
 806         add             x8, x8, x4
 807         sub             x9, x9, x4
 808         add             x4, x0, x4              /* output address of final block */
 809
 810         next_tweak      v5, v4, v8
 811
 812         ld1             {v1.16b}, [x1]          /* load final block */
 813         ld1             {v2.16b}, [x8]
 814         ld1             {v3.16b}, [x9]
 815
 816         eor             v0.16b, v0.16b, v5.16b
 817         decrypt_block   v0, w3, x2, x8, w7
 818         eor             v0.16b, v0.16b, v5.16b
 819
 820         tbl             v2.16b, {v0.16b}, v2.16b
 821         tbx             v0.16b, {v1.16b}, v3.16b
 822
 823         st1             {v2.16b}, [x4]                  /* overlapping stores */
 824         mov             w4, wzr
 825         b               .Lxtsdecctsout
 826 AES_FUNC_END(aes_xts_decrypt)
 827
 828         /*
 829          * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
 830          *                int blocks, u8 dg[], int enc_before, int enc_after)
 831          */
 832 AES_FUNC_START(aes_mac_update)
 833         ld1             {v0.16b}, [x4]                  /* get dg */
 834         enc_prepare     w2, x1, x7
 835         cbz             w5, .Lmacloop4x
 836
 837         encrypt_block   v0, w2, x1, x7, w8
 838
 839 .Lmacloop4x:
 840         subs            w3, w3, #4
 841         bmi             .Lmac1x
 842         ld1             {v1.16b-v4.16b}, [x0], #64      /* get next pt block */
 843         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
 844         encrypt_block   v0, w2, x1, x7, w8
 845         eor             v0.16b, v0.16b, v2.16b
 846         encrypt_block   v0, w2, x1, x7, w8
 847         eor             v0.16b, v0.16b, v3.16b
 848         encrypt_block   v0, w2, x1, x7, w8
 849         eor             v0.16b, v0.16b, v4.16b
 850         cmp             w3, wzr
 851         csinv           x5, x6, xzr, eq
 852         cbz             w5, .Lmacout
 853         encrypt_block   v0, w2, x1, x7, w8
 854         st1             {v0.16b}, [x4]                  /* return dg */
 855         cond_yield      .Lmacout, x7, x8
 856         b               .Lmacloop4x
 857 .Lmac1x:
 858         add             w3, w3, #4
 859 .Lmacloop:
 860         cbz             w3, .Lmacout
 861         ld1             {v1.16b}, [x0], #16             /* get next pt block */
 862         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
 863
 864         subs            w3, w3, #1
 865         csinv           x5, x6, xzr, eq
 866         cbz             w5, .Lmacout
 867
 868 .Lmacenc:
 869         encrypt_block   v0, w2, x1, x7, w8
 870         b               .Lmacloop
 871
 872 .Lmacout:
 873         st1             {v0.16b}, [x4]                  /* return dg */
 874         mov             w0, w3
 875         ret
 876 AES_FUNC_END(aes_mac_update)