arch/arm64/crypto/ghash-ce-core.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
   4  *
   5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
   6  */
   7
   8 #include <linux/linkage.h>
   9 #include <asm/assembler.h>
  10
  11         SHASH           .req    v0
  12         SHASH2          .req    v1
  13         T1              .req    v2
  14         T2              .req    v3
  15         MASK            .req    v4
  16         XM              .req    v5
  17         XL              .req    v6
  18         XH              .req    v7
  19         IN1             .req    v7
  20
  21         k00_16          .req    v8
  22         k32_48          .req    v9
  23
  24         t3              .req    v10
  25         t4              .req    v11
  26         t5              .req    v12
  27         t6              .req    v13
  28         t7              .req    v14
  29         t8              .req    v15
  30         t9              .req    v16
  31
  32         perm1           .req    v17
  33         perm2           .req    v18
  34         perm3           .req    v19
  35
  36         sh1             .req    v20
  37         sh2             .req    v21
  38         sh3             .req    v22
  39         sh4             .req    v23
  40
  41         ss1             .req    v24
  42         ss2             .req    v25
  43         ss3             .req    v26
  44         ss4             .req    v27
  45
  46         XL2             .req    v8
  47         XM2             .req    v9
  48         XH2             .req    v10
  49         XL3             .req    v11
  50         XM3             .req    v12
  51         XH3             .req    v13
  52         TT3             .req    v14
  53         TT4             .req    v15
  54         HH              .req    v16
  55         HH3             .req    v17
  56         HH4             .req    v18
  57         HH34            .req    v19
  58
  59         .text
  60         .arch           armv8-a+crypto
  61
  62         .macro          __pmull_p64, rd, rn, rm
  63         pmull           \rd\().1q, \rn\().1d, \rm\().1d
  64         .endm
  65
  66         .macro          __pmull2_p64, rd, rn, rm
  67         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
  68         .endm
  69
  70         .macro          __pmull_p8, rq, ad, bd
  71         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
  72         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
  73         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
  74
  75         __pmull_p8_\bd  \rq, \ad
  76         .endm
  77
  78         .macro          __pmull2_p8, rq, ad, bd
  79         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
  80         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
  81         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
  82
  83         __pmull2_p8_\bd \rq, \ad
  84         .endm
  85
  86         .macro          __pmull_p8_SHASH, rq, ad
  87         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
  88         .endm
  89
  90         .macro          __pmull_p8_SHASH2, rq, ad
  91         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
  92         .endm
  93
  94         .macro          __pmull2_p8_SHASH, rq, ad
  95         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
  96         .endm
  97
  98         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
  99         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
 100         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
 101         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
 102         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
 103         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
 104         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
 105         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
 106         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
 107
 108         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
 109         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
 110         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
 111
 112         uzp1            t4.2d, t3.2d, t5.2d
 113         uzp2            t3.2d, t3.2d, t5.2d
 114         uzp1            t6.2d, t7.2d, t9.2d
 115         uzp2            t7.2d, t7.2d, t9.2d
 116
 117         // t3 = (L) (P0 + P1) << 8
 118         // t5 = (M) (P2 + P3) << 16
 119         eor             t4.16b, t4.16b, t3.16b
 120         and             t3.16b, t3.16b, k32_48.16b
 121
 122         // t7 = (N) (P4 + P5) << 24
 123         // t9 = (K) (P6 + P7) << 32
 124         eor             t6.16b, t6.16b, t7.16b
 125         and             t7.16b, t7.16b, k00_16.16b
 126
 127         eor             t4.16b, t4.16b, t3.16b
 128         eor             t6.16b, t6.16b, t7.16b
 129
 130         zip2            t5.2d, t4.2d, t3.2d
 131         zip1            t3.2d, t4.2d, t3.2d
 132         zip2            t9.2d, t6.2d, t7.2d
 133         zip1            t7.2d, t6.2d, t7.2d
 134
 135         ext             t3.16b, t3.16b, t3.16b, #15
 136         ext             t5.16b, t5.16b, t5.16b, #14
 137         ext             t7.16b, t7.16b, t7.16b, #13
 138         ext             t9.16b, t9.16b, t9.16b, #12
 139
 140         eor             t3.16b, t3.16b, t5.16b
 141         eor             t7.16b, t7.16b, t9.16b
 142         eor             \rq\().16b, \rq\().16b, t3.16b
 143         eor             \rq\().16b, \rq\().16b, t7.16b
 144         .endm
 145
 146         .macro          __pmull_pre_p64
 147         add             x8, x3, #16
 148         ld1             {HH.2d-HH4.2d}, [x8]
 149
 150         trn1            SHASH2.2d, SHASH.2d, HH.2d
 151         trn2            T1.2d, SHASH.2d, HH.2d
 152         eor             SHASH2.16b, SHASH2.16b, T1.16b
 153
 154         trn1            HH34.2d, HH3.2d, HH4.2d
 155         trn2            T1.2d, HH3.2d, HH4.2d
 156         eor             HH34.16b, HH34.16b, T1.16b
 157
 158         movi            MASK.16b, #0xe1
 159         shl             MASK.2d, MASK.2d, #57
 160         .endm
 161
 162         .macro          __pmull_pre_p8
 163         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
 164         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
 165
 166         // k00_16 := 0x0000000000000000_000000000000ffff
 167         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
 168         movi            k32_48.2d, #0xffffffff
 169         mov             k32_48.h[2], k32_48.h[0]
 170         ushr            k00_16.2d, k32_48.2d, #32
 171
 172         // prepare the permutation vectors
 173         mov_q           x5, 0x080f0e0d0c0b0a09
 174         movi            T1.8b, #8
 175         dup             perm1.2d, x5
 176         eor             perm1.16b, perm1.16b, T1.16b
 177         ushr            perm2.2d, perm1.2d, #8
 178         ushr            perm3.2d, perm1.2d, #16
 179         ushr            T1.2d, perm1.2d, #24
 180         sli             perm2.2d, perm1.2d, #56
 181         sli             perm3.2d, perm1.2d, #48
 182         sli             T1.2d, perm1.2d, #40
 183
 184         // precompute loop invariants
 185         tbl             sh1.16b, {SHASH.16b}, perm1.16b
 186         tbl             sh2.16b, {SHASH.16b}, perm2.16b
 187         tbl             sh3.16b, {SHASH.16b}, perm3.16b
 188         tbl             sh4.16b, {SHASH.16b}, T1.16b
 189         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
 190         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
 191         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
 192         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
 193         .endm
 194
 195         //
 196         // PMULL (64x64->128) based reduction for CPUs that can do
 197         // it in a single instruction.
 198         //
 199         .macro          __pmull_reduce_p64
 200         pmull           T2.1q, XL.1d, MASK.1d
 201         eor             XM.16b, XM.16b, T1.16b
 202
 203         mov             XH.d[0], XM.d[1]
 204         mov             XM.d[1], XL.d[0]
 205
 206         eor             XL.16b, XM.16b, T2.16b
 207         ext             T2.16b, XL.16b, XL.16b, #8
 208         pmull           XL.1q, XL.1d, MASK.1d
 209         .endm
 210
 211         //
 212         // Alternative reduction for CPUs that lack support for the
 213         // 64x64->128 PMULL instruction
 214         //
 215         .macro          __pmull_reduce_p8
 216         eor             XM.16b, XM.16b, T1.16b
 217
 218         mov             XL.d[1], XM.d[0]
 219         mov             XH.d[0], XM.d[1]
 220
 221         shl             T1.2d, XL.2d, #57
 222         shl             T2.2d, XL.2d, #62
 223         eor             T2.16b, T2.16b, T1.16b
 224         shl             T1.2d, XL.2d, #63
 225         eor             T2.16b, T2.16b, T1.16b
 226         ext             T1.16b, XL.16b, XH.16b, #8
 227         eor             T2.16b, T2.16b, T1.16b
 228
 229         mov             XL.d[1], T2.d[0]
 230         mov             XH.d[0], T2.d[1]
 231
 232         ushr            T2.2d, XL.2d, #1
 233         eor             XH.16b, XH.16b, XL.16b
 234         eor             XL.16b, XL.16b, T2.16b
 235         ushr            T2.2d, T2.2d, #6
 236         ushr            XL.2d, XL.2d, #1
 237         .endm
 238
 239         .macro          __pmull_ghash, pn
 240         ld1             {SHASH.2d}, [x3]
 241         ld1             {XL.2d}, [x1]
 242
 243         __pmull_pre_\pn
 244
 245         /* do the head block first, if supplied */
 246         cbz             x4, 0f
 247         ld1             {T1.2d}, [x4]
 248         mov             x4, xzr
 249         b               3f
 250
 251 0:      .ifc            \pn, p64
 252         tbnz            w0, #0, 2f              // skip until #blocks is a
 253         tbnz            w0, #1, 2f              // round multiple of 4
 254
 255 1:      ld1             {XM3.16b-TT4.16b}, [x2], #64
 256
 257         sub             w0, w0, #4
 258
 259         rev64           T1.16b, XM3.16b
 260         rev64           T2.16b, XH3.16b
 261         rev64           TT4.16b, TT4.16b
 262         rev64           TT3.16b, TT3.16b
 263
 264         ext             IN1.16b, TT4.16b, TT4.16b, #8
 265         ext             XL3.16b, TT3.16b, TT3.16b, #8
 266
 267         eor             TT4.16b, TT4.16b, IN1.16b
 268         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
 269         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
 270         pmull           XM2.1q, SHASH2.1d, TT4.1d       // (a1 + a0)(b1 + b0)
 271
 272         eor             TT3.16b, TT3.16b, XL3.16b
 273         pmull2          XH3.1q, HH.2d, XL3.2d           // a1 * b1
 274         pmull           XL3.1q, HH.1d, XL3.1d           // a0 * b0
 275         pmull2          XM3.1q, SHASH2.2d, TT3.2d       // (a1 + a0)(b1 + b0)
 276
 277         ext             IN1.16b, T2.16b, T2.16b, #8
 278         eor             XL2.16b, XL2.16b, XL3.16b
 279         eor             XH2.16b, XH2.16b, XH3.16b
 280         eor             XM2.16b, XM2.16b, XM3.16b
 281
 282         eor             T2.16b, T2.16b, IN1.16b
 283         pmull2          XH3.1q, HH3.2d, IN1.2d          // a1 * b1
 284         pmull           XL3.1q, HH3.1d, IN1.1d          // a0 * b0
 285         pmull           XM3.1q, HH34.1d, T2.1d          // (a1 + a0)(b1 + b0)
 286
 287         eor             XL2.16b, XL2.16b, XL3.16b
 288         eor             XH2.16b, XH2.16b, XH3.16b
 289         eor             XM2.16b, XM2.16b, XM3.16b
 290
 291         ext             IN1.16b, T1.16b, T1.16b, #8
 292         ext             TT3.16b, XL.16b, XL.16b, #8
 293         eor             XL.16b, XL.16b, IN1.16b
 294         eor             T1.16b, T1.16b, TT3.16b
 295
 296         pmull2          XH.1q, HH4.2d, XL.2d            // a1 * b1
 297         eor             T1.16b, T1.16b, XL.16b
 298         pmull           XL.1q, HH4.1d, XL.1d            // a0 * b0
 299         pmull2          XM.1q, HH34.2d, T1.2d           // (a1 + a0)(b1 + b0)
 300
 301         eor             XL.16b, XL.16b, XL2.16b
 302         eor             XH.16b, XH.16b, XH2.16b
 303         eor             XM.16b, XM.16b, XM2.16b
 304
 305         eor             T2.16b, XL.16b, XH.16b
 306         ext             T1.16b, XL.16b, XH.16b, #8
 307         eor             XM.16b, XM.16b, T2.16b
 308
 309         __pmull_reduce_p64
 310
 311         eor             T2.16b, T2.16b, XH.16b
 312         eor             XL.16b, XL.16b, T2.16b
 313
 314         cbz             w0, 5f
 315         b               1b
 316         .endif
 317
 318 2:      ld1             {T1.2d}, [x2], #16
 319         sub             w0, w0, #1
 320
 321 3:      /* multiply XL by SHASH in GF(2^128) */
 322 CPU_LE( rev64           T1.16b, T1.16b  )
 323
 324         ext             T2.16b, XL.16b, XL.16b, #8
 325         ext             IN1.16b, T1.16b, T1.16b, #8
 326         eor             T1.16b, T1.16b, T2.16b
 327         eor             XL.16b, XL.16b, IN1.16b
 328
 329         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
 330         eor             T1.16b, T1.16b, XL.16b
 331         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
 332         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
 333
 334 4:      eor             T2.16b, XL.16b, XH.16b
 335         ext             T1.16b, XL.16b, XH.16b, #8
 336         eor             XM.16b, XM.16b, T2.16b
 337
 338         __pmull_reduce_\pn
 339
 340         eor             T2.16b, T2.16b, XH.16b
 341         eor             XL.16b, XL.16b, T2.16b
 342
 343         cbnz            w0, 0b
 344
 345 5:      st1             {XL.2d}, [x1]
 346         ret
 347         .endm
 348
 349         /*
 350          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
 351          *                         struct ghash_key const *k, const char *head)
 352          */
 353 SYM_FUNC_START(pmull_ghash_update_p64)
 354         __pmull_ghash   p64
 355 SYM_FUNC_END(pmull_ghash_update_p64)
 356
 357 SYM_FUNC_START(pmull_ghash_update_p8)
 358         __pmull_ghash   p8
 359 SYM_FUNC_END(pmull_ghash_update_p8)
 360
 361         KS0             .req    v8
 362         KS1             .req    v9
 363         KS2             .req    v10
 364         KS3             .req    v11
 365
 366         INP0            .req    v21
 367         INP1            .req    v22
 368         INP2            .req    v23
 369         INP3            .req    v24
 370
 371         K0              .req    v25
 372         K1              .req    v26
 373         K2              .req    v27
 374         K3              .req    v28
 375         K4              .req    v12
 376         K5              .req    v13
 377         K6              .req    v4
 378         K7              .req    v5
 379         K8              .req    v14
 380         K9              .req    v15
 381         KK              .req    v29
 382         KL              .req    v30
 383         KM              .req    v31
 384
 385         .macro          load_round_keys, rounds, rk, tmp
 386         add             \tmp, \rk, #64
 387         ld1             {K0.4s-K3.4s}, [\rk]
 388         ld1             {K4.4s-K5.4s}, [\tmp]
 389         add             \tmp, \rk, \rounds, lsl #4
 390         sub             \tmp, \tmp, #32
 391         ld1             {KK.4s-KM.4s}, [\tmp]
 392         .endm
 393
 394         .macro          enc_round, state, key
 395         aese            \state\().16b, \key\().16b
 396         aesmc           \state\().16b, \state\().16b
 397         .endm
 398
 399         .macro          enc_qround, s0, s1, s2, s3, key
 400         enc_round       \s0, \key
 401         enc_round       \s1, \key
 402         enc_round       \s2, \key
 403         enc_round       \s3, \key
 404         .endm
 405
 406         .macro          enc_block, state, rounds, rk, tmp
 407         add             \tmp, \rk, #96
 408         ld1             {K6.4s-K7.4s}, [\tmp], #32
 409         .irp            key, K0, K1, K2, K3, K4 K5
 410         enc_round       \state, \key
 411         .endr
 412
 413         tbnz            \rounds, #2, .Lnot128_\@
 414 .Lout256_\@:
 415         enc_round       \state, K6
 416         enc_round       \state, K7
 417
 418 .Lout192_\@:
 419         enc_round       \state, KK
 420         aese            \state\().16b, KL.16b
 421         eor             \state\().16b, \state\().16b, KM.16b
 422
 423         .subsection     1
 424 .Lnot128_\@:
 425         ld1             {K8.4s-K9.4s}, [\tmp], #32
 426         enc_round       \state, K6
 427         enc_round       \state, K7
 428         ld1             {K6.4s-K7.4s}, [\tmp]
 429         enc_round       \state, K8
 430         enc_round       \state, K9
 431         tbz             \rounds, #1, .Lout192_\@
 432         b               .Lout256_\@
 433         .previous
 434         .endm
 435
 436         .align          6
 437         .macro          pmull_gcm_do_crypt, enc
 438         stp             x29, x30, [sp, #-32]!
 439         mov             x29, sp
 440         str             x19, [sp, #24]
 441
 442         load_round_keys x7, x6, x8
 443
 444         ld1             {SHASH.2d}, [x3], #16
 445         ld1             {HH.2d-HH4.2d}, [x3]
 446
 447         trn1            SHASH2.2d, SHASH.2d, HH.2d
 448         trn2            T1.2d, SHASH.2d, HH.2d
 449         eor             SHASH2.16b, SHASH2.16b, T1.16b
 450
 451         trn1            HH34.2d, HH3.2d, HH4.2d
 452         trn2            T1.2d, HH3.2d, HH4.2d
 453         eor             HH34.16b, HH34.16b, T1.16b
 454
 455         ld1             {XL.2d}, [x4]
 456
 457         cbz             x0, 3f                          // tag only?
 458
 459         ldr             w8, [x5, #12]                   // load lower counter
 460 CPU_LE( rev             w8, w8          )
 461
 462 0:      mov             w9, #4                          // max blocks per round
 463         add             x10, x0, #0xf
 464         lsr             x10, x10, #4                    // remaining blocks
 465
 466         subs            x0, x0, #64
 467         csel            w9, w10, w9, mi
 468         add             w8, w8, w9
 469
 470         bmi             1f
 471         ld1             {INP0.16b-INP3.16b}, [x2], #64
 472         .subsection     1
 473         /*
 474          * Populate the four input registers right to left with up to 63 bytes
 475          * of data, using overlapping loads to avoid branches.
 476          *
 477          *                INP0     INP1     INP2     INP3
 478          *  1 byte     |        |        |        |x       |
 479          * 16 bytes    |        |        |        |xxxxxxxx|
 480          * 17 bytes    |        |        |xxxxxxxx|x       |
 481          * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
 482          * etc etc
 483          *
 484          * Note that this code may read up to 15 bytes before the start of
 485          * the input. It is up to the calling code to ensure this is safe if
 486          * this happens in the first iteration of the loop (i.e., when the
 487          * input size is < 16 bytes)
 488          */
 489 1:      mov             x15, #16
 490         ands            x19, x0, #0xf
 491         csel            x19, x19, x15, ne
 492         adr_l           x17, .Lpermute_table + 16
 493
 494         sub             x11, x15, x19
 495         add             x12, x17, x11
 496         sub             x17, x17, x11
 497         ld1             {T1.16b}, [x12]
 498         sub             x10, x1, x11
 499         sub             x11, x2, x11
 500
 501         cmp             x0, #-16
 502         csel            x14, x15, xzr, gt
 503         cmp             x0, #-32
 504         csel            x15, x15, xzr, gt
 505         cmp             x0, #-48
 506         csel            x16, x19, xzr, gt
 507         csel            x1, x1, x10, gt
 508         csel            x2, x2, x11, gt
 509
 510         ld1             {INP0.16b}, [x2], x14
 511         ld1             {INP1.16b}, [x2], x15
 512         ld1             {INP2.16b}, [x2], x16
 513         ld1             {INP3.16b}, [x2]
 514         tbl             INP3.16b, {INP3.16b}, T1.16b
 515         b               2f
 516         .previous
 517
 518 2:      .if             \enc == 0
 519         bl              pmull_gcm_ghash_4x
 520         .endif
 521
 522         bl              pmull_gcm_enc_4x
 523
 524         tbnz            x0, #63, 6f
 525         st1             {INP0.16b-INP3.16b}, [x1], #64
 526         .if             \enc == 1
 527         bl              pmull_gcm_ghash_4x
 528         .endif
 529         bne             0b
 530
 531 3:      ldp             x19, x10, [sp, #24]
 532         cbz             x10, 5f                         // output tag?
 533
 534         ld1             {INP3.16b}, [x10]               // load lengths[]
 535         mov             w9, #1
 536         bl              pmull_gcm_ghash_4x
 537
 538         mov             w11, #(0x1 << 24)               // BE '1U'
 539         ld1             {KS0.16b}, [x5]
 540         mov             KS0.s[3], w11
 541
 542         enc_block       KS0, x7, x6, x12
 543
 544         ext             XL.16b, XL.16b, XL.16b, #8
 545         rev64           XL.16b, XL.16b
 546         eor             XL.16b, XL.16b, KS0.16b
 547
 548         .if             \enc == 1
 549         st1             {XL.16b}, [x10]                 // store tag
 550         .else
 551         ldp             x11, x12, [sp, #40]             // load tag pointer and authsize
 552         adr_l           x17, .Lpermute_table
 553         ld1             {KS0.16b}, [x11]                // load supplied tag
 554         add             x17, x17, x12
 555         ld1             {KS1.16b}, [x17]                // load permute vector
 556
 557         cmeq            XL.16b, XL.16b, KS0.16b         // compare tags
 558         mvn             XL.16b, XL.16b                  // -1 for fail, 0 for pass
 559         tbl             XL.16b, {XL.16b}, KS1.16b       // keep authsize bytes only
 560         sminv           b0, XL.16b                      // signed minimum across XL
 561         smov            w0, v0.b[0]                     // return b0
 562         .endif
 563
 564 4:      ldp             x29, x30, [sp], #32
 565         ret
 566
 567 5:
 568 CPU_LE( rev             w8, w8          )
 569         str             w8, [x5, #12]                   // store lower counter
 570         st1             {XL.2d}, [x4]
 571         b               4b
 572
 573 6:      ld1             {T1.16b-T2.16b}, [x17], #32     // permute vectors
 574         sub             x17, x17, x19, lsl #1
 575
 576         cmp             w9, #1
 577         beq             7f
 578         .subsection     1
 579 7:      ld1             {INP2.16b}, [x1]
 580         tbx             INP2.16b, {INP3.16b}, T1.16b
 581         mov             INP3.16b, INP2.16b
 582         b               8f
 583         .previous
 584
 585         st1             {INP0.16b}, [x1], x14
 586         st1             {INP1.16b}, [x1], x15
 587         st1             {INP2.16b}, [x1], x16
 588         tbl             INP3.16b, {INP3.16b}, T1.16b
 589         tbx             INP3.16b, {INP2.16b}, T2.16b
 590 8:      st1             {INP3.16b}, [x1]
 591
 592         .if             \enc == 1
 593         ld1             {T1.16b}, [x17]
 594         tbl             INP3.16b, {INP3.16b}, T1.16b    // clear non-data bits
 595         bl              pmull_gcm_ghash_4x
 596         .endif
 597         b               3b
 598         .endm
 599
 600         /*
 601          * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
 602          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
 603          *                        int rounds, u8 tag)
 604          */
 605 SYM_FUNC_START(pmull_gcm_encrypt)
 606         pmull_gcm_do_crypt      1
 607 SYM_FUNC_END(pmull_gcm_encrypt)
 608
 609         /*
 610          * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
 611          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
 612          *                        int rounds, u8 tag)
 613          */
 614 SYM_FUNC_START(pmull_gcm_decrypt)
 615         pmull_gcm_do_crypt      0
 616 SYM_FUNC_END(pmull_gcm_decrypt)
 617
 618 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
 619         movi            MASK.16b, #0xe1
 620         shl             MASK.2d, MASK.2d, #57
 621
 622         rev64           T1.16b, INP0.16b
 623         rev64           T2.16b, INP1.16b
 624         rev64           TT3.16b, INP2.16b
 625         rev64           TT4.16b, INP3.16b
 626
 627         ext             XL.16b, XL.16b, XL.16b, #8
 628
 629         tbz             w9, #2, 0f                      // <4 blocks?
 630         .subsection     1
 631 0:      movi            XH2.16b, #0
 632         movi            XM2.16b, #0
 633         movi            XL2.16b, #0
 634
 635         tbz             w9, #0, 1f                      // 2 blocks?
 636         tbz             w9, #1, 2f                      // 1 block?
 637
 638         eor             T2.16b, T2.16b, XL.16b
 639         ext             T1.16b, T2.16b, T2.16b, #8
 640         b               .Lgh3
 641
 642 1:      eor             TT3.16b, TT3.16b, XL.16b
 643         ext             T2.16b, TT3.16b, TT3.16b, #8
 644         b               .Lgh2
 645
 646 2:      eor             TT4.16b, TT4.16b, XL.16b
 647         ext             IN1.16b, TT4.16b, TT4.16b, #8
 648         b               .Lgh1
 649         .previous
 650
 651         eor             T1.16b, T1.16b, XL.16b
 652         ext             IN1.16b, T1.16b, T1.16b, #8
 653
 654         pmull2          XH2.1q, HH4.2d, IN1.2d          // a1 * b1
 655         eor             T1.16b, T1.16b, IN1.16b
 656         pmull           XL2.1q, HH4.1d, IN1.1d          // a0 * b0
 657         pmull2          XM2.1q, HH34.2d, T1.2d          // (a1 + a0)(b1 + b0)
 658
 659         ext             T1.16b, T2.16b, T2.16b, #8
 660 .Lgh3:  eor             T2.16b, T2.16b, T1.16b
 661         pmull2          XH.1q, HH3.2d, T1.2d            // a1 * b1
 662         pmull           XL.1q, HH3.1d, T1.1d            // a0 * b0
 663         pmull           XM.1q, HH34.1d, T2.1d           // (a1 + a0)(b1 + b0)
 664
 665         eor             XH2.16b, XH2.16b, XH.16b
 666         eor             XL2.16b, XL2.16b, XL.16b
 667         eor             XM2.16b, XM2.16b, XM.16b
 668
 669         ext             T2.16b, TT3.16b, TT3.16b, #8
 670 .Lgh2:  eor             TT3.16b, TT3.16b, T2.16b
 671         pmull2          XH.1q, HH.2d, T2.2d             // a1 * b1
 672         pmull           XL.1q, HH.1d, T2.1d             // a0 * b0
 673         pmull2          XM.1q, SHASH2.2d, TT3.2d        // (a1 + a0)(b1 + b0)
 674
 675         eor             XH2.16b, XH2.16b, XH.16b
 676         eor             XL2.16b, XL2.16b, XL.16b
 677         eor             XM2.16b, XM2.16b, XM.16b
 678
 679         ext             IN1.16b, TT4.16b, TT4.16b, #8
 680 .Lgh1:  eor             TT4.16b, TT4.16b, IN1.16b
 681         pmull           XL.1q, SHASH.1d, IN1.1d         // a0 * b0
 682         pmull2          XH.1q, SHASH.2d, IN1.2d         // a1 * b1
 683         pmull           XM.1q, SHASH2.1d, TT4.1d        // (a1 + a0)(b1 + b0)
 684
 685         eor             XH.16b, XH.16b, XH2.16b
 686         eor             XL.16b, XL.16b, XL2.16b
 687         eor             XM.16b, XM.16b, XM2.16b
 688
 689         eor             T2.16b, XL.16b, XH.16b
 690         ext             T1.16b, XL.16b, XH.16b, #8
 691         eor             XM.16b, XM.16b, T2.16b
 692
 693         __pmull_reduce_p64
 694
 695         eor             T2.16b, T2.16b, XH.16b
 696         eor             XL.16b, XL.16b, T2.16b
 697
 698         ret
 699 SYM_FUNC_END(pmull_gcm_ghash_4x)
 700
 701 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
 702         ld1             {KS0.16b}, [x5]                 // load upper counter
 703         sub             w10, w8, #4
 704         sub             w11, w8, #3
 705         sub             w12, w8, #2
 706         sub             w13, w8, #1
 707         rev             w10, w10
 708         rev             w11, w11
 709         rev             w12, w12
 710         rev             w13, w13
 711         mov             KS1.16b, KS0.16b
 712         mov             KS2.16b, KS0.16b
 713         mov             KS3.16b, KS0.16b
 714         ins             KS0.s[3], w10                   // set lower counter
 715         ins             KS1.s[3], w11
 716         ins             KS2.s[3], w12
 717         ins             KS3.s[3], w13
 718
 719         add             x10, x6, #96                    // round key pointer
 720         ld1             {K6.4s-K7.4s}, [x10], #32
 721         .irp            key, K0, K1, K2, K3, K4, K5
 722         enc_qround      KS0, KS1, KS2, KS3, \key
 723         .endr
 724
 725         tbnz            x7, #2, .Lnot128
 726         .subsection     1
 727 .Lnot128:
 728         ld1             {K8.4s-K9.4s}, [x10], #32
 729         .irp            key, K6, K7
 730         enc_qround      KS0, KS1, KS2, KS3, \key
 731         .endr
 732         ld1             {K6.4s-K7.4s}, [x10]
 733         .irp            key, K8, K9
 734         enc_qround      KS0, KS1, KS2, KS3, \key
 735         .endr
 736         tbz             x7, #1, .Lout192
 737         b               .Lout256
 738         .previous
 739
 740 .Lout256:
 741         .irp            key, K6, K7
 742         enc_qround      KS0, KS1, KS2, KS3, \key
 743         .endr
 744
 745 .Lout192:
 746         enc_qround      KS0, KS1, KS2, KS3, KK
 747
 748         aese            KS0.16b, KL.16b
 749         aese            KS1.16b, KL.16b
 750         aese            KS2.16b, KL.16b
 751         aese            KS3.16b, KL.16b
 752
 753         eor             KS0.16b, KS0.16b, KM.16b
 754         eor             KS1.16b, KS1.16b, KM.16b
 755         eor             KS2.16b, KS2.16b, KM.16b
 756         eor             KS3.16b, KS3.16b, KM.16b
 757
 758         eor             INP0.16b, INP0.16b, KS0.16b
 759         eor             INP1.16b, INP1.16b, KS1.16b
 760         eor             INP2.16b, INP2.16b, KS2.16b
 761         eor             INP3.16b, INP3.16b, KS3.16b
 762
 763         ret
 764 SYM_FUNC_END(pmull_gcm_enc_4x)
 765
 766         .section        ".rodata", "a"
 767         .align          6
 768 .Lpermute_table:
 769         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 770         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 771         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 772         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
 773         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 774         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 775         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 776         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
 777         .previous