arch/arm/crypto/chacha-scalar-core.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * Copyright (C) 2018 Google, Inc.
   4  */
   5
   6 #include <linux/linkage.h>
   7 #include <asm/assembler.h>
   8
   9 /*
  10  * Design notes:
  11  *
  12  * 16 registers would be needed to hold the state matrix, but only 14 are
  13  * available because 'sp' and 'pc' cannot be used.  So we spill the elements
  14  * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
  15  * 'ldrd' and one 'strd' instruction per round.
  16  *
  17  * All rotates are performed using the implicit rotate operand accepted by the
  18  * 'add' and 'eor' instructions.  This is faster than using explicit rotate
  19  * instructions.  To make this work, we allow the values in the second and last
  20  * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
  21  * wrong rotation amount.  The rotation amount is then fixed up just in time
  22  * when the values are used.  'brot' is the number of bits the values in row 'b'
  23  * need to be rotated right to arrive at the correct values, and 'drot'
  24  * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
  25  * that they end up as (25, 24) after every round.
  26  */
  27
  28         // ChaCha state registers
  29         X0      .req    r0
  30         X1      .req    r1
  31         X2      .req    r2
  32         X3      .req    r3
  33         X4      .req    r4
  34         X5      .req    r5
  35         X6      .req    r6
  36         X7      .req    r7
  37         X8_X10  .req    r8      // shared by x8 and x10
  38         X9_X11  .req    r9      // shared by x9 and x11
  39         X12     .req    r10
  40         X13     .req    r11
  41         X14     .req    r12
  42         X15     .req    r14
  43
  44 .macro __rev            out, in,  t0, t1, t2
  45 .if __LINUX_ARM_ARCH__ >= 6
  46         rev             \out, \in
  47 .else
  48         lsl             \t0, \in, #24
  49         and             \t1, \in, #0xff00
  50         and             \t2, \in, #0xff0000
  51         orr             \out, \t0, \in, lsr #24
  52         orr             \out, \out, \t1, lsl #8
  53         orr             \out, \out, \t2, lsr #8
  54 .endif
  55 .endm
  56
  57 .macro _le32_bswap      x,  t0, t1, t2
  58 #ifdef __ARMEB__
  59         __rev           \x, \x,  \t0, \t1, \t2
  60 #endif
  61 .endm
  62
  63 .macro _le32_bswap_4x   a, b, c, d,  t0, t1, t2
  64         _le32_bswap     \a,  \t0, \t1, \t2
  65         _le32_bswap     \b,  \t0, \t1, \t2
  66         _le32_bswap     \c,  \t0, \t1, \t2
  67         _le32_bswap     \d,  \t0, \t1, \t2
  68 .endm
  69
  70 .macro __ldrd           a, b, src, offset
  71 #if __LINUX_ARM_ARCH__ >= 6
  72         ldrd            \a, \b, [\src, #\offset]
  73 #else
  74         ldr             \a, [\src, #\offset]
  75         ldr             \b, [\src, #\offset + 4]
  76 #endif
  77 .endm
  78
  79 .macro __strd           a, b, dst, offset
  80 #if __LINUX_ARM_ARCH__ >= 6
  81         strd            \a, \b, [\dst, #\offset]
  82 #else
  83         str             \a, [\dst, #\offset]
  84         str             \b, [\dst, #\offset + 4]
  85 #endif
  86 .endm
  87
  88 .macro _halfround       a1, b1, c1, d1,  a2, b2, c2, d2
  89
  90         // a += b; d ^= a; d = rol(d, 16);
  91         add             \a1, \a1, \b1, ror #brot
  92         add             \a2, \a2, \b2, ror #brot
  93         eor             \d1, \a1, \d1, ror #drot
  94         eor             \d2, \a2, \d2, ror #drot
  95         // drot == 32 - 16 == 16
  96
  97         // c += d; b ^= c; b = rol(b, 12);
  98         add             \c1, \c1, \d1, ror #16
  99         add             \c2, \c2, \d2, ror #16
 100         eor             \b1, \c1, \b1, ror #brot
 101         eor             \b2, \c2, \b2, ror #brot
 102         // brot == 32 - 12 == 20
 103
 104         // a += b; d ^= a; d = rol(d, 8);
 105         add             \a1, \a1, \b1, ror #20
 106         add             \a2, \a2, \b2, ror #20
 107         eor             \d1, \a1, \d1, ror #16
 108         eor             \d2, \a2, \d2, ror #16
 109         // drot == 32 - 8 == 24
 110
 111         // c += d; b ^= c; b = rol(b, 7);
 112         add             \c1, \c1, \d1, ror #24
 113         add             \c2, \c2, \d2, ror #24
 114         eor             \b1, \c1, \b1, ror #20
 115         eor             \b2, \c2, \b2, ror #20
 116         // brot == 32 - 7 == 25
 117 .endm
 118
 119 .macro _doubleround
 120
 121         // column round
 122
 123         // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
 124         _halfround      X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
 125
 126         // save (x8, x9); restore (x10, x11)
 127         __strd          X8_X10, X9_X11, sp, 0
 128         __ldrd          X8_X10, X9_X11, sp, 8
 129
 130         // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
 131         _halfround      X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
 132
 133         .set brot, 25
 134         .set drot, 24
 135
 136         // diagonal round
 137
 138         // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
 139         _halfround      X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
 140
 141         // save (x10, x11); restore (x8, x9)
 142         __strd          X8_X10, X9_X11, sp, 8
 143         __ldrd          X8_X10, X9_X11, sp, 0
 144
 145         // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
 146         _halfround      X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
 147 .endm
 148
 149 .macro _chacha_permute  nrounds
 150         .set brot, 0
 151         .set drot, 0
 152         .rept \nrounds / 2
 153          _doubleround
 154         .endr
 155 .endm
 156
 157 .macro _chacha          nrounds
 158
 159 .Lnext_block\@:
 160         // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
 161         // Registers contain x0-x9,x12-x15.
 162
 163         // Do the core ChaCha permutation to update x0-x15.
 164         _chacha_permute \nrounds
 165
 166         add             sp, #8
 167         // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
 168         // Registers contain x0-x9,x12-x15.
 169         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 170
 171         // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
 172         push            {X8_X10, X9_X11, X12, X13, X14, X15}
 173
 174         // Load (OUT, IN, LEN).
 175         ldr             r14, [sp, #96]
 176         ldr             r12, [sp, #100]
 177         ldr             r11, [sp, #104]
 178
 179         orr             r10, r14, r12
 180
 181         // Use slow path if fewer than 64 bytes remain.
 182         cmp             r11, #64
 183         blt             .Lxor_slowpath\@
 184
 185         // Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
 186         // ARMv6+, since ldmia and stmia (used below) still require alignment.
 187         tst             r10, #3
 188         bne             .Lxor_slowpath\@
 189
 190         // Fast path: XOR 64 bytes of aligned data.
 191
 192         // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
 193         // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
 194         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 195
 196         // x0-x3
 197         __ldrd          r8, r9, sp, 32
 198         __ldrd          r10, r11, sp, 40
 199         add             X0, X0, r8
 200         add             X1, X1, r9
 201         add             X2, X2, r10
 202         add             X3, X3, r11
 203         _le32_bswap_4x  X0, X1, X2, X3,  r8, r9, r10
 204         ldmia           r12!, {r8-r11}
 205         eor             X0, X0, r8
 206         eor             X1, X1, r9
 207         eor             X2, X2, r10
 208         eor             X3, X3, r11
 209         stmia           r14!, {X0-X3}
 210
 211         // x4-x7
 212         __ldrd          r8, r9, sp, 48
 213         __ldrd          r10, r11, sp, 56
 214         add             X4, r8, X4, ror #brot
 215         add             X5, r9, X5, ror #brot
 216         ldmia           r12!, {X0-X3}
 217         add             X6, r10, X6, ror #brot
 218         add             X7, r11, X7, ror #brot
 219         _le32_bswap_4x  X4, X5, X6, X7,  r8, r9, r10
 220         eor             X4, X4, X0
 221         eor             X5, X5, X1
 222         eor             X6, X6, X2
 223         eor             X7, X7, X3
 224         stmia           r14!, {X4-X7}
 225
 226         // x8-x15
 227         pop             {r0-r7}                 // (x8-x9,x12-x15,x10-x11)
 228         __ldrd          r8, r9, sp, 32
 229         __ldrd          r10, r11, sp, 40
 230         add             r0, r0, r8              // x8
 231         add             r1, r1, r9              // x9
 232         add             r6, r6, r10             // x10
 233         add             r7, r7, r11             // x11
 234         _le32_bswap_4x  r0, r1, r6, r7,  r8, r9, r10
 235         ldmia           r12!, {r8-r11}
 236         eor             r0, r0, r8              // x8
 237         eor             r1, r1, r9              // x9
 238         eor             r6, r6, r10             // x10
 239         eor             r7, r7, r11             // x11
 240         stmia           r14!, {r0,r1,r6,r7}
 241         ldmia           r12!, {r0,r1,r6,r7}
 242         __ldrd          r8, r9, sp, 48
 243         __ldrd          r10, r11, sp, 56
 244         add             r2, r8, r2, ror #drot   // x12
 245         add             r3, r9, r3, ror #drot   // x13
 246         add             r4, r10, r4, ror #drot  // x14
 247         add             r5, r11, r5, ror #drot  // x15
 248         _le32_bswap_4x  r2, r3, r4, r5,  r9, r10, r11
 249           ldr           r9, [sp, #72]           // load LEN
 250         eor             r2, r2, r0              // x12
 251         eor             r3, r3, r1              // x13
 252         eor             r4, r4, r6              // x14
 253         eor             r5, r5, r7              // x15
 254           subs          r9, #64                 // decrement and check LEN
 255         stmia           r14!, {r2-r5}
 256
 257         beq             .Ldone\@
 258
 259 .Lprepare_for_next_block\@:
 260
 261         // Stack: x0-x15 OUT IN LEN
 262
 263         // Increment block counter (x12)
 264         add             r8, #1
 265
 266         // Store updated (OUT, IN, LEN)
 267         str             r14, [sp, #64]
 268         str             r12, [sp, #68]
 269         str             r9, [sp, #72]
 270
 271           mov           r14, sp
 272
 273         // Store updated block counter (x12)
 274         str             r8, [sp, #48]
 275
 276           sub           sp, #16
 277
 278         // Reload state and do next block
 279         ldmia           r14!, {r0-r11}          // load x0-x11
 280         __strd          r10, r11, sp, 8         // store x10-x11 before state
 281         ldmia           r14, {r10-r12,r14}      // load x12-x15
 282         b               .Lnext_block\@
 283
 284 .Lxor_slowpath\@:
 285         // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
 286         // We handle it by storing the 64 bytes of keystream to the stack, then
 287         // XOR-ing the needed portion with the data.
 288
 289         // Allocate keystream buffer
 290         sub             sp, #64
 291         mov             r14, sp
 292
 293         // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
 294         // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
 295         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 296
 297         // Save keystream for x0-x3
 298         __ldrd          r8, r9, sp, 96
 299         __ldrd          r10, r11, sp, 104
 300         add             X0, X0, r8
 301         add             X1, X1, r9
 302         add             X2, X2, r10
 303         add             X3, X3, r11
 304         _le32_bswap_4x  X0, X1, X2, X3,  r8, r9, r10
 305         stmia           r14!, {X0-X3}
 306
 307         // Save keystream for x4-x7
 308         __ldrd          r8, r9, sp, 112
 309         __ldrd          r10, r11, sp, 120
 310         add             X4, r8, X4, ror #brot
 311         add             X5, r9, X5, ror #brot
 312         add             X6, r10, X6, ror #brot
 313         add             X7, r11, X7, ror #brot
 314         _le32_bswap_4x  X4, X5, X6, X7,  r8, r9, r10
 315           add           r8, sp, #64
 316         stmia           r14!, {X4-X7}
 317
 318         // Save keystream for x8-x15
 319         ldm             r8, {r0-r7}             // (x8-x9,x12-x15,x10-x11)
 320         __ldrd          r8, r9, sp, 128
 321         __ldrd          r10, r11, sp, 136
 322         add             r0, r0, r8              // x8
 323         add             r1, r1, r9              // x9
 324         add             r6, r6, r10             // x10
 325         add             r7, r7, r11             // x11
 326         _le32_bswap_4x  r0, r1, r6, r7,  r8, r9, r10
 327         stmia           r14!, {r0,r1,r6,r7}
 328         __ldrd          r8, r9, sp, 144
 329         __ldrd          r10, r11, sp, 152
 330         add             r2, r8, r2, ror #drot   // x12
 331         add             r3, r9, r3, ror #drot   // x13
 332         add             r4, r10, r4, ror #drot  // x14
 333         add             r5, r11, r5, ror #drot  // x15
 334         _le32_bswap_4x  r2, r3, r4, r5,  r9, r10, r11
 335         stmia           r14, {r2-r5}
 336
 337         // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
 338         // Registers: r8 is block counter, r12 is IN.
 339
 340         ldr             r9, [sp, #168]          // LEN
 341         ldr             r14, [sp, #160]         // OUT
 342         cmp             r9, #64
 343           mov           r0, sp
 344         movle           r1, r9
 345         movgt           r1, #64
 346         // r1 is number of bytes to XOR, in range [1, 64]
 347
 348 .if __LINUX_ARM_ARCH__ < 6
 349         orr             r2, r12, r14
 350         tst             r2, #3                  // IN or OUT misaligned?
 351         bne             .Lxor_next_byte\@
 352 .endif
 353
 354         // XOR a word at a time
 355 .rept 16
 356         subs            r1, #4
 357         blt             .Lxor_words_done\@
 358         ldr             r2, [r12], #4
 359         ldr             r3, [r0], #4
 360         eor             r2, r2, r3
 361         str             r2, [r14], #4
 362 .endr
 363         b               .Lxor_slowpath_done\@
 364 .Lxor_words_done\@:
 365         ands            r1, r1, #3
 366         beq             .Lxor_slowpath_done\@
 367
 368         // XOR a byte at a time
 369 .Lxor_next_byte\@:
 370         ldrb            r2, [r12], #1
 371         ldrb            r3, [r0], #1
 372         eor             r2, r2, r3
 373         strb            r2, [r14], #1
 374         subs            r1, #1
 375         bne             .Lxor_next_byte\@
 376
 377 .Lxor_slowpath_done\@:
 378         subs            r9, #64
 379         add             sp, #96
 380         bgt             .Lprepare_for_next_block\@
 381
 382 .Ldone\@:
 383 .endm   // _chacha
 384
 385 /*
 386  * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
 387  *                   const u32 *state, int nrounds);
 388  */
 389 ENTRY(chacha_doarm)
 390         cmp             r2, #0                  // len == 0?
 391         reteq           lr
 392
 393         ldr             ip, [sp]
 394         cmp             ip, #12
 395
 396         push            {r0-r2,r4-r11,lr}
 397
 398         // Push state x0-x15 onto stack.
 399         // Also store an extra copy of x10-x11 just before the state.
 400
 401         add             X12, r3, #48
 402         ldm             X12, {X12,X13,X14,X15}
 403         push            {X12,X13,X14,X15}
 404         sub             sp, sp, #64
 405
 406         __ldrd          X8_X10, X9_X11, r3, 40
 407         __strd          X8_X10, X9_X11, sp, 8
 408         __strd          X8_X10, X9_X11, sp, 56
 409         ldm             r3, {X0-X9_X11}
 410         __strd          X0, X1, sp, 16
 411         __strd          X2, X3, sp, 24
 412         __strd          X4, X5, sp, 32
 413         __strd          X6, X7, sp, 40
 414         __strd          X8_X10, X9_X11, sp, 48
 415
 416         beq             1f
 417         _chacha         20
 418
 419 0:      add             sp, #76
 420         pop             {r4-r11, pc}
 421
 422 1:      _chacha         12
 423         b               0b
 424 ENDPROC(chacha_doarm)
 425
 426 /*
 427  * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
 428  */
 429 ENTRY(hchacha_block_arm)
 430         push            {r1,r4-r11,lr}
 431
 432         cmp             r2, #12                 // ChaCha12 ?
 433
 434         mov             r14, r0
 435         ldmia           r14!, {r0-r11}          // load x0-x11
 436         push            {r10-r11}               // store x10-x11 to stack
 437         ldm             r14, {r10-r12,r14}      // load x12-x15
 438         sub             sp, #8
 439
 440         beq             1f
 441         _chacha_permute 20
 442
 443         // Skip over (unused0-unused1, x10-x11)
 444 0:      add             sp, #16
 445
 446         // Fix up rotations of x12-x15
 447         ror             X12, X12, #drot
 448         ror             X13, X13, #drot
 449           pop           {r4}                    // load 'out'
 450         ror             X14, X14, #drot
 451         ror             X15, X15, #drot
 452
 453         // Store (x0-x3,x12-x15) to 'out'
 454         stm             r4, {X0,X1,X2,X3,X12,X13,X14,X15}
 455
 456         pop             {r4-r11,pc}
 457
 458 1:      _chacha_permute 12
 459         b               0b
 460 ENDPROC(hchacha_block_arm)