arch/arm/crypto/blake2s-core.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * BLAKE2s digest algorithm, ARM scalar implementation
   4  *
   5  * Copyright 2020 Google LLC
   6  *
   7  * Author: Eric Biggers <ebiggers@google.com>
   8  */
   9
  10 #include <linux/linkage.h>
  11 #include <asm/assembler.h>
  12
  13         // Registers used to hold message words temporarily.  There aren't
  14         // enough ARM registers to hold the whole message block, so we have to
  15         // load the words on-demand.
  16         M_0             .req    r12
  17         M_1             .req    r14
  18
  19 // The BLAKE2s initialization vector
  20 .Lblake2s_IV:
  21         .word   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
  22         .word   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
  23
  24 .macro __ldrd           a, b, src, offset
  25 #if __LINUX_ARM_ARCH__ >= 6
  26         ldrd            \a, \b, [\src, #\offset]
  27 #else
  28         ldr             \a, [\src, #\offset]
  29         ldr             \b, [\src, #\offset + 4]
  30 #endif
  31 .endm
  32
  33 .macro __strd           a, b, dst, offset
  34 #if __LINUX_ARM_ARCH__ >= 6
  35         strd            \a, \b, [\dst, #\offset]
  36 #else
  37         str             \a, [\dst, #\offset]
  38         str             \b, [\dst, #\offset + 4]
  39 #endif
  40 .endm
  41
  42 .macro _le32_bswap      a, tmp
  43 #ifdef __ARMEB__
  44         rev_l           \a, \tmp
  45 #endif
  46 .endm
  47
  48 .macro _le32_bswap_8x   a, b, c, d, e, f, g, h,  tmp
  49         _le32_bswap     \a, \tmp
  50         _le32_bswap     \b, \tmp
  51         _le32_bswap     \c, \tmp
  52         _le32_bswap     \d, \tmp
  53         _le32_bswap     \e, \tmp
  54         _le32_bswap     \f, \tmp
  55         _le32_bswap     \g, \tmp
  56         _le32_bswap     \h, \tmp
  57 .endm
  58
  59 // Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
  60 // (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
  61 // columns/diagonals.  s0-s1 are the word offsets to the message words the first
  62 // column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
  63 // M_0 and M_1 are free to use, and the message block can be found at sp + 32.
  64 //
  65 // Note that to save instructions, the rotations don't happen when the
  66 // pseudocode says they should, but rather they are delayed until the values are
  67 // used.  See the comment above _blake2s_round().
  68 .macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
  69
  70         ldr             M_0, [sp, #32 + 4 * \s0]
  71         ldr             M_1, [sp, #32 + 4 * \s2]
  72
  73         // a += b + m[blake2s_sigma[r][2*i + 0]];
  74         add             \a0, \a0, \b0, ror #brot
  75         add             \a1, \a1, \b1, ror #brot
  76         add             \a0, \a0, M_0
  77         add             \a1, \a1, M_1
  78
  79         // d = ror32(d ^ a, 16);
  80         eor             \d0, \a0, \d0, ror #drot
  81         eor             \d1, \a1, \d1, ror #drot
  82
  83         // c += d;
  84         add             \c0, \c0, \d0, ror #16
  85         add             \c1, \c1, \d1, ror #16
  86
  87         // b = ror32(b ^ c, 12);
  88         eor             \b0, \c0, \b0, ror #brot
  89         eor             \b1, \c1, \b1, ror #brot
  90
  91         ldr             M_0, [sp, #32 + 4 * \s1]
  92         ldr             M_1, [sp, #32 + 4 * \s3]
  93
  94         // a += b + m[blake2s_sigma[r][2*i + 1]];
  95         add             \a0, \a0, \b0, ror #12
  96         add             \a1, \a1, \b1, ror #12
  97         add             \a0, \a0, M_0
  98         add             \a1, \a1, M_1
  99
 100         // d = ror32(d ^ a, 8);
 101         eor             \d0, \a0, \d0, ror#16
 102         eor             \d1, \a1, \d1, ror#16
 103
 104         // c += d;
 105         add             \c0, \c0, \d0, ror#8
 106         add             \c1, \c1, \d1, ror#8
 107
 108         // b = ror32(b ^ c, 7);
 109         eor             \b0, \c0, \b0, ror#12
 110         eor             \b1, \c1, \b1, ror#12
 111 .endm
 112
 113 // Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
 114 // are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
 115 // spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
 116 // r14 are free to use.  The macro arguments s0-s15 give the order in which the
 117 // message words are used in this round.
 118 //
 119 // All rotates are performed using the implicit rotate operand accepted by the
 120 // 'add' and 'eor' instructions.  This is faster than using explicit rotate
 121 // instructions.  To make this work, we allow the values in the second and last
 122 // rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
 123 // wrong rotation amount.  The rotation amount is then fixed up just in time
 124 // when the values are used.  'brot' is the number of bits the values in row 'b'
 125 // need to be rotated right to arrive at the correct values, and 'drot'
 126 // similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
 127 // that they end up as (7, 8) after every round.
 128 .macro  _blake2s_round  s0, s1, s2, s3, s4, s5, s6, s7, \
 129                         s8, s9, s10, s11, s12, s13, s14, s15
 130
 131         // Mix first two columns:
 132         // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
 133         __ldrd          r10, r11, sp, 16        // load v[12] and v[13]
 134         _blake2s_quarterround   r0, r4, r8, r10,  r1, r5, r9, r11, \
 135                                 \s0, \s1, \s2, \s3
 136         __strd          r8, r9, sp, 0
 137         __strd          r10, r11, sp, 16
 138
 139         // Mix second two columns:
 140         // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
 141         __ldrd          r8, r9, sp, 8           // load v[10] and v[11]
 142         __ldrd          r10, r11, sp, 24        // load v[14] and v[15]
 143         _blake2s_quarterround   r2, r6, r8, r10,  r3, r7, r9, r11, \
 144                                 \s4, \s5, \s6, \s7
 145         str             r10, [sp, #24]          // store v[14]
 146         // v[10], v[11], and v[15] are used below, so no need to store them yet.
 147
 148         .set brot, 7
 149         .set drot, 8
 150
 151         // Mix first two diagonals:
 152         // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
 153         ldr             r10, [sp, #16]          // load v[12]
 154         _blake2s_quarterround   r0, r5, r8, r11,  r1, r6, r9, r10, \
 155                                 \s8, \s9, \s10, \s11
 156         __strd          r8, r9, sp, 8
 157         str             r11, [sp, #28]
 158         str             r10, [sp, #16]
 159
 160         // Mix second two diagonals:
 161         // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
 162         __ldrd          r8, r9, sp, 0           // load v[8] and v[9]
 163         __ldrd          r10, r11, sp, 20        // load v[13] and v[14]
 164         _blake2s_quarterround   r2, r7, r8, r10,  r3, r4, r9, r11, \
 165                                 \s12, \s13, \s14, \s15
 166         __strd          r10, r11, sp, 20
 167 .endm
 168
 169 //
 170 // void blake2s_compress_arch(struct blake2s_state *state,
 171 //                            const u8 *block, size_t nblocks, u32 inc);
 172 //
 173 // Only the first three fields of struct blake2s_state are used:
 174 //      u32 h[8];       (inout)
 175 //      u32 t[2];       (inout)
 176 //      u32 f[2];       (in)
 177 //
 178         .align          5
 179 ENTRY(blake2s_compress_arch)
 180         push            {r0-r2,r4-r11,lr}       // keep this an even number
 181
 182 .Lnext_block:
 183         // r0 is 'state'
 184         // r1 is 'block'
 185         // r3 is 'inc'
 186
 187         // Load and increment the counter t[0..1].
 188         __ldrd          r10, r11, r0, 32
 189         adds            r10, r10, r3
 190         adc             r11, r11, #0
 191         __strd          r10, r11, r0, 32
 192
 193         // _blake2s_round is very short on registers, so copy the message block
 194         // to the stack to save a register during the rounds.  This also has the
 195         // advantage that misalignment only needs to be dealt with in one place.
 196         sub             sp, sp, #64
 197         mov             r12, sp
 198         tst             r1, #3
 199         bne             .Lcopy_block_misaligned
 200         ldmia           r1!, {r2-r9}
 201         _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
 202         stmia           r12!, {r2-r9}
 203         ldmia           r1!, {r2-r9}
 204         _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
 205         stmia           r12, {r2-r9}
 206 .Lcopy_block_done:
 207         str             r1, [sp, #68]           // Update message pointer
 208
 209         // Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
 210         // for spilling v[8..9].  Leave v[8..9] in r8-r9.
 211         mov             r14, r0                 // r14 = state
 212         adr             r12, .Lblake2s_IV
 213         ldmia           r12!, {r8-r9}           // load IV[0..1]
 214         __ldrd          r0, r1, r14, 40         // load f[0..1]
 215         ldm             r12, {r2-r7}            // load IV[3..7]
 216         eor             r4, r4, r10             // v[12] = IV[4] ^ t[0]
 217         eor             r5, r5, r11             // v[13] = IV[5] ^ t[1]
 218         eor             r6, r6, r0              // v[14] = IV[6] ^ f[0]
 219         eor             r7, r7, r1              // v[15] = IV[7] ^ f[1]
 220         push            {r2-r7}                 // push v[9..15]
 221         sub             sp, sp, #8              // leave space for v[8..9]
 222
 223         // Load h[0..7] == v[0..7].
 224         ldm             r14, {r0-r7}
 225
 226         // Execute the rounds.  Each round is provided the order in which it
 227         // needs to use the message words.
 228         .set brot, 0
 229         .set drot, 0
 230         _blake2s_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 231         _blake2s_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
 232         _blake2s_round  11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
 233         _blake2s_round  7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
 234         _blake2s_round  9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
 235         _blake2s_round  2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
 236         _blake2s_round  12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
 237         _blake2s_round  13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
 238         _blake2s_round  6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
 239         _blake2s_round  10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
 240
 241         // Fold the final state matrix into the hash chaining value:
 242         //
 243         //      for (i = 0; i < 8; i++)
 244         //              h[i] ^= v[i] ^ v[i + 8];
 245         //
 246         ldr             r14, [sp, #96]          // r14 = &h[0]
 247         add             sp, sp, #8              // v[8..9] are already loaded.
 248         pop             {r10-r11}               // load v[10..11]
 249         eor             r0, r0, r8
 250         eor             r1, r1, r9
 251         eor             r2, r2, r10
 252         eor             r3, r3, r11
 253         ldm             r14, {r8-r11}           // load h[0..3]
 254         eor             r0, r0, r8
 255         eor             r1, r1, r9
 256         eor             r2, r2, r10
 257         eor             r3, r3, r11
 258         stmia           r14!, {r0-r3}           // store new h[0..3]
 259         ldm             r14, {r0-r3}            // load old h[4..7]
 260         pop             {r8-r11}                // load v[12..15]
 261         eor             r0, r0, r4, ror #brot
 262         eor             r1, r1, r5, ror #brot
 263         eor             r2, r2, r6, ror #brot
 264         eor             r3, r3, r7, ror #brot
 265         eor             r0, r0, r8, ror #drot
 266         eor             r1, r1, r9, ror #drot
 267         eor             r2, r2, r10, ror #drot
 268         eor             r3, r3, r11, ror #drot
 269           add           sp, sp, #64             // skip copy of message block
 270         stm             r14, {r0-r3}            // store new h[4..7]
 271
 272         // Advance to the next block, if there is one.  Note that if there are
 273         // multiple blocks, then 'inc' (the counter increment amount) must be
 274         // 64.  So we can simply set it to 64 without re-loading it.
 275         ldm             sp, {r0, r1, r2}        // load (state, block, nblocks)
 276         mov             r3, #64                 // set 'inc'
 277         subs            r2, r2, #1              // nblocks--
 278         str             r2, [sp, #8]
 279         bne             .Lnext_block            // nblocks != 0?
 280
 281         pop             {r0-r2,r4-r11,pc}
 282
 283         // The next message block (pointed to by r1) isn't 4-byte aligned, so it
 284         // can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
 285         // by r12) using an alternative method.  r2-r9 are free to use.
 286 .Lcopy_block_misaligned:
 287         mov             r2, #64
 288 1:
 289 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 290         ldr             r3, [r1], #4
 291         _le32_bswap     r3, r4
 292 #else
 293         ldrb            r3, [r1, #0]
 294         ldrb            r4, [r1, #1]
 295         ldrb            r5, [r1, #2]
 296         ldrb            r6, [r1, #3]
 297         add             r1, r1, #4
 298         orr             r3, r3, r4, lsl #8
 299         orr             r3, r3, r5, lsl #16
 300         orr             r3, r3, r6, lsl #24
 301 #endif
 302         subs            r2, r2, #4
 303         str             r3, [r12], #4
 304         bne             1b
 305         b               .Lcopy_block_done
 306 ENDPROC(blake2s_compress_arch)