arch/riscv/lib/memmove.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
   4  */
   5
   6 #include <linux/linkage.h>
   7 #include <asm/asm.h>
   8
   9 SYM_FUNC_START(__memmove)
  10         /*
  11          * Returns
  12          *   a0 - dest
  13          *
  14          * Parameters
  15          *   a0 - Inclusive first byte of dest
  16          *   a1 - Inclusive first byte of src
  17          *   a2 - Length of copy n
  18          *
  19          * Because the return matches the parameter register a0,
  20          * we will not clobber or modify that register.
  21          *
  22          * Note: This currently only works on little-endian.
  23          * To port to big-endian, reverse the direction of shifts
  24          * in the 2 misaligned fixup copy loops.
  25          */
  26
  27         /* Return if nothing to do */
  28         beq a0, a1, .Lreturn_from_memmove
  29         beqz a2, .Lreturn_from_memmove
  30
  31         /*
  32          * Register Uses
  33          *      Forward Copy: a1 - Index counter of src
  34          *      Reverse Copy: a4 - Index counter of src
  35          *      Forward Copy: t3 - Index counter of dest
  36          *      Reverse Copy: t4 - Index counter of dest
  37          *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
  38          *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
  39          *   Both Copy Modes: t0 - Link / Temporary for load-store
  40          *   Both Copy Modes: t1 - Temporary for load-store
  41          *   Both Copy Modes: t2 - Temporary for load-store
  42          *   Both Copy Modes: a5 - dest to src alignment offset
  43          *   Both Copy Modes: a6 - Shift ammount
  44          *   Both Copy Modes: a7 - Inverse Shift ammount
  45          *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
  46          */
  47
  48         /*
  49          * Solve for some register values now.
  50          * Byte copy does not need t5 or t6.
  51          */
  52         mv   t3, a0
  53         add  t4, a0, a2
  54         add  a4, a1, a2
  55
  56         /*
  57          * Byte copy if copying less than (2 * SZREG) bytes. This can
  58          * cause problems with the bulk copy implementation and is
  59          * small enough not to bother.
  60          */
  61         andi t0, a2, -(2 * SZREG)
  62         beqz t0, .Lbyte_copy
  63
  64         /*
  65          * Now solve for t5 and t6.
  66          */
  67         andi t5, t3, -SZREG
  68         andi t6, t4, -SZREG
  69         /*
  70          * If dest(Register t3) rounded down to the nearest naturally
  71          * aligned SZREG address, does not equal dest, then add SZREG
  72          * to find the low-bound of SZREG alignment in the dest memory
  73          * region.  Note that this could overshoot the dest memory
  74          * region if n is less than SZREG.  This is one reason why
  75          * we always byte copy if n is less than SZREG.
  76          * Otherwise, dest is already naturally aligned to SZREG.
  77          */
  78         beq  t5, t3, 1f
  79                 addi t5, t5, SZREG
  80         1:
  81
  82         /*
  83          * If the dest and src are co-aligned to SZREG, then there is
  84          * no need for the full rigmarole of a full misaligned fixup copy.
  85          * Instead, do a simpler co-aligned copy.
  86          */
  87         xor  t0, a0, a1
  88         andi t1, t0, (SZREG - 1)
  89         beqz t1, .Lcoaligned_copy
  90         /* Fall through to misaligned fixup copy */
  91
  92 .Lmisaligned_fixup_copy:
  93         bltu a1, a0, .Lmisaligned_fixup_copy_reverse
  94
  95 .Lmisaligned_fixup_copy_forward:
  96         jal  t0, .Lbyte_copy_until_aligned_forward
  97
  98         andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
  99         slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
 100         sub  a5, a1, t3 /* Find the difference between src and dest */
 101         andi a1, a1, -SZREG /* Align the src pointer */
 102         addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
 103
 104         /*
 105          * Compute The Inverse Shift
 106          * a7 = XLEN - a6 = XLEN + -a6
 107          * 2s complement negation to find the negative: -a6 = ~a6 + 1
 108          * Add that to XLEN.  XLEN = SZREG * 8.
 109          */
 110         not  a7, a6
 111         addi a7, a7, (SZREG * 8 + 1)
 112
 113         /*
 114          * Fix Misalignment Copy Loop - Forward
 115          * load_val0 = load_ptr[0];
 116          * do {
 117          *      load_val1 = load_ptr[1];
 118          *      store_ptr += 2;
 119          *      store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
 120          *
 121          *      if (store_ptr == {a2})
 122          *              break;
 123          *
 124          *      load_val0 = load_ptr[2];
 125          *      load_ptr += 2;
 126          *      store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
 127          *
 128          * } while (store_ptr != store_ptr_end);
 129          * store_ptr = store_ptr_end;
 130          */
 131
 132         REG_L t0, (0 * SZREG)(a1)
 133         1:
 134         REG_L t1, (1 * SZREG)(a1)
 135         addi  t3, t3, (2 * SZREG)
 136         srl   t0, t0, a6
 137         sll   t2, t1, a7
 138         or    t2, t0, t2
 139         REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
 140
 141         beq   t3, a2, 2f
 142
 143         REG_L t0, (2 * SZREG)(a1)
 144         addi  a1, a1, (2 * SZREG)
 145         srl   t1, t1, a6
 146         sll   t2, t0, a7
 147         or    t2, t1, t2
 148         REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
 149
 150         bne   t3, t6, 1b
 151         2:
 152         mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
 153
 154         add  a1, t3, a5 /* Restore the src pointer */
 155         j .Lbyte_copy_forward /* Copy any remaining bytes */
 156
 157 .Lmisaligned_fixup_copy_reverse:
 158         jal  t0, .Lbyte_copy_until_aligned_reverse
 159
 160         andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
 161         slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
 162         sub  a5, a4, t4 /* Find the difference between src and dest */
 163         andi a4, a4, -SZREG /* Align the src pointer */
 164         addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
 165
 166         /*
 167          * Compute The Inverse Shift
 168          * a7 = XLEN - a6 = XLEN + -a6
 169          * 2s complement negation to find the negative: -a6 = ~a6 + 1
 170          * Add that to XLEN.  XLEN = SZREG * 8.
 171          */
 172         not  a7, a6
 173         addi a7, a7, (SZREG * 8 + 1)
 174
 175         /*
 176          * Fix Misalignment Copy Loop - Reverse
 177          * load_val1 = load_ptr[0];
 178          * do {
 179          *      load_val0 = load_ptr[-1];
 180          *      store_ptr -= 2;
 181          *      store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
 182          *
 183          *      if (store_ptr == {a2})
 184          *              break;
 185          *
 186          *      load_val1 = load_ptr[-2];
 187          *      load_ptr -= 2;
 188          *      store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
 189          *
 190          * } while (store_ptr != store_ptr_end);
 191          * store_ptr = store_ptr_end;
 192          */
 193
 194         REG_L t1, ( 0 * SZREG)(a4)
 195         1:
 196         REG_L t0, (-1 * SZREG)(a4)
 197         addi  t4, t4, (-2 * SZREG)
 198         sll   t1, t1, a7
 199         srl   t2, t0, a6
 200         or    t2, t1, t2
 201         REG_S t2, ( 1 * SZREG)(t4)
 202
 203         beq   t4, a2, 2f
 204
 205         REG_L t1, (-2 * SZREG)(a4)
 206         addi  a4, a4, (-2 * SZREG)
 207         sll   t0, t0, a7
 208         srl   t2, t1, a6
 209         or    t2, t0, t2
 210         REG_S t2, ( 0 * SZREG)(t4)
 211
 212         bne   t4, t5, 1b
 213         2:
 214         mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
 215
 216         add  a4, t4, a5 /* Restore the src pointer */
 217         j .Lbyte_copy_reverse /* Copy any remaining bytes */
 218
 219 /*
 220  * Simple copy loops for SZREG co-aligned memory locations.
 221  * These also make calls to do byte copies for any unaligned
 222  * data at their terminations.
 223  */
 224 .Lcoaligned_copy:
 225         bltu a1, a0, .Lcoaligned_copy_reverse
 226
 227 .Lcoaligned_copy_forward:
 228         jal t0, .Lbyte_copy_until_aligned_forward
 229
 230         1:
 231         REG_L t1, ( 0 * SZREG)(a1)
 232         addi  a1, a1, SZREG
 233         addi  t3, t3, SZREG
 234         REG_S t1, (-1 * SZREG)(t3)
 235         bne   t3, t6, 1b
 236
 237         j .Lbyte_copy_forward /* Copy any remaining bytes */
 238
 239 .Lcoaligned_copy_reverse:
 240         jal t0, .Lbyte_copy_until_aligned_reverse
 241
 242         1:
 243         REG_L t1, (-1 * SZREG)(a4)
 244         addi  a4, a4, -SZREG
 245         addi  t4, t4, -SZREG
 246         REG_S t1, ( 0 * SZREG)(t4)
 247         bne   t4, t5, 1b
 248
 249         j .Lbyte_copy_reverse /* Copy any remaining bytes */
 250
 251 /*
 252  * These are basically sub-functions within the function.  They
 253  * are used to byte copy until the dest pointer is in alignment.
 254  * At which point, a bulk copy method can be used by the
 255  * calling code.  These work on the same registers as the bulk
 256  * copy loops.  Therefore, the register values can be picked
 257  * up from where they were left and we avoid code duplication
 258  * without any overhead except the call in and return jumps.
 259  */
 260 .Lbyte_copy_until_aligned_forward:
 261         beq  t3, t5, 2f
 262         1:
 263         lb   t1,  0(a1)
 264         addi a1, a1, 1
 265         addi t3, t3, 1
 266         sb   t1, -1(t3)
 267         bne  t3, t5, 1b
 268         2:
 269         jalr zero, 0x0(t0) /* Return to multibyte copy loop */
 270
 271 .Lbyte_copy_until_aligned_reverse:
 272         beq  t4, t6, 2f
 273         1:
 274         lb   t1, -1(a4)
 275         addi a4, a4, -1
 276         addi t4, t4, -1
 277         sb   t1,  0(t4)
 278         bne  t4, t6, 1b
 279         2:
 280         jalr zero, 0x0(t0) /* Return to multibyte copy loop */
 281
 282 /*
 283  * Simple byte copy loops.
 284  * These will byte copy until they reach the end of data to copy.
 285  * At that point, they will call to return from memmove.
 286  */
 287 .Lbyte_copy:
 288         bltu a1, a0, .Lbyte_copy_reverse
 289
 290 .Lbyte_copy_forward:
 291         beq  t3, t4, 2f
 292         1:
 293         lb   t1,  0(a1)
 294         addi a1, a1, 1
 295         addi t3, t3, 1
 296         sb   t1, -1(t3)
 297         bne  t3, t4, 1b
 298         2:
 299         ret
 300
 301 .Lbyte_copy_reverse:
 302         beq  t4, t3, 2f
 303         1:
 304         lb   t1, -1(a4)
 305         addi a4, a4, -1
 306         addi t4, t4, -1
 307         sb   t1,  0(t4)
 308         bne  t4, t3, 1b
 309         2:
 310
 311 .Lreturn_from_memmove:
 312         ret
 313
 314 SYM_FUNC_END(__memmove)
 315 SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
 316 SYM_FUNC_ALIAS(__pi_memmove, __memmove)
 317 SYM_FUNC_ALIAS(__pi___memmove, __memmove)