arch/x86/lib/memmove_64.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * Normally compiler builtins are used, but sometimes the compiler calls out
   4  * of line code. Based on asm-i386/string.h.
   5  *
   6  * This assembly file is re-written from memmove_64.c file.
   7  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   8  */
   9 #include <linux/linkage.h>
  10 #include <asm/cpufeatures.h>
  11 #include <asm/alternative.h>
  12 #include <asm/export.h>
  13
  14 #undef memmove
  15
  16 /*
  17  * Implement memmove(). This can handle overlap between src and dst.
  18  *
  19  * Input:
  20  * rdi: dest
  21  * rsi: src
  22  * rdx: count
  23  *
  24  * Output:
  25  * rax: dest
  26  */
  27 SYM_FUNC_START(__memmove)
  28
  29         mov %rdi, %rax
  30
  31         /* Decide forward/backward copy mode */
  32         cmp %rdi, %rsi
  33         jge .Lmemmove_begin_forward
  34         mov %rsi, %r8
  35         add %rdx, %r8
  36         cmp %rdi, %r8
  37         jg 2f
  38
  39         /* FSRM implies ERMS => no length checks, do the copy directly */
  40 .Lmemmove_begin_forward:
  41         ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
  42         ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
  43
  44         /*
  45          * movsq instruction have many startup latency
  46          * so we handle small size by general register.
  47          */
  48         cmp  $680, %rdx
  49         jb      3f
  50         /*
  51          * movsq instruction is only good for aligned case.
  52          */
  53
  54         cmpb %dil, %sil
  55         je 4f
  56 3:
  57         sub $0x20, %rdx
  58         /*
  59          * We gobble 32 bytes forward in each loop.
  60          */
  61 5:
  62         sub $0x20, %rdx
  63         movq 0*8(%rsi), %r11
  64         movq 1*8(%rsi), %r10
  65         movq 2*8(%rsi), %r9
  66         movq 3*8(%rsi), %r8
  67         leaq 4*8(%rsi), %rsi
  68
  69         movq %r11, 0*8(%rdi)
  70         movq %r10, 1*8(%rdi)
  71         movq %r9, 2*8(%rdi)
  72         movq %r8, 3*8(%rdi)
  73         leaq 4*8(%rdi), %rdi
  74         jae 5b
  75         addq $0x20, %rdx
  76         jmp 1f
  77         /*
  78          * Handle data forward by movsq.
  79          */
  80         .p2align 4
  81 4:
  82         movq %rdx, %rcx
  83         movq -8(%rsi, %rdx), %r11
  84         lea -8(%rdi, %rdx), %r10
  85         shrq $3, %rcx
  86         rep movsq
  87         movq %r11, (%r10)
  88         jmp 13f
  89 .Lmemmove_end_forward:
  90
  91         /*
  92          * Handle data backward by movsq.
  93          */
  94         .p2align 4
  95 7:
  96         movq %rdx, %rcx
  97         movq (%rsi), %r11
  98         movq %rdi, %r10
  99         leaq -8(%rsi, %rdx), %rsi
 100         leaq -8(%rdi, %rdx), %rdi
 101         shrq $3, %rcx
 102         std
 103         rep movsq
 104         cld
 105         movq %r11, (%r10)
 106         jmp 13f
 107
 108         /*
 109          * Start to prepare for backward copy.
 110          */
 111         .p2align 4
 112 2:
 113         cmp $0x20, %rdx
 114         jb 1f
 115         cmp $680, %rdx
 116         jb 6f
 117         cmp %dil, %sil
 118         je 7b
 119 6:
 120         /*
 121          * Calculate copy position to tail.
 122          */
 123         addq %rdx, %rsi
 124         addq %rdx, %rdi
 125         subq $0x20, %rdx
 126         /*
 127          * We gobble 32 bytes backward in each loop.
 128          */
 129 8:
 130         subq $0x20, %rdx
 131         movq -1*8(%rsi), %r11
 132         movq -2*8(%rsi), %r10
 133         movq -3*8(%rsi), %r9
 134         movq -4*8(%rsi), %r8
 135         leaq -4*8(%rsi), %rsi
 136
 137         movq %r11, -1*8(%rdi)
 138         movq %r10, -2*8(%rdi)
 139         movq %r9, -3*8(%rdi)
 140         movq %r8, -4*8(%rdi)
 141         leaq -4*8(%rdi), %rdi
 142         jae 8b
 143         /*
 144          * Calculate copy position to head.
 145          */
 146         addq $0x20, %rdx
 147         subq %rdx, %rsi
 148         subq %rdx, %rdi
 149 1:
 150         cmpq $16, %rdx
 151         jb 9f
 152         /*
 153          * Move data from 16 bytes to 31 bytes.
 154          */
 155         movq 0*8(%rsi), %r11
 156         movq 1*8(%rsi), %r10
 157         movq -2*8(%rsi, %rdx), %r9
 158         movq -1*8(%rsi, %rdx), %r8
 159         movq %r11, 0*8(%rdi)
 160         movq %r10, 1*8(%rdi)
 161         movq %r9, -2*8(%rdi, %rdx)
 162         movq %r8, -1*8(%rdi, %rdx)
 163         jmp 13f
 164         .p2align 4
 165 9:
 166         cmpq $8, %rdx
 167         jb 10f
 168         /*
 169          * Move data from 8 bytes to 15 bytes.
 170          */
 171         movq 0*8(%rsi), %r11
 172         movq -1*8(%rsi, %rdx), %r10
 173         movq %r11, 0*8(%rdi)
 174         movq %r10, -1*8(%rdi, %rdx)
 175         jmp 13f
 176 10:
 177         cmpq $4, %rdx
 178         jb 11f
 179         /*
 180          * Move data from 4 bytes to 7 bytes.
 181          */
 182         movl (%rsi), %r11d
 183         movl -4(%rsi, %rdx), %r10d
 184         movl %r11d, (%rdi)
 185         movl %r10d, -4(%rdi, %rdx)
 186         jmp 13f
 187 11:
 188         cmp $2, %rdx
 189         jb 12f
 190         /*
 191          * Move data from 2 bytes to 3 bytes.
 192          */
 193         movw (%rsi), %r11w
 194         movw -2(%rsi, %rdx), %r10w
 195         movw %r11w, (%rdi)
 196         movw %r10w, -2(%rdi, %rdx)
 197         jmp 13f
 198 12:
 199         cmp $1, %rdx
 200         jb 13f
 201         /*
 202          * Move data for 1 byte.
 203          */
 204         movb (%rsi), %r11b
 205         movb %r11b, (%rdi)
 206 13:
 207         RET
 208
 209 .Lmemmove_erms:
 210         movq %rdx, %rcx
 211         rep movsb
 212         RET
 213 SYM_FUNC_END(__memmove)
 214 EXPORT_SYMBOL(__memmove)
 215
 216 SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
 217 EXPORT_SYMBOL(memmove)