arch/x86/lib/memmove_64.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * Normally compiler builtins are used, but sometimes the compiler calls out
   4  * of line code. Based on asm-i386/string.h.
   5  *
   6  * This assembly file is re-written from memmove_64.c file.
   7  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   8  */
   9 #include <linux/linkage.h>
  10 #include <asm/cpufeatures.h>
  11 #include <asm/alternative.h>
  12 #include <asm/export.h>
  13
  14 #undef memmove
  15
  16 /*
  17  * Implement memmove(). This can handle overlap between src and dst.
  18  *
  19  * Input:
  20  * rdi: dest
  21  * rsi: src
  22  * rdx: count
  23  *
  24  * Output:
  25  * rax: dest
  26  */
  27 SYM_FUNC_START_WEAK(memmove)
  28 SYM_FUNC_START(__memmove)
  29
  30         mov %rdi, %rax
  31
  32         /* Decide forward/backward copy mode */
  33         cmp %rdi, %rsi
  34         jge .Lmemmove_begin_forward
  35         mov %rsi, %r8
  36         add %rdx, %r8
  37         cmp %rdi, %r8
  38         jg 2f
  39
  40         /* FSRM implies ERMS => no length checks, do the copy directly */
  41 .Lmemmove_begin_forward:
  42         ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
  43         ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
  44
  45         /*
  46          * movsq instruction have many startup latency
  47          * so we handle small size by general register.
  48          */
  49         cmp  $680, %rdx
  50         jb      3f
  51         /*
  52          * movsq instruction is only good for aligned case.
  53          */
  54
  55         cmpb %dil, %sil
  56         je 4f
  57 3:
  58         sub $0x20, %rdx
  59         /*
  60          * We gobble 32 bytes forward in each loop.
  61          */
  62 5:
  63         sub $0x20, %rdx
  64         movq 0*8(%rsi), %r11
  65         movq 1*8(%rsi), %r10
  66         movq 2*8(%rsi), %r9
  67         movq 3*8(%rsi), %r8
  68         leaq 4*8(%rsi), %rsi
  69
  70         movq %r11, 0*8(%rdi)
  71         movq %r10, 1*8(%rdi)
  72         movq %r9, 2*8(%rdi)
  73         movq %r8, 3*8(%rdi)
  74         leaq 4*8(%rdi), %rdi
  75         jae 5b
  76         addq $0x20, %rdx
  77         jmp 1f
  78         /*
  79          * Handle data forward by movsq.
  80          */
  81         .p2align 4
  82 4:
  83         movq %rdx, %rcx
  84         movq -8(%rsi, %rdx), %r11
  85         lea -8(%rdi, %rdx), %r10
  86         shrq $3, %rcx
  87         rep movsq
  88         movq %r11, (%r10)
  89         jmp 13f
  90 .Lmemmove_end_forward:
  91
  92         /*
  93          * Handle data backward by movsq.
  94          */
  95         .p2align 4
  96 7:
  97         movq %rdx, %rcx
  98         movq (%rsi), %r11
  99         movq %rdi, %r10
 100         leaq -8(%rsi, %rdx), %rsi
 101         leaq -8(%rdi, %rdx), %rdi
 102         shrq $3, %rcx
 103         std
 104         rep movsq
 105         cld
 106         movq %r11, (%r10)
 107         jmp 13f
 108
 109         /*
 110          * Start to prepare for backward copy.
 111          */
 112         .p2align 4
 113 2:
 114         cmp $0x20, %rdx
 115         jb 1f
 116         cmp $680, %rdx
 117         jb 6f
 118         cmp %dil, %sil
 119         je 7b
 120 6:
 121         /*
 122          * Calculate copy position to tail.
 123          */
 124         addq %rdx, %rsi
 125         addq %rdx, %rdi
 126         subq $0x20, %rdx
 127         /*
 128          * We gobble 32 bytes backward in each loop.
 129          */
 130 8:
 131         subq $0x20, %rdx
 132         movq -1*8(%rsi), %r11
 133         movq -2*8(%rsi), %r10
 134         movq -3*8(%rsi), %r9
 135         movq -4*8(%rsi), %r8
 136         leaq -4*8(%rsi), %rsi
 137
 138         movq %r11, -1*8(%rdi)
 139         movq %r10, -2*8(%rdi)
 140         movq %r9, -3*8(%rdi)
 141         movq %r8, -4*8(%rdi)
 142         leaq -4*8(%rdi), %rdi
 143         jae 8b
 144         /*
 145          * Calculate copy position to head.
 146          */
 147         addq $0x20, %rdx
 148         subq %rdx, %rsi
 149         subq %rdx, %rdi
 150 1:
 151         cmpq $16, %rdx
 152         jb 9f
 153         /*
 154          * Move data from 16 bytes to 31 bytes.
 155          */
 156         movq 0*8(%rsi), %r11
 157         movq 1*8(%rsi), %r10
 158         movq -2*8(%rsi, %rdx), %r9
 159         movq -1*8(%rsi, %rdx), %r8
 160         movq %r11, 0*8(%rdi)
 161         movq %r10, 1*8(%rdi)
 162         movq %r9, -2*8(%rdi, %rdx)
 163         movq %r8, -1*8(%rdi, %rdx)
 164         jmp 13f
 165         .p2align 4
 166 9:
 167         cmpq $8, %rdx
 168         jb 10f
 169         /*
 170          * Move data from 8 bytes to 15 bytes.
 171          */
 172         movq 0*8(%rsi), %r11
 173         movq -1*8(%rsi, %rdx), %r10
 174         movq %r11, 0*8(%rdi)
 175         movq %r10, -1*8(%rdi, %rdx)
 176         jmp 13f
 177 10:
 178         cmpq $4, %rdx
 179         jb 11f
 180         /*
 181          * Move data from 4 bytes to 7 bytes.
 182          */
 183         movl (%rsi), %r11d
 184         movl -4(%rsi, %rdx), %r10d
 185         movl %r11d, (%rdi)
 186         movl %r10d, -4(%rdi, %rdx)
 187         jmp 13f
 188 11:
 189         cmp $2, %rdx
 190         jb 12f
 191         /*
 192          * Move data from 2 bytes to 3 bytes.
 193          */
 194         movw (%rsi), %r11w
 195         movw -2(%rsi, %rdx), %r10w
 196         movw %r11w, (%rdi)
 197         movw %r10w, -2(%rdi, %rdx)
 198         jmp 13f
 199 12:
 200         cmp $1, %rdx
 201         jb 13f
 202         /*
 203          * Move data for 1 byte.
 204          */
 205         movb (%rsi), %r11b
 206         movb %r11b, (%rdi)
 207 13:
 208         retq
 209 SYM_FUNC_END(__memmove)
 210 SYM_FUNC_END_ALIAS(memmove)
 211 EXPORT_SYMBOL(__memmove)
 212 EXPORT_SYMBOL(memmove)