arch/x86/lib/memmove_64.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * Normally compiler builtins are used, but sometimes the compiler calls out
   4  * of line code. Based on asm-i386/string.h.
   5  *
   6  * This assembly file is re-written from memmove_64.c file.
   7  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   8  */
   9 #include <linux/linkage.h>
  10 #include <asm/cpufeatures.h>
  11 #include <asm/alternative-asm.h>
  12 #include <asm/export.h>
  13
  14 #undef memmove
  15
  16 /*
  17  * Implement memmove(). This can handle overlap between src and dst.
  18  *
  19  * Input:
  20  * rdi: dest
  21  * rsi: src
  22  * rdx: count
  23  *
  24  * Output:
  25  * rax: dest
  26  */
  27 .weak memmove
  28
  29 SYM_FUNC_START_ALIAS(memmove)
  30 SYM_FUNC_START(__memmove)
  31
  32         mov %rdi, %rax
  33
  34         /* Decide forward/backward copy mode */
  35         cmp %rdi, %rsi
  36         jge .Lmemmove_begin_forward
  37         mov %rsi, %r8
  38         add %rdx, %r8
  39         cmp %rdi, %r8
  40         jg 2f
  41
  42         /* FSRM implies ERMS => no length checks, do the copy directly */
  43 .Lmemmove_begin_forward:
  44         ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
  45         ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
  46
  47         /*
  48          * movsq instruction have many startup latency
  49          * so we handle small size by general register.
  50          */
  51         cmp  $680, %rdx
  52         jb      3f
  53         /*
  54          * movsq instruction is only good for aligned case.
  55          */
  56
  57         cmpb %dil, %sil
  58         je 4f
  59 3:
  60         sub $0x20, %rdx
  61         /*
  62          * We gobble 32 bytes forward in each loop.
  63          */
  64 5:
  65         sub $0x20, %rdx
  66         movq 0*8(%rsi), %r11
  67         movq 1*8(%rsi), %r10
  68         movq 2*8(%rsi), %r9
  69         movq 3*8(%rsi), %r8
  70         leaq 4*8(%rsi), %rsi
  71
  72         movq %r11, 0*8(%rdi)
  73         movq %r10, 1*8(%rdi)
  74         movq %r9, 2*8(%rdi)
  75         movq %r8, 3*8(%rdi)
  76         leaq 4*8(%rdi), %rdi
  77         jae 5b
  78         addq $0x20, %rdx
  79         jmp 1f
  80         /*
  81          * Handle data forward by movsq.
  82          */
  83         .p2align 4
  84 4:
  85         movq %rdx, %rcx
  86         movq -8(%rsi, %rdx), %r11
  87         lea -8(%rdi, %rdx), %r10
  88         shrq $3, %rcx
  89         rep movsq
  90         movq %r11, (%r10)
  91         jmp 13f
  92 .Lmemmove_end_forward:
  93
  94         /*
  95          * Handle data backward by movsq.
  96          */
  97         .p2align 4
  98 7:
  99         movq %rdx, %rcx
 100         movq (%rsi), %r11
 101         movq %rdi, %r10
 102         leaq -8(%rsi, %rdx), %rsi
 103         leaq -8(%rdi, %rdx), %rdi
 104         shrq $3, %rcx
 105         std
 106         rep movsq
 107         cld
 108         movq %r11, (%r10)
 109         jmp 13f
 110
 111         /*
 112          * Start to prepare for backward copy.
 113          */
 114         .p2align 4
 115 2:
 116         cmp $0x20, %rdx
 117         jb 1f
 118         cmp $680, %rdx
 119         jb 6f
 120         cmp %dil, %sil
 121         je 7b
 122 6:
 123         /*
 124          * Calculate copy position to tail.
 125          */
 126         addq %rdx, %rsi
 127         addq %rdx, %rdi
 128         subq $0x20, %rdx
 129         /*
 130          * We gobble 32 bytes backward in each loop.
 131          */
 132 8:
 133         subq $0x20, %rdx
 134         movq -1*8(%rsi), %r11
 135         movq -2*8(%rsi), %r10
 136         movq -3*8(%rsi), %r9
 137         movq -4*8(%rsi), %r8
 138         leaq -4*8(%rsi), %rsi
 139
 140         movq %r11, -1*8(%rdi)
 141         movq %r10, -2*8(%rdi)
 142         movq %r9, -3*8(%rdi)
 143         movq %r8, -4*8(%rdi)
 144         leaq -4*8(%rdi), %rdi
 145         jae 8b
 146         /*
 147          * Calculate copy position to head.
 148          */
 149         addq $0x20, %rdx
 150         subq %rdx, %rsi
 151         subq %rdx, %rdi
 152 1:
 153         cmpq $16, %rdx
 154         jb 9f
 155         /*
 156          * Move data from 16 bytes to 31 bytes.
 157          */
 158         movq 0*8(%rsi), %r11
 159         movq 1*8(%rsi), %r10
 160         movq -2*8(%rsi, %rdx), %r9
 161         movq -1*8(%rsi, %rdx), %r8
 162         movq %r11, 0*8(%rdi)
 163         movq %r10, 1*8(%rdi)
 164         movq %r9, -2*8(%rdi, %rdx)
 165         movq %r8, -1*8(%rdi, %rdx)
 166         jmp 13f
 167         .p2align 4
 168 9:
 169         cmpq $8, %rdx
 170         jb 10f
 171         /*
 172          * Move data from 8 bytes to 15 bytes.
 173          */
 174         movq 0*8(%rsi), %r11
 175         movq -1*8(%rsi, %rdx), %r10
 176         movq %r11, 0*8(%rdi)
 177         movq %r10, -1*8(%rdi, %rdx)
 178         jmp 13f
 179 10:
 180         cmpq $4, %rdx
 181         jb 11f
 182         /*
 183          * Move data from 4 bytes to 7 bytes.
 184          */
 185         movl (%rsi), %r11d
 186         movl -4(%rsi, %rdx), %r10d
 187         movl %r11d, (%rdi)
 188         movl %r10d, -4(%rdi, %rdx)
 189         jmp 13f
 190 11:
 191         cmp $2, %rdx
 192         jb 12f
 193         /*
 194          * Move data from 2 bytes to 3 bytes.
 195          */
 196         movw (%rsi), %r11w
 197         movw -2(%rsi, %rdx), %r10w
 198         movw %r11w, (%rdi)
 199         movw %r10w, -2(%rdi, %rdx)
 200         jmp 13f
 201 12:
 202         cmp $1, %rdx
 203         jb 13f
 204         /*
 205          * Move data for 1 byte.
 206          */
 207         movb (%rsi), %r11b
 208         movb %r11b, (%rdi)
 209 13:
 210         retq
 211 SYM_FUNC_END(__memmove)
 212 SYM_FUNC_END_ALIAS(memmove)
 213 EXPORT_SYMBOL(__memmove)
 214 EXPORT_SYMBOL(memmove)