arch/x86/lib/memcpy_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /* Copyright 2002 Andi Kleen */
   3
   4 #include <linux/linkage.h>
   5 #include <asm/errno.h>
   6 #include <asm/cpufeatures.h>
   7 #include <asm/alternative-asm.h>
   8 #include <asm/export.h>
   9
  10 .pushsection .noinstr.text, "ax"
  11
  12 /*
  13  * We build a jump to memcpy_orig by default which gets NOPped out on
  14  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  15  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  16  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  17  */
  18
  19 .weak memcpy
  20
  21 /*
  22  * memcpy - Copy a memory block.
  23  *
  24  * Input:
  25  *  rdi destination
  26  *  rsi source
  27  *  rdx count
  28  *
  29  * Output:
  30  * rax original destination
  31  */
  32 SYM_FUNC_START_ALIAS(__memcpy)
  33 SYM_FUNC_START_LOCAL(memcpy)
  34         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  35                       "jmp memcpy_erms", X86_FEATURE_ERMS
  36
  37         movq %rdi, %rax
  38         movq %rdx, %rcx
  39         shrq $3, %rcx
  40         andl $7, %edx
  41         rep movsq
  42         movl %edx, %ecx
  43         rep movsb
  44         ret
  45 SYM_FUNC_END(memcpy)
  46 SYM_FUNC_END_ALIAS(__memcpy)
  47 EXPORT_SYMBOL(memcpy)
  48 EXPORT_SYMBOL(__memcpy)
  49
  50 /*
  51  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  52  * simpler than memcpy. Use memcpy_erms when possible.
  53  */
  54 SYM_FUNC_START_LOCAL(memcpy_erms)
  55         movq %rdi, %rax
  56         movq %rdx, %rcx
  57         rep movsb
  58         ret
  59 SYM_FUNC_END(memcpy_erms)
  60
  61 SYM_FUNC_START_LOCAL(memcpy_orig)
  62         movq %rdi, %rax
  63
  64         cmpq $0x20, %rdx
  65         jb .Lhandle_tail
  66
  67         /*
  68          * We check whether memory false dependence could occur,
  69          * then jump to corresponding copy mode.
  70          */
  71         cmp  %dil, %sil
  72         jl .Lcopy_backward
  73         subq $0x20, %rdx
  74 .Lcopy_forward_loop:
  75         subq $0x20,     %rdx
  76
  77         /*
  78          * Move in blocks of 4x8 bytes:
  79          */
  80         movq 0*8(%rsi), %r8
  81         movq 1*8(%rsi), %r9
  82         movq 2*8(%rsi), %r10
  83         movq 3*8(%rsi), %r11
  84         leaq 4*8(%rsi), %rsi
  85
  86         movq %r8,       0*8(%rdi)
  87         movq %r9,       1*8(%rdi)
  88         movq %r10,      2*8(%rdi)
  89         movq %r11,      3*8(%rdi)
  90         leaq 4*8(%rdi), %rdi
  91         jae  .Lcopy_forward_loop
  92         addl $0x20,     %edx
  93         jmp  .Lhandle_tail
  94
  95 .Lcopy_backward:
  96         /*
  97          * Calculate copy position to tail.
  98          */
  99         addq %rdx,      %rsi
 100         addq %rdx,      %rdi
 101         subq $0x20,     %rdx
 102         /*
 103          * At most 3 ALU operations in one cycle,
 104          * so append NOPS in the same 16 bytes trunk.
 105          */
 106         .p2align 4
 107 .Lcopy_backward_loop:
 108         subq $0x20,     %rdx
 109         movq -1*8(%rsi),        %r8
 110         movq -2*8(%rsi),        %r9
 111         movq -3*8(%rsi),        %r10
 112         movq -4*8(%rsi),        %r11
 113         leaq -4*8(%rsi),        %rsi
 114         movq %r8,               -1*8(%rdi)
 115         movq %r9,               -2*8(%rdi)
 116         movq %r10,              -3*8(%rdi)
 117         movq %r11,              -4*8(%rdi)
 118         leaq -4*8(%rdi),        %rdi
 119         jae  .Lcopy_backward_loop
 120
 121         /*
 122          * Calculate copy position to head.
 123          */
 124         addl $0x20,     %edx
 125         subq %rdx,      %rsi
 126         subq %rdx,      %rdi
 127 .Lhandle_tail:
 128         cmpl $16,       %edx
 129         jb   .Lless_16bytes
 130
 131         /*
 132          * Move data from 16 bytes to 31 bytes.
 133          */
 134         movq 0*8(%rsi), %r8
 135         movq 1*8(%rsi), %r9
 136         movq -2*8(%rsi, %rdx),  %r10
 137         movq -1*8(%rsi, %rdx),  %r11
 138         movq %r8,       0*8(%rdi)
 139         movq %r9,       1*8(%rdi)
 140         movq %r10,      -2*8(%rdi, %rdx)
 141         movq %r11,      -1*8(%rdi, %rdx)
 142         retq
 143         .p2align 4
 144 .Lless_16bytes:
 145         cmpl $8,        %edx
 146         jb   .Lless_8bytes
 147         /*
 148          * Move data from 8 bytes to 15 bytes.
 149          */
 150         movq 0*8(%rsi), %r8
 151         movq -1*8(%rsi, %rdx),  %r9
 152         movq %r8,       0*8(%rdi)
 153         movq %r9,       -1*8(%rdi, %rdx)
 154         retq
 155         .p2align 4
 156 .Lless_8bytes:
 157         cmpl $4,        %edx
 158         jb   .Lless_3bytes
 159
 160         /*
 161          * Move data from 4 bytes to 7 bytes.
 162          */
 163         movl (%rsi), %ecx
 164         movl -4(%rsi, %rdx), %r8d
 165         movl %ecx, (%rdi)
 166         movl %r8d, -4(%rdi, %rdx)
 167         retq
 168         .p2align 4
 169 .Lless_3bytes:
 170         subl $1, %edx
 171         jb .Lend
 172         /*
 173          * Move data from 1 bytes to 3 bytes.
 174          */
 175         movzbl (%rsi), %ecx
 176         jz .Lstore_1byte
 177         movzbq 1(%rsi), %r8
 178         movzbq (%rsi, %rdx), %r9
 179         movb %r8b, 1(%rdi)
 180         movb %r9b, (%rdi, %rdx)
 181 .Lstore_1byte:
 182         movb %cl, (%rdi)
 183
 184 .Lend:
 185         retq
 186 SYM_FUNC_END(memcpy_orig)
 187
 188 .popsection