tools/arch/x86/lib/memcpy_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /* Copyright 2002 Andi Kleen */
   3
   4 #include <linux/linkage.h>
   5 #include <asm/errno.h>
   6 #include <asm/cpufeatures.h>
   7 #include <asm/alternative-asm.h>
   8 #include <asm/export.h>
   9
  10 .pushsection .noinstr.text, "ax"
  11
  12 /*
  13  * We build a jump to memcpy_orig by default which gets NOPped out on
  14  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  15  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  16  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  17  */
  18
  19 /*
  20  * memcpy - Copy a memory block.
  21  *
  22  * Input:
  23  *  rdi destination
  24  *  rsi source
  25  *  rdx count
  26  *
  27  * Output:
  28  * rax original destination
  29  */
  30 SYM_FUNC_START_ALIAS(__memcpy)
  31 SYM_FUNC_START_WEAK(memcpy)
  32         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  33                       "jmp memcpy_erms", X86_FEATURE_ERMS
  34
  35         movq %rdi, %rax
  36         movq %rdx, %rcx
  37         shrq $3, %rcx
  38         andl $7, %edx
  39         rep movsq
  40         movl %edx, %ecx
  41         rep movsb
  42         ret
  43 SYM_FUNC_END(memcpy)
  44 SYM_FUNC_END_ALIAS(__memcpy)
  45 EXPORT_SYMBOL(memcpy)
  46 EXPORT_SYMBOL(__memcpy)
  47
  48 /*
  49  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  50  * simpler than memcpy. Use memcpy_erms when possible.
  51  */
  52 SYM_FUNC_START_LOCAL(memcpy_erms)
  53         movq %rdi, %rax
  54         movq %rdx, %rcx
  55         rep movsb
  56         ret
  57 SYM_FUNC_END(memcpy_erms)
  58
  59 SYM_FUNC_START_LOCAL(memcpy_orig)
  60         movq %rdi, %rax
  61
  62         cmpq $0x20, %rdx
  63         jb .Lhandle_tail
  64
  65         /*
  66          * We check whether memory false dependence could occur,
  67          * then jump to corresponding copy mode.
  68          */
  69         cmp  %dil, %sil
  70         jl .Lcopy_backward
  71         subq $0x20, %rdx
  72 .Lcopy_forward_loop:
  73         subq $0x20,     %rdx
  74
  75         /*
  76          * Move in blocks of 4x8 bytes:
  77          */
  78         movq 0*8(%rsi), %r8
  79         movq 1*8(%rsi), %r9
  80         movq 2*8(%rsi), %r10
  81         movq 3*8(%rsi), %r11
  82         leaq 4*8(%rsi), %rsi
  83
  84         movq %r8,       0*8(%rdi)
  85         movq %r9,       1*8(%rdi)
  86         movq %r10,      2*8(%rdi)
  87         movq %r11,      3*8(%rdi)
  88         leaq 4*8(%rdi), %rdi
  89         jae  .Lcopy_forward_loop
  90         addl $0x20,     %edx
  91         jmp  .Lhandle_tail
  92
  93 .Lcopy_backward:
  94         /*
  95          * Calculate copy position to tail.
  96          */
  97         addq %rdx,      %rsi
  98         addq %rdx,      %rdi
  99         subq $0x20,     %rdx
 100         /*
 101          * At most 3 ALU operations in one cycle,
 102          * so append NOPS in the same 16 bytes trunk.
 103          */
 104         .p2align 4
 105 .Lcopy_backward_loop:
 106         subq $0x20,     %rdx
 107         movq -1*8(%rsi),        %r8
 108         movq -2*8(%rsi),        %r9
 109         movq -3*8(%rsi),        %r10
 110         movq -4*8(%rsi),        %r11
 111         leaq -4*8(%rsi),        %rsi
 112         movq %r8,               -1*8(%rdi)
 113         movq %r9,               -2*8(%rdi)
 114         movq %r10,              -3*8(%rdi)
 115         movq %r11,              -4*8(%rdi)
 116         leaq -4*8(%rdi),        %rdi
 117         jae  .Lcopy_backward_loop
 118
 119         /*
 120          * Calculate copy position to head.
 121          */
 122         addl $0x20,     %edx
 123         subq %rdx,      %rsi
 124         subq %rdx,      %rdi
 125 .Lhandle_tail:
 126         cmpl $16,       %edx
 127         jb   .Lless_16bytes
 128
 129         /*
 130          * Move data from 16 bytes to 31 bytes.
 131          */
 132         movq 0*8(%rsi), %r8
 133         movq 1*8(%rsi), %r9
 134         movq -2*8(%rsi, %rdx),  %r10
 135         movq -1*8(%rsi, %rdx),  %r11
 136         movq %r8,       0*8(%rdi)
 137         movq %r9,       1*8(%rdi)
 138         movq %r10,      -2*8(%rdi, %rdx)
 139         movq %r11,      -1*8(%rdi, %rdx)
 140         retq
 141         .p2align 4
 142 .Lless_16bytes:
 143         cmpl $8,        %edx
 144         jb   .Lless_8bytes
 145         /*
 146          * Move data from 8 bytes to 15 bytes.
 147          */
 148         movq 0*8(%rsi), %r8
 149         movq -1*8(%rsi, %rdx),  %r9
 150         movq %r8,       0*8(%rdi)
 151         movq %r9,       -1*8(%rdi, %rdx)
 152         retq
 153         .p2align 4
 154 .Lless_8bytes:
 155         cmpl $4,        %edx
 156         jb   .Lless_3bytes
 157
 158         /*
 159          * Move data from 4 bytes to 7 bytes.
 160          */
 161         movl (%rsi), %ecx
 162         movl -4(%rsi, %rdx), %r8d
 163         movl %ecx, (%rdi)
 164         movl %r8d, -4(%rdi, %rdx)
 165         retq
 166         .p2align 4
 167 .Lless_3bytes:
 168         subl $1, %edx
 169         jb .Lend
 170         /*
 171          * Move data from 1 bytes to 3 bytes.
 172          */
 173         movzbl (%rsi), %ecx
 174         jz .Lstore_1byte
 175         movzbq 1(%rsi), %r8
 176         movzbq (%rsi, %rdx), %r9
 177         movb %r8b, 1(%rdi)
 178         movb %r9b, (%rdi, %rdx)
 179 .Lstore_1byte:
 180         movb %cl, (%rdi)
 181
 182 .Lend:
 183         retq
 184 SYM_FUNC_END(memcpy_orig)
 185
 186 .popsection