arch/x86/lib/memcpy_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /* Copyright 2002 Andi Kleen */
   3
   4 #include <linux/linkage.h>
   5 #include <asm/errno.h>
   6 #include <asm/cpufeatures.h>
   7 #include <asm/mcsafe_test.h>
   8 #include <asm/alternative-asm.h>
   9 #include <asm/export.h>
  10
  11 .pushsection .noinstr.text, "ax"
  12
  13 /*
  14  * We build a jump to memcpy_orig by default which gets NOPped out on
  15  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  16  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  17  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  18  */
  19
  20 .weak memcpy
  21
  22 /*
  23  * memcpy - Copy a memory block.
  24  *
  25  * Input:
  26  *  rdi destination
  27  *  rsi source
  28  *  rdx count
  29  *
  30  * Output:
  31  * rax original destination
  32  */
  33 SYM_FUNC_START_ALIAS(__memcpy)
  34 SYM_FUNC_START_LOCAL(memcpy)
  35         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  36                       "jmp memcpy_erms", X86_FEATURE_ERMS
  37
  38         movq %rdi, %rax
  39         movq %rdx, %rcx
  40         shrq $3, %rcx
  41         andl $7, %edx
  42         rep movsq
  43         movl %edx, %ecx
  44         rep movsb
  45         ret
  46 SYM_FUNC_END(memcpy)
  47 SYM_FUNC_END_ALIAS(__memcpy)
  48 EXPORT_SYMBOL(memcpy)
  49 EXPORT_SYMBOL(__memcpy)
  50
  51 /*
  52  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  53  * simpler than memcpy. Use memcpy_erms when possible.
  54  */
  55 SYM_FUNC_START_LOCAL(memcpy_erms)
  56         movq %rdi, %rax
  57         movq %rdx, %rcx
  58         rep movsb
  59         ret
  60 SYM_FUNC_END(memcpy_erms)
  61
  62 SYM_FUNC_START_LOCAL(memcpy_orig)
  63         movq %rdi, %rax
  64
  65         cmpq $0x20, %rdx
  66         jb .Lhandle_tail
  67
  68         /*
  69          * We check whether memory false dependence could occur,
  70          * then jump to corresponding copy mode.
  71          */
  72         cmp  %dil, %sil
  73         jl .Lcopy_backward
  74         subq $0x20, %rdx
  75 .Lcopy_forward_loop:
  76         subq $0x20,     %rdx
  77
  78         /*
  79          * Move in blocks of 4x8 bytes:
  80          */
  81         movq 0*8(%rsi), %r8
  82         movq 1*8(%rsi), %r9
  83         movq 2*8(%rsi), %r10
  84         movq 3*8(%rsi), %r11
  85         leaq 4*8(%rsi), %rsi
  86
  87         movq %r8,       0*8(%rdi)
  88         movq %r9,       1*8(%rdi)
  89         movq %r10,      2*8(%rdi)
  90         movq %r11,      3*8(%rdi)
  91         leaq 4*8(%rdi), %rdi
  92         jae  .Lcopy_forward_loop
  93         addl $0x20,     %edx
  94         jmp  .Lhandle_tail
  95
  96 .Lcopy_backward:
  97         /*
  98          * Calculate copy position to tail.
  99          */
 100         addq %rdx,      %rsi
 101         addq %rdx,      %rdi
 102         subq $0x20,     %rdx
 103         /*
 104          * At most 3 ALU operations in one cycle,
 105          * so append NOPS in the same 16 bytes trunk.
 106          */
 107         .p2align 4
 108 .Lcopy_backward_loop:
 109         subq $0x20,     %rdx
 110         movq -1*8(%rsi),        %r8
 111         movq -2*8(%rsi),        %r9
 112         movq -3*8(%rsi),        %r10
 113         movq -4*8(%rsi),        %r11
 114         leaq -4*8(%rsi),        %rsi
 115         movq %r8,               -1*8(%rdi)
 116         movq %r9,               -2*8(%rdi)
 117         movq %r10,              -3*8(%rdi)
 118         movq %r11,              -4*8(%rdi)
 119         leaq -4*8(%rdi),        %rdi
 120         jae  .Lcopy_backward_loop
 121
 122         /*
 123          * Calculate copy position to head.
 124          */
 125         addl $0x20,     %edx
 126         subq %rdx,      %rsi
 127         subq %rdx,      %rdi
 128 .Lhandle_tail:
 129         cmpl $16,       %edx
 130         jb   .Lless_16bytes
 131
 132         /*
 133          * Move data from 16 bytes to 31 bytes.
 134          */
 135         movq 0*8(%rsi), %r8
 136         movq 1*8(%rsi), %r9
 137         movq -2*8(%rsi, %rdx),  %r10
 138         movq -1*8(%rsi, %rdx),  %r11
 139         movq %r8,       0*8(%rdi)
 140         movq %r9,       1*8(%rdi)
 141         movq %r10,      -2*8(%rdi, %rdx)
 142         movq %r11,      -1*8(%rdi, %rdx)
 143         retq
 144         .p2align 4
 145 .Lless_16bytes:
 146         cmpl $8,        %edx
 147         jb   .Lless_8bytes
 148         /*
 149          * Move data from 8 bytes to 15 bytes.
 150          */
 151         movq 0*8(%rsi), %r8
 152         movq -1*8(%rsi, %rdx),  %r9
 153         movq %r8,       0*8(%rdi)
 154         movq %r9,       -1*8(%rdi, %rdx)
 155         retq
 156         .p2align 4
 157 .Lless_8bytes:
 158         cmpl $4,        %edx
 159         jb   .Lless_3bytes
 160
 161         /*
 162          * Move data from 4 bytes to 7 bytes.
 163          */
 164         movl (%rsi), %ecx
 165         movl -4(%rsi, %rdx), %r8d
 166         movl %ecx, (%rdi)
 167         movl %r8d, -4(%rdi, %rdx)
 168         retq
 169         .p2align 4
 170 .Lless_3bytes:
 171         subl $1, %edx
 172         jb .Lend
 173         /*
 174          * Move data from 1 bytes to 3 bytes.
 175          */
 176         movzbl (%rsi), %ecx
 177         jz .Lstore_1byte
 178         movzbq 1(%rsi), %r8
 179         movzbq (%rsi, %rdx), %r9
 180         movb %r8b, 1(%rdi)
 181         movb %r9b, (%rdi, %rdx)
 182 .Lstore_1byte:
 183         movb %cl, (%rdi)
 184
 185 .Lend:
 186         retq
 187 SYM_FUNC_END(memcpy_orig)
 188
 189 .popsection
 190
 191 #ifndef CONFIG_UML
 192
 193 MCSAFE_TEST_CTL
 194
 195 /*
 196  * __memcpy_mcsafe - memory copy with machine check exception handling
 197  * Note that we only catch machine checks when reading the source addresses.
 198  * Writes to target are posted and don't generate machine checks.
 199  */
 200 SYM_FUNC_START(__memcpy_mcsafe)
 201         cmpl $8, %edx
 202         /* Less than 8 bytes? Go to byte copy loop */
 203         jb .L_no_whole_words
 204
 205         /* Check for bad alignment of source */
 206         testl $7, %esi
 207         /* Already aligned */
 208         jz .L_8byte_aligned
 209
 210         /* Copy one byte at a time until source is 8-byte aligned */
 211         movl %esi, %ecx
 212         andl $7, %ecx
 213         subl $8, %ecx
 214         negl %ecx
 215         subl %ecx, %edx
 216 .L_read_leading_bytes:
 217         movb (%rsi), %al
 218         MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
 219         MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
 220 .L_write_leading_bytes:
 221         movb %al, (%rdi)
 222         incq %rsi
 223         incq %rdi
 224         decl %ecx
 225         jnz .L_read_leading_bytes
 226
 227 .L_8byte_aligned:
 228         movl %edx, %ecx
 229         andl $7, %edx
 230         shrl $3, %ecx
 231         jz .L_no_whole_words
 232
 233 .L_read_words:
 234         movq (%rsi), %r8
 235         MCSAFE_TEST_SRC %rsi 8 .E_read_words
 236         MCSAFE_TEST_DST %rdi 8 .E_write_words
 237 .L_write_words:
 238         movq %r8, (%rdi)
 239         addq $8, %rsi
 240         addq $8, %rdi
 241         decl %ecx
 242         jnz .L_read_words
 243
 244         /* Any trailing bytes? */
 245 .L_no_whole_words:
 246         andl %edx, %edx
 247         jz .L_done_memcpy_trap
 248
 249         /* Copy trailing bytes */
 250         movl %edx, %ecx
 251 .L_read_trailing_bytes:
 252         movb (%rsi), %al
 253         MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
 254         MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
 255 .L_write_trailing_bytes:
 256         movb %al, (%rdi)
 257         incq %rsi
 258         incq %rdi
 259         decl %ecx
 260         jnz .L_read_trailing_bytes
 261
 262         /* Copy successful. Return zero */
 263 .L_done_memcpy_trap:
 264         xorl %eax, %eax
 265 .L_done:
 266         ret
 267 SYM_FUNC_END(__memcpy_mcsafe)
 268 EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 269
 270         .section .fixup, "ax"
 271         /*
 272          * Return number of bytes not copied for any failure. Note that
 273          * there is no "tail" handling since the source buffer is 8-byte
 274          * aligned and poison is cacheline aligned.
 275          */
 276 .E_read_words:
 277         shll    $3, %ecx
 278 .E_leading_bytes:
 279         addl    %edx, %ecx
 280 .E_trailing_bytes:
 281         mov     %ecx, %eax
 282         jmp     .L_done
 283
 284         /*
 285          * For write fault handling, given the destination is unaligned,
 286          * we handle faults on multi-byte writes with a byte-by-byte
 287          * copy up to the write-protected page.
 288          */
 289 .E_write_words:
 290         shll    $3, %ecx
 291         addl    %edx, %ecx
 292         movl    %ecx, %edx
 293         jmp mcsafe_handle_tail
 294
 295         .previous
 296
 297         _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
 298         _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
 299         _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
 300         _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
 301         _ASM_EXTABLE(.L_write_words, .E_write_words)
 302         _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
 303 #endif