arch/x86/lib/csum-copy_64.S

   1 /*
   2  * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
   3  *
   4  * This file is subject to the terms and conditions of the GNU General Public
   5  * License.  See the file COPYING in the main directory of this archive
   6  * for more details. No warranty for anything given at all.
   7  */
   8 #include <linux/linkage.h>
   9 #include <asm/errno.h>
  10 #include <asm/asm.h>
  11
  12 /*
  13  * Checksum copy with exception handling.
  14  * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
  15  * destination is zeroed.
  16  *
  17  * Input
  18  * rdi  source
  19  * rsi  destination
  20  * edx  len (32bit)
  21  *
  22  * Output
  23  * eax  64bit sum. undefined in case of exception.
  24  *
  25  * Wrappers need to take care of valid exception sum and zeroing.
  26  * They also should align source or destination to 8 bytes.
  27  */
  28
  29         .macro source
  30 10:
  31         _ASM_EXTABLE_UA(10b, .Lfault)
  32         .endm
  33
  34         .macro dest
  35 20:
  36         _ASM_EXTABLE_UA(20b, .Lfault)
  37         .endm
  38
  39 SYM_FUNC_START(csum_partial_copy_generic)
  40         subq  $5*8, %rsp
  41         movq  %rbx, 0*8(%rsp)
  42         movq  %r12, 1*8(%rsp)
  43         movq  %r14, 2*8(%rsp)
  44         movq  %r13, 3*8(%rsp)
  45         movq  %r15, 4*8(%rsp)
  46
  47         movl  $-1, %eax
  48         xorl  %r9d, %r9d
  49         movl  %edx, %ecx
  50         cmpl  $8, %ecx
  51         jb    .Lshort
  52
  53         testb  $7, %sil
  54         jne   .Lunaligned
  55 .Laligned:
  56         movl  %ecx, %r12d
  57
  58         shrq  $6, %r12
  59         jz      .Lhandle_tail       /* < 64 */
  60
  61         clc
  62
  63         /* main loop. clear in 64 byte blocks */
  64         /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
  65         /* r11: temp3, rdx: temp4, r12 loopcnt */
  66         /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */
  67         .p2align 4
  68 .Lloop:
  69         source
  70         movq  (%rdi), %rbx
  71         source
  72         movq  8(%rdi), %r8
  73         source
  74         movq  16(%rdi), %r11
  75         source
  76         movq  24(%rdi), %rdx
  77
  78         source
  79         movq  32(%rdi), %r10
  80         source
  81         movq  40(%rdi), %r15
  82         source
  83         movq  48(%rdi), %r14
  84         source
  85         movq  56(%rdi), %r13
  86
  87 30:
  88         /*
  89          * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
  90          * potentially unmapped kernel address.
  91          */
  92         _ASM_EXTABLE(30b, 2f)
  93         prefetcht0 5*64(%rdi)
  94 2:
  95         adcq  %rbx, %rax
  96         adcq  %r8, %rax
  97         adcq  %r11, %rax
  98         adcq  %rdx, %rax
  99         adcq  %r10, %rax
 100         adcq  %r15, %rax
 101         adcq  %r14, %rax
 102         adcq  %r13, %rax
 103
 104         decl %r12d
 105
 106         dest
 107         movq %rbx, (%rsi)
 108         dest
 109         movq %r8, 8(%rsi)
 110         dest
 111         movq %r11, 16(%rsi)
 112         dest
 113         movq %rdx, 24(%rsi)
 114
 115         dest
 116         movq %r10, 32(%rsi)
 117         dest
 118         movq %r15, 40(%rsi)
 119         dest
 120         movq %r14, 48(%rsi)
 121         dest
 122         movq %r13, 56(%rsi)
 123
 124         leaq 64(%rdi), %rdi
 125         leaq 64(%rsi), %rsi
 126
 127         jnz     .Lloop
 128
 129         adcq  %r9, %rax
 130
 131         /* do last up to 56 bytes */
 132 .Lhandle_tail:
 133         /* ecx: count, rcx.63: the end result needs to be rol8 */
 134         movq %rcx, %r10
 135         andl $63, %ecx
 136         shrl $3, %ecx
 137         jz      .Lfold
 138         clc
 139         .p2align 4
 140 .Lloop_8:
 141         source
 142         movq (%rdi), %rbx
 143         adcq %rbx, %rax
 144         decl %ecx
 145         dest
 146         movq %rbx, (%rsi)
 147         leaq 8(%rsi), %rsi /* preserve carry */
 148         leaq 8(%rdi), %rdi
 149         jnz     .Lloop_8
 150         adcq %r9, %rax  /* add in carry */
 151
 152 .Lfold:
 153         /* reduce checksum to 32bits */
 154         movl %eax, %ebx
 155         shrq $32, %rax
 156         addl %ebx, %eax
 157         adcl %r9d, %eax
 158
 159         /* do last up to 6 bytes */
 160 .Lhandle_7:
 161         movl %r10d, %ecx
 162         andl $7, %ecx
 163 .L1:                            /* .Lshort rejoins the common path here */
 164         shrl $1, %ecx
 165         jz   .Lhandle_1
 166         movl $2, %edx
 167         xorl %ebx, %ebx
 168         clc
 169         .p2align 4
 170 .Lloop_1:
 171         source
 172         movw (%rdi), %bx
 173         adcl %ebx, %eax
 174         decl %ecx
 175         dest
 176         movw %bx, (%rsi)
 177         leaq 2(%rdi), %rdi
 178         leaq 2(%rsi), %rsi
 179         jnz .Lloop_1
 180         adcl %r9d, %eax /* add in carry */
 181
 182         /* handle last odd byte */
 183 .Lhandle_1:
 184         testb $1, %r10b
 185         jz    .Lende
 186         xorl  %ebx, %ebx
 187         source
 188         movb (%rdi), %bl
 189         dest
 190         movb %bl, (%rsi)
 191         addl %ebx, %eax
 192         adcl %r9d, %eax         /* carry */
 193
 194 .Lende:
 195         testq %r10, %r10
 196         js  .Lwas_odd
 197 .Lout:
 198         movq 0*8(%rsp), %rbx
 199         movq 1*8(%rsp), %r12
 200         movq 2*8(%rsp), %r14
 201         movq 3*8(%rsp), %r13
 202         movq 4*8(%rsp), %r15
 203         addq $5*8, %rsp
 204         ret
 205 .Lshort:
 206         movl %ecx, %r10d
 207         jmp  .L1
 208 .Lunaligned:
 209         xorl %ebx, %ebx
 210         testb $1, %sil
 211         jne  .Lodd
 212 1:      testb $2, %sil
 213         je   2f
 214         source
 215         movw (%rdi), %bx
 216         dest
 217         movw %bx, (%rsi)
 218         leaq 2(%rdi), %rdi
 219         subq $2, %rcx
 220         leaq 2(%rsi), %rsi
 221         addq %rbx, %rax
 222 2:      testb $4, %sil
 223         je .Laligned
 224         source
 225         movl (%rdi), %ebx
 226         dest
 227         movl %ebx, (%rsi)
 228         leaq 4(%rdi), %rdi
 229         subq $4, %rcx
 230         leaq 4(%rsi), %rsi
 231         addq %rbx, %rax
 232         jmp .Laligned
 233
 234 .Lodd:
 235         source
 236         movb (%rdi), %bl
 237         dest
 238         movb %bl, (%rsi)
 239         leaq 1(%rdi), %rdi
 240         leaq 1(%rsi), %rsi
 241         /* decrement, set MSB */
 242         leaq -1(%rcx, %rcx), %rcx
 243         rorq $1, %rcx
 244         shll $8, %ebx
 245         addq %rbx, %rax
 246         jmp 1b
 247
 248 .Lwas_odd:
 249         roll $8, %eax
 250         jmp .Lout
 251
 252         /* Exception: just return 0 */
 253 .Lfault:
 254         xorl %eax, %eax
 255         jmp  .Lout
 256 SYM_FUNC_END(csum_partial_copy_generic)