leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 0*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
leaq K256+1*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 1*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
leaq K256+2*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 2*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
leaq K256+3*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 3*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
add $4*32, SRND
cmp $3*4*32, SRND
leaq K256+0*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS _XFER + 0*32
+ DO_4ROUNDS (_XFER + 0*32)
leaq K256+1*32(%rip), INP
vpaddd (INP, SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS _XFER + 1*32
+ DO_4ROUNDS (_XFER + 1*32)
add $2*32, SRND
vmovdqa X2, X0
xor SRND, SRND
.align 16
.Lloop3:
- DO_4ROUNDS _XFER + 0*32 + 16
- DO_4ROUNDS _XFER + 1*32 + 16
+ DO_4ROUNDS (_XFER + 0*32 + 16)
+ DO_4ROUNDS (_XFER + 1*32 + 16)
add $2*32, SRND
cmp $4*4*32, SRND
jb .Lloop3