KVM: x86/svm: Clear reserved bits written to PerfEvtSeln MSRs
[linux-2.6-microblaze.git] / arch / s390 / crypto / chacha-s390.S
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Original implementation written by Andy Polyakov, @dot-asm.
4  * This is an adaptation of the original code for kernel use.
5  *
6  * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
7  */
8
9 #include <linux/linkage.h>
10 #include <asm/nospec-insn.h>
11 #include <asm/vx-insn.h>
12
13 #define SP      %r15
14 #define FRAME   (16 * 8 + 4 * 8)
15
16 .data
17 .align  32
18
19 .Lsigma:
20 .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
21 .long   1,0,0,0
22 .long   2,0,0,0
23 .long   3,0,0,0
24 .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c     # byte swap
25
26 .long   0,1,2,3
27 .long   0x61707865,0x61707865,0x61707865,0x61707865     # smashed sigma
28 .long   0x3320646e,0x3320646e,0x3320646e,0x3320646e
29 .long   0x79622d32,0x79622d32,0x79622d32,0x79622d32
30 .long   0x6b206574,0x6b206574,0x6b206574,0x6b206574
31
32 .previous
33
34         GEN_BR_THUNK %r14
35
36 .text
37
38 #############################################################################
39 # void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
40 #                     counst u32 *key, const u32 *counter)
41
42 #define OUT             %r2
43 #define INP             %r3
44 #define LEN             %r4
45 #define KEY             %r5
46 #define COUNTER         %r6
47
48 #define BEPERM          %v31
49 #define CTR             %v26
50
51 #define K0              %v16
52 #define K1              %v17
53 #define K2              %v18
54 #define K3              %v19
55
56 #define XA0             %v0
57 #define XA1             %v1
58 #define XA2             %v2
59 #define XA3             %v3
60
61 #define XB0             %v4
62 #define XB1             %v5
63 #define XB2             %v6
64 #define XB3             %v7
65
66 #define XC0             %v8
67 #define XC1             %v9
68 #define XC2             %v10
69 #define XC3             %v11
70
71 #define XD0             %v12
72 #define XD1             %v13
73 #define XD2             %v14
74 #define XD3             %v15
75
76 #define XT0             %v27
77 #define XT1             %v28
78 #define XT2             %v29
79 #define XT3             %v30
80
81 ENTRY(chacha20_vx_4x)
82         stmg    %r6,%r7,6*8(SP)
83
84         larl    %r7,.Lsigma
85         lhi     %r0,10
86         lhi     %r1,0
87
88         VL      K0,0,,%r7               # load sigma
89         VL      K1,0,,KEY               # load key
90         VL      K2,16,,KEY
91         VL      K3,0,,COUNTER           # load counter
92
93         VL      BEPERM,0x40,,%r7
94         VL      CTR,0x50,,%r7
95
96         VLM     XA0,XA3,0x60,%r7,4      # load [smashed] sigma
97
98         VREPF   XB0,K1,0                # smash the key
99         VREPF   XB1,K1,1
100         VREPF   XB2,K1,2
101         VREPF   XB3,K1,3
102
103         VREPF   XD0,K3,0
104         VREPF   XD1,K3,1
105         VREPF   XD2,K3,2
106         VREPF   XD3,K3,3
107         VAF     XD0,XD0,CTR
108
109         VREPF   XC0,K2,0
110         VREPF   XC1,K2,1
111         VREPF   XC2,K2,2
112         VREPF   XC3,K2,3
113
114 .Loop_4x:
115         VAF     XA0,XA0,XB0
116         VX      XD0,XD0,XA0
117         VERLLF  XD0,XD0,16
118
119         VAF     XA1,XA1,XB1
120         VX      XD1,XD1,XA1
121         VERLLF  XD1,XD1,16
122
123         VAF     XA2,XA2,XB2
124         VX      XD2,XD2,XA2
125         VERLLF  XD2,XD2,16
126
127         VAF     XA3,XA3,XB3
128         VX      XD3,XD3,XA3
129         VERLLF  XD3,XD3,16
130
131         VAF     XC0,XC0,XD0
132         VX      XB0,XB0,XC0
133         VERLLF  XB0,XB0,12
134
135         VAF     XC1,XC1,XD1
136         VX      XB1,XB1,XC1
137         VERLLF  XB1,XB1,12
138
139         VAF     XC2,XC2,XD2
140         VX      XB2,XB2,XC2
141         VERLLF  XB2,XB2,12
142
143         VAF     XC3,XC3,XD3
144         VX      XB3,XB3,XC3
145         VERLLF  XB3,XB3,12
146
147         VAF     XA0,XA0,XB0
148         VX      XD0,XD0,XA0
149         VERLLF  XD0,XD0,8
150
151         VAF     XA1,XA1,XB1
152         VX      XD1,XD1,XA1
153         VERLLF  XD1,XD1,8
154
155         VAF     XA2,XA2,XB2
156         VX      XD2,XD2,XA2
157         VERLLF  XD2,XD2,8
158
159         VAF     XA3,XA3,XB3
160         VX      XD3,XD3,XA3
161         VERLLF  XD3,XD3,8
162
163         VAF     XC0,XC0,XD0
164         VX      XB0,XB0,XC0
165         VERLLF  XB0,XB0,7
166
167         VAF     XC1,XC1,XD1
168         VX      XB1,XB1,XC1
169         VERLLF  XB1,XB1,7
170
171         VAF     XC2,XC2,XD2
172         VX      XB2,XB2,XC2
173         VERLLF  XB2,XB2,7
174
175         VAF     XC3,XC3,XD3
176         VX      XB3,XB3,XC3
177         VERLLF  XB3,XB3,7
178
179         VAF     XA0,XA0,XB1
180         VX      XD3,XD3,XA0
181         VERLLF  XD3,XD3,16
182
183         VAF     XA1,XA1,XB2
184         VX      XD0,XD0,XA1
185         VERLLF  XD0,XD0,16
186
187         VAF     XA2,XA2,XB3
188         VX      XD1,XD1,XA2
189         VERLLF  XD1,XD1,16
190
191         VAF     XA3,XA3,XB0
192         VX      XD2,XD2,XA3
193         VERLLF  XD2,XD2,16
194
195         VAF     XC2,XC2,XD3
196         VX      XB1,XB1,XC2
197         VERLLF  XB1,XB1,12
198
199         VAF     XC3,XC3,XD0
200         VX      XB2,XB2,XC3
201         VERLLF  XB2,XB2,12
202
203         VAF     XC0,XC0,XD1
204         VX      XB3,XB3,XC0
205         VERLLF  XB3,XB3,12
206
207         VAF     XC1,XC1,XD2
208         VX      XB0,XB0,XC1
209         VERLLF  XB0,XB0,12
210
211         VAF     XA0,XA0,XB1
212         VX      XD3,XD3,XA0
213         VERLLF  XD3,XD3,8
214
215         VAF     XA1,XA1,XB2
216         VX      XD0,XD0,XA1
217         VERLLF  XD0,XD0,8
218
219         VAF     XA2,XA2,XB3
220         VX      XD1,XD1,XA2
221         VERLLF  XD1,XD1,8
222
223         VAF     XA3,XA3,XB0
224         VX      XD2,XD2,XA3
225         VERLLF  XD2,XD2,8
226
227         VAF     XC2,XC2,XD3
228         VX      XB1,XB1,XC2
229         VERLLF  XB1,XB1,7
230
231         VAF     XC3,XC3,XD0
232         VX      XB2,XB2,XC3
233         VERLLF  XB2,XB2,7
234
235         VAF     XC0,XC0,XD1
236         VX      XB3,XB3,XC0
237         VERLLF  XB3,XB3,7
238
239         VAF     XC1,XC1,XD2
240         VX      XB0,XB0,XC1
241         VERLLF  XB0,XB0,7
242         brct    %r0,.Loop_4x
243
244         VAF     XD0,XD0,CTR
245
246         VMRHF   XT0,XA0,XA1             # transpose data
247         VMRHF   XT1,XA2,XA3
248         VMRLF   XT2,XA0,XA1
249         VMRLF   XT3,XA2,XA3
250         VPDI    XA0,XT0,XT1,0b0000
251         VPDI    XA1,XT0,XT1,0b0101
252         VPDI    XA2,XT2,XT3,0b0000
253         VPDI    XA3,XT2,XT3,0b0101
254
255         VMRHF   XT0,XB0,XB1
256         VMRHF   XT1,XB2,XB3
257         VMRLF   XT2,XB0,XB1
258         VMRLF   XT3,XB2,XB3
259         VPDI    XB0,XT0,XT1,0b0000
260         VPDI    XB1,XT0,XT1,0b0101
261         VPDI    XB2,XT2,XT3,0b0000
262         VPDI    XB3,XT2,XT3,0b0101
263
264         VMRHF   XT0,XC0,XC1
265         VMRHF   XT1,XC2,XC3
266         VMRLF   XT2,XC0,XC1
267         VMRLF   XT3,XC2,XC3
268         VPDI    XC0,XT0,XT1,0b0000
269         VPDI    XC1,XT0,XT1,0b0101
270         VPDI    XC2,XT2,XT3,0b0000
271         VPDI    XC3,XT2,XT3,0b0101
272
273         VMRHF   XT0,XD0,XD1
274         VMRHF   XT1,XD2,XD3
275         VMRLF   XT2,XD0,XD1
276         VMRLF   XT3,XD2,XD3
277         VPDI    XD0,XT0,XT1,0b0000
278         VPDI    XD1,XT0,XT1,0b0101
279         VPDI    XD2,XT2,XT3,0b0000
280         VPDI    XD3,XT2,XT3,0b0101
281
282         VAF     XA0,XA0,K0
283         VAF     XB0,XB0,K1
284         VAF     XC0,XC0,K2
285         VAF     XD0,XD0,K3
286
287         VPERM   XA0,XA0,XA0,BEPERM
288         VPERM   XB0,XB0,XB0,BEPERM
289         VPERM   XC0,XC0,XC0,BEPERM
290         VPERM   XD0,XD0,XD0,BEPERM
291
292         VLM     XT0,XT3,0,INP,0
293
294         VX      XT0,XT0,XA0
295         VX      XT1,XT1,XB0
296         VX      XT2,XT2,XC0
297         VX      XT3,XT3,XD0
298
299         VSTM    XT0,XT3,0,OUT,0
300
301         la      INP,0x40(INP)
302         la      OUT,0x40(OUT)
303         aghi    LEN,-0x40
304
305         VAF     XA0,XA1,K0
306         VAF     XB0,XB1,K1
307         VAF     XC0,XC1,K2
308         VAF     XD0,XD1,K3
309
310         VPERM   XA0,XA0,XA0,BEPERM
311         VPERM   XB0,XB0,XB0,BEPERM
312         VPERM   XC0,XC0,XC0,BEPERM
313         VPERM   XD0,XD0,XD0,BEPERM
314
315         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
316         jl      .Ltail_4x
317
318         VLM     XT0,XT3,0,INP,0
319
320         VX      XT0,XT0,XA0
321         VX      XT1,XT1,XB0
322         VX      XT2,XT2,XC0
323         VX      XT3,XT3,XD0
324
325         VSTM    XT0,XT3,0,OUT,0
326
327         la      INP,0x40(INP)
328         la      OUT,0x40(OUT)
329         aghi    LEN,-0x40
330         je      .Ldone_4x
331
332         VAF     XA0,XA2,K0
333         VAF     XB0,XB2,K1
334         VAF     XC0,XC2,K2
335         VAF     XD0,XD2,K3
336
337         VPERM   XA0,XA0,XA0,BEPERM
338         VPERM   XB0,XB0,XB0,BEPERM
339         VPERM   XC0,XC0,XC0,BEPERM
340         VPERM   XD0,XD0,XD0,BEPERM
341
342         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
343         jl      .Ltail_4x
344
345         VLM     XT0,XT3,0,INP,0
346
347         VX      XT0,XT0,XA0
348         VX      XT1,XT1,XB0
349         VX      XT2,XT2,XC0
350         VX      XT3,XT3,XD0
351
352         VSTM    XT0,XT3,0,OUT,0
353
354         la      INP,0x40(INP)
355         la      OUT,0x40(OUT)
356         aghi    LEN,-0x40
357         je      .Ldone_4x
358
359         VAF     XA0,XA3,K0
360         VAF     XB0,XB3,K1
361         VAF     XC0,XC3,K2
362         VAF     XD0,XD3,K3
363
364         VPERM   XA0,XA0,XA0,BEPERM
365         VPERM   XB0,XB0,XB0,BEPERM
366         VPERM   XC0,XC0,XC0,BEPERM
367         VPERM   XD0,XD0,XD0,BEPERM
368
369         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
370         jl      .Ltail_4x
371
372         VLM     XT0,XT3,0,INP,0
373
374         VX      XT0,XT0,XA0
375         VX      XT1,XT1,XB0
376         VX      XT2,XT2,XC0
377         VX      XT3,XT3,XD0
378
379         VSTM    XT0,XT3,0,OUT,0
380
381 .Ldone_4x:
382         lmg     %r6,%r7,6*8(SP)
383         BR_EX   %r14
384
385 .Ltail_4x:
386         VLR     XT0,XC0
387         VLR     XT1,XD0
388
389         VST     XA0,8*8+0x00,,SP
390         VST     XB0,8*8+0x10,,SP
391         VST     XT0,8*8+0x20,,SP
392         VST     XT1,8*8+0x30,,SP
393
394         lghi    %r1,0
395
396 .Loop_tail_4x:
397         llgc    %r5,0(%r1,INP)
398         llgc    %r6,8*8(%r1,SP)
399         xr      %r6,%r5
400         stc     %r6,0(%r1,OUT)
401         la      %r1,1(%r1)
402         brct    LEN,.Loop_tail_4x
403
404         lmg     %r6,%r7,6*8(SP)
405         BR_EX   %r14
406 ENDPROC(chacha20_vx_4x)
407
408 #undef  OUT
409 #undef  INP
410 #undef  LEN
411 #undef  KEY
412 #undef  COUNTER
413
414 #undef BEPERM
415
416 #undef K0
417 #undef K1
418 #undef K2
419 #undef K3
420
421
422 #############################################################################
423 # void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
424 #                  counst u32 *key, const u32 *counter)
425
426 #define OUT             %r2
427 #define INP             %r3
428 #define LEN             %r4
429 #define KEY             %r5
430 #define COUNTER         %r6
431
432 #define BEPERM          %v31
433
434 #define K0              %v27
435 #define K1              %v24
436 #define K2              %v25
437 #define K3              %v26
438
439 #define A0              %v0
440 #define B0              %v1
441 #define C0              %v2
442 #define D0              %v3
443
444 #define A1              %v4
445 #define B1              %v5
446 #define C1              %v6
447 #define D1              %v7
448
449 #define A2              %v8
450 #define B2              %v9
451 #define C2              %v10
452 #define D2              %v11
453
454 #define A3              %v12
455 #define B3              %v13
456 #define C3              %v14
457 #define D3              %v15
458
459 #define A4              %v16
460 #define B4              %v17
461 #define C4              %v18
462 #define D4              %v19
463
464 #define A5              %v20
465 #define B5              %v21
466 #define C5              %v22
467 #define D5              %v23
468
469 #define T0              %v27
470 #define T1              %v28
471 #define T2              %v29
472 #define T3              %v30
473
474 ENTRY(chacha20_vx)
475         .insn   rilu,0xc20e00000000,LEN,256     # clgfi LEN,256
476         jle     chacha20_vx_4x
477         stmg    %r6,%r7,6*8(SP)
478
479         lghi    %r1,-FRAME
480         lgr     %r0,SP
481         la      SP,0(%r1,SP)
482         stg     %r0,0(SP)               # back-chain
483
484         larl    %r7,.Lsigma
485         lhi     %r0,10
486
487         VLM     K1,K2,0,KEY,0           # load key
488         VL      K3,0,,COUNTER           # load counter
489
490         VLM     K0,BEPERM,0,%r7,4       # load sigma, increments, ...
491
492 .Loop_outer_vx:
493         VLR     A0,K0
494         VLR     B0,K1
495         VLR     A1,K0
496         VLR     B1,K1
497         VLR     A2,K0
498         VLR     B2,K1
499         VLR     A3,K0
500         VLR     B3,K1
501         VLR     A4,K0
502         VLR     B4,K1
503         VLR     A5,K0
504         VLR     B5,K1
505
506         VLR     D0,K3
507         VAF     D1,K3,T1                # K[3]+1
508         VAF     D2,K3,T2                # K[3]+2
509         VAF     D3,K3,T3                # K[3]+3
510         VAF     D4,D2,T2                # K[3]+4
511         VAF     D5,D2,T3                # K[3]+5
512
513         VLR     C0,K2
514         VLR     C1,K2
515         VLR     C2,K2
516         VLR     C3,K2
517         VLR     C4,K2
518         VLR     C5,K2
519
520         VLR     T1,D1
521         VLR     T2,D2
522         VLR     T3,D3
523
524 .Loop_vx:
525         VAF     A0,A0,B0
526         VAF     A1,A1,B1
527         VAF     A2,A2,B2
528         VAF     A3,A3,B3
529         VAF     A4,A4,B4
530         VAF     A5,A5,B5
531         VX      D0,D0,A0
532         VX      D1,D1,A1
533         VX      D2,D2,A2
534         VX      D3,D3,A3
535         VX      D4,D4,A4
536         VX      D5,D5,A5
537         VERLLF  D0,D0,16
538         VERLLF  D1,D1,16
539         VERLLF  D2,D2,16
540         VERLLF  D3,D3,16
541         VERLLF  D4,D4,16
542         VERLLF  D5,D5,16
543
544         VAF     C0,C0,D0
545         VAF     C1,C1,D1
546         VAF     C2,C2,D2
547         VAF     C3,C3,D3
548         VAF     C4,C4,D4
549         VAF     C5,C5,D5
550         VX      B0,B0,C0
551         VX      B1,B1,C1
552         VX      B2,B2,C2
553         VX      B3,B3,C3
554         VX      B4,B4,C4
555         VX      B5,B5,C5
556         VERLLF  B0,B0,12
557         VERLLF  B1,B1,12
558         VERLLF  B2,B2,12
559         VERLLF  B3,B3,12
560         VERLLF  B4,B4,12
561         VERLLF  B5,B5,12
562
563         VAF     A0,A0,B0
564         VAF     A1,A1,B1
565         VAF     A2,A2,B2
566         VAF     A3,A3,B3
567         VAF     A4,A4,B4
568         VAF     A5,A5,B5
569         VX      D0,D0,A0
570         VX      D1,D1,A1
571         VX      D2,D2,A2
572         VX      D3,D3,A3
573         VX      D4,D4,A4
574         VX      D5,D5,A5
575         VERLLF  D0,D0,8
576         VERLLF  D1,D1,8
577         VERLLF  D2,D2,8
578         VERLLF  D3,D3,8
579         VERLLF  D4,D4,8
580         VERLLF  D5,D5,8
581
582         VAF     C0,C0,D0
583         VAF     C1,C1,D1
584         VAF     C2,C2,D2
585         VAF     C3,C3,D3
586         VAF     C4,C4,D4
587         VAF     C5,C5,D5
588         VX      B0,B0,C0
589         VX      B1,B1,C1
590         VX      B2,B2,C2
591         VX      B3,B3,C3
592         VX      B4,B4,C4
593         VX      B5,B5,C5
594         VERLLF  B0,B0,7
595         VERLLF  B1,B1,7
596         VERLLF  B2,B2,7
597         VERLLF  B3,B3,7
598         VERLLF  B4,B4,7
599         VERLLF  B5,B5,7
600
601         VSLDB   C0,C0,C0,8
602         VSLDB   C1,C1,C1,8
603         VSLDB   C2,C2,C2,8
604         VSLDB   C3,C3,C3,8
605         VSLDB   C4,C4,C4,8
606         VSLDB   C5,C5,C5,8
607         VSLDB   B0,B0,B0,4
608         VSLDB   B1,B1,B1,4
609         VSLDB   B2,B2,B2,4
610         VSLDB   B3,B3,B3,4
611         VSLDB   B4,B4,B4,4
612         VSLDB   B5,B5,B5,4
613         VSLDB   D0,D0,D0,12
614         VSLDB   D1,D1,D1,12
615         VSLDB   D2,D2,D2,12
616         VSLDB   D3,D3,D3,12
617         VSLDB   D4,D4,D4,12
618         VSLDB   D5,D5,D5,12
619
620         VAF     A0,A0,B0
621         VAF     A1,A1,B1
622         VAF     A2,A2,B2
623         VAF     A3,A3,B3
624         VAF     A4,A4,B4
625         VAF     A5,A5,B5
626         VX      D0,D0,A0
627         VX      D1,D1,A1
628         VX      D2,D2,A2
629         VX      D3,D3,A3
630         VX      D4,D4,A4
631         VX      D5,D5,A5
632         VERLLF  D0,D0,16
633         VERLLF  D1,D1,16
634         VERLLF  D2,D2,16
635         VERLLF  D3,D3,16
636         VERLLF  D4,D4,16
637         VERLLF  D5,D5,16
638
639         VAF     C0,C0,D0
640         VAF     C1,C1,D1
641         VAF     C2,C2,D2
642         VAF     C3,C3,D3
643         VAF     C4,C4,D4
644         VAF     C5,C5,D5
645         VX      B0,B0,C0
646         VX      B1,B1,C1
647         VX      B2,B2,C2
648         VX      B3,B3,C3
649         VX      B4,B4,C4
650         VX      B5,B5,C5
651         VERLLF  B0,B0,12
652         VERLLF  B1,B1,12
653         VERLLF  B2,B2,12
654         VERLLF  B3,B3,12
655         VERLLF  B4,B4,12
656         VERLLF  B5,B5,12
657
658         VAF     A0,A0,B0
659         VAF     A1,A1,B1
660         VAF     A2,A2,B2
661         VAF     A3,A3,B3
662         VAF     A4,A4,B4
663         VAF     A5,A5,B5
664         VX      D0,D0,A0
665         VX      D1,D1,A1
666         VX      D2,D2,A2
667         VX      D3,D3,A3
668         VX      D4,D4,A4
669         VX      D5,D5,A5
670         VERLLF  D0,D0,8
671         VERLLF  D1,D1,8
672         VERLLF  D2,D2,8
673         VERLLF  D3,D3,8
674         VERLLF  D4,D4,8
675         VERLLF  D5,D5,8
676
677         VAF     C0,C0,D0
678         VAF     C1,C1,D1
679         VAF     C2,C2,D2
680         VAF     C3,C3,D3
681         VAF     C4,C4,D4
682         VAF     C5,C5,D5
683         VX      B0,B0,C0
684         VX      B1,B1,C1
685         VX      B2,B2,C2
686         VX      B3,B3,C3
687         VX      B4,B4,C4
688         VX      B5,B5,C5
689         VERLLF  B0,B0,7
690         VERLLF  B1,B1,7
691         VERLLF  B2,B2,7
692         VERLLF  B3,B3,7
693         VERLLF  B4,B4,7
694         VERLLF  B5,B5,7
695
696         VSLDB   C0,C0,C0,8
697         VSLDB   C1,C1,C1,8
698         VSLDB   C2,C2,C2,8
699         VSLDB   C3,C3,C3,8
700         VSLDB   C4,C4,C4,8
701         VSLDB   C5,C5,C5,8
702         VSLDB   B0,B0,B0,12
703         VSLDB   B1,B1,B1,12
704         VSLDB   B2,B2,B2,12
705         VSLDB   B3,B3,B3,12
706         VSLDB   B4,B4,B4,12
707         VSLDB   B5,B5,B5,12
708         VSLDB   D0,D0,D0,4
709         VSLDB   D1,D1,D1,4
710         VSLDB   D2,D2,D2,4
711         VSLDB   D3,D3,D3,4
712         VSLDB   D4,D4,D4,4
713         VSLDB   D5,D5,D5,4
714         brct    %r0,.Loop_vx
715
716         VAF     A0,A0,K0
717         VAF     B0,B0,K1
718         VAF     C0,C0,K2
719         VAF     D0,D0,K3
720         VAF     A1,A1,K0
721         VAF     D1,D1,T1                # +K[3]+1
722
723         VPERM   A0,A0,A0,BEPERM
724         VPERM   B0,B0,B0,BEPERM
725         VPERM   C0,C0,C0,BEPERM
726         VPERM   D0,D0,D0,BEPERM
727
728         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
729         jl      .Ltail_vx
730
731         VAF     D2,D2,T2                # +K[3]+2
732         VAF     D3,D3,T3                # +K[3]+3
733         VLM     T0,T3,0,INP,0
734
735         VX      A0,A0,T0
736         VX      B0,B0,T1
737         VX      C0,C0,T2
738         VX      D0,D0,T3
739
740         VLM     K0,T3,0,%r7,4           # re-load sigma and increments
741
742         VSTM    A0,D0,0,OUT,0
743
744         la      INP,0x40(INP)
745         la      OUT,0x40(OUT)
746         aghi    LEN,-0x40
747         je      .Ldone_vx
748
749         VAF     B1,B1,K1
750         VAF     C1,C1,K2
751
752         VPERM   A0,A1,A1,BEPERM
753         VPERM   B0,B1,B1,BEPERM
754         VPERM   C0,C1,C1,BEPERM
755         VPERM   D0,D1,D1,BEPERM
756
757         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
758         jl      .Ltail_vx
759
760         VLM     A1,D1,0,INP,0
761
762         VX      A0,A0,A1
763         VX      B0,B0,B1
764         VX      C0,C0,C1
765         VX      D0,D0,D1
766
767         VSTM    A0,D0,0,OUT,0
768
769         la      INP,0x40(INP)
770         la      OUT,0x40(OUT)
771         aghi    LEN,-0x40
772         je      .Ldone_vx
773
774         VAF     A2,A2,K0
775         VAF     B2,B2,K1
776         VAF     C2,C2,K2
777
778         VPERM   A0,A2,A2,BEPERM
779         VPERM   B0,B2,B2,BEPERM
780         VPERM   C0,C2,C2,BEPERM
781         VPERM   D0,D2,D2,BEPERM
782
783         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
784         jl      .Ltail_vx
785
786         VLM     A1,D1,0,INP,0
787
788         VX      A0,A0,A1
789         VX      B0,B0,B1
790         VX      C0,C0,C1
791         VX      D0,D0,D1
792
793         VSTM    A0,D0,0,OUT,0
794
795         la      INP,0x40(INP)
796         la      OUT,0x40(OUT)
797         aghi    LEN,-0x40
798         je      .Ldone_vx
799
800         VAF     A3,A3,K0
801         VAF     B3,B3,K1
802         VAF     C3,C3,K2
803         VAF     D2,K3,T3                # K[3]+3
804
805         VPERM   A0,A3,A3,BEPERM
806         VPERM   B0,B3,B3,BEPERM
807         VPERM   C0,C3,C3,BEPERM
808         VPERM   D0,D3,D3,BEPERM
809
810         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
811         jl      .Ltail_vx
812
813         VAF     D3,D2,T1                # K[3]+4
814         VLM     A1,D1,0,INP,0
815
816         VX      A0,A0,A1
817         VX      B0,B0,B1
818         VX      C0,C0,C1
819         VX      D0,D0,D1
820
821         VSTM    A0,D0,0,OUT,0
822
823         la      INP,0x40(INP)
824         la      OUT,0x40(OUT)
825         aghi    LEN,-0x40
826         je      .Ldone_vx
827
828         VAF     A4,A4,K0
829         VAF     B4,B4,K1
830         VAF     C4,C4,K2
831         VAF     D4,D4,D3                # +K[3]+4
832         VAF     D3,D3,T1                # K[3]+5
833         VAF     K3,D2,T3                # K[3]+=6
834
835         VPERM   A0,A4,A4,BEPERM
836         VPERM   B0,B4,B4,BEPERM
837         VPERM   C0,C4,C4,BEPERM
838         VPERM   D0,D4,D4,BEPERM
839
840         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
841         jl      .Ltail_vx
842
843         VLM     A1,D1,0,INP,0
844
845         VX      A0,A0,A1
846         VX      B0,B0,B1
847         VX      C0,C0,C1
848         VX      D0,D0,D1
849
850         VSTM    A0,D0,0,OUT,0
851
852         la      INP,0x40(INP)
853         la      OUT,0x40(OUT)
854         aghi    LEN,-0x40
855         je      .Ldone_vx
856
857         VAF     A5,A5,K0
858         VAF     B5,B5,K1
859         VAF     C5,C5,K2
860         VAF     D5,D5,D3                # +K[3]+5
861
862         VPERM   A0,A5,A5,BEPERM
863         VPERM   B0,B5,B5,BEPERM
864         VPERM   C0,C5,C5,BEPERM
865         VPERM   D0,D5,D5,BEPERM
866
867         .insn   rilu,0xc20e00000000,LEN,0x40    # clgfi LEN,0x40
868         jl      .Ltail_vx
869
870         VLM     A1,D1,0,INP,0
871
872         VX      A0,A0,A1
873         VX      B0,B0,B1
874         VX      C0,C0,C1
875         VX      D0,D0,D1
876
877         VSTM    A0,D0,0,OUT,0
878
879         la      INP,0x40(INP)
880         la      OUT,0x40(OUT)
881         lhi     %r0,10
882         aghi    LEN,-0x40
883         jne     .Loop_outer_vx
884
885 .Ldone_vx:
886         lmg     %r6,%r7,FRAME+6*8(SP)
887         la      SP,FRAME(SP)
888         BR_EX   %r14
889
890 .Ltail_vx:
891         VSTM    A0,D0,8*8,SP,3
892         lghi    %r1,0
893
894 .Loop_tail_vx:
895         llgc    %r5,0(%r1,INP)
896         llgc    %r6,8*8(%r1,SP)
897         xr      %r6,%r5
898         stc     %r6,0(%r1,OUT)
899         la      %r1,1(%r1)
900         brct    LEN,.Loop_tail_vx
901
902         lmg     %r6,%r7,FRAME+6*8(SP)
903         la      SP,FRAME(SP)
904         BR_EX   %r14
905 ENDPROC(chacha20_vx)
906
907 .previous