frontswap: remove frontswap_test
[linux-2.6-microblaze.git] / arch / x86 / crypto / curve25519-x86_64.c
1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3  * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4  * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
5  */
6
7 #include <crypto/curve25519.h>
8 #include <crypto/internal/kpp.h>
9
10 #include <linux/types.h>
11 #include <linux/jump_label.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/scatterlist.h>
15
16 #include <asm/cpufeature.h>
17 #include <asm/processor.h>
18
19 static __always_inline u64 eq_mask(u64 a, u64 b)
20 {
21         u64 x = a ^ b;
22         u64 minus_x = ~x + (u64)1U;
23         u64 x_or_minus_x = x | minus_x;
24         u64 xnx = x_or_minus_x >> (u32)63U;
25         return xnx - (u64)1U;
26 }
27
28 static __always_inline u64 gte_mask(u64 a, u64 b)
29 {
30         u64 x = a;
31         u64 y = b;
32         u64 x_xor_y = x ^ y;
33         u64 x_sub_y = x - y;
34         u64 x_sub_y_xor_y = x_sub_y ^ y;
35         u64 q = x_xor_y | x_sub_y_xor_y;
36         u64 x_xor_q = x ^ q;
37         u64 x_xor_q_ = x_xor_q >> (u32)63U;
38         return x_xor_q_ - (u64)1U;
39 }
40
41 /* Computes the addition of four-element f1 with value in f2
42  * and returns the carry (if any) */
43 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
44 {
45         u64 carry_r;
46
47         asm volatile(
48                 /* Clear registers to propagate the carry bit */
49                 "  xor %%r8d, %%r8d;"
50                 "  xor %%r9d, %%r9d;"
51                 "  xor %%r10d, %%r10d;"
52                 "  xor %%r11d, %%r11d;"
53                 "  xor %k1, %k1;"
54
55                 /* Begin addition chain */
56                 "  addq 0(%3), %0;"
57                 "  movq %0, 0(%2);"
58                 "  adcxq 8(%3), %%r8;"
59                 "  movq %%r8, 8(%2);"
60                 "  adcxq 16(%3), %%r9;"
61                 "  movq %%r9, 16(%2);"
62                 "  adcxq 24(%3), %%r10;"
63                 "  movq %%r10, 24(%2);"
64
65                 /* Return the carry bit in a register */
66                 "  adcx %%r11, %1;"
67                 : "+&r"(f2), "=&r"(carry_r)
68                 : "r"(out), "r"(f1)
69                 : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
70
71         return carry_r;
72 }
73
74 /* Computes the field addition of two field elements */
75 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
76 {
77         asm volatile(
78                 /* Compute the raw addition of f1 + f2 */
79                 "  movq 0(%0), %%r8;"
80                 "  addq 0(%2), %%r8;"
81                 "  movq 8(%0), %%r9;"
82                 "  adcxq 8(%2), %%r9;"
83                 "  movq 16(%0), %%r10;"
84                 "  adcxq 16(%2), %%r10;"
85                 "  movq 24(%0), %%r11;"
86                 "  adcxq 24(%2), %%r11;"
87
88                 /* Wrap the result back into the field */
89
90                 /* Step 1: Compute carry*38 */
91                 "  mov $0, %%rax;"
92                 "  mov $38, %0;"
93                 "  cmovc %0, %%rax;"
94
95                 /* Step 2: Add carry*38 to the original sum */
96                 "  xor %%ecx, %%ecx;"
97                 "  add %%rax, %%r8;"
98                 "  adcx %%rcx, %%r9;"
99                 "  movq %%r9, 8(%1);"
100                 "  adcx %%rcx, %%r10;"
101                 "  movq %%r10, 16(%1);"
102                 "  adcx %%rcx, %%r11;"
103                 "  movq %%r11, 24(%1);"
104
105                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
106                 "  mov $0, %%rax;"
107                 "  cmovc %0, %%rax;"
108                 "  add %%rax, %%r8;"
109                 "  movq %%r8, 0(%1);"
110                 : "+&r"(f2)
111                 : "r"(out), "r"(f1)
112                 : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
113 }
114
115 /* Computes the field subtraction of two field elements */
116 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
117 {
118         asm volatile(
119                 /* Compute the raw subtraction of f1-f2 */
120                 "  movq 0(%1), %%r8;"
121                 "  subq 0(%2), %%r8;"
122                 "  movq 8(%1), %%r9;"
123                 "  sbbq 8(%2), %%r9;"
124                 "  movq 16(%1), %%r10;"
125                 "  sbbq 16(%2), %%r10;"
126                 "  movq 24(%1), %%r11;"
127                 "  sbbq 24(%2), %%r11;"
128
129                 /* Wrap the result back into the field */
130
131                 /* Step 1: Compute carry*38 */
132                 "  mov $0, %%rax;"
133                 "  mov $38, %%rcx;"
134                 "  cmovc %%rcx, %%rax;"
135
136                 /* Step 2: Subtract carry*38 from the original difference */
137                 "  sub %%rax, %%r8;"
138                 "  sbb $0, %%r9;"
139                 "  sbb $0, %%r10;"
140                 "  sbb $0, %%r11;"
141
142                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
143                 "  mov $0, %%rax;"
144                 "  cmovc %%rcx, %%rax;"
145                 "  sub %%rax, %%r8;"
146
147                 /* Store the result */
148                 "  movq %%r8, 0(%0);"
149                 "  movq %%r9, 8(%0);"
150                 "  movq %%r10, 16(%0);"
151                 "  movq %%r11, 24(%0);"
152                 :
153                 : "r"(out), "r"(f1), "r"(f2)
154                 : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
155 }
156
157 /* Computes a field multiplication: out <- f1 * f2
158  * Uses the 8-element buffer tmp for intermediate results */
159 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
160 {
161         asm volatile(
162
163                 /* Compute the raw multiplication: tmp <- src1 * src2 */
164
165                 /* Compute src1[0] * src2 */
166                 "  movq 0(%0), %%rdx;"
167                 "  mulxq 0(%1), %%r8, %%r9;"
168                 "  xor %%r10d, %%r10d;"
169                 "  movq %%r8, 0(%2);"
170                 "  mulxq 8(%1), %%r10, %%r11;"
171                 "  adox %%r9, %%r10;"
172                 "  movq %%r10, 8(%2);"
173                 "  mulxq 16(%1), %%rbx, %%r13;"
174                 "  adox %%r11, %%rbx;"
175                 "  mulxq 24(%1), %%r14, %%rdx;"
176                 "  adox %%r13, %%r14;"
177                 "  mov $0, %%rax;"
178                 "  adox %%rdx, %%rax;"
179
180                 /* Compute src1[1] * src2 */
181                 "  movq 8(%0), %%rdx;"
182                 "  mulxq 0(%1), %%r8, %%r9;"
183                 "  xor %%r10d, %%r10d;"
184                 "  adcxq 8(%2), %%r8;"
185                 "  movq %%r8, 8(%2);"
186                 "  mulxq 8(%1), %%r10, %%r11;"
187                 "  adox %%r9, %%r10;"
188                 "  adcx %%rbx, %%r10;"
189                 "  movq %%r10, 16(%2);"
190                 "  mulxq 16(%1), %%rbx, %%r13;"
191                 "  adox %%r11, %%rbx;"
192                 "  adcx %%r14, %%rbx;"
193                 "  mov $0, %%r8;"
194                 "  mulxq 24(%1), %%r14, %%rdx;"
195                 "  adox %%r13, %%r14;"
196                 "  adcx %%rax, %%r14;"
197                 "  mov $0, %%rax;"
198                 "  adox %%rdx, %%rax;"
199                 "  adcx %%r8, %%rax;"
200
201                 /* Compute src1[2] * src2 */
202                 "  movq 16(%0), %%rdx;"
203                 "  mulxq 0(%1), %%r8, %%r9;"
204                 "  xor %%r10d, %%r10d;"
205                 "  adcxq 16(%2), %%r8;"
206                 "  movq %%r8, 16(%2);"
207                 "  mulxq 8(%1), %%r10, %%r11;"
208                 "  adox %%r9, %%r10;"
209                 "  adcx %%rbx, %%r10;"
210                 "  movq %%r10, 24(%2);"
211                 "  mulxq 16(%1), %%rbx, %%r13;"
212                 "  adox %%r11, %%rbx;"
213                 "  adcx %%r14, %%rbx;"
214                 "  mov $0, %%r8;"
215                 "  mulxq 24(%1), %%r14, %%rdx;"
216                 "  adox %%r13, %%r14;"
217                 "  adcx %%rax, %%r14;"
218                 "  mov $0, %%rax;"
219                 "  adox %%rdx, %%rax;"
220                 "  adcx %%r8, %%rax;"
221
222                 /* Compute src1[3] * src2 */
223                 "  movq 24(%0), %%rdx;"
224                 "  mulxq 0(%1), %%r8, %%r9;"
225                 "  xor %%r10d, %%r10d;"
226                 "  adcxq 24(%2), %%r8;"
227                 "  movq %%r8, 24(%2);"
228                 "  mulxq 8(%1), %%r10, %%r11;"
229                 "  adox %%r9, %%r10;"
230                 "  adcx %%rbx, %%r10;"
231                 "  movq %%r10, 32(%2);"
232                 "  mulxq 16(%1), %%rbx, %%r13;"
233                 "  adox %%r11, %%rbx;"
234                 "  adcx %%r14, %%rbx;"
235                 "  movq %%rbx, 40(%2);"
236                 "  mov $0, %%r8;"
237                 "  mulxq 24(%1), %%r14, %%rdx;"
238                 "  adox %%r13, %%r14;"
239                 "  adcx %%rax, %%r14;"
240                 "  movq %%r14, 48(%2);"
241                 "  mov $0, %%rax;"
242                 "  adox %%rdx, %%rax;"
243                 "  adcx %%r8, %%rax;"
244                 "  movq %%rax, 56(%2);"
245
246                 /* Line up pointers */
247                 "  mov %2, %0;"
248                 "  mov %3, %2;"
249
250                 /* Wrap the result back into the field */
251
252                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
253                 "  mov $38, %%rdx;"
254                 "  mulxq 32(%0), %%r8, %%r13;"
255                 "  xor %k1, %k1;"
256                 "  adoxq 0(%0), %%r8;"
257                 "  mulxq 40(%0), %%r9, %%rbx;"
258                 "  adcx %%r13, %%r9;"
259                 "  adoxq 8(%0), %%r9;"
260                 "  mulxq 48(%0), %%r10, %%r13;"
261                 "  adcx %%rbx, %%r10;"
262                 "  adoxq 16(%0), %%r10;"
263                 "  mulxq 56(%0), %%r11, %%rax;"
264                 "  adcx %%r13, %%r11;"
265                 "  adoxq 24(%0), %%r11;"
266                 "  adcx %1, %%rax;"
267                 "  adox %1, %%rax;"
268                 "  imul %%rdx, %%rax;"
269
270                 /* Step 2: Fold the carry back into dst */
271                 "  add %%rax, %%r8;"
272                 "  adcx %1, %%r9;"
273                 "  movq %%r9, 8(%2);"
274                 "  adcx %1, %%r10;"
275                 "  movq %%r10, 16(%2);"
276                 "  adcx %1, %%r11;"
277                 "  movq %%r11, 24(%2);"
278
279                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
280                 "  mov $0, %%rax;"
281                 "  cmovc %%rdx, %%rax;"
282                 "  add %%rax, %%r8;"
283                 "  movq %%r8, 0(%2);"
284                 : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
285                 : "r"(out)
286                 : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
287                   "%r14", "memory", "cc");
288 }
289
290 /* Computes two field multiplications:
291  *   out[0] <- f1[0] * f2[0]
292  *   out[1] <- f1[1] * f2[1]
293  * Uses the 16-element buffer tmp for intermediate results: */
294 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
295 {
296         asm volatile(
297
298                 /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
299
300                 /* Compute src1[0] * src2 */
301                 "  movq 0(%0), %%rdx;"
302                 "  mulxq 0(%1), %%r8, %%r9;"
303                 "  xor %%r10d, %%r10d;"
304                 "  movq %%r8, 0(%2);"
305                 "  mulxq 8(%1), %%r10, %%r11;"
306                 "  adox %%r9, %%r10;"
307                 "  movq %%r10, 8(%2);"
308                 "  mulxq 16(%1), %%rbx, %%r13;"
309                 "  adox %%r11, %%rbx;"
310                 "  mulxq 24(%1), %%r14, %%rdx;"
311                 "  adox %%r13, %%r14;"
312                 "  mov $0, %%rax;"
313                 "  adox %%rdx, %%rax;"
314
315                 /* Compute src1[1] * src2 */
316                 "  movq 8(%0), %%rdx;"
317                 "  mulxq 0(%1), %%r8, %%r9;"
318                 "  xor %%r10d, %%r10d;"
319                 "  adcxq 8(%2), %%r8;"
320                 "  movq %%r8, 8(%2);"
321                 "  mulxq 8(%1), %%r10, %%r11;"
322                 "  adox %%r9, %%r10;"
323                 "  adcx %%rbx, %%r10;"
324                 "  movq %%r10, 16(%2);"
325                 "  mulxq 16(%1), %%rbx, %%r13;"
326                 "  adox %%r11, %%rbx;"
327                 "  adcx %%r14, %%rbx;"
328                 "  mov $0, %%r8;"
329                 "  mulxq 24(%1), %%r14, %%rdx;"
330                 "  adox %%r13, %%r14;"
331                 "  adcx %%rax, %%r14;"
332                 "  mov $0, %%rax;"
333                 "  adox %%rdx, %%rax;"
334                 "  adcx %%r8, %%rax;"
335
336                 /* Compute src1[2] * src2 */
337                 "  movq 16(%0), %%rdx;"
338                 "  mulxq 0(%1), %%r8, %%r9;"
339                 "  xor %%r10d, %%r10d;"
340                 "  adcxq 16(%2), %%r8;"
341                 "  movq %%r8, 16(%2);"
342                 "  mulxq 8(%1), %%r10, %%r11;"
343                 "  adox %%r9, %%r10;"
344                 "  adcx %%rbx, %%r10;"
345                 "  movq %%r10, 24(%2);"
346                 "  mulxq 16(%1), %%rbx, %%r13;"
347                 "  adox %%r11, %%rbx;"
348                 "  adcx %%r14, %%rbx;"
349                 "  mov $0, %%r8;"
350                 "  mulxq 24(%1), %%r14, %%rdx;"
351                 "  adox %%r13, %%r14;"
352                 "  adcx %%rax, %%r14;"
353                 "  mov $0, %%rax;"
354                 "  adox %%rdx, %%rax;"
355                 "  adcx %%r8, %%rax;"
356
357                 /* Compute src1[3] * src2 */
358                 "  movq 24(%0), %%rdx;"
359                 "  mulxq 0(%1), %%r8, %%r9;"
360                 "  xor %%r10d, %%r10d;"
361                 "  adcxq 24(%2), %%r8;"
362                 "  movq %%r8, 24(%2);"
363                 "  mulxq 8(%1), %%r10, %%r11;"
364                 "  adox %%r9, %%r10;"
365                 "  adcx %%rbx, %%r10;"
366                 "  movq %%r10, 32(%2);"
367                 "  mulxq 16(%1), %%rbx, %%r13;"
368                 "  adox %%r11, %%rbx;"
369                 "  adcx %%r14, %%rbx;"
370                 "  movq %%rbx, 40(%2);"
371                 "  mov $0, %%r8;"
372                 "  mulxq 24(%1), %%r14, %%rdx;"
373                 "  adox %%r13, %%r14;"
374                 "  adcx %%rax, %%r14;"
375                 "  movq %%r14, 48(%2);"
376                 "  mov $0, %%rax;"
377                 "  adox %%rdx, %%rax;"
378                 "  adcx %%r8, %%rax;"
379                 "  movq %%rax, 56(%2);"
380
381                 /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
382
383                 /* Compute src1[0] * src2 */
384                 "  movq 32(%0), %%rdx;"
385                 "  mulxq 32(%1), %%r8, %%r9;"
386                 "  xor %%r10d, %%r10d;"
387                 "  movq %%r8, 64(%2);"
388                 "  mulxq 40(%1), %%r10, %%r11;"
389                 "  adox %%r9, %%r10;"
390                 "  movq %%r10, 72(%2);"
391                 "  mulxq 48(%1), %%rbx, %%r13;"
392                 "  adox %%r11, %%rbx;"
393                 "  mulxq 56(%1), %%r14, %%rdx;"
394                 "  adox %%r13, %%r14;"
395                 "  mov $0, %%rax;"
396                 "  adox %%rdx, %%rax;"
397
398                 /* Compute src1[1] * src2 */
399                 "  movq 40(%0), %%rdx;"
400                 "  mulxq 32(%1), %%r8, %%r9;"
401                 "  xor %%r10d, %%r10d;"
402                 "  adcxq 72(%2), %%r8;"
403                 "  movq %%r8, 72(%2);"
404                 "  mulxq 40(%1), %%r10, %%r11;"
405                 "  adox %%r9, %%r10;"
406                 "  adcx %%rbx, %%r10;"
407                 "  movq %%r10, 80(%2);"
408                 "  mulxq 48(%1), %%rbx, %%r13;"
409                 "  adox %%r11, %%rbx;"
410                 "  adcx %%r14, %%rbx;"
411                 "  mov $0, %%r8;"
412                 "  mulxq 56(%1), %%r14, %%rdx;"
413                 "  adox %%r13, %%r14;"
414                 "  adcx %%rax, %%r14;"
415                 "  mov $0, %%rax;"
416                 "  adox %%rdx, %%rax;"
417                 "  adcx %%r8, %%rax;"
418
419                 /* Compute src1[2] * src2 */
420                 "  movq 48(%0), %%rdx;"
421                 "  mulxq 32(%1), %%r8, %%r9;"
422                 "  xor %%r10d, %%r10d;"
423                 "  adcxq 80(%2), %%r8;"
424                 "  movq %%r8, 80(%2);"
425                 "  mulxq 40(%1), %%r10, %%r11;"
426                 "  adox %%r9, %%r10;"
427                 "  adcx %%rbx, %%r10;"
428                 "  movq %%r10, 88(%2);"
429                 "  mulxq 48(%1), %%rbx, %%r13;"
430                 "  adox %%r11, %%rbx;"
431                 "  adcx %%r14, %%rbx;"
432                 "  mov $0, %%r8;"
433                 "  mulxq 56(%1), %%r14, %%rdx;"
434                 "  adox %%r13, %%r14;"
435                 "  adcx %%rax, %%r14;"
436                 "  mov $0, %%rax;"
437                 "  adox %%rdx, %%rax;"
438                 "  adcx %%r8, %%rax;"
439
440                 /* Compute src1[3] * src2 */
441                 "  movq 56(%0), %%rdx;"
442                 "  mulxq 32(%1), %%r8, %%r9;"
443                 "  xor %%r10d, %%r10d;"
444                 "  adcxq 88(%2), %%r8;"
445                 "  movq %%r8, 88(%2);"
446                 "  mulxq 40(%1), %%r10, %%r11;"
447                 "  adox %%r9, %%r10;"
448                 "  adcx %%rbx, %%r10;"
449                 "  movq %%r10, 96(%2);"
450                 "  mulxq 48(%1), %%rbx, %%r13;"
451                 "  adox %%r11, %%rbx;"
452                 "  adcx %%r14, %%rbx;"
453                 "  movq %%rbx, 104(%2);"
454                 "  mov $0, %%r8;"
455                 "  mulxq 56(%1), %%r14, %%rdx;"
456                 "  adox %%r13, %%r14;"
457                 "  adcx %%rax, %%r14;"
458                 "  movq %%r14, 112(%2);"
459                 "  mov $0, %%rax;"
460                 "  adox %%rdx, %%rax;"
461                 "  adcx %%r8, %%rax;"
462                 "  movq %%rax, 120(%2);"
463
464                 /* Line up pointers */
465                 "  mov %2, %0;"
466                 "  mov %3, %2;"
467
468                 /* Wrap the results back into the field */
469
470                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
471                 "  mov $38, %%rdx;"
472                 "  mulxq 32(%0), %%r8, %%r13;"
473                 "  xor %k1, %k1;"
474                 "  adoxq 0(%0), %%r8;"
475                 "  mulxq 40(%0), %%r9, %%rbx;"
476                 "  adcx %%r13, %%r9;"
477                 "  adoxq 8(%0), %%r9;"
478                 "  mulxq 48(%0), %%r10, %%r13;"
479                 "  adcx %%rbx, %%r10;"
480                 "  adoxq 16(%0), %%r10;"
481                 "  mulxq 56(%0), %%r11, %%rax;"
482                 "  adcx %%r13, %%r11;"
483                 "  adoxq 24(%0), %%r11;"
484                 "  adcx %1, %%rax;"
485                 "  adox %1, %%rax;"
486                 "  imul %%rdx, %%rax;"
487
488                 /* Step 2: Fold the carry back into dst */
489                 "  add %%rax, %%r8;"
490                 "  adcx %1, %%r9;"
491                 "  movq %%r9, 8(%2);"
492                 "  adcx %1, %%r10;"
493                 "  movq %%r10, 16(%2);"
494                 "  adcx %1, %%r11;"
495                 "  movq %%r11, 24(%2);"
496
497                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
498                 "  mov $0, %%rax;"
499                 "  cmovc %%rdx, %%rax;"
500                 "  add %%rax, %%r8;"
501                 "  movq %%r8, 0(%2);"
502
503                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
504                 "  mov $38, %%rdx;"
505                 "  mulxq 96(%0), %%r8, %%r13;"
506                 "  xor %k1, %k1;"
507                 "  adoxq 64(%0), %%r8;"
508                 "  mulxq 104(%0), %%r9, %%rbx;"
509                 "  adcx %%r13, %%r9;"
510                 "  adoxq 72(%0), %%r9;"
511                 "  mulxq 112(%0), %%r10, %%r13;"
512                 "  adcx %%rbx, %%r10;"
513                 "  adoxq 80(%0), %%r10;"
514                 "  mulxq 120(%0), %%r11, %%rax;"
515                 "  adcx %%r13, %%r11;"
516                 "  adoxq 88(%0), %%r11;"
517                 "  adcx %1, %%rax;"
518                 "  adox %1, %%rax;"
519                 "  imul %%rdx, %%rax;"
520
521                 /* Step 2: Fold the carry back into dst */
522                 "  add %%rax, %%r8;"
523                 "  adcx %1, %%r9;"
524                 "  movq %%r9, 40(%2);"
525                 "  adcx %1, %%r10;"
526                 "  movq %%r10, 48(%2);"
527                 "  adcx %1, %%r11;"
528                 "  movq %%r11, 56(%2);"
529
530                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
531                 "  mov $0, %%rax;"
532                 "  cmovc %%rdx, %%rax;"
533                 "  add %%rax, %%r8;"
534                 "  movq %%r8, 32(%2);"
535                 : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
536                 : "r"(out)
537                 : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
538                   "%r14", "memory", "cc");
539 }
540
541 /* Computes the field multiplication of four-element f1 with value in f2
542  * Requires f2 to be smaller than 2^17 */
543 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
544 {
545         register u64 f2_r asm("rdx") = f2;
546
547         asm volatile(
548                 /* Compute the raw multiplication of f1*f2 */
549                 "  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
550                 "  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
551                 "  add %%rcx, %%r9;"
552                 "  mov $0, %%rcx;"
553                 "  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
554                 "  adcx %%rbx, %%r10;"
555                 "  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
556                 "  adcx %%r13, %%r11;"
557                 "  adcx %%rcx, %%rax;"
558
559                 /* Wrap the result back into the field */
560
561                 /* Step 1: Compute carry*38 */
562                 "  mov $38, %%rdx;"
563                 "  imul %%rdx, %%rax;"
564
565                 /* Step 2: Fold the carry back into dst */
566                 "  add %%rax, %%r8;"
567                 "  adcx %%rcx, %%r9;"
568                 "  movq %%r9, 8(%1);"
569                 "  adcx %%rcx, %%r10;"
570                 "  movq %%r10, 16(%1);"
571                 "  adcx %%rcx, %%r11;"
572                 "  movq %%r11, 24(%1);"
573
574                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
575                 "  mov $0, %%rax;"
576                 "  cmovc %%rdx, %%rax;"
577                 "  add %%rax, %%r8;"
578                 "  movq %%r8, 0(%1);"
579                 : "+&r"(f2_r)
580                 : "r"(out), "r"(f1)
581                 : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
582                   "memory", "cc");
583 }
584
585 /* Computes p1 <- bit ? p2 : p1 in constant time */
586 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
587 {
588         asm volatile(
589                 /* Transfer bit into CF flag */
590                 "  add $18446744073709551615, %0;"
591
592                 /* cswap p1[0], p2[0] */
593                 "  movq 0(%1), %%r8;"
594                 "  movq 0(%2), %%r9;"
595                 "  mov %%r8, %%r10;"
596                 "  cmovc %%r9, %%r8;"
597                 "  cmovc %%r10, %%r9;"
598                 "  movq %%r8, 0(%1);"
599                 "  movq %%r9, 0(%2);"
600
601                 /* cswap p1[1], p2[1] */
602                 "  movq 8(%1), %%r8;"
603                 "  movq 8(%2), %%r9;"
604                 "  mov %%r8, %%r10;"
605                 "  cmovc %%r9, %%r8;"
606                 "  cmovc %%r10, %%r9;"
607                 "  movq %%r8, 8(%1);"
608                 "  movq %%r9, 8(%2);"
609
610                 /* cswap p1[2], p2[2] */
611                 "  movq 16(%1), %%r8;"
612                 "  movq 16(%2), %%r9;"
613                 "  mov %%r8, %%r10;"
614                 "  cmovc %%r9, %%r8;"
615                 "  cmovc %%r10, %%r9;"
616                 "  movq %%r8, 16(%1);"
617                 "  movq %%r9, 16(%2);"
618
619                 /* cswap p1[3], p2[3] */
620                 "  movq 24(%1), %%r8;"
621                 "  movq 24(%2), %%r9;"
622                 "  mov %%r8, %%r10;"
623                 "  cmovc %%r9, %%r8;"
624                 "  cmovc %%r10, %%r9;"
625                 "  movq %%r8, 24(%1);"
626                 "  movq %%r9, 24(%2);"
627
628                 /* cswap p1[4], p2[4] */
629                 "  movq 32(%1), %%r8;"
630                 "  movq 32(%2), %%r9;"
631                 "  mov %%r8, %%r10;"
632                 "  cmovc %%r9, %%r8;"
633                 "  cmovc %%r10, %%r9;"
634                 "  movq %%r8, 32(%1);"
635                 "  movq %%r9, 32(%2);"
636
637                 /* cswap p1[5], p2[5] */
638                 "  movq 40(%1), %%r8;"
639                 "  movq 40(%2), %%r9;"
640                 "  mov %%r8, %%r10;"
641                 "  cmovc %%r9, %%r8;"
642                 "  cmovc %%r10, %%r9;"
643                 "  movq %%r8, 40(%1);"
644                 "  movq %%r9, 40(%2);"
645
646                 /* cswap p1[6], p2[6] */
647                 "  movq 48(%1), %%r8;"
648                 "  movq 48(%2), %%r9;"
649                 "  mov %%r8, %%r10;"
650                 "  cmovc %%r9, %%r8;"
651                 "  cmovc %%r10, %%r9;"
652                 "  movq %%r8, 48(%1);"
653                 "  movq %%r9, 48(%2);"
654
655                 /* cswap p1[7], p2[7] */
656                 "  movq 56(%1), %%r8;"
657                 "  movq 56(%2), %%r9;"
658                 "  mov %%r8, %%r10;"
659                 "  cmovc %%r9, %%r8;"
660                 "  cmovc %%r10, %%r9;"
661                 "  movq %%r8, 56(%1);"
662                 "  movq %%r9, 56(%2);"
663                 : "+&r"(bit)
664                 : "r"(p1), "r"(p2)
665                 : "%r8", "%r9", "%r10", "memory", "cc");
666 }
667
668 /* Computes the square of a field element: out <- f * f
669  * Uses the 8-element buffer tmp for intermediate results */
670 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
671 {
672         asm volatile(
673                 /* Compute the raw multiplication: tmp <- f * f */
674
675                 /* Step 1: Compute all partial products */
676                 "  movq 0(%0), %%rdx;" /* f[0] */
677                 "  mulxq 8(%0), %%r8, %%r14;"
678                 "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
679                 "  mulxq 16(%0), %%r9, %%r10;"
680                 "  adcx %%r14, %%r9;" /* f[2]*f[0] */
681                 "  mulxq 24(%0), %%rax, %%rcx;"
682                 "  adcx %%rax, %%r10;" /* f[3]*f[0] */
683                 "  movq 24(%0), %%rdx;" /* f[3] */
684                 "  mulxq 8(%0), %%r11, %%rbx;"
685                 "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
686                 "  mulxq 16(%0), %%rax, %%r13;"
687                 "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
688                 "  movq 8(%0), %%rdx;"
689                 "  adcx %%r15, %%r13;" /* f1 */
690                 "  mulxq 16(%0), %%rax, %%rcx;"
691                 "  mov $0, %%r14;" /* f[2]*f[1] */
692
693                 /* Step 2: Compute two parallel carry chains */
694                 "  xor %%r15d, %%r15d;"
695                 "  adox %%rax, %%r10;"
696                 "  adcx %%r8, %%r8;"
697                 "  adox %%rcx, %%r11;"
698                 "  adcx %%r9, %%r9;"
699                 "  adox %%r15, %%rbx;"
700                 "  adcx %%r10, %%r10;"
701                 "  adox %%r15, %%r13;"
702                 "  adcx %%r11, %%r11;"
703                 "  adox %%r15, %%r14;"
704                 "  adcx %%rbx, %%rbx;"
705                 "  adcx %%r13, %%r13;"
706                 "  adcx %%r14, %%r14;"
707
708                 /* Step 3: Compute intermediate squares */
709                 "  movq 0(%0), %%rdx;"
710                 "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
711                 "  movq %%rax, 0(%1);"
712                 "  add %%rcx, %%r8;"
713                 "  movq %%r8, 8(%1);"
714                 "  movq 8(%0), %%rdx;"
715                 "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
716                 "  adcx %%rax, %%r9;"
717                 "  movq %%r9, 16(%1);"
718                 "  adcx %%rcx, %%r10;"
719                 "  movq %%r10, 24(%1);"
720                 "  movq 16(%0), %%rdx;"
721                 "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
722                 "  adcx %%rax, %%r11;"
723                 "  movq %%r11, 32(%1);"
724                 "  adcx %%rcx, %%rbx;"
725                 "  movq %%rbx, 40(%1);"
726                 "  movq 24(%0), %%rdx;"
727                 "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
728                 "  adcx %%rax, %%r13;"
729                 "  movq %%r13, 48(%1);"
730                 "  adcx %%rcx, %%r14;"
731                 "  movq %%r14, 56(%1);"
732
733                 /* Line up pointers */
734                 "  mov %1, %0;"
735                 "  mov %2, %1;"
736
737                 /* Wrap the result back into the field */
738
739                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
740                 "  mov $38, %%rdx;"
741                 "  mulxq 32(%0), %%r8, %%r13;"
742                 "  xor %%ecx, %%ecx;"
743                 "  adoxq 0(%0), %%r8;"
744                 "  mulxq 40(%0), %%r9, %%rbx;"
745                 "  adcx %%r13, %%r9;"
746                 "  adoxq 8(%0), %%r9;"
747                 "  mulxq 48(%0), %%r10, %%r13;"
748                 "  adcx %%rbx, %%r10;"
749                 "  adoxq 16(%0), %%r10;"
750                 "  mulxq 56(%0), %%r11, %%rax;"
751                 "  adcx %%r13, %%r11;"
752                 "  adoxq 24(%0), %%r11;"
753                 "  adcx %%rcx, %%rax;"
754                 "  adox %%rcx, %%rax;"
755                 "  imul %%rdx, %%rax;"
756
757                 /* Step 2: Fold the carry back into dst */
758                 "  add %%rax, %%r8;"
759                 "  adcx %%rcx, %%r9;"
760                 "  movq %%r9, 8(%1);"
761                 "  adcx %%rcx, %%r10;"
762                 "  movq %%r10, 16(%1);"
763                 "  adcx %%rcx, %%r11;"
764                 "  movq %%r11, 24(%1);"
765
766                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
767                 "  mov $0, %%rax;"
768                 "  cmovc %%rdx, %%rax;"
769                 "  add %%rax, %%r8;"
770                 "  movq %%r8, 0(%1);"
771                 : "+&r"(f), "+&r"(tmp)
772                 : "r"(out)
773                 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
774                   "%r13", "%r14", "%r15", "memory", "cc");
775 }
776
777 /* Computes two field squarings:
778  *   out[0] <- f[0] * f[0]
779  *   out[1] <- f[1] * f[1]
780  * Uses the 16-element buffer tmp for intermediate results */
781 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
782 {
783         asm volatile(
784                 /* Step 1: Compute all partial products */
785                 "  movq 0(%0), %%rdx;" /* f[0] */
786                 "  mulxq 8(%0), %%r8, %%r14;"
787                 "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
788                 "  mulxq 16(%0), %%r9, %%r10;"
789                 "  adcx %%r14, %%r9;" /* f[2]*f[0] */
790                 "  mulxq 24(%0), %%rax, %%rcx;"
791                 "  adcx %%rax, %%r10;" /* f[3]*f[0] */
792                 "  movq 24(%0), %%rdx;" /* f[3] */
793                 "  mulxq 8(%0), %%r11, %%rbx;"
794                 "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
795                 "  mulxq 16(%0), %%rax, %%r13;"
796                 "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
797                 "  movq 8(%0), %%rdx;"
798                 "  adcx %%r15, %%r13;" /* f1 */
799                 "  mulxq 16(%0), %%rax, %%rcx;"
800                 "  mov $0, %%r14;" /* f[2]*f[1] */
801
802                 /* Step 2: Compute two parallel carry chains */
803                 "  xor %%r15d, %%r15d;"
804                 "  adox %%rax, %%r10;"
805                 "  adcx %%r8, %%r8;"
806                 "  adox %%rcx, %%r11;"
807                 "  adcx %%r9, %%r9;"
808                 "  adox %%r15, %%rbx;"
809                 "  adcx %%r10, %%r10;"
810                 "  adox %%r15, %%r13;"
811                 "  adcx %%r11, %%r11;"
812                 "  adox %%r15, %%r14;"
813                 "  adcx %%rbx, %%rbx;"
814                 "  adcx %%r13, %%r13;"
815                 "  adcx %%r14, %%r14;"
816
817                 /* Step 3: Compute intermediate squares */
818                 "  movq 0(%0), %%rdx;"
819                 "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
820                 "  movq %%rax, 0(%1);"
821                 "  add %%rcx, %%r8;"
822                 "  movq %%r8, 8(%1);"
823                 "  movq 8(%0), %%rdx;"
824                 "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
825                 "  adcx %%rax, %%r9;"
826                 "  movq %%r9, 16(%1);"
827                 "  adcx %%rcx, %%r10;"
828                 "  movq %%r10, 24(%1);"
829                 "  movq 16(%0), %%rdx;"
830                 "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
831                 "  adcx %%rax, %%r11;"
832                 "  movq %%r11, 32(%1);"
833                 "  adcx %%rcx, %%rbx;"
834                 "  movq %%rbx, 40(%1);"
835                 "  movq 24(%0), %%rdx;"
836                 "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
837                 "  adcx %%rax, %%r13;"
838                 "  movq %%r13, 48(%1);"
839                 "  adcx %%rcx, %%r14;"
840                 "  movq %%r14, 56(%1);"
841
842                 /* Step 1: Compute all partial products */
843                 "  movq 32(%0), %%rdx;" /* f[0] */
844                 "  mulxq 40(%0), %%r8, %%r14;"
845                 "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
846                 "  mulxq 48(%0), %%r9, %%r10;"
847                 "  adcx %%r14, %%r9;" /* f[2]*f[0] */
848                 "  mulxq 56(%0), %%rax, %%rcx;"
849                 "  adcx %%rax, %%r10;" /* f[3]*f[0] */
850                 "  movq 56(%0), %%rdx;" /* f[3] */
851                 "  mulxq 40(%0), %%r11, %%rbx;"
852                 "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
853                 "  mulxq 48(%0), %%rax, %%r13;"
854                 "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
855                 "  movq 40(%0), %%rdx;"
856                 "  adcx %%r15, %%r13;" /* f1 */
857                 "  mulxq 48(%0), %%rax, %%rcx;"
858                 "  mov $0, %%r14;" /* f[2]*f[1] */
859
860                 /* Step 2: Compute two parallel carry chains */
861                 "  xor %%r15d, %%r15d;"
862                 "  adox %%rax, %%r10;"
863                 "  adcx %%r8, %%r8;"
864                 "  adox %%rcx, %%r11;"
865                 "  adcx %%r9, %%r9;"
866                 "  adox %%r15, %%rbx;"
867                 "  adcx %%r10, %%r10;"
868                 "  adox %%r15, %%r13;"
869                 "  adcx %%r11, %%r11;"
870                 "  adox %%r15, %%r14;"
871                 "  adcx %%rbx, %%rbx;"
872                 "  adcx %%r13, %%r13;"
873                 "  adcx %%r14, %%r14;"
874
875                 /* Step 3: Compute intermediate squares */
876                 "  movq 32(%0), %%rdx;"
877                 "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
878                 "  movq %%rax, 64(%1);"
879                 "  add %%rcx, %%r8;"
880                 "  movq %%r8, 72(%1);"
881                 "  movq 40(%0), %%rdx;"
882                 "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
883                 "  adcx %%rax, %%r9;"
884                 "  movq %%r9, 80(%1);"
885                 "  adcx %%rcx, %%r10;"
886                 "  movq %%r10, 88(%1);"
887                 "  movq 48(%0), %%rdx;"
888                 "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
889                 "  adcx %%rax, %%r11;"
890                 "  movq %%r11, 96(%1);"
891                 "  adcx %%rcx, %%rbx;"
892                 "  movq %%rbx, 104(%1);"
893                 "  movq 56(%0), %%rdx;"
894                 "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
895                 "  adcx %%rax, %%r13;"
896                 "  movq %%r13, 112(%1);"
897                 "  adcx %%rcx, %%r14;"
898                 "  movq %%r14, 120(%1);"
899
900                 /* Line up pointers */
901                 "  mov %1, %0;"
902                 "  mov %2, %1;"
903
904                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
905                 "  mov $38, %%rdx;"
906                 "  mulxq 32(%0), %%r8, %%r13;"
907                 "  xor %%ecx, %%ecx;"
908                 "  adoxq 0(%0), %%r8;"
909                 "  mulxq 40(%0), %%r9, %%rbx;"
910                 "  adcx %%r13, %%r9;"
911                 "  adoxq 8(%0), %%r9;"
912                 "  mulxq 48(%0), %%r10, %%r13;"
913                 "  adcx %%rbx, %%r10;"
914                 "  adoxq 16(%0), %%r10;"
915                 "  mulxq 56(%0), %%r11, %%rax;"
916                 "  adcx %%r13, %%r11;"
917                 "  adoxq 24(%0), %%r11;"
918                 "  adcx %%rcx, %%rax;"
919                 "  adox %%rcx, %%rax;"
920                 "  imul %%rdx, %%rax;"
921
922                 /* Step 2: Fold the carry back into dst */
923                 "  add %%rax, %%r8;"
924                 "  adcx %%rcx, %%r9;"
925                 "  movq %%r9, 8(%1);"
926                 "  adcx %%rcx, %%r10;"
927                 "  movq %%r10, 16(%1);"
928                 "  adcx %%rcx, %%r11;"
929                 "  movq %%r11, 24(%1);"
930
931                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
932                 "  mov $0, %%rax;"
933                 "  cmovc %%rdx, %%rax;"
934                 "  add %%rax, %%r8;"
935                 "  movq %%r8, 0(%1);"
936
937                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
938                 "  mov $38, %%rdx;"
939                 "  mulxq 96(%0), %%r8, %%r13;"
940                 "  xor %%ecx, %%ecx;"
941                 "  adoxq 64(%0), %%r8;"
942                 "  mulxq 104(%0), %%r9, %%rbx;"
943                 "  adcx %%r13, %%r9;"
944                 "  adoxq 72(%0), %%r9;"
945                 "  mulxq 112(%0), %%r10, %%r13;"
946                 "  adcx %%rbx, %%r10;"
947                 "  adoxq 80(%0), %%r10;"
948                 "  mulxq 120(%0), %%r11, %%rax;"
949                 "  adcx %%r13, %%r11;"
950                 "  adoxq 88(%0), %%r11;"
951                 "  adcx %%rcx, %%rax;"
952                 "  adox %%rcx, %%rax;"
953                 "  imul %%rdx, %%rax;"
954
955                 /* Step 2: Fold the carry back into dst */
956                 "  add %%rax, %%r8;"
957                 "  adcx %%rcx, %%r9;"
958                 "  movq %%r9, 40(%1);"
959                 "  adcx %%rcx, %%r10;"
960                 "  movq %%r10, 48(%1);"
961                 "  adcx %%rcx, %%r11;"
962                 "  movq %%r11, 56(%1);"
963
964                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
965                 "  mov $0, %%rax;"
966                 "  cmovc %%rdx, %%rax;"
967                 "  add %%rax, %%r8;"
968                 "  movq %%r8, 32(%1);"
969                 : "+&r"(f), "+&r"(tmp)
970                 : "r"(out)
971                 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
972                   "%r13", "%r14", "%r15", "memory", "cc");
973 }
974
975 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
976 {
977         u64 *nq = p01_tmp1;
978         u64 *nq_p1 = p01_tmp1 + (u32)8U;
979         u64 *tmp1 = p01_tmp1 + (u32)16U;
980         u64 *x1 = q;
981         u64 *x2 = nq;
982         u64 *z2 = nq + (u32)4U;
983         u64 *z3 = nq_p1 + (u32)4U;
984         u64 *a = tmp1;
985         u64 *b = tmp1 + (u32)4U;
986         u64 *ab = tmp1;
987         u64 *dc = tmp1 + (u32)8U;
988         u64 *x3;
989         u64 *z31;
990         u64 *d0;
991         u64 *c0;
992         u64 *a1;
993         u64 *b1;
994         u64 *d;
995         u64 *c;
996         u64 *ab1;
997         u64 *dc1;
998         fadd(a, x2, z2);
999         fsub(b, x2, z2);
1000         x3 = nq_p1;
1001         z31 = nq_p1 + (u32)4U;
1002         d0 = dc;
1003         c0 = dc + (u32)4U;
1004         fadd(c0, x3, z31);
1005         fsub(d0, x3, z31);
1006         fmul2(dc, dc, ab, tmp2);
1007         fadd(x3, d0, c0);
1008         fsub(z31, d0, c0);
1009         a1 = tmp1;
1010         b1 = tmp1 + (u32)4U;
1011         d = tmp1 + (u32)8U;
1012         c = tmp1 + (u32)12U;
1013         ab1 = tmp1;
1014         dc1 = tmp1 + (u32)8U;
1015         fsqr2(dc1, ab1, tmp2);
1016         fsqr2(nq_p1, nq_p1, tmp2);
1017         a1[0U] = c[0U];
1018         a1[1U] = c[1U];
1019         a1[2U] = c[2U];
1020         a1[3U] = c[3U];
1021         fsub(c, d, c);
1022         fmul_scalar(b1, c, (u64)121665U);
1023         fadd(b1, b1, d);
1024         fmul2(nq, dc1, ab1, tmp2);
1025         fmul(z3, z3, x1, tmp2);
1026 }
1027
1028 static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
1029 {
1030         u64 *x2 = nq;
1031         u64 *z2 = nq + (u32)4U;
1032         u64 *a = tmp1;
1033         u64 *b = tmp1 + (u32)4U;
1034         u64 *d = tmp1 + (u32)8U;
1035         u64 *c = tmp1 + (u32)12U;
1036         u64 *ab = tmp1;
1037         u64 *dc = tmp1 + (u32)8U;
1038         fadd(a, x2, z2);
1039         fsub(b, x2, z2);
1040         fsqr2(dc, ab, tmp2);
1041         a[0U] = c[0U];
1042         a[1U] = c[1U];
1043         a[2U] = c[2U];
1044         a[3U] = c[3U];
1045         fsub(c, d, c);
1046         fmul_scalar(b, c, (u64)121665U);
1047         fadd(b, b, d);
1048         fmul2(nq, dc, ab, tmp2);
1049 }
1050
1051 static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
1052 {
1053         u64 tmp2[16U] = { 0U };
1054         u64 p01_tmp1_swap[33U] = { 0U };
1055         u64 *p0 = p01_tmp1_swap;
1056         u64 *p01 = p01_tmp1_swap;
1057         u64 *p03 = p01;
1058         u64 *p11 = p01 + (u32)8U;
1059         u64 *x0;
1060         u64 *z0;
1061         u64 *p01_tmp1;
1062         u64 *p01_tmp11;
1063         u64 *nq10;
1064         u64 *nq_p11;
1065         u64 *swap1;
1066         u64 sw0;
1067         u64 *nq1;
1068         u64 *tmp1;
1069         memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
1070         x0 = p03;
1071         z0 = p03 + (u32)4U;
1072         x0[0U] = (u64)1U;
1073         x0[1U] = (u64)0U;
1074         x0[2U] = (u64)0U;
1075         x0[3U] = (u64)0U;
1076         z0[0U] = (u64)0U;
1077         z0[1U] = (u64)0U;
1078         z0[2U] = (u64)0U;
1079         z0[3U] = (u64)0U;
1080         p01_tmp1 = p01_tmp1_swap;
1081         p01_tmp11 = p01_tmp1_swap;
1082         nq10 = p01_tmp1_swap;
1083         nq_p11 = p01_tmp1_swap + (u32)8U;
1084         swap1 = p01_tmp1_swap + (u32)32U;
1085         cswap2((u64)1U, nq10, nq_p11);
1086         point_add_and_double(init1, p01_tmp11, tmp2);
1087         swap1[0U] = (u64)1U;
1088         {
1089                 u32 i;
1090                 for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
1091                         u64 *p01_tmp12 = p01_tmp1_swap;
1092                         u64 *swap2 = p01_tmp1_swap + (u32)32U;
1093                         u64 *nq2 = p01_tmp12;
1094                         u64 *nq_p12 = p01_tmp12 + (u32)8U;
1095                         u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
1096                         u64 sw = swap2[0U] ^ bit;
1097                         cswap2(sw, nq2, nq_p12);
1098                         point_add_and_double(init1, p01_tmp12, tmp2);
1099                         swap2[0U] = bit;
1100                 }
1101         }
1102         sw0 = swap1[0U];
1103         cswap2(sw0, nq10, nq_p11);
1104         nq1 = p01_tmp1;
1105         tmp1 = p01_tmp1 + (u32)16U;
1106         point_double(nq1, tmp1, tmp2);
1107         point_double(nq1, tmp1, tmp2);
1108         point_double(nq1, tmp1, tmp2);
1109         memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
1110
1111         memzero_explicit(tmp2, sizeof(tmp2));
1112         memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
1113 }
1114
1115 static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
1116 {
1117         u32 i;
1118         fsqr(o, inp, tmp);
1119         for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
1120                 fsqr(o, o, tmp);
1121 }
1122
1123 static void finv(u64 *o, const u64 *i, u64 *tmp)
1124 {
1125         u64 t1[16U] = { 0U };
1126         u64 *a0 = t1;
1127         u64 *b = t1 + (u32)4U;
1128         u64 *c = t1 + (u32)8U;
1129         u64 *t00 = t1 + (u32)12U;
1130         u64 *tmp1 = tmp;
1131         u64 *a;
1132         u64 *t0;
1133         fsquare_times(a0, i, tmp1, (u32)1U);
1134         fsquare_times(t00, a0, tmp1, (u32)2U);
1135         fmul(b, t00, i, tmp);
1136         fmul(a0, b, a0, tmp);
1137         fsquare_times(t00, a0, tmp1, (u32)1U);
1138         fmul(b, t00, b, tmp);
1139         fsquare_times(t00, b, tmp1, (u32)5U);
1140         fmul(b, t00, b, tmp);
1141         fsquare_times(t00, b, tmp1, (u32)10U);
1142         fmul(c, t00, b, tmp);
1143         fsquare_times(t00, c, tmp1, (u32)20U);
1144         fmul(t00, t00, c, tmp);
1145         fsquare_times(t00, t00, tmp1, (u32)10U);
1146         fmul(b, t00, b, tmp);
1147         fsquare_times(t00, b, tmp1, (u32)50U);
1148         fmul(c, t00, b, tmp);
1149         fsquare_times(t00, c, tmp1, (u32)100U);
1150         fmul(t00, t00, c, tmp);
1151         fsquare_times(t00, t00, tmp1, (u32)50U);
1152         fmul(t00, t00, b, tmp);
1153         fsquare_times(t00, t00, tmp1, (u32)5U);
1154         a = t1;
1155         t0 = t1 + (u32)12U;
1156         fmul(o, t0, a, tmp);
1157 }
1158
1159 static void store_felem(u64 *b, u64 *f)
1160 {
1161         u64 f30 = f[3U];
1162         u64 top_bit0 = f30 >> (u32)63U;
1163         u64 f31;
1164         u64 top_bit;
1165         u64 f0;
1166         u64 f1;
1167         u64 f2;
1168         u64 f3;
1169         u64 m0;
1170         u64 m1;
1171         u64 m2;
1172         u64 m3;
1173         u64 mask;
1174         u64 f0_;
1175         u64 f1_;
1176         u64 f2_;
1177         u64 f3_;
1178         u64 o0;
1179         u64 o1;
1180         u64 o2;
1181         u64 o3;
1182         f[3U] = f30 & (u64)0x7fffffffffffffffU;
1183         add_scalar(f, f, (u64)19U * top_bit0);
1184         f31 = f[3U];
1185         top_bit = f31 >> (u32)63U;
1186         f[3U] = f31 & (u64)0x7fffffffffffffffU;
1187         add_scalar(f, f, (u64)19U * top_bit);
1188         f0 = f[0U];
1189         f1 = f[1U];
1190         f2 = f[2U];
1191         f3 = f[3U];
1192         m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
1193         m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
1194         m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
1195         m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
1196         mask = ((m0 & m1) & m2) & m3;
1197         f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
1198         f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
1199         f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
1200         f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
1201         o0 = f0_;
1202         o1 = f1_;
1203         o2 = f2_;
1204         o3 = f3_;
1205         b[0U] = o0;
1206         b[1U] = o1;
1207         b[2U] = o2;
1208         b[3U] = o3;
1209 }
1210
1211 static void encode_point(u8 *o, const u64 *i)
1212 {
1213         const u64 *x = i;
1214         const u64 *z = i + (u32)4U;
1215         u64 tmp[4U] = { 0U };
1216         u64 tmp_w[16U] = { 0U };
1217         finv(tmp, z, tmp_w);
1218         fmul(tmp, tmp, x, tmp_w);
1219         store_felem((u64 *)o, tmp);
1220 }
1221
1222 static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1223 {
1224         u64 init1[8U] = { 0U };
1225         u64 tmp[4U] = { 0U };
1226         u64 tmp3;
1227         u64 *x;
1228         u64 *z;
1229         {
1230                 u32 i;
1231                 for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1232                         u64 *os = tmp;
1233                         const u8 *bj = pub + i * (u32)8U;
1234                         u64 u = *(u64 *)bj;
1235                         u64 r = u;
1236                         u64 x0 = r;
1237                         os[i] = x0;
1238                 }
1239         }
1240         tmp3 = tmp[3U];
1241         tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1242         x = init1;
1243         z = init1 + (u32)4U;
1244         z[0U] = (u64)1U;
1245         z[1U] = (u64)0U;
1246         z[2U] = (u64)0U;
1247         z[3U] = (u64)0U;
1248         x[0U] = tmp[0U];
1249         x[1U] = tmp[1U];
1250         x[2U] = tmp[2U];
1251         x[3U] = tmp[3U];
1252         montgomery_ladder(init1, priv, init1);
1253         encode_point(out, init1);
1254 }
1255
1256 /* The below constants were generated using this sage script:
1257  *
1258  * #!/usr/bin/env sage
1259  * import sys
1260  * from sage.all import *
1261  * def limbs(n):
1262  *      n = int(n)
1263  *      l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1264  *      return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1265  * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1266  * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1267  * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1268  * print("static const u64 table_ladder[] = {")
1269  * p = ec.lift_x(9)
1270  * for i in range(252):
1271  *      l = (p[0] + p[2]) / (p[0] - p[2])
1272  *      print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1273  *      p = p * 2
1274  * print("};")
1275  *
1276  */
1277
1278 static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1279
1280 static const u64 table_ladder[] = {
1281         0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1282         0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1283         0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1284         0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1285         0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1286         0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1287         0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1288         0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1289         0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1290         0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1291         0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1292         0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1293         0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1294         0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1295         0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1296         0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1297         0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1298         0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1299         0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1300         0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1301         0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1302         0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1303         0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1304         0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1305         0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1306         0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1307         0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1308         0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1309         0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1310         0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1311         0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1312         0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1313         0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1314         0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1315         0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1316         0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1317         0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1318         0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1319         0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1320         0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1321         0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1322         0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1323         0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1324         0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1325         0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1326         0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1327         0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1328         0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1329         0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1330         0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1331         0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1332         0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1333         0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1334         0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1335         0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1336         0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1337         0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1338         0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1339         0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1340         0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1341         0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1342         0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1343         0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1344         0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1345         0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1346         0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1347         0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1348         0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1349         0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1350         0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1351         0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1352         0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1353         0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1354         0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1355         0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1356         0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1357         0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1358         0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1359         0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1360         0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1361         0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1362         0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1363         0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1364         0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1365         0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1366         0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1367         0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1368         0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1369         0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1370         0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1371         0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1372         0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1373         0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1374         0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1375         0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1376         0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1377         0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1378         0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1379         0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1380         0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1381         0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1382         0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1383         0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1384         0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1385         0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1386         0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1387         0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1388         0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1389         0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1390         0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1391         0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1392         0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1393         0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1394         0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1395         0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1396         0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1397         0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1398         0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1399         0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1400         0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1401         0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1402         0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1403         0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1404         0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1405         0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1406         0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1407         0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1408         0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1409         0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1410         0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1411         0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1412         0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1413         0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1414         0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1415         0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1416         0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1417         0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1418         0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1419         0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1420         0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1421         0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1422         0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1423         0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1424         0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1425         0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1426         0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1427         0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1428         0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1429         0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1430         0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1431         0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1432         0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1433         0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1434         0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1435         0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1436         0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1437         0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1438         0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1439         0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1440         0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1441         0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1442         0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1443         0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1444         0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1445         0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1446         0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1447         0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1448         0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1449         0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1450         0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1451         0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1452         0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1453         0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1454         0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1455         0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1456         0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1457         0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1458         0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1459         0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1460         0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1461         0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1462         0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1463         0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1464         0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1465         0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1466         0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1467         0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1468         0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1469         0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1470         0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1471         0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1472         0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1473         0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1474         0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1475         0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1476         0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1477         0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1478         0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1479         0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1480         0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1481         0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1482         0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1483         0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1484         0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1485         0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1486         0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1487         0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1488         0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1489         0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1490         0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1491         0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1492         0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1493         0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1494         0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1495         0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1496         0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1497         0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1498         0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1499         0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1500         0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1501         0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1502         0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1503         0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1504         0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1505         0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1506         0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1507         0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1508         0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1509         0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1510         0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1511         0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1512         0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1513         0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1514         0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1515         0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1516         0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1517         0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1518         0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1519         0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1520         0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1521         0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1522         0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1523         0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1524         0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1525         0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1526         0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1527         0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1528         0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1529         0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1530         0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1531         0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1532         0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1533 };
1534
1535 static void curve25519_ever64_base(u8 *out, const u8 *priv)
1536 {
1537         u64 swap = 1;
1538         int i, j, k;
1539         u64 tmp[16 + 32 + 4];
1540         u64 *x1 = &tmp[0];
1541         u64 *z1 = &tmp[4];
1542         u64 *x2 = &tmp[8];
1543         u64 *z2 = &tmp[12];
1544         u64 *xz1 = &tmp[0];
1545         u64 *xz2 = &tmp[8];
1546         u64 *a = &tmp[0 + 16];
1547         u64 *b = &tmp[4 + 16];
1548         u64 *c = &tmp[8 + 16];
1549         u64 *ab = &tmp[0 + 16];
1550         u64 *abcd = &tmp[0 + 16];
1551         u64 *ef = &tmp[16 + 16];
1552         u64 *efgh = &tmp[16 + 16];
1553         u64 *key = &tmp[0 + 16 + 32];
1554
1555         memcpy(key, priv, 32);
1556         ((u8 *)key)[0] &= 248;
1557         ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1558
1559         x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1560         z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1561         z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1562         memcpy(x2, p_minus_s, sizeof(p_minus_s));
1563
1564         j = 3;
1565         for (i = 0; i < 4; ++i) {
1566                 while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1567                         u64 bit = (key[i] >> j) & 1;
1568                         k = (64 * i + j - 3);
1569                         swap = swap ^ bit;
1570                         cswap2(swap, xz1, xz2);
1571                         swap = bit;
1572                         fsub(b, x1, z1);
1573                         fadd(a, x1, z1);
1574                         fmul(c, &table_ladder[4 * k], b, ef);
1575                         fsub(b, a, c);
1576                         fadd(a, a, c);
1577                         fsqr2(ab, ab, efgh);
1578                         fmul2(xz1, xz2, ab, efgh);
1579                         ++j;
1580                 }
1581                 j = 0;
1582         }
1583
1584         point_double(xz1, abcd, efgh);
1585         point_double(xz1, abcd, efgh);
1586         point_double(xz1, abcd, efgh);
1587         encode_point(out, xz1);
1588
1589         memzero_explicit(tmp, sizeof(tmp));
1590 }
1591
1592 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1593
1594 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1595                      const u8 secret[CURVE25519_KEY_SIZE],
1596                      const u8 basepoint[CURVE25519_KEY_SIZE])
1597 {
1598         if (static_branch_likely(&curve25519_use_bmi2_adx))
1599                 curve25519_ever64(mypublic, secret, basepoint);
1600         else
1601                 curve25519_generic(mypublic, secret, basepoint);
1602 }
1603 EXPORT_SYMBOL(curve25519_arch);
1604
1605 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1606                           const u8 secret[CURVE25519_KEY_SIZE])
1607 {
1608         if (static_branch_likely(&curve25519_use_bmi2_adx))
1609                 curve25519_ever64_base(pub, secret);
1610         else
1611                 curve25519_generic(pub, secret, curve25519_base_point);
1612 }
1613 EXPORT_SYMBOL(curve25519_base_arch);
1614
1615 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1616                                  unsigned int len)
1617 {
1618         u8 *secret = kpp_tfm_ctx(tfm);
1619
1620         if (!len)
1621                 curve25519_generate_secret(secret);
1622         else if (len == CURVE25519_KEY_SIZE &&
1623                  crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1624                 memcpy(secret, buf, CURVE25519_KEY_SIZE);
1625         else
1626                 return -EINVAL;
1627         return 0;
1628 }
1629
1630 static int curve25519_generate_public_key(struct kpp_request *req)
1631 {
1632         struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1633         const u8 *secret = kpp_tfm_ctx(tfm);
1634         u8 buf[CURVE25519_KEY_SIZE];
1635         int copied, nbytes;
1636
1637         if (req->src)
1638                 return -EINVAL;
1639
1640         curve25519_base_arch(buf, secret);
1641
1642         /* might want less than we've got */
1643         nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1644         copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1645                                                                 nbytes),
1646                                      buf, nbytes);
1647         if (copied != nbytes)
1648                 return -EINVAL;
1649         return 0;
1650 }
1651
1652 static int curve25519_compute_shared_secret(struct kpp_request *req)
1653 {
1654         struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1655         const u8 *secret = kpp_tfm_ctx(tfm);
1656         u8 public_key[CURVE25519_KEY_SIZE];
1657         u8 buf[CURVE25519_KEY_SIZE];
1658         int copied, nbytes;
1659
1660         if (!req->src)
1661                 return -EINVAL;
1662
1663         copied = sg_copy_to_buffer(req->src,
1664                                    sg_nents_for_len(req->src,
1665                                                     CURVE25519_KEY_SIZE),
1666                                    public_key, CURVE25519_KEY_SIZE);
1667         if (copied != CURVE25519_KEY_SIZE)
1668                 return -EINVAL;
1669
1670         curve25519_arch(buf, secret, public_key);
1671
1672         /* might want less than we've got */
1673         nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1674         copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1675                                                                 nbytes),
1676                                      buf, nbytes);
1677         if (copied != nbytes)
1678                 return -EINVAL;
1679         return 0;
1680 }
1681
1682 static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1683 {
1684         return CURVE25519_KEY_SIZE;
1685 }
1686
1687 static struct kpp_alg curve25519_alg = {
1688         .base.cra_name          = "curve25519",
1689         .base.cra_driver_name   = "curve25519-x86",
1690         .base.cra_priority      = 200,
1691         .base.cra_module        = THIS_MODULE,
1692         .base.cra_ctxsize       = CURVE25519_KEY_SIZE,
1693
1694         .set_secret             = curve25519_set_secret,
1695         .generate_public_key    = curve25519_generate_public_key,
1696         .compute_shared_secret  = curve25519_compute_shared_secret,
1697         .max_size               = curve25519_max_size,
1698 };
1699
1700
1701 static int __init curve25519_mod_init(void)
1702 {
1703         if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1704                 static_branch_enable(&curve25519_use_bmi2_adx);
1705         else
1706                 return 0;
1707         return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1708                 crypto_register_kpp(&curve25519_alg) : 0;
1709 }
1710
1711 static void __exit curve25519_mod_exit(void)
1712 {
1713         if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1714             static_branch_likely(&curve25519_use_bmi2_adx))
1715                 crypto_unregister_kpp(&curve25519_alg);
1716 }
1717
1718 module_init(curve25519_mod_init);
1719 module_exit(curve25519_mod_exit);
1720
1721 MODULE_ALIAS_CRYPTO("curve25519");
1722 MODULE_ALIAS_CRYPTO("curve25519-x86");
1723 MODULE_LICENSE("GPL v2");
1724 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");