Merge tag 'tegra-for-5.7-asoc' of git://git.kernel.org/pub/scm/linux/kernel/git/tegra...
[linux-2.6-microblaze.git] / arch / x86 / crypto / curve25519-x86_64.c
1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3  * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4  * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
5  */
6
7 #include <crypto/curve25519.h>
8 #include <crypto/internal/kpp.h>
9
10 #include <linux/types.h>
11 #include <linux/jump_label.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14
15 #include <asm/cpufeature.h>
16 #include <asm/processor.h>
17
18 static __always_inline u64 eq_mask(u64 a, u64 b)
19 {
20         u64 x = a ^ b;
21         u64 minus_x = ~x + (u64)1U;
22         u64 x_or_minus_x = x | minus_x;
23         u64 xnx = x_or_minus_x >> (u32)63U;
24         return xnx - (u64)1U;
25 }
26
27 static __always_inline u64 gte_mask(u64 a, u64 b)
28 {
29         u64 x = a;
30         u64 y = b;
31         u64 x_xor_y = x ^ y;
32         u64 x_sub_y = x - y;
33         u64 x_sub_y_xor_y = x_sub_y ^ y;
34         u64 q = x_xor_y | x_sub_y_xor_y;
35         u64 x_xor_q = x ^ q;
36         u64 x_xor_q_ = x_xor_q >> (u32)63U;
37         return x_xor_q_ - (u64)1U;
38 }
39
40 /* Computes the addition of four-element f1 with value in f2
41  * and returns the carry (if any) */
42 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
43 {
44         u64 carry_r;
45
46         asm volatile(
47                 /* Clear registers to propagate the carry bit */
48                 "  xor %%r8, %%r8;"
49                 "  xor %%r9, %%r9;"
50                 "  xor %%r10, %%r10;"
51                 "  xor %%r11, %%r11;"
52                 "  xor %1, %1;"
53
54                 /* Begin addition chain */
55                 "  addq 0(%3), %0;"
56                 "  movq %0, 0(%2);"
57                 "  adcxq 8(%3), %%r8;"
58                 "  movq %%r8, 8(%2);"
59                 "  adcxq 16(%3), %%r9;"
60                 "  movq %%r9, 16(%2);"
61                 "  adcxq 24(%3), %%r10;"
62                 "  movq %%r10, 24(%2);"
63
64                 /* Return the carry bit in a register */
65                 "  adcx %%r11, %1;"
66         : "+&r" (f2), "=&r" (carry_r)
67         : "r" (out), "r" (f1)
68         : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
69         );
70
71         return carry_r;
72 }
73
74 /* Computes the field addition of two field elements */
75 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
76 {
77         asm volatile(
78                 /* Compute the raw addition of f1 + f2 */
79                 "  movq 0(%0), %%r8;"
80                 "  addq 0(%2), %%r8;"
81                 "  movq 8(%0), %%r9;"
82                 "  adcxq 8(%2), %%r9;"
83                 "  movq 16(%0), %%r10;"
84                 "  adcxq 16(%2), %%r10;"
85                 "  movq 24(%0), %%r11;"
86                 "  adcxq 24(%2), %%r11;"
87
88                 /* Wrap the result back into the field */
89
90                 /* Step 1: Compute carry*38 */
91                 "  mov $0, %%rax;"
92                 "  mov $38, %0;"
93                 "  cmovc %0, %%rax;"
94
95                 /* Step 2: Add carry*38 to the original sum */
96                 "  xor %%rcx, %%rcx;"
97                 "  add %%rax, %%r8;"
98                 "  adcx %%rcx, %%r9;"
99                 "  movq %%r9, 8(%1);"
100                 "  adcx %%rcx, %%r10;"
101                 "  movq %%r10, 16(%1);"
102                 "  adcx %%rcx, %%r11;"
103                 "  movq %%r11, 24(%1);"
104
105                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
106                 "  mov $0, %%rax;"
107                 "  cmovc %0, %%rax;"
108                 "  add %%rax, %%r8;"
109                 "  movq %%r8, 0(%1);"
110         : "+&r" (f2)
111         : "r" (out), "r" (f1)
112         : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
113         );
114 }
115
116 /* Computes the field substraction of two field elements */
117 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
118 {
119         asm volatile(
120                 /* Compute the raw substraction of f1-f2 */
121                 "  movq 0(%1), %%r8;"
122                 "  subq 0(%2), %%r8;"
123                 "  movq 8(%1), %%r9;"
124                 "  sbbq 8(%2), %%r9;"
125                 "  movq 16(%1), %%r10;"
126                 "  sbbq 16(%2), %%r10;"
127                 "  movq 24(%1), %%r11;"
128                 "  sbbq 24(%2), %%r11;"
129
130                 /* Wrap the result back into the field */
131
132                 /* Step 1: Compute carry*38 */
133                 "  mov $0, %%rax;"
134                 "  mov $38, %%rcx;"
135                 "  cmovc %%rcx, %%rax;"
136
137                 /* Step 2: Substract carry*38 from the original difference */
138                 "  sub %%rax, %%r8;"
139                 "  sbb $0, %%r9;"
140                 "  sbb $0, %%r10;"
141                 "  sbb $0, %%r11;"
142
143                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
144                 "  mov $0, %%rax;"
145                 "  cmovc %%rcx, %%rax;"
146                 "  sub %%rax, %%r8;"
147
148                 /* Store the result */
149                 "  movq %%r8, 0(%0);"
150                 "  movq %%r9, 8(%0);"
151                 "  movq %%r10, 16(%0);"
152                 "  movq %%r11, 24(%0);"
153         :
154         : "r" (out), "r" (f1), "r" (f2)
155         : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
156         );
157 }
158
159 /* Computes a field multiplication: out <- f1 * f2
160  * Uses the 8-element buffer tmp for intermediate results */
161 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
162 {
163         asm volatile(
164                 /* Compute the raw multiplication: tmp <- src1 * src2 */
165
166                 /* Compute src1[0] * src2 */
167                 "  movq 0(%1), %%rdx;"
168                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
169                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
170                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
171                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
172                                                    "  adox %%rdx, %%rax;"
173                 /* Compute src1[1] * src2 */
174                 "  movq 8(%1), %%rdx;"
175                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
176                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
177                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
178                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
179                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
180                 /* Compute src1[2] * src2 */
181                 "  movq 16(%1), %%rdx;"
182                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
183                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
184                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
185                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
186                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
187                 /* Compute src1[3] * src2 */
188                 "  movq 24(%1), %%rdx;"
189                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
190                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
191                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
192                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
193                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
194                 /* Line up pointers */
195                 "  mov %0, %1;"
196                 "  mov %2, %0;"
197
198                 /* Wrap the result back into the field */
199
200                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
201                 "  mov $38, %%rdx;"
202                 "  mulxq 32(%1), %%r8, %%r13;"
203                 "  xor %3, %3;"
204                 "  adoxq 0(%1), %%r8;"
205                 "  mulxq 40(%1), %%r9, %%rbx;"
206                 "  adcx %%r13, %%r9;"
207                 "  adoxq 8(%1), %%r9;"
208                 "  mulxq 48(%1), %%r10, %%r13;"
209                 "  adcx %%rbx, %%r10;"
210                 "  adoxq 16(%1), %%r10;"
211                 "  mulxq 56(%1), %%r11, %%rax;"
212                 "  adcx %%r13, %%r11;"
213                 "  adoxq 24(%1), %%r11;"
214                 "  adcx %3, %%rax;"
215                 "  adox %3, %%rax;"
216                 "  imul %%rdx, %%rax;"
217
218                 /* Step 2: Fold the carry back into dst */
219                 "  add %%rax, %%r8;"
220                 "  adcx %3, %%r9;"
221                 "  movq %%r9, 8(%0);"
222                 "  adcx %3, %%r10;"
223                 "  movq %%r10, 16(%0);"
224                 "  adcx %3, %%r11;"
225                 "  movq %%r11, 24(%0);"
226
227                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
228                 "  mov $0, %%rax;"
229                 "  cmovc %%rdx, %%rax;"
230                 "  add %%rax, %%r8;"
231                 "  movq %%r8, 0(%0);"
232         : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
233         :
234         : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
235         );
236 }
237
238 /* Computes two field multiplications:
239  * out[0] <- f1[0] * f2[0]
240  * out[1] <- f1[1] * f2[1]
241  * Uses the 16-element buffer tmp for intermediate results. */
242 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
243 {
244         asm volatile(
245                 /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
246
247                 /* Compute src1[0] * src2 */
248                 "  movq 0(%1), %%rdx;"
249                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
250                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
251                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
252                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
253                                                    "  adox %%rdx, %%rax;"
254                 /* Compute src1[1] * src2 */
255                 "  movq 8(%1), %%rdx;"
256                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
257                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
258                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
259                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
260                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
261                 /* Compute src1[2] * src2 */
262                 "  movq 16(%1), %%rdx;"
263                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
264                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
265                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
266                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
267                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
268                 /* Compute src1[3] * src2 */
269                 "  movq 24(%1), %%rdx;"
270                 "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
271                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
272                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
273                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
274                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
275
276                 /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
277
278                 /* Compute src1[0] * src2 */
279                 "  movq 32(%1), %%rdx;"
280                 "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
281                 "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
282                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
283                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
284                                                    "  adox %%rdx, %%rax;"
285                 /* Compute src1[1] * src2 */
286                 "  movq 40(%1), %%rdx;"
287                 "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
288                 "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
289                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
290                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
291                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
292                 /* Compute src1[2] * src2 */
293                 "  movq 48(%1), %%rdx;"
294                 "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
295                 "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
296                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
297                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
298                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
299                 /* Compute src1[3] * src2 */
300                 "  movq 56(%1), %%rdx;"
301                 "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
302                 "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
303                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
304                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
305                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
306                 /* Line up pointers */
307                 "  mov %0, %1;"
308                 "  mov %2, %0;"
309
310                 /* Wrap the results back into the field */
311
312                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
313                 "  mov $38, %%rdx;"
314                 "  mulxq 32(%1), %%r8, %%r13;"
315                 "  xor %3, %3;"
316                 "  adoxq 0(%1), %%r8;"
317                 "  mulxq 40(%1), %%r9, %%rbx;"
318                 "  adcx %%r13, %%r9;"
319                 "  adoxq 8(%1), %%r9;"
320                 "  mulxq 48(%1), %%r10, %%r13;"
321                 "  adcx %%rbx, %%r10;"
322                 "  adoxq 16(%1), %%r10;"
323                 "  mulxq 56(%1), %%r11, %%rax;"
324                 "  adcx %%r13, %%r11;"
325                 "  adoxq 24(%1), %%r11;"
326                 "  adcx %3, %%rax;"
327                 "  adox %3, %%rax;"
328                 "  imul %%rdx, %%rax;"
329
330                 /* Step 2: Fold the carry back into dst */
331                 "  add %%rax, %%r8;"
332                 "  adcx %3, %%r9;"
333                 "  movq %%r9, 8(%0);"
334                 "  adcx %3, %%r10;"
335                 "  movq %%r10, 16(%0);"
336                 "  adcx %3, %%r11;"
337                 "  movq %%r11, 24(%0);"
338
339                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
340                 "  mov $0, %%rax;"
341                 "  cmovc %%rdx, %%rax;"
342                 "  add %%rax, %%r8;"
343                 "  movq %%r8, 0(%0);"
344
345                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
346                 "  mov $38, %%rdx;"
347                 "  mulxq 96(%1), %%r8, %%r13;"
348                 "  xor %3, %3;"
349                 "  adoxq 64(%1), %%r8;"
350                 "  mulxq 104(%1), %%r9, %%rbx;"
351                 "  adcx %%r13, %%r9;"
352                 "  adoxq 72(%1), %%r9;"
353                 "  mulxq 112(%1), %%r10, %%r13;"
354                 "  adcx %%rbx, %%r10;"
355                 "  adoxq 80(%1), %%r10;"
356                 "  mulxq 120(%1), %%r11, %%rax;"
357                 "  adcx %%r13, %%r11;"
358                 "  adoxq 88(%1), %%r11;"
359                 "  adcx %3, %%rax;"
360                 "  adox %3, %%rax;"
361                 "  imul %%rdx, %%rax;"
362
363                 /* Step 2: Fold the carry back into dst */
364                 "  add %%rax, %%r8;"
365                 "  adcx %3, %%r9;"
366                 "  movq %%r9, 40(%0);"
367                 "  adcx %3, %%r10;"
368                 "  movq %%r10, 48(%0);"
369                 "  adcx %3, %%r11;"
370                 "  movq %%r11, 56(%0);"
371
372                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
373                 "  mov $0, %%rax;"
374                 "  cmovc %%rdx, %%rax;"
375                 "  add %%rax, %%r8;"
376                 "  movq %%r8, 32(%0);"
377         : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
378         :
379         : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
380         );
381 }
382
383 /* Computes the field multiplication of four-element f1 with value in f2 */
384 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
385 {
386         register u64 f2_r asm("rdx") = f2;
387
388         asm volatile(
389                 /* Compute the raw multiplication of f1*f2 */
390                 "  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
391                 "  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
392                 "  add %%rcx, %%r9;"
393                 "  mov $0, %%rcx;"
394                 "  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
395                 "  adcx %%rbx, %%r10;"
396                 "  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
397                 "  adcx %%r13, %%r11;"
398                 "  adcx %%rcx, %%rax;"
399
400                 /* Wrap the result back into the field */
401
402                 /* Step 1: Compute carry*38 */
403                 "  mov $38, %%rdx;"
404                 "  imul %%rdx, %%rax;"
405
406                 /* Step 2: Fold the carry back into dst */
407                 "  add %%rax, %%r8;"
408                 "  adcx %%rcx, %%r9;"
409                 "  movq %%r9, 8(%1);"
410                 "  adcx %%rcx, %%r10;"
411                 "  movq %%r10, 16(%1);"
412                 "  adcx %%rcx, %%r11;"
413                 "  movq %%r11, 24(%1);"
414
415                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
416                 "  mov $0, %%rax;"
417                 "  cmovc %%rdx, %%rax;"
418                 "  add %%rax, %%r8;"
419                 "  movq %%r8, 0(%1);"
420         : "+&r" (f2_r)
421         : "r" (out), "r" (f1)
422         : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
423         );
424 }
425
426 /* Computes p1 <- bit ? p2 : p1 in constant time */
427 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
428 {
429         asm volatile(
430                 /* Invert the polarity of bit to match cmov expectations */
431                 "  add $18446744073709551615, %0;"
432
433                 /* cswap p1[0], p2[0] */
434                 "  movq 0(%1), %%r8;"
435                 "  movq 0(%2), %%r9;"
436                 "  mov %%r8, %%r10;"
437                 "  cmovc %%r9, %%r8;"
438                 "  cmovc %%r10, %%r9;"
439                 "  movq %%r8, 0(%1);"
440                 "  movq %%r9, 0(%2);"
441
442                 /* cswap p1[1], p2[1] */
443                 "  movq 8(%1), %%r8;"
444                 "  movq 8(%2), %%r9;"
445                 "  mov %%r8, %%r10;"
446                 "  cmovc %%r9, %%r8;"
447                 "  cmovc %%r10, %%r9;"
448                 "  movq %%r8, 8(%1);"
449                 "  movq %%r9, 8(%2);"
450
451                 /* cswap p1[2], p2[2] */
452                 "  movq 16(%1), %%r8;"
453                 "  movq 16(%2), %%r9;"
454                 "  mov %%r8, %%r10;"
455                 "  cmovc %%r9, %%r8;"
456                 "  cmovc %%r10, %%r9;"
457                 "  movq %%r8, 16(%1);"
458                 "  movq %%r9, 16(%2);"
459
460                 /* cswap p1[3], p2[3] */
461                 "  movq 24(%1), %%r8;"
462                 "  movq 24(%2), %%r9;"
463                 "  mov %%r8, %%r10;"
464                 "  cmovc %%r9, %%r8;"
465                 "  cmovc %%r10, %%r9;"
466                 "  movq %%r8, 24(%1);"
467                 "  movq %%r9, 24(%2);"
468
469                 /* cswap p1[4], p2[4] */
470                 "  movq 32(%1), %%r8;"
471                 "  movq 32(%2), %%r9;"
472                 "  mov %%r8, %%r10;"
473                 "  cmovc %%r9, %%r8;"
474                 "  cmovc %%r10, %%r9;"
475                 "  movq %%r8, 32(%1);"
476                 "  movq %%r9, 32(%2);"
477
478                 /* cswap p1[5], p2[5] */
479                 "  movq 40(%1), %%r8;"
480                 "  movq 40(%2), %%r9;"
481                 "  mov %%r8, %%r10;"
482                 "  cmovc %%r9, %%r8;"
483                 "  cmovc %%r10, %%r9;"
484                 "  movq %%r8, 40(%1);"
485                 "  movq %%r9, 40(%2);"
486
487                 /* cswap p1[6], p2[6] */
488                 "  movq 48(%1), %%r8;"
489                 "  movq 48(%2), %%r9;"
490                 "  mov %%r8, %%r10;"
491                 "  cmovc %%r9, %%r8;"
492                 "  cmovc %%r10, %%r9;"
493                 "  movq %%r8, 48(%1);"
494                 "  movq %%r9, 48(%2);"
495
496                 /* cswap p1[7], p2[7] */
497                 "  movq 56(%1), %%r8;"
498                 "  movq 56(%2), %%r9;"
499                 "  mov %%r8, %%r10;"
500                 "  cmovc %%r9, %%r8;"
501                 "  cmovc %%r10, %%r9;"
502                 "  movq %%r8, 56(%1);"
503                 "  movq %%r9, 56(%2);"
504         : "+&r" (bit)
505         : "r" (p1), "r" (p2)
506         : "%r8", "%r9", "%r10", "memory", "cc"
507         );
508 }
509
510 /* Computes the square of a field element: out <- f * f
511  * Uses the 8-element buffer tmp for intermediate results */
512 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
513 {
514         asm volatile(
515                 /* Compute the raw multiplication: tmp <- f * f */
516
517                 /* Step 1: Compute all partial products */
518                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
519                 "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
520                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
521                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
522                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
523                 "  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
524                 "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
525                 "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
526                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
527
528                 /* Step 2: Compute two parallel carry chains */
529                 "  xor %%r15, %%r15;"
530                 "  adox %%rax, %%r10;"
531                 "  adcx %%r8, %%r8;"
532                 "  adox %%rcx, %%r11;"
533                 "  adcx %%r9, %%r9;"
534                 "  adox %%r15, %%rbx;"
535                 "  adcx %%r10, %%r10;"
536                 "  adox %%r15, %%r13;"
537                 "  adcx %%r11, %%r11;"
538                 "  adox %%r15, %%r14;"
539                 "  adcx %%rbx, %%rbx;"
540                 "  adcx %%r13, %%r13;"
541                 "  adcx %%r14, %%r14;"
542
543                 /* Step 3: Compute intermediate squares */
544                 "  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
545                                            "  movq %%rax, 0(%0);"
546                 "  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
547                 "  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
548                 "  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
549                 "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
550                 "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
551                 "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
552                 "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
553                 "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
554                 "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
555                 "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
556
557                 /* Line up pointers */
558                 "  mov %0, %1;"
559                 "  mov %2, %0;"
560
561                 /* Wrap the result back into the field */
562
563                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
564                 "  mov $38, %%rdx;"
565                 "  mulxq 32(%1), %%r8, %%r13;"
566                 "  xor %%rcx, %%rcx;"
567                 "  adoxq 0(%1), %%r8;"
568                 "  mulxq 40(%1), %%r9, %%rbx;"
569                 "  adcx %%r13, %%r9;"
570                 "  adoxq 8(%1), %%r9;"
571                 "  mulxq 48(%1), %%r10, %%r13;"
572                 "  adcx %%rbx, %%r10;"
573                 "  adoxq 16(%1), %%r10;"
574                 "  mulxq 56(%1), %%r11, %%rax;"
575                 "  adcx %%r13, %%r11;"
576                 "  adoxq 24(%1), %%r11;"
577                 "  adcx %%rcx, %%rax;"
578                 "  adox %%rcx, %%rax;"
579                 "  imul %%rdx, %%rax;"
580
581                 /* Step 2: Fold the carry back into dst */
582                 "  add %%rax, %%r8;"
583                 "  adcx %%rcx, %%r9;"
584                 "  movq %%r9, 8(%0);"
585                 "  adcx %%rcx, %%r10;"
586                 "  movq %%r10, 16(%0);"
587                 "  adcx %%rcx, %%r11;"
588                 "  movq %%r11, 24(%0);"
589
590                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
591                 "  mov $0, %%rax;"
592                 "  cmovc %%rdx, %%rax;"
593                 "  add %%rax, %%r8;"
594                 "  movq %%r8, 0(%0);"
595         : "+&r" (tmp), "+&r" (f), "+&r" (out)
596         :
597         : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
598         );
599 }
600
601 /* Computes two field squarings:
602  * out[0] <- f[0] * f[0]
603  * out[1] <- f[1] * f[1]
604  * Uses the 16-element buffer tmp for intermediate results */
605 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
606 {
607         asm volatile(
608                 /* Step 1: Compute all partial products */
609                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
610                 "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
611                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
612                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
613                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
614                 "  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
615                 "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
616                 "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
617                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
618
619                 /* Step 2: Compute two parallel carry chains */
620                 "  xor %%r15, %%r15;"
621                 "  adox %%rax, %%r10;"
622                 "  adcx %%r8, %%r8;"
623                 "  adox %%rcx, %%r11;"
624                 "  adcx %%r9, %%r9;"
625                 "  adox %%r15, %%rbx;"
626                 "  adcx %%r10, %%r10;"
627                 "  adox %%r15, %%r13;"
628                 "  adcx %%r11, %%r11;"
629                 "  adox %%r15, %%r14;"
630                 "  adcx %%rbx, %%rbx;"
631                 "  adcx %%r13, %%r13;"
632                 "  adcx %%r14, %%r14;"
633
634                 /* Step 3: Compute intermediate squares */
635                 "  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
636                                            "  movq %%rax, 0(%0);"
637                 "  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
638                 "  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
639                 "  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
640                 "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
641                 "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
642                 "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
643                 "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
644                 "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
645                 "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
646                 "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
647
648                 /* Step 1: Compute all partial products */
649                 "  movq 32(%1), %%rdx;"                                       /* f[0] */
650                 "  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
651                 "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
652                 "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
653                 "  movq 56(%1), %%rdx;"                                      /* f[3] */
654                 "  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
655                 "  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
656                 "  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
657                 "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
658
659                 /* Step 2: Compute two parallel carry chains */
660                 "  xor %%r15, %%r15;"
661                 "  adox %%rax, %%r10;"
662                 "  adcx %%r8, %%r8;"
663                 "  adox %%rcx, %%r11;"
664                 "  adcx %%r9, %%r9;"
665                 "  adox %%r15, %%rbx;"
666                 "  adcx %%r10, %%r10;"
667                 "  adox %%r15, %%r13;"
668                 "  adcx %%r11, %%r11;"
669                 "  adox %%r15, %%r14;"
670                 "  adcx %%rbx, %%rbx;"
671                 "  adcx %%r13, %%r13;"
672                 "  adcx %%r14, %%r14;"
673
674                 /* Step 3: Compute intermediate squares */
675                 "  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
676                                            "  movq %%rax, 64(%0);"
677                 "  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
678                 "  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
679                 "  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
680                 "  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
681                 "  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
682                 "  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
683                 "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
684                 "  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
685                 "  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
686                 "  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
687
688                 /* Line up pointers */
689                 "  mov %0, %1;"
690                 "  mov %2, %0;"
691
692                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
693                 "  mov $38, %%rdx;"
694                 "  mulxq 32(%1), %%r8, %%r13;"
695                 "  xor %%rcx, %%rcx;"
696                 "  adoxq 0(%1), %%r8;"
697                 "  mulxq 40(%1), %%r9, %%rbx;"
698                 "  adcx %%r13, %%r9;"
699                 "  adoxq 8(%1), %%r9;"
700                 "  mulxq 48(%1), %%r10, %%r13;"
701                 "  adcx %%rbx, %%r10;"
702                 "  adoxq 16(%1), %%r10;"
703                 "  mulxq 56(%1), %%r11, %%rax;"
704                 "  adcx %%r13, %%r11;"
705                 "  adoxq 24(%1), %%r11;"
706                 "  adcx %%rcx, %%rax;"
707                 "  adox %%rcx, %%rax;"
708                 "  imul %%rdx, %%rax;"
709
710                 /* Step 2: Fold the carry back into dst */
711                 "  add %%rax, %%r8;"
712                 "  adcx %%rcx, %%r9;"
713                 "  movq %%r9, 8(%0);"
714                 "  adcx %%rcx, %%r10;"
715                 "  movq %%r10, 16(%0);"
716                 "  adcx %%rcx, %%r11;"
717                 "  movq %%r11, 24(%0);"
718
719                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
720                 "  mov $0, %%rax;"
721                 "  cmovc %%rdx, %%rax;"
722                 "  add %%rax, %%r8;"
723                 "  movq %%r8, 0(%0);"
724
725                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
726                 "  mov $38, %%rdx;"
727                 "  mulxq 96(%1), %%r8, %%r13;"
728                 "  xor %%rcx, %%rcx;"
729                 "  adoxq 64(%1), %%r8;"
730                 "  mulxq 104(%1), %%r9, %%rbx;"
731                 "  adcx %%r13, %%r9;"
732                 "  adoxq 72(%1), %%r9;"
733                 "  mulxq 112(%1), %%r10, %%r13;"
734                 "  adcx %%rbx, %%r10;"
735                 "  adoxq 80(%1), %%r10;"
736                 "  mulxq 120(%1), %%r11, %%rax;"
737                 "  adcx %%r13, %%r11;"
738                 "  adoxq 88(%1), %%r11;"
739                 "  adcx %%rcx, %%rax;"
740                 "  adox %%rcx, %%rax;"
741                 "  imul %%rdx, %%rax;"
742
743                 /* Step 2: Fold the carry back into dst */
744                 "  add %%rax, %%r8;"
745                 "  adcx %%rcx, %%r9;"
746                 "  movq %%r9, 40(%0);"
747                 "  adcx %%rcx, %%r10;"
748                 "  movq %%r10, 48(%0);"
749                 "  adcx %%rcx, %%r11;"
750                 "  movq %%r11, 56(%0);"
751
752                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
753                 "  mov $0, %%rax;"
754                 "  cmovc %%rdx, %%rax;"
755                 "  add %%rax, %%r8;"
756                 "  movq %%r8, 32(%0);"
757         : "+&r" (tmp), "+&r" (f), "+&r" (out)
758         :
759         : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
760         );
761 }
762
763 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
764 {
765         u64 *nq = p01_tmp1;
766         u64 *nq_p1 = p01_tmp1 + (u32)8U;
767         u64 *tmp1 = p01_tmp1 + (u32)16U;
768         u64 *x1 = q;
769         u64 *x2 = nq;
770         u64 *z2 = nq + (u32)4U;
771         u64 *z3 = nq_p1 + (u32)4U;
772         u64 *a = tmp1;
773         u64 *b = tmp1 + (u32)4U;
774         u64 *ab = tmp1;
775         u64 *dc = tmp1 + (u32)8U;
776         u64 *x3;
777         u64 *z31;
778         u64 *d0;
779         u64 *c0;
780         u64 *a1;
781         u64 *b1;
782         u64 *d;
783         u64 *c;
784         u64 *ab1;
785         u64 *dc1;
786         fadd(a, x2, z2);
787         fsub(b, x2, z2);
788         x3 = nq_p1;
789         z31 = nq_p1 + (u32)4U;
790         d0 = dc;
791         c0 = dc + (u32)4U;
792         fadd(c0, x3, z31);
793         fsub(d0, x3, z31);
794         fmul2(dc, dc, ab, tmp2);
795         fadd(x3, d0, c0);
796         fsub(z31, d0, c0);
797         a1 = tmp1;
798         b1 = tmp1 + (u32)4U;
799         d = tmp1 + (u32)8U;
800         c = tmp1 + (u32)12U;
801         ab1 = tmp1;
802         dc1 = tmp1 + (u32)8U;
803         fsqr2(dc1, ab1, tmp2);
804         fsqr2(nq_p1, nq_p1, tmp2);
805         a1[0U] = c[0U];
806         a1[1U] = c[1U];
807         a1[2U] = c[2U];
808         a1[3U] = c[3U];
809         fsub(c, d, c);
810         fmul_scalar(b1, c, (u64)121665U);
811         fadd(b1, b1, d);
812         fmul2(nq, dc1, ab1, tmp2);
813         fmul(z3, z3, x1, tmp2);
814 }
815
816 static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
817 {
818         u64 *x2 = nq;
819         u64 *z2 = nq + (u32)4U;
820         u64 *a = tmp1;
821         u64 *b = tmp1 + (u32)4U;
822         u64 *d = tmp1 + (u32)8U;
823         u64 *c = tmp1 + (u32)12U;
824         u64 *ab = tmp1;
825         u64 *dc = tmp1 + (u32)8U;
826         fadd(a, x2, z2);
827         fsub(b, x2, z2);
828         fsqr2(dc, ab, tmp2);
829         a[0U] = c[0U];
830         a[1U] = c[1U];
831         a[2U] = c[2U];
832         a[3U] = c[3U];
833         fsub(c, d, c);
834         fmul_scalar(b, c, (u64)121665U);
835         fadd(b, b, d);
836         fmul2(nq, dc, ab, tmp2);
837 }
838
839 static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
840 {
841         u64 tmp2[16U] = { 0U };
842         u64 p01_tmp1_swap[33U] = { 0U };
843         u64 *p0 = p01_tmp1_swap;
844         u64 *p01 = p01_tmp1_swap;
845         u64 *p03 = p01;
846         u64 *p11 = p01 + (u32)8U;
847         u64 *x0;
848         u64 *z0;
849         u64 *p01_tmp1;
850         u64 *p01_tmp11;
851         u64 *nq10;
852         u64 *nq_p11;
853         u64 *swap1;
854         u64 sw0;
855         u64 *nq1;
856         u64 *tmp1;
857         memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
858         x0 = p03;
859         z0 = p03 + (u32)4U;
860         x0[0U] = (u64)1U;
861         x0[1U] = (u64)0U;
862         x0[2U] = (u64)0U;
863         x0[3U] = (u64)0U;
864         z0[0U] = (u64)0U;
865         z0[1U] = (u64)0U;
866         z0[2U] = (u64)0U;
867         z0[3U] = (u64)0U;
868         p01_tmp1 = p01_tmp1_swap;
869         p01_tmp11 = p01_tmp1_swap;
870         nq10 = p01_tmp1_swap;
871         nq_p11 = p01_tmp1_swap + (u32)8U;
872         swap1 = p01_tmp1_swap + (u32)32U;
873         cswap2((u64)1U, nq10, nq_p11);
874         point_add_and_double(init1, p01_tmp11, tmp2);
875         swap1[0U] = (u64)1U;
876         {
877                 u32 i;
878                 for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
879                         u64 *p01_tmp12 = p01_tmp1_swap;
880                         u64 *swap2 = p01_tmp1_swap + (u32)32U;
881                         u64 *nq2 = p01_tmp12;
882                         u64 *nq_p12 = p01_tmp12 + (u32)8U;
883                         u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
884                         u64 sw = swap2[0U] ^ bit;
885                         cswap2(sw, nq2, nq_p12);
886                         point_add_and_double(init1, p01_tmp12, tmp2);
887                         swap2[0U] = bit;
888                 }
889         }
890         sw0 = swap1[0U];
891         cswap2(sw0, nq10, nq_p11);
892         nq1 = p01_tmp1;
893         tmp1 = p01_tmp1 + (u32)16U;
894         point_double(nq1, tmp1, tmp2);
895         point_double(nq1, tmp1, tmp2);
896         point_double(nq1, tmp1, tmp2);
897         memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
898
899         memzero_explicit(tmp2, sizeof(tmp2));
900         memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
901 }
902
903 static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
904 {
905         u32 i;
906         fsqr(o, inp, tmp);
907         for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
908                 fsqr(o, o, tmp);
909 }
910
911 static void finv(u64 *o, const u64 *i, u64 *tmp)
912 {
913         u64 t1[16U] = { 0U };
914         u64 *a0 = t1;
915         u64 *b = t1 + (u32)4U;
916         u64 *c = t1 + (u32)8U;
917         u64 *t00 = t1 + (u32)12U;
918         u64 *tmp1 = tmp;
919         u64 *a;
920         u64 *t0;
921         fsquare_times(a0, i, tmp1, (u32)1U);
922         fsquare_times(t00, a0, tmp1, (u32)2U);
923         fmul(b, t00, i, tmp);
924         fmul(a0, b, a0, tmp);
925         fsquare_times(t00, a0, tmp1, (u32)1U);
926         fmul(b, t00, b, tmp);
927         fsquare_times(t00, b, tmp1, (u32)5U);
928         fmul(b, t00, b, tmp);
929         fsquare_times(t00, b, tmp1, (u32)10U);
930         fmul(c, t00, b, tmp);
931         fsquare_times(t00, c, tmp1, (u32)20U);
932         fmul(t00, t00, c, tmp);
933         fsquare_times(t00, t00, tmp1, (u32)10U);
934         fmul(b, t00, b, tmp);
935         fsquare_times(t00, b, tmp1, (u32)50U);
936         fmul(c, t00, b, tmp);
937         fsquare_times(t00, c, tmp1, (u32)100U);
938         fmul(t00, t00, c, tmp);
939         fsquare_times(t00, t00, tmp1, (u32)50U);
940         fmul(t00, t00, b, tmp);
941         fsquare_times(t00, t00, tmp1, (u32)5U);
942         a = t1;
943         t0 = t1 + (u32)12U;
944         fmul(o, t0, a, tmp);
945 }
946
947 static void store_felem(u64 *b, u64 *f)
948 {
949         u64 f30 = f[3U];
950         u64 top_bit0 = f30 >> (u32)63U;
951         u64 carry0;
952         u64 f31;
953         u64 top_bit;
954         u64 carry;
955         u64 f0;
956         u64 f1;
957         u64 f2;
958         u64 f3;
959         u64 m0;
960         u64 m1;
961         u64 m2;
962         u64 m3;
963         u64 mask;
964         u64 f0_;
965         u64 f1_;
966         u64 f2_;
967         u64 f3_;
968         u64 o0;
969         u64 o1;
970         u64 o2;
971         u64 o3;
972         f[3U] = f30 & (u64)0x7fffffffffffffffU;
973         carry0 = add_scalar(f, f, (u64)19U * top_bit0);
974         f31 = f[3U];
975         top_bit = f31 >> (u32)63U;
976         f[3U] = f31 & (u64)0x7fffffffffffffffU;
977         carry = add_scalar(f, f, (u64)19U * top_bit);
978         f0 = f[0U];
979         f1 = f[1U];
980         f2 = f[2U];
981         f3 = f[3U];
982         m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
983         m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
984         m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
985         m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
986         mask = ((m0 & m1) & m2) & m3;
987         f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
988         f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
989         f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
990         f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
991         o0 = f0_;
992         o1 = f1_;
993         o2 = f2_;
994         o3 = f3_;
995         b[0U] = o0;
996         b[1U] = o1;
997         b[2U] = o2;
998         b[3U] = o3;
999 }
1000
1001 static void encode_point(u8 *o, const u64 *i)
1002 {
1003         const u64 *x = i;
1004         const u64 *z = i + (u32)4U;
1005         u64 tmp[4U] = { 0U };
1006         u64 tmp_w[16U] = { 0U };
1007         finv(tmp, z, tmp_w);
1008         fmul(tmp, tmp, x, tmp_w);
1009         store_felem((u64 *)o, tmp);
1010 }
1011
1012 static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1013 {
1014         u64 init1[8U] = { 0U };
1015         u64 tmp[4U] = { 0U };
1016         u64 tmp3;
1017         u64 *x;
1018         u64 *z;
1019         {
1020                 u32 i;
1021                 for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1022                         u64 *os = tmp;
1023                         const u8 *bj = pub + i * (u32)8U;
1024                         u64 u = *(u64 *)bj;
1025                         u64 r = u;
1026                         u64 x0 = r;
1027                         os[i] = x0;
1028                 }
1029         }
1030         tmp3 = tmp[3U];
1031         tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1032         x = init1;
1033         z = init1 + (u32)4U;
1034         z[0U] = (u64)1U;
1035         z[1U] = (u64)0U;
1036         z[2U] = (u64)0U;
1037         z[3U] = (u64)0U;
1038         x[0U] = tmp[0U];
1039         x[1U] = tmp[1U];
1040         x[2U] = tmp[2U];
1041         x[3U] = tmp[3U];
1042         montgomery_ladder(init1, priv, init1);
1043         encode_point(out, init1);
1044 }
1045
1046 /* The below constants were generated using this sage script:
1047  *
1048  * #!/usr/bin/env sage
1049  * import sys
1050  * from sage.all import *
1051  * def limbs(n):
1052  *      n = int(n)
1053  *      l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1054  *      return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1055  * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1056  * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1057  * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1058  * print("static const u64 table_ladder[] = {")
1059  * p = ec.lift_x(9)
1060  * for i in range(252):
1061  *      l = (p[0] + p[2]) / (p[0] - p[2])
1062  *      print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1063  *      p = p * 2
1064  * print("};")
1065  *
1066  */
1067
1068 static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1069
1070 static const u64 table_ladder[] = {
1071         0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1072         0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1073         0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1074         0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1075         0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1076         0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1077         0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1078         0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1079         0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1080         0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1081         0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1082         0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1083         0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1084         0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1085         0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1086         0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1087         0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1088         0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1089         0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1090         0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1091         0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1092         0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1093         0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1094         0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1095         0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1096         0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1097         0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1098         0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1099         0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1100         0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1101         0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1102         0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1103         0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1104         0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1105         0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1106         0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1107         0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1108         0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1109         0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1110         0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1111         0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1112         0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1113         0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1114         0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1115         0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1116         0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1117         0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1118         0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1119         0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1120         0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1121         0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1122         0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1123         0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1124         0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1125         0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1126         0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1127         0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1128         0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1129         0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1130         0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1131         0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1132         0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1133         0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1134         0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1135         0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1136         0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1137         0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1138         0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1139         0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1140         0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1141         0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1142         0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1143         0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1144         0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1145         0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1146         0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1147         0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1148         0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1149         0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1150         0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1151         0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1152         0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1153         0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1154         0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1155         0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1156         0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1157         0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1158         0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1159         0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1160         0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1161         0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1162         0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1163         0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1164         0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1165         0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1166         0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1167         0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1168         0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1169         0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1170         0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1171         0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1172         0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1173         0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1174         0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1175         0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1176         0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1177         0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1178         0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1179         0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1180         0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1181         0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1182         0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1183         0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1184         0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1185         0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1186         0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1187         0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1188         0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1189         0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1190         0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1191         0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1192         0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1193         0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1194         0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1195         0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1196         0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1197         0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1198         0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1199         0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1200         0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1201         0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1202         0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1203         0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1204         0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1205         0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1206         0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1207         0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1208         0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1209         0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1210         0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1211         0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1212         0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1213         0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1214         0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1215         0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1216         0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1217         0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1218         0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1219         0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1220         0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1221         0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1222         0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1223         0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1224         0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1225         0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1226         0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1227         0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1228         0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1229         0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1230         0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1231         0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1232         0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1233         0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1234         0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1235         0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1236         0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1237         0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1238         0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1239         0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1240         0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1241         0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1242         0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1243         0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1244         0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1245         0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1246         0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1247         0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1248         0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1249         0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1250         0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1251         0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1252         0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1253         0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1254         0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1255         0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1256         0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1257         0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1258         0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1259         0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1260         0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1261         0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1262         0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1263         0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1264         0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1265         0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1266         0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1267         0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1268         0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1269         0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1270         0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1271         0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1272         0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1273         0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1274         0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1275         0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1276         0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1277         0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1278         0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1279         0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1280         0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1281         0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1282         0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1283         0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1284         0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1285         0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1286         0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1287         0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1288         0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1289         0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1290         0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1291         0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1292         0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1293         0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1294         0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1295         0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1296         0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1297         0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1298         0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1299         0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1300         0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1301         0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1302         0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1303         0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1304         0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1305         0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1306         0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1307         0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1308         0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1309         0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1310         0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1311         0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1312         0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1313         0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1314         0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1315         0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1316         0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1317         0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1318         0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1319         0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1320         0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1321         0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1322         0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1323 };
1324
1325 static void curve25519_ever64_base(u8 *out, const u8 *priv)
1326 {
1327         u64 swap = 1;
1328         int i, j, k;
1329         u64 tmp[16 + 32 + 4];
1330         u64 *x1 = &tmp[0];
1331         u64 *z1 = &tmp[4];
1332         u64 *x2 = &tmp[8];
1333         u64 *z2 = &tmp[12];
1334         u64 *xz1 = &tmp[0];
1335         u64 *xz2 = &tmp[8];
1336         u64 *a = &tmp[0 + 16];
1337         u64 *b = &tmp[4 + 16];
1338         u64 *c = &tmp[8 + 16];
1339         u64 *ab = &tmp[0 + 16];
1340         u64 *abcd = &tmp[0 + 16];
1341         u64 *ef = &tmp[16 + 16];
1342         u64 *efgh = &tmp[16 + 16];
1343         u64 *key = &tmp[0 + 16 + 32];
1344
1345         memcpy(key, priv, 32);
1346         ((u8 *)key)[0] &= 248;
1347         ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1348
1349         x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1350         z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1351         z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1352         memcpy(x2, p_minus_s, sizeof(p_minus_s));
1353
1354         j = 3;
1355         for (i = 0; i < 4; ++i) {
1356                 while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1357                         u64 bit = (key[i] >> j) & 1;
1358                         k = (64 * i + j - 3);
1359                         swap = swap ^ bit;
1360                         cswap2(swap, xz1, xz2);
1361                         swap = bit;
1362                         fsub(b, x1, z1);
1363                         fadd(a, x1, z1);
1364                         fmul(c, &table_ladder[4 * k], b, ef);
1365                         fsub(b, a, c);
1366                         fadd(a, a, c);
1367                         fsqr2(ab, ab, efgh);
1368                         fmul2(xz1, xz2, ab, efgh);
1369                         ++j;
1370                 }
1371                 j = 0;
1372         }
1373
1374         point_double(xz1, abcd, efgh);
1375         point_double(xz1, abcd, efgh);
1376         point_double(xz1, abcd, efgh);
1377         encode_point(out, xz1);
1378
1379         memzero_explicit(tmp, sizeof(tmp));
1380 }
1381
1382 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1383
1384 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1385                      const u8 secret[CURVE25519_KEY_SIZE],
1386                      const u8 basepoint[CURVE25519_KEY_SIZE])
1387 {
1388         if (static_branch_likely(&curve25519_use_bmi2_adx))
1389                 curve25519_ever64(mypublic, secret, basepoint);
1390         else
1391                 curve25519_generic(mypublic, secret, basepoint);
1392 }
1393 EXPORT_SYMBOL(curve25519_arch);
1394
1395 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1396                           const u8 secret[CURVE25519_KEY_SIZE])
1397 {
1398         if (static_branch_likely(&curve25519_use_bmi2_adx))
1399                 curve25519_ever64_base(pub, secret);
1400         else
1401                 curve25519_generic(pub, secret, curve25519_base_point);
1402 }
1403 EXPORT_SYMBOL(curve25519_base_arch);
1404
1405 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1406                                  unsigned int len)
1407 {
1408         u8 *secret = kpp_tfm_ctx(tfm);
1409
1410         if (!len)
1411                 curve25519_generate_secret(secret);
1412         else if (len == CURVE25519_KEY_SIZE &&
1413                  crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1414                 memcpy(secret, buf, CURVE25519_KEY_SIZE);
1415         else
1416                 return -EINVAL;
1417         return 0;
1418 }
1419
1420 static int curve25519_generate_public_key(struct kpp_request *req)
1421 {
1422         struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1423         const u8 *secret = kpp_tfm_ctx(tfm);
1424         u8 buf[CURVE25519_KEY_SIZE];
1425         int copied, nbytes;
1426
1427         if (req->src)
1428                 return -EINVAL;
1429
1430         curve25519_base_arch(buf, secret);
1431
1432         /* might want less than we've got */
1433         nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1434         copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1435                                                                 nbytes),
1436                                      buf, nbytes);
1437         if (copied != nbytes)
1438                 return -EINVAL;
1439         return 0;
1440 }
1441
1442 static int curve25519_compute_shared_secret(struct kpp_request *req)
1443 {
1444         struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1445         const u8 *secret = kpp_tfm_ctx(tfm);
1446         u8 public_key[CURVE25519_KEY_SIZE];
1447         u8 buf[CURVE25519_KEY_SIZE];
1448         int copied, nbytes;
1449
1450         if (!req->src)
1451                 return -EINVAL;
1452
1453         copied = sg_copy_to_buffer(req->src,
1454                                    sg_nents_for_len(req->src,
1455                                                     CURVE25519_KEY_SIZE),
1456                                    public_key, CURVE25519_KEY_SIZE);
1457         if (copied != CURVE25519_KEY_SIZE)
1458                 return -EINVAL;
1459
1460         curve25519_arch(buf, secret, public_key);
1461
1462         /* might want less than we've got */
1463         nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1464         copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1465                                                                 nbytes),
1466                                      buf, nbytes);
1467         if (copied != nbytes)
1468                 return -EINVAL;
1469         return 0;
1470 }
1471
1472 static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1473 {
1474         return CURVE25519_KEY_SIZE;
1475 }
1476
1477 static struct kpp_alg curve25519_alg = {
1478         .base.cra_name          = "curve25519",
1479         .base.cra_driver_name   = "curve25519-x86",
1480         .base.cra_priority      = 200,
1481         .base.cra_module        = THIS_MODULE,
1482         .base.cra_ctxsize       = CURVE25519_KEY_SIZE,
1483
1484         .set_secret             = curve25519_set_secret,
1485         .generate_public_key    = curve25519_generate_public_key,
1486         .compute_shared_secret  = curve25519_compute_shared_secret,
1487         .max_size               = curve25519_max_size,
1488 };
1489
1490
1491 static int __init curve25519_mod_init(void)
1492 {
1493         if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1494                 static_branch_enable(&curve25519_use_bmi2_adx);
1495         else
1496                 return 0;
1497         return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1498                 crypto_register_kpp(&curve25519_alg) : 0;
1499 }
1500
1501 static void __exit curve25519_mod_exit(void)
1502 {
1503         if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1504             (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
1505                 crypto_unregister_kpp(&curve25519_alg);
1506 }
1507
1508 module_init(curve25519_mod_init);
1509 module_exit(curve25519_mod_exit);
1510
1511 MODULE_ALIAS_CRYPTO("curve25519");
1512 MODULE_ALIAS_CRYPTO("curve25519-x86");
1513 MODULE_LICENSE("GPL v2");
1514 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");