1 /* gf128mul.c - GF(2^128) multiplication functions
3 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
4 * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
6 * Based on Dr Brian Gladman's (GPL'd) work published at
7 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
8 * See the original copyright notice below.
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 2 of the License, or (at your option)
17 ---------------------------------------------------------------------------
18 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
22 The free distribution and use of this software in both source and binary
23 form is allowed (with or without changes) provided that:
25 1. distributions of this source code include the above copyright
26 notice, this list of conditions and the following disclaimer;
28 2. distributions in binary form include the above copyright
29 notice, this list of conditions and the following disclaimer
30 in the documentation and/or other associated materials;
32 3. the copyright holder's name is not used to endorse products
33 built using this software without specific written permission.
35 ALTERNATIVELY, provided that this notice is retained in full, this product
36 may be distributed under the terms of the GNU General Public License (GPL),
37 in which case the provisions of the GPL apply INSTEAD OF those given above.
41 This software is provided 'as is' with no explicit or implied warranties
42 in respect of its properties, including, but not limited to, correctness
43 and/or fitness for purpose.
44 ---------------------------------------------------------------------------
47 This file provides fast multiplication in GF(2^128) as required by several
48 cryptographic authentication modes
51 #include <crypto/gf128mul.h>
52 #include <linux/kernel.h>
53 #include <linux/module.h>
54 #include <linux/slab.h>
56 #define gf128mul_dat(q) { \
57 q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
58 q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
59 q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
60 q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
61 q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
62 q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
63 q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
64 q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
65 q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
66 q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
67 q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
68 q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
69 q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
70 q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
71 q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
72 q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
73 q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
74 q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
75 q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
76 q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
77 q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
78 q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
79 q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
80 q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
81 q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
82 q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
83 q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
84 q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
85 q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
86 q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
87 q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
88 q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
91 /* Given the value i in 0..255 as the byte overflow when a field element
92 in GHASH is multiplied by x^8, this function will return the values that
93 are generated in the lo 16-bit word of the field value by applying the
94 modular polynomial. The values lo_byte and hi_byte are returned via the
95 macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into
96 memory as required by a suitable definition of this macro operating on
100 #define xda_bbe(i) ( \
101 (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
102 (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
103 (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
104 (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
107 #define xda_lle(i) ( \
108 (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
109 (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
110 (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
111 (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
114 static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
115 static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);
118 * The following functions multiply a field element by x or by x^8 in
119 * the polynomial field representation. They use 64-bit word operations
120 * to gain speed but compensate for machine endianness and hence work
121 * correctly on both styles of machine.
124 static void gf128mul_x_lle(be128 *r, const be128 *x)
126 u64 a = be64_to_cpu(x->a);
127 u64 b = be64_to_cpu(x->b);
128 u64 _tt = gf128mul_table_lle[(b << 7) & 0xff];
130 r->b = cpu_to_be64((b >> 1) | (a << 63));
131 r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
134 static void gf128mul_x_bbe(be128 *r, const be128 *x)
136 u64 a = be64_to_cpu(x->a);
137 u64 b = be64_to_cpu(x->b);
138 u64 _tt = gf128mul_table_bbe[a >> 63];
140 r->a = cpu_to_be64((a << 1) | (b >> 63));
141 r->b = cpu_to_be64((b << 1) ^ _tt);
144 void gf128mul_x_ble(be128 *r, const be128 *x)
146 u64 a = le64_to_cpu(x->a);
147 u64 b = le64_to_cpu(x->b);
148 u64 _tt = gf128mul_table_bbe[b >> 63];
150 r->a = cpu_to_le64((a << 1) ^ _tt);
151 r->b = cpu_to_le64((b << 1) | (a >> 63));
153 EXPORT_SYMBOL(gf128mul_x_ble);
155 static void gf128mul_x8_lle(be128 *x)
157 u64 a = be64_to_cpu(x->a);
158 u64 b = be64_to_cpu(x->b);
159 u64 _tt = gf128mul_table_lle[b & 0xff];
161 x->b = cpu_to_be64((b >> 8) | (a << 56));
162 x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
165 static void gf128mul_x8_bbe(be128 *x)
167 u64 a = be64_to_cpu(x->a);
168 u64 b = be64_to_cpu(x->b);
169 u64 _tt = gf128mul_table_bbe[a >> 56];
171 x->a = cpu_to_be64((a << 8) | (b >> 56));
172 x->b = cpu_to_be64((b << 8) ^ _tt);
175 void gf128mul_lle(be128 *r, const be128 *b)
181 for (i = 0; i < 7; ++i)
182 gf128mul_x_lle(&p[i + 1], &p[i]);
184 memset(r, 0, sizeof(*r));
186 u8 ch = ((u8 *)b)[15 - i];
189 be128_xor(r, r, &p[0]);
191 be128_xor(r, r, &p[1]);
193 be128_xor(r, r, &p[2]);
195 be128_xor(r, r, &p[3]);
197 be128_xor(r, r, &p[4]);
199 be128_xor(r, r, &p[5]);
201 be128_xor(r, r, &p[6]);
203 be128_xor(r, r, &p[7]);
211 EXPORT_SYMBOL(gf128mul_lle);
213 void gf128mul_bbe(be128 *r, const be128 *b)
219 for (i = 0; i < 7; ++i)
220 gf128mul_x_bbe(&p[i + 1], &p[i]);
222 memset(r, 0, sizeof(*r));
224 u8 ch = ((u8 *)b)[i];
227 be128_xor(r, r, &p[7]);
229 be128_xor(r, r, &p[6]);
231 be128_xor(r, r, &p[5]);
233 be128_xor(r, r, &p[4]);
235 be128_xor(r, r, &p[3]);
237 be128_xor(r, r, &p[2]);
239 be128_xor(r, r, &p[1]);
241 be128_xor(r, r, &p[0]);
249 EXPORT_SYMBOL(gf128mul_bbe);
251 /* This version uses 64k bytes of table space.
252 A 16 byte buffer has to be multiplied by a 16 byte key
253 value in GF(2^128). If we consider a GF(2^128) value in
254 the buffer's lowest byte, we can construct a table of
255 the 256 16 byte values that result from the 256 values
256 of this byte. This requires 4096 bytes. But we also
257 need tables for each of the 16 higher bytes in the
258 buffer as well, which makes 64 kbytes in total.
260 /* additional explanation
261 * t[0][BYTE] contains g*BYTE
262 * t[1][BYTE] contains g*x^8*BYTE
264 * t[15][BYTE] contains g*x^120*BYTE */
265 struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
267 struct gf128mul_64k *t;
270 t = kzalloc(sizeof(*t), GFP_KERNEL);
274 for (i = 0; i < 16; i++) {
275 t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
277 gf128mul_free_64k(t);
284 for (j = 1; j <= 64; j <<= 1)
285 gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
288 for (j = 2; j < 256; j += j)
289 for (k = 1; k < j; ++k)
290 be128_xor(&t->t[i]->t[j + k],
291 &t->t[i]->t[j], &t->t[i]->t[k]);
296 for (j = 128; j > 0; j >>= 1) {
297 t->t[i]->t[j] = t->t[i - 1]->t[j];
298 gf128mul_x8_bbe(&t->t[i]->t[j]);
305 EXPORT_SYMBOL(gf128mul_init_64k_bbe);
307 void gf128mul_free_64k(struct gf128mul_64k *t)
311 for (i = 0; i < 16; i++)
315 EXPORT_SYMBOL(gf128mul_free_64k);
317 void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
323 *r = t->t[0]->t[ap[15]];
324 for (i = 1; i < 16; ++i)
325 be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
328 EXPORT_SYMBOL(gf128mul_64k_bbe);
330 /* This version uses 4k bytes of table space.
331 A 16 byte buffer has to be multiplied by a 16 byte key
332 value in GF(2^128). If we consider a GF(2^128) value in a
333 single byte, we can construct a table of the 256 16 byte
334 values that result from the 256 values of this byte.
335 This requires 4096 bytes. If we take the highest byte in
336 the buffer and use this table to get the result, we then
337 have to multiply by x^120 to get the final value. For the
338 next highest byte the result has to be multiplied by x^112
339 and so on. But we can do this by accumulating the result
340 in an accumulator starting with the result for the top
341 byte. We repeatedly multiply the accumulator value by
342 x^8 and then add in (i.e. xor) the 16 bytes of the next
343 lower byte in the buffer, stopping when we reach the
344 lowest byte. This requires a 4096 byte table.
346 struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
348 struct gf128mul_4k *t;
351 t = kzalloc(sizeof(*t), GFP_KERNEL);
356 for (j = 64; j > 0; j >>= 1)
357 gf128mul_x_lle(&t->t[j], &t->t[j+j]);
359 for (j = 2; j < 256; j += j)
360 for (k = 1; k < j; ++k)
361 be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
366 EXPORT_SYMBOL(gf128mul_init_4k_lle);
368 struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
370 struct gf128mul_4k *t;
373 t = kzalloc(sizeof(*t), GFP_KERNEL);
378 for (j = 1; j <= 64; j <<= 1)
379 gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
381 for (j = 2; j < 256; j += j)
382 for (k = 1; k < j; ++k)
383 be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
388 EXPORT_SYMBOL(gf128mul_init_4k_bbe);
390 void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
399 be128_xor(r, r, &t->t[ap[i]]);
403 EXPORT_SYMBOL(gf128mul_4k_lle);
405 void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
414 be128_xor(r, r, &t->t[ap[i]]);
418 EXPORT_SYMBOL(gf128mul_4k_bbe);
420 MODULE_LICENSE("GPL");
421 MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");