18c2ecf20Sopenharmony_ci/* gf128mul.h - GF(2^128) multiplication functions
28c2ecf20Sopenharmony_ci *
38c2ecf20Sopenharmony_ci * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
48c2ecf20Sopenharmony_ci * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org>
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci * Based on Dr Brian Gladman's (GPL'd) work published at
78c2ecf20Sopenharmony_ci * http://fp.gladman.plus.com/cryptography_technology/index.htm
88c2ecf20Sopenharmony_ci * See the original copyright notice below.
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or modify it
118c2ecf20Sopenharmony_ci * under the terms of the GNU General Public License as published by the Free
128c2ecf20Sopenharmony_ci * Software Foundation; either version 2 of the License, or (at your option)
138c2ecf20Sopenharmony_ci * any later version.
148c2ecf20Sopenharmony_ci */
158c2ecf20Sopenharmony_ci/*
168c2ecf20Sopenharmony_ci ---------------------------------------------------------------------------
178c2ecf20Sopenharmony_ci Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci LICENSE TERMS
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci The free distribution and use of this software in both source and binary
228c2ecf20Sopenharmony_ci form is allowed (with or without changes) provided that:
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci   1. distributions of this source code include the above copyright
258c2ecf20Sopenharmony_ci      notice, this list of conditions and the following disclaimer;
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci   2. distributions in binary form include the above copyright
288c2ecf20Sopenharmony_ci      notice, this list of conditions and the following disclaimer
298c2ecf20Sopenharmony_ci      in the documentation and/or other associated materials;
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci   3. the copyright holder's name is not used to endorse products
328c2ecf20Sopenharmony_ci      built using this software without specific written permission.
338c2ecf20Sopenharmony_ci
348c2ecf20Sopenharmony_ci ALTERNATIVELY, provided that this notice is retained in full, this product
358c2ecf20Sopenharmony_ci may be distributed under the terms of the GNU General Public License (GPL),
368c2ecf20Sopenharmony_ci in which case the provisions of the GPL apply INSTEAD OF those given above.
378c2ecf20Sopenharmony_ci
388c2ecf20Sopenharmony_ci DISCLAIMER
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_ci This software is provided 'as is' with no explicit or implied warranties
418c2ecf20Sopenharmony_ci in respect of its properties, including, but not limited to, correctness
428c2ecf20Sopenharmony_ci and/or fitness for purpose.
438c2ecf20Sopenharmony_ci ---------------------------------------------------------------------------
448c2ecf20Sopenharmony_ci Issue Date: 31/01/2006
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci An implementation of field multiplication in Galois Field GF(2^128)
478c2ecf20Sopenharmony_ci*/
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci#ifndef _CRYPTO_GF128MUL_H
508c2ecf20Sopenharmony_ci#define _CRYPTO_GF128MUL_H
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci#include <asm/byteorder.h>
538c2ecf20Sopenharmony_ci#include <crypto/b128ops.h>
548c2ecf20Sopenharmony_ci#include <linux/slab.h>
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci/* Comment by Rik:
578c2ecf20Sopenharmony_ci *
588c2ecf20Sopenharmony_ci * For some background on GF(2^128) see for example:
598c2ecf20Sopenharmony_ci * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
608c2ecf20Sopenharmony_ci *
618c2ecf20Sopenharmony_ci * The elements of GF(2^128) := GF(2)[X]/(X^128-X^7-X^2-X^1-1) can
628c2ecf20Sopenharmony_ci * be mapped to computer memory in a variety of ways. Let's examine
638c2ecf20Sopenharmony_ci * three common cases.
648c2ecf20Sopenharmony_ci *
658c2ecf20Sopenharmony_ci * Take a look at the 16 binary octets below in memory order. The msb's
668c2ecf20Sopenharmony_ci * are left and the lsb's are right. char b[16] is an array and b[0] is
678c2ecf20Sopenharmony_ci * the first octet.
688c2ecf20Sopenharmony_ci *
698c2ecf20Sopenharmony_ci * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000
708c2ecf20Sopenharmony_ci *   b[0]     b[1]     b[2]     b[3]          b[13]    b[14]    b[15]
718c2ecf20Sopenharmony_ci *
728c2ecf20Sopenharmony_ci * Every bit is a coefficient of some power of X. We can store the bits
738c2ecf20Sopenharmony_ci * in every byte in little-endian order and the bytes themselves also in
748c2ecf20Sopenharmony_ci * little endian order. I will call this lle (little-little-endian).
758c2ecf20Sopenharmony_ci * The above buffer represents the polynomial 1, and X^7+X^2+X^1+1 looks
768c2ecf20Sopenharmony_ci * like 11100001 00000000 .... 00000000 = { 0xE1, 0x00, }.
778c2ecf20Sopenharmony_ci * This format was originally implemented in gf128mul and is used
788c2ecf20Sopenharmony_ci * in GCM (Galois/Counter mode) and in ABL (Arbitrary Block Length).
798c2ecf20Sopenharmony_ci *
808c2ecf20Sopenharmony_ci * Another convention says: store the bits in bigendian order and the
818c2ecf20Sopenharmony_ci * bytes also. This is bbe (big-big-endian). Now the buffer above
828c2ecf20Sopenharmony_ci * represents X^127. X^7+X^2+X^1+1 looks like 00000000 .... 10000111,
838c2ecf20Sopenharmony_ci * b[15] = 0x87 and the rest is 0. LRW uses this convention and bbe
848c2ecf20Sopenharmony_ci * is partly implemented.
858c2ecf20Sopenharmony_ci *
868c2ecf20Sopenharmony_ci * Both of the above formats are easy to implement on big-endian
878c2ecf20Sopenharmony_ci * machines.
888c2ecf20Sopenharmony_ci *
898c2ecf20Sopenharmony_ci * XTS and EME (the latter of which is patent encumbered) use the ble
908c2ecf20Sopenharmony_ci * format (bits are stored in big endian order and the bytes in little
918c2ecf20Sopenharmony_ci * endian). The above buffer represents X^7 in this case and the
928c2ecf20Sopenharmony_ci * primitive polynomial is b[0] = 0x87.
938c2ecf20Sopenharmony_ci *
948c2ecf20Sopenharmony_ci * The common machine word-size is smaller than 128 bits, so to make
958c2ecf20Sopenharmony_ci * an efficient implementation we must split into machine word sizes.
968c2ecf20Sopenharmony_ci * This implementation uses 64-bit words for the moment. Machine
978c2ecf20Sopenharmony_ci * endianness comes into play. The lle format in relation to machine
988c2ecf20Sopenharmony_ci * endianness is discussed below by the original author of gf128mul Dr
998c2ecf20Sopenharmony_ci * Brian Gladman.
1008c2ecf20Sopenharmony_ci *
1018c2ecf20Sopenharmony_ci * Let's look at the bbe and ble format on a little endian machine.
1028c2ecf20Sopenharmony_ci *
1038c2ecf20Sopenharmony_ci * bbe on a little endian machine u32 x[4]:
1048c2ecf20Sopenharmony_ci *
1058c2ecf20Sopenharmony_ci *  MS            x[0]           LS  MS            x[1]		  LS
1068c2ecf20Sopenharmony_ci *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1078c2ecf20Sopenharmony_ci *  103..96 111.104 119.112 127.120  71...64 79...72 87...80 95...88
1088c2ecf20Sopenharmony_ci *
1098c2ecf20Sopenharmony_ci *  MS            x[2]           LS  MS            x[3]		  LS
1108c2ecf20Sopenharmony_ci *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1118c2ecf20Sopenharmony_ci *  39...32 47...40 55...48 63...56  07...00 15...08 23...16 31...24
1128c2ecf20Sopenharmony_ci *
1138c2ecf20Sopenharmony_ci * ble on a little endian machine
1148c2ecf20Sopenharmony_ci *
1158c2ecf20Sopenharmony_ci *  MS            x[0]           LS  MS            x[1]		  LS
1168c2ecf20Sopenharmony_ci *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1178c2ecf20Sopenharmony_ci *  31...24 23...16 15...08 07...00  63...56 55...48 47...40 39...32
1188c2ecf20Sopenharmony_ci *
1198c2ecf20Sopenharmony_ci *  MS            x[2]           LS  MS            x[3]		  LS
1208c2ecf20Sopenharmony_ci *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1218c2ecf20Sopenharmony_ci *  95...88 87...80 79...72 71...64  127.120 199.112 111.104 103..96
1228c2ecf20Sopenharmony_ci *
1238c2ecf20Sopenharmony_ci * Multiplications in GF(2^128) are mostly bit-shifts, so you see why
1248c2ecf20Sopenharmony_ci * ble (and lbe also) are easier to implement on a little-endian
1258c2ecf20Sopenharmony_ci * machine than on a big-endian machine. The converse holds for bbe
1268c2ecf20Sopenharmony_ci * and lle.
1278c2ecf20Sopenharmony_ci *
1288c2ecf20Sopenharmony_ci * Note: to have good alignment, it seems to me that it is sufficient
1298c2ecf20Sopenharmony_ci * to keep elements of GF(2^128) in type u64[2]. On 32-bit wordsize
1308c2ecf20Sopenharmony_ci * machines this will automatically aligned to wordsize and on a 64-bit
1318c2ecf20Sopenharmony_ci * machine also.
1328c2ecf20Sopenharmony_ci */
1338c2ecf20Sopenharmony_ci/*	Multiply a GF(2^128) field element by x. Field elements are
1348c2ecf20Sopenharmony_ci    held in arrays of bytes in which field bits 8n..8n + 7 are held in
1358c2ecf20Sopenharmony_ci    byte[n], with lower indexed bits placed in the more numerically
1368c2ecf20Sopenharmony_ci    significant bit positions within bytes.
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci    On little endian machines the bit indexes translate into the bit
1398c2ecf20Sopenharmony_ci    positions within four 32-bit words in the following way
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci    MS            x[0]           LS  MS            x[1]		  LS
1428c2ecf20Sopenharmony_ci    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1438c2ecf20Sopenharmony_ci    24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci    MS            x[2]           LS  MS            x[3]		  LS
1468c2ecf20Sopenharmony_ci    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1478c2ecf20Sopenharmony_ci    88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci    On big endian machines the bit indexes translate into the bit
1508c2ecf20Sopenharmony_ci    positions within four 32-bit words in the following way
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci    MS            x[0]           LS  MS            x[1]		  LS
1538c2ecf20Sopenharmony_ci    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1548c2ecf20Sopenharmony_ci    00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci    MS            x[2]           LS  MS            x[3]		  LS
1578c2ecf20Sopenharmony_ci    ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
1588c2ecf20Sopenharmony_ci    64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127
1598c2ecf20Sopenharmony_ci*/
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci/*	A slow generic version of gf_mul, implemented for lle and bbe
1628c2ecf20Sopenharmony_ci * 	It multiplies a and b and puts the result in a */
1638c2ecf20Sopenharmony_civoid gf128mul_lle(be128 *a, const be128 *b);
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_civoid gf128mul_bbe(be128 *a, const be128 *b);
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci/*
1688c2ecf20Sopenharmony_ci * The following functions multiply a field element by x in
1698c2ecf20Sopenharmony_ci * the polynomial field representation.  They use 64-bit word operations
1708c2ecf20Sopenharmony_ci * to gain speed but compensate for machine endianness and hence work
1718c2ecf20Sopenharmony_ci * correctly on both styles of machine.
1728c2ecf20Sopenharmony_ci *
1738c2ecf20Sopenharmony_ci * They are defined here for performance.
1748c2ecf20Sopenharmony_ci */
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_cistatic inline u64 gf128mul_mask_from_bit(u64 x, int which)
1778c2ecf20Sopenharmony_ci{
1788c2ecf20Sopenharmony_ci	/* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */
1798c2ecf20Sopenharmony_ci	return ((s64)(x << (63 - which)) >> 63);
1808c2ecf20Sopenharmony_ci}
1818c2ecf20Sopenharmony_ci
1828c2ecf20Sopenharmony_cistatic inline void gf128mul_x_lle(be128 *r, const be128 *x)
1838c2ecf20Sopenharmony_ci{
1848c2ecf20Sopenharmony_ci	u64 a = be64_to_cpu(x->a);
1858c2ecf20Sopenharmony_ci	u64 b = be64_to_cpu(x->b);
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci	/* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48
1888c2ecf20Sopenharmony_ci	 * (see crypto/gf128mul.c): */
1898c2ecf20Sopenharmony_ci	u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56);
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	r->b = cpu_to_be64((b >> 1) | (a << 63));
1928c2ecf20Sopenharmony_ci	r->a = cpu_to_be64((a >> 1) ^ _tt);
1938c2ecf20Sopenharmony_ci}
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_cistatic inline void gf128mul_x_bbe(be128 *r, const be128 *x)
1968c2ecf20Sopenharmony_ci{
1978c2ecf20Sopenharmony_ci	u64 a = be64_to_cpu(x->a);
1988c2ecf20Sopenharmony_ci	u64 b = be64_to_cpu(x->b);
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci	/* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
2018c2ecf20Sopenharmony_ci	u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	r->a = cpu_to_be64((a << 1) | (b >> 63));
2048c2ecf20Sopenharmony_ci	r->b = cpu_to_be64((b << 1) ^ _tt);
2058c2ecf20Sopenharmony_ci}
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci/* needed by XTS */
2088c2ecf20Sopenharmony_cistatic inline void gf128mul_x_ble(le128 *r, const le128 *x)
2098c2ecf20Sopenharmony_ci{
2108c2ecf20Sopenharmony_ci	u64 a = le64_to_cpu(x->a);
2118c2ecf20Sopenharmony_ci	u64 b = le64_to_cpu(x->b);
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	/* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
2148c2ecf20Sopenharmony_ci	u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci	r->a = cpu_to_le64((a << 1) | (b >> 63));
2178c2ecf20Sopenharmony_ci	r->b = cpu_to_le64((b << 1) ^ _tt);
2188c2ecf20Sopenharmony_ci}
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci/* 4k table optimization */
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_cistruct gf128mul_4k {
2238c2ecf20Sopenharmony_ci	be128 t[256];
2248c2ecf20Sopenharmony_ci};
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_cistruct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g);
2278c2ecf20Sopenharmony_cistruct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g);
2288c2ecf20Sopenharmony_civoid gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t);
2298c2ecf20Sopenharmony_civoid gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t);
2308c2ecf20Sopenharmony_civoid gf128mul_x8_ble(le128 *r, const le128 *x);
2318c2ecf20Sopenharmony_cistatic inline void gf128mul_free_4k(struct gf128mul_4k *t)
2328c2ecf20Sopenharmony_ci{
2338c2ecf20Sopenharmony_ci	kfree_sensitive(t);
2348c2ecf20Sopenharmony_ci}
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci/* 64k table optimization, implemented for bbe */
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_cistruct gf128mul_64k {
2408c2ecf20Sopenharmony_ci	struct gf128mul_4k *t[16];
2418c2ecf20Sopenharmony_ci};
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci/* First initialize with the constant factor with which you
2448c2ecf20Sopenharmony_ci * want to multiply and then call gf128mul_64k_bbe with the other
2458c2ecf20Sopenharmony_ci * factor in the first argument, and the table in the second.
2468c2ecf20Sopenharmony_ci * Afterwards, the result is stored in *a.
2478c2ecf20Sopenharmony_ci */
2488c2ecf20Sopenharmony_cistruct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g);
2498c2ecf20Sopenharmony_civoid gf128mul_free_64k(struct gf128mul_64k *t);
2508c2ecf20Sopenharmony_civoid gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t);
2518c2ecf20Sopenharmony_ci
2528c2ecf20Sopenharmony_ci#endif /* _CRYPTO_GF128MUL_H */
253