IBR-DTNSuite  0.10
gf128mul.h
Go to the documentation of this file.
1 /*
2  ---------------------------------------------------------------------------
3  Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
4 
5  LICENSE TERMS
6 
7  The free distribution and use of this software in both source and binary
8  form is allowed (with or without changes) provided that:
9 
10  1. distributions of this source code include the above copyright
11  notice, this list of conditions and the following disclaimer;
12 
13  2. distributions in binary form include the above copyright
14  notice, this list of conditions and the following disclaimer
15  in the documentation and/or other associated materials;
16 
17  3. the copyright holder's name is not used to endorse products
18  built using this software without specific written permission.
19 
20  ALTERNATIVELY, provided that this notice is retained in full, this product
21  may be distributed under the terms of the GNU General Public License (GPL),
22  in which case the provisions of the GPL apply INSTEAD OF those given above.
23 
24  DISCLAIMER
25 
26  This software is provided 'as is' with no explicit or implied warranties
27  in respect of its properties, including, but not limited to, correctness
28  and/or fitness for purpose.
29  ---------------------------------------------------------------------------
30  Issue Date: 13/10/2006
31 
32  An implementation of field multiplication in Galois Field GF(128)
33 */
34 
35 #ifndef GF128MUL_H
36 #define GF128MUL_H
37 
38 #include <stdlib.h>
39 #include <string.h>
40 
41 #include "mode_hdr.h"
42 
43 /* Table sizes for GF(128) Multiply. Normally larger tables give
44  higher speed but cache loading might change this. Normally only
45  one table size (or none at all) will be specified here
46 */
47 
48 #if 0
49 # define TABLES_64K
50 #endif
51 #if 1
52 # define TABLES_8K
53 #endif
54 #if 0
55 # define TABLES_4K
56 #endif
57 #if 0
58 # define TABLES_256
59 #endif
60 
61 /* Use of inlines is preferred but code blocks can also be expanded inline
62  using 'defines'. But the latter approach will typically generate a LOT
63  of code and is not recommended.
64 */
65 #if 0
66 # define USE_INLINES
67 #endif
68 
69 /* Speed critical loops can be unrolled to gain speed but consume more
70  memory
71 */
72 #if 0
73 # define UNROLL_LOOPS
74 #endif
75 
76 /* Multiply a GF128 field element by x. Field elements are held in arrays
77  of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower
78  indexed bits placed in the more numerically significant bit positions
79  within bytes.
80 
81  On little endian machines the bit indexes translate into the bit
82  positions within four 32-bit words in the following way
83 
84  MS x[0] LS MS x[1] LS
85  ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
86  24...31 16...23 08...15 00...07 56...63 48...55 40...47 32...39
87 
88  MS x[2] LS MS x[3] LS
89  ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
90  88...95 80...87 72...79 64...71 120.127 112.119 104.111 96..103
91 
92  On big endian machines the bit indexes translate into the bit
93  positions within four 32-bit words in the following way
94 
95  MS x[0] LS MS x[1] LS
96  ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
97  00...07 08...15 16...23 24...31 32...39 40...47 48...55 56...63
98 
99  MS x[2] LS MS x[3] LS
100  ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
101  64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127
102 */
103 
104 #define GF_BYTE_LEN 16
105 
106 #if defined( USE_INLINES )
107 # if defined( _MSC_VER )
108 # define gf_inline __inline
109 # elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
110 # define gf_inline static inline
111 # else
112 # define gf_inline static
113 # endif
114 #endif
115 
116 #if defined(__cplusplus)
117 extern "C"
118 {
119 #endif
120 
121 /* These functions multiply a field element x, by x^4 and by x^8 in the
122  polynomial field representation. It uses 32-bit word operations to
123  gain speed but compensates for machine endianess and hence works
124  correctly on both styles of machine.
125 */
126 extern const unsigned short gf_tab[256];
127 
128 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
129 
130 /* This section is not needed as GF(128) multiplication is now implemented
131  but is left in place as it provides a template for an alternative little
132  endian implementation approach based on conversion to and from big endian
133  format
134 */
135 #if 0
136 
137 /* This is a template for mul_x. The mul_x4 and mul_x8 little endian
138  alternative implementations (and their defined versions) follow the
139  big endian functions below in the same way.
140 */
141 
142 gf_inline void mul_x(void *r, const void *x)
143 { uint_32t _tt;
144  bswap32_block(r, x, 4);
145  _tt = gf_tab[(ui32_ptr(r)[3] << 7) & 0xff];
146  ui32_ptr(r)[3] = (ui32_ptr(r)[3] >> 1) | (ui32_ptr(r)[2] << 31);
147  ui32_ptr(r)[2] = (ui32_ptr(r)[2] >> 1) | (ui32_ptr(r)[1] << 31);
148  ui32_ptr(r)[1] = (ui32_ptr(r)[1] >> 1) | (ui32_ptr(r)[0] << 31);
149  ui32_ptr(r)[0] = (ui32_ptr(r)[0] >> 1) ^ bswap_32(_tt);
150  bswap32_block(r, r, 4);
151 }
152 
153 #endif
154 
155 #define VERSION_1
156 
157 #define MSK_80 (0x80 * (unit_cast(BFR_UNIT,-1) / 0xff))
158 #define MSK_F0 (0xf0 * (unit_cast(BFR_UNIT,-1) / 0xff))
159 
160 #if defined( USE_INLINES )
161 
162 #if BFR_UNIT == 64
163 
164  gf_inline void mul_x(void *r, const void *x)
165  { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80];
166 
167  ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;
168  ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;
169  }
170 
171  #if defined( VERSION_1 )
172 
173  gf_inline void mul_x4(void *x)
174  { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];
175 
176  ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) | (ui64_ptr(x)[0] >> 52)) & MSK_F0;
177  ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
178  }
179 
180  #else
181 
182  gf_inline void mul_x4(void *x)
183  { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];
184  bswap64_block(x, x, 2);
185  ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));
186  ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;
187  }
188 
189  #endif
190 
191  gf_inline void mul_x8(void *x)
192  { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];
193  ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56);
194  ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;
195  }
196 
197 #elif BFR_UNIT == 32
198 
199  gf_inline void mul_x(void *r, const void *x)
200  { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80];
201 
202  ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) | (ui32_ptr(x)[2] >> 17)) & MSK_80;
203  ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) | (ui32_ptr(x)[1] >> 17)) & MSK_80;
204  ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) | (ui32_ptr(x)[0] >> 17)) & MSK_80;
205  ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;
206  }
207 
208  #if defined( VERSION_1 )
209 
210  gf_inline void mul_x4(void *x)
211  { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];
212 
213  ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) | (ui32_ptr(x)[2] >> 20)) & MSK_F0;
214  ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) | (ui32_ptr(x)[1] >> 20)) & MSK_F0;
215  ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) | (ui32_ptr(x)[0] >> 20)) & MSK_F0;
216  ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
217  }
218 
219  #else
220 
221  gf_inline void mul_x4(void *x)
222  { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];
223  bswap32_block(x, x, 4);
224  ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));
225  ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));
226  ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));
227  ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;
228  }
229 
230  #endif
231 
232  gf_inline void mul_x8(void *x)
233  { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24];
234 
235  ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);
236  ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);
237  ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);
238  ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;
239  }
240 
241 #else
242 
243  gf_inline void mul_x(void *r, const void *x)
244  { uint_8t _tt = ui8_ptr(x)[15] & 1;
245  ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
246  ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
247  ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
248  ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
249  ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
250  ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
251  ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
252  ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
253  ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
254  ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
255  ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
256  ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
257  ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
258  ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
259  ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
260  ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
261  }
262 
263  gf_inline void mul_x4(void *x)
264  { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
265  ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
266  ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
267  ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
268  ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
269  ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
270  ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
271  ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
272  ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
273  ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
274  ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
275  ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
276  ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
277  ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
278  ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
279  ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);
280  ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);
281  }
282 
283  gf_inline void mul_x8(void *x)
284  { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
285  memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
286  ui8_ptr(x)[1] ^= (_tt >> 8);
287  ui8_ptr(x)[0] = (_tt & 0xff);
288  }
289 
290 #endif
291 
292 #else /* DEFINES */
293 
294 #if BFR_UNIT == 64
295 
296  #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; \
297  ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 \
298  | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80; \
299  ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 \
300  | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt; \
301  } while(0)
302 
303  #if defined( VERSION_1 )
304 
305  #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0]; \
306  ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) \
307  | (ui64_ptr(x)[0] >> 52)) & MSK_F0; \
308  ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 \
309  | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \
310  } while(0)
311 
312  #else
313 
314  #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0]; \
315  bswap64_block(x, x, 2); \
316  ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60)); \
317  ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt; \
318  } while(0)
319 
320  #endif
321 
322  #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56]; \
323  ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); \
324  ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt; \
325  } while(0)
326 
327 #elif BFR_UNIT == 32
328 
329  #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; \
330  ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) \
331  | (ui32_ptr(x)[2] >> 17)) & MSK_80; \
332  ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) \
333  | (ui32_ptr(x)[1] >> 17)) & MSK_80; \
334  ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) \
335  | (ui32_ptr(x)[0] >> 17)) & MSK_80; \
336  ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 \
337  | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt; \
338  } while(0)
339 
340  #if defined( VERSION_1 )
341 
342  #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0]; \
343  ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) \
344  | (ui32_ptr(x)[2] >> 20)) & MSK_F0; \
345  ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) \
346  | (ui32_ptr(x)[1] >> 20)) & MSK_F0; \
347  ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) \
348  | (ui32_ptr(x)[0] >> 20)) & MSK_F0; \
349  ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 \
350  | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \
351  } while(0)
352 
353  #else
354 
355  #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0]; \
356  bswap32_block(x, x, 4); \
357  ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28)); \
358  ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28)); \
359  ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28)); \
360  ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt; \
361  } while(0)
362 
363  #endif
364 
365 #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24]; \
366  ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24); \
367  ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24); \
368  ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24); \
369  ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt; \
370  } while(0)
371 
372 #else
373 
374  #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \
375  ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
376  ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
377  ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
378  ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
379  ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
380  ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
381  ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
382  ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
383  ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
384  ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
385  ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
386  ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
387  ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
388  ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
389  ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
390  ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \
391  } while(0)
392 
393  #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \
394  ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \
395  ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \
396  ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \
397  ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \
398  ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \
399  ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \
400  ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \
401  ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \
402  ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \
403  ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \
404  ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \
405  ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \
406  ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \
407  ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \
408  ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8); \
409  ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff); \
410  } while(0)
411 
412  #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \
413  memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \
414  ui8_ptr(x)[1] ^= (_tt >> 8); \
415  ui8_ptr(x)[0] = (_tt & 0xff); \
416  } while(0)
417 
418 #endif
419 
420 #endif
421 
422 #elif PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
423 
424 #if defined( USE_INLINES )
425 
426 #if BFR_UNIT == 64
427 
428  gf_inline void mul_x(void *r, const void *x)
429  { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];
430  ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);
431  ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);
432  }
433 
434  gf_inline void mul_x4(void *x)
435  { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff];
436  ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);
437  ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);
438  }
439 
440  gf_inline void mul_x8(void *x)
441  { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];
442  ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);
443  ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);
444  }
445 
446 #elif BFR_UNIT == 32
447 
448  gf_inline void mul_x(void *r, const void *x)
449  { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];
450  ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);
451  ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);
452  ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);
453  ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);
454  }
455 
456  gf_inline void mul_x4(void *x)
457  { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff];
458  ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);
459  ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);
460  ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);
461  ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);
462  }
463 
464  gf_inline void mul_x8(void *x)
465  { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];
466  ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);
467  ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);
468  ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);
469  ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);
470  }
471 
472 #else
473 
474  gf_inline void mul_x(void *r, const void *x)
475  { uint_8t _tt = ui8_ptr(x)[15] & 1;
476  ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
477  ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
478  ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
479  ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
480  ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
481  ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
482  ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
483  ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
484  ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
485  ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
486  ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
487  ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
488  ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
489  ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
490  ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
491  ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
492  }
493 
494  gf_inline void mul_x4(void *x)
495  {
496  uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
497  ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
498  ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
499  ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
500  ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
501  ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
502  ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
503  ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
504  ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
505  ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
506  ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
507  ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
508  ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
509  ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
510  ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
511  ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);
512  ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);
513  }
514 
515  gf_inline void mul_x8(void *x)
516  { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
517  memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
518  ui8_ptr(x)[1] ^= (_tt & 0xff);
519  ui8_ptr(x)[0] = (_tt >> 8);
520  }
521 
522 #endif
523 
524 #else /* DEFINES */
525 
526 #if BFR_UNIT == 64
527 
528  #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff]; \
529  ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63); \
530  ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48); \
531  } while(0)
532 
533  #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; \
534  ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60); \
535  ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48); \
536  } while(0)
537 
538  #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff]; \
539  ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56); \
540  ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48); \
541  } while(0)
542 
543 #elif BFR_UNIT == 32
544 
545  #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff]; \
546  ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31); \
547  ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31); \
548  ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31); \
549  ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16); \
550  } while(0)
551 
552  #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; \
553  ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28); \
554  ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28); \
555  ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28); \
556  ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16); \
557  } while(0)
558 
559  #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff]; \
560  ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24); \
561  ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24); \
562  ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24); \
563  ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16); \
564  } while(0)
565 
566 #else
567 
568  #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \
569  ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
570  ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
571  ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
572  ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
573  ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
574  ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
575  ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
576  ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
577  ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
578  ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
579  ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
580  ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
581  ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
582  ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
583  ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
584  ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \
585  } while(0)
586 
587  #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \
588  ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \
589  ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \
590  ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \
591  ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \
592  ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \
593  ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \
594  ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \
595  ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \
596  ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \
597  ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \
598  ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \
599  ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \
600  ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \
601  ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \
602  ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff); \
603  ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8); \
604  } while(0)
605 
606  #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \
607  memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \
608  ui8_ptr(x)[1] ^= (_tt & 0xff); \
609  ui8_ptr(x)[0] = (_tt >> 8); \
610  } while(0)
611 
612 #endif
613 
614 #endif
615 
616 #else
617 # error Platform byte order has not been set.
618 #endif
619 
620 /* A slow generic version of gf_mul (a = a * b) */
621 
622 void gf_mul(void *a, const void* b);
623 
624 /* This version uses 64k bytes of table space on the stack.
625  A 16 byte buffer has to be multiplied by a 16 byte key
626  value in GF(128). If we consider a GF(128) value in
627  the buffer's lowest byte, we can construct a table of
628  the 256 16 byte values that result from the 256 values
629  of this byte. This requires 4096 bytes. But we also
630  need tables for each of the 16 higher bytes in the
631  buffer as well, which makes 64 kbytes in total.
632 */
633 
634 void init_64k_table(unsigned char g[], void *t);
635 typedef uint_32t (*gf_t64k)[256][GF_BYTE_LEN >> 2];
636 #define tab64k(x) ((gf_t64k)x)
637 #define xor_64k(i,a,t,r) xor_block_aligned(r, tab64k(t)[i][a[i]])
638 
639 #if defined( USE_INLINES )
640 
641 #if defined( UNROLL_LOOPS )
642 
643 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
644 {
645  move_block_aligned(r, tab64k(t)[0][a[0]]); xor_64k( 1, a, t, r);
646  xor_64k( 2, a, t, r); xor_64k( 3, a, t, r);
647  xor_64k( 4, a, t, r); xor_64k( 5, a, t, r);
648  xor_64k( 6, a, t, r); xor_64k( 7, a, t, r);
649  xor_64k( 8, a, t, r); xor_64k( 9, a, t, r);
650  xor_64k(10, a, t, r); xor_64k(11, a, t, r);
651  xor_64k(12, a, t, r); xor_64k(13, a, t, r);
652  xor_64k(14, a, t, r); xor_64k(15, a, t, r);
653  move_block_aligned(a, r);
654 }
655 
656 #else
657 
658 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
659 { int i;
660  move_block_aligned(r, tab64k(t)[0][a[0]]);
661  for(i = 1; i < GF_BYTE_LEN; ++i)
662  xor_64k(i, a, t, r);
663  move_block_aligned(a, r);
664 }
665 
666 #endif
667 
668 #else
669 
670 #if !defined( UNROLL_LOOPS )
671 
672 #define gf_mul_64k(a, t, r) do { \
673  move_block_aligned(r, tab64k(t)[0][a[0]]); \
674  xor_64k( 1, a, t, r); \
675  xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); \
676  xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); \
677  xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); \
678  xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); \
679  xor_64k(10, a, t, r); xor_64k(11, a, t, r); \
680  xor_64k(12, a, t, r); xor_64k(13, a, t, r); \
681  xor_64k(14, a, t, r); xor_64k(15, a, t, r); \
682  move_block_aligned(a, r); \
683 } while(0)
684 
685 #else
686 
687 #define gf_mul_64k(a, t, r) do { int i; \
688  move_block_aligned(r, tab64k(t)[0][a[0]]); \
689  for(i = 1; i < GF_BYTE_LEN; ++i) \
690  { xor_64k(i, a, t, r); \
691  } \
692  move_block_aligned(a, r); \
693 } while(0)
694 
695 #endif
696 
697 #endif
698 
699 /* This version uses 8k bytes of table space on the stack.
700  A 16 byte buffer has to be multiplied by a 16 byte key
701  value in GF(128). If we consider a GF(128) value in
702  the buffer's lowest 4-bits, we can construct a table of
703  the 16 16 byte values that result from the 16 values
704  of these 4 bits. This requires 256 bytes. But we also
705  need tables for each of the 32 higher 4 bit groups,
706  which makes 8 kbytes in total.
707 */
708 
709 void init_8k_table(unsigned char g[], void *t);
710 
711 typedef uint_32t (*gf_t8k)[16][GF_BYTE_LEN >> 2];
712 #define tab8k(x) ((gf_t8k)x)
713 #define xor_8k(i,a,t,r) \
714  xor_block_aligned(r, tab8k(t)[i + i][a[i] & 15]); \
715  xor_block_aligned(r, tab8k(t)[i + i + 1][a[i] >> 4])
716 
717 #if defined( USE_INLINES )
718 
719 #if defined( UNROLL_LOOPS )
720 
721 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
722 {
723  move_block_aligned(r, tab8k(t)[0][a[0] & 15]);
724  xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
725  xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); xor_8k( 3, a, t, r);
726  xor_8k( 4, a, t, r); xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); xor_8k( 7, a, t, r);
727  xor_8k( 8, a, t, r); xor_8k( 9, a, t, r); xor_8k(10, a, t, r); xor_8k(11, a, t, r);
728  xor_8k(12, a, t, r); xor_8k(13, a, t, r); xor_8k(14, a, t, r); xor_8k(15, a, t, r);
729  move_block_aligned(a, r);
730 }
731 
732 #else
733 
734 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
735 { int i;
736  memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN);
737  xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
738  for(i = 1; i < GF_BYTE_LEN; ++i)
739  { xor_8k(i, a, t, r);
740  }
741  memcpy(a, r, GF_BYTE_LEN);
742 }
743 
744 #endif
745 
746 #else
747 
748 #if defined( UNROLL_LOOPS )
749 
750 #define gf_mul_8k(a, t, r) do { \
751  move_block_aligned(r, tab8k(t)[0][a[0] & 15]); \
752  xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \
753  xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); \
754  xor_8k( 3, a, t, r); xor_8k( 4, a, t, r); \
755  xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); \
756  xor_8k( 7, a, t, r); xor_8k( 8, a, t, r); \
757  xor_8k( 9, a, t, r); xor_8k(10, a, t, r); \
758  xor_8k(11, a, t, r); xor_8k(12, a, t, r); \
759  xor_8k(13, a, t, r); xor_8k(14, a, t, r); \
760  xor_8k(15, a, t, r); move_block_aligned(a, r); \
761 } while(0)
762 
763 #else
764 
765 #define gf_mul_8k(a, t, r) do { int i; \
766  memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); \
767  xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \
768  for(i = 1; i < GF_BYTE_LEN; ++i) \
769  { xor_8k(i, a, t, r); \
770  } \
771  memcpy(a, r, GF_BYTE_LEN); \
772 } while(0)
773 
774 #endif
775 
776 #endif
777 
778 /* This version uses 4k bytes of table space on the stack.
779  A 16 byte buffer has to be multiplied by a 16 byte key
780  value in GF(128). If we consider a GF(128) value in a
781  single byte, we can construct a table of the 256 16 byte
782  values that result from the 256 values of this byte.
783  This requires 4096 bytes. If we take the highest byte in
784  the buffer and use this table to get the result, we then
785  have to multiply by x^120 to get the final value. For the
786  next highest byte the result has to be multiplied by x^112
787  and so on. But we can do this by accumulating the result
788  in an accumulator starting with the result for the top
789  byte. We repeatedly multiply the accumulator value by
790  x^8 and then add in (i.e. xor) the 16 bytes of the next
791  lower byte in the buffer, stopping when we reach the
792  lowest byte. This requires a 4096 byte table.
793 */
794 
795 void init_4k_table(unsigned char g[], void *t);
796 
797 typedef uint_32t (*gf_t4k)[GF_BYTE_LEN >> 2];
798 #define tab4k(x) ((gf_t4k)x)
799 #define xor_4k(i,a,t,r) mul_x8(r); xor_block_aligned(r, tab4k(t)[a[i]])
800 
801 #if defined( USE_INLINES )
802 
803 #if defined( UNROLL_LOOPS )
804 
805 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
806 {
807  move_block_aligned(r,tab4k(t)[a[15]]);
808  xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);
809  xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);
810  xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);
811  xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);
812  xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);
813  move_block_aligned(a, r);
814 }
815 
816 #else
817 
818 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
819 { int i = 15;
820  move_block_aligned(r,tab4k(t)[a[15]]);
821  while(i--)
822  {
823  xor_4k(i, a, t, r);
824  }
825  move_block_aligned(a, r);
826 }
827 
828 #endif
829 
830 #else
831 
832 #if defined( UNROLL_LOOPS )
833 
834 #define gf_mul_4k(a, t, r) do { \
835  move_block_aligned(r,tab4k(t)[a[15]]); \
836  xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r); \
837  xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r); \
838  xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r); \
839  xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r); \
840  xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r); \
841  move_block_aligned(a, r); \
842 } while(0)
843 
844 #else
845 
846 #define gf_mul_4k(a, t, r) do { int i = 15; \
847  move_block_aligned(r,tab4k(t)[a[15]]); \
848  while(i--) \
849  { xor_4k(i, a, t, r); \
850  } \
851  move_block_aligned(a, r); \
852 } while(0)
853 
854 #endif
855 
856 #endif
857 
858 /* This version uses 256 bytes of table space on the stack.
859  A 16 byte buffer has to be multiplied by a 16 byte key
860  value in GF(128). If we consider a GF(128) value in a
861  single 4-bit nibble, we can construct a table of the 16
862  16 byte values that result from the 16 values of this
863  byte. This requires 256 bytes. If we take the highest
864  4-bit nibble in the buffer and use this table to get the
865  result, we then have to multiply by x^124 to get the
866  final value. For the next highest byte the result has to
867  be multiplied by x^120 and so on. But we can do this by
868  accumulating the result in an accumulator starting with
869  the result for the top nibble. We repeatedly multiply
870  the accumulator value by x^4 and then add in (i.e. xor)
871  the 16 bytes of the next lower nibble in the buffer,
872  stopping when we reach the lowest nibblebyte. This uses
873  a 256 byte table.
874 */
875 
876 void init_256_table(unsigned char g[], void *t);
877 
878 typedef uint_32t (*gf_t256)[GF_BYTE_LEN >> 2];
879 #define tab256(t) ((gf_t256)t)
880 #define xor_256(i,a,t,r) \
881  mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] & 15]); \
882  mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] >> 4])
883 
884 #if defined( USE_INLINES )
885 
886 #if defined( UNROLL_LOOPS )
887 
888 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
889 {
890  move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
891  xor_block_aligned(r, tab256(t)[a[15] >> 4]);
892  xor_256(14, a, t, r); xor_256(13, a, t, r);
893  xor_256(12, a, t, r); xor_256(11, a, t, r);
894  xor_256(10, a, t, r); xor_256( 9, a, t, r);
895  xor_256( 8, a, t, r); xor_256( 7, a, t, r);
896  xor_256( 6, a, t, r); xor_256( 5, a, t, r);
897  xor_256( 4, a, t, r); xor_256( 3, a, t, r);
898  xor_256( 2, a, t, r); xor_256( 1, a, t, r);
899  xor_256( 0, a, t, r); move_block_aligned(a, r);
900 }
901 
902 #else
903 
904 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
905 { int i = 15;
906  move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
907  xor_block_aligned(r, tab256(t)[a[15] >> 4]);
908  while(i--)
909  { xor_256(i, a, t, r);
910  }
911  move_block_aligned(a, r);
912 }
913 
914 #endif
915 
916 #else
917 
918 #if defined( UNROLL_LOOPS )
919 
920 #define gf_mul_256(a, t, r) do { \
921  move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
922  xor_block_aligned(r, tab256(t)[a[15] >> 4]); \
923  xor_256(14, a, t, r); xor_256(13, a, t, r); \
924  xor_256(12, a, t, r); xor_256(11, a, t, r); \
925  xor_256(10, a, t, r); xor_256( 9, a, t, r); \
926  xor_256( 8, a, t, r); xor_256( 7, a, t, r); \
927  xor_256( 6, a, t, r); xor_256( 5, a, t, r); \
928  xor_256( 4, a, t, r); xor_256( 3, a, t, r); \
929  xor_256( 2, a, t, r); xor_256( 1, a, t, r); \
930  xor_256( 0, a, t, r); move_block_aligned(a, r); \
931 } while(0)
932 
933 #else
934 
935 #define gf_mul_256(a, t, r) do { int i = 15; \
936  move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
937  xor_block_aligned(r, tab256(t)[a[15] >> 4]); \
938  while(i--) \
939  { xor_256(i, a, t, r); \
940  } \
941  move_block_aligned(a, r); \
942 } while(0)
943 
944 #endif
945 
946 #endif
947 
948 #if defined(__cplusplus)
949 }
950 #endif
951 
952 #endif