IBR-DTNSuite
0.8
|
00001 /* 00002 --------------------------------------------------------------------------- 00003 Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved. 00004 00005 LICENSE TERMS 00006 00007 The free distribution and use of this software in both source and binary 00008 form is allowed (with or without changes) provided that: 00009 00010 1. distributions of this source code include the above copyright 00011 notice, this list of conditions and the following disclaimer; 00012 00013 2. distributions in binary form include the above copyright 00014 notice, this list of conditions and the following disclaimer 00015 in the documentation and/or other associated materials; 00016 00017 3. the copyright holder's name is not used to endorse products 00018 built using this software without specific written permission. 00019 00020 ALTERNATIVELY, provided that this notice is retained in full, this product 00021 may be distributed under the terms of the GNU General Public License (GPL), 00022 in which case the provisions of the GPL apply INSTEAD OF those given above. 00023 00024 DISCLAIMER 00025 00026 This software is provided 'as is' with no explicit or implied warranties 00027 in respect of its properties, including, but not limited to, correctness 00028 and/or fitness for purpose. 00029 --------------------------------------------------------------------------- 00030 Issue Date: 13/10/2006 00031 00032 An implementation of field multiplication in Galois Field GF(128) 00033 */ 00034 00035 #ifndef GF128MUL_H 00036 #define GF128MUL_H 00037 00038 #include <stdlib.h> 00039 #include <string.h> 00040 00041 #include "mode_hdr.h" 00042 00043 /* Table sizes for GF(128) Multiply. Normally larger tables give 00044 higher speed but cache loading might change this. Normally only 00045 one table size (or none at all) will be specified here 00046 */ 00047 00048 #if 0 00049 # define TABLES_64K 00050 #endif 00051 #if 1 00052 # define TABLES_8K 00053 #endif 00054 #if 0 00055 # define TABLES_4K 00056 #endif 00057 #if 0 00058 # define TABLES_256 00059 #endif 00060 00061 /* Use of inlines is preferred but code blocks can also be expanded inline 00062 using 'defines'. But the latter approach will typically generate a LOT 00063 of code and is not recommended. 00064 */ 00065 #if 0 00066 # define USE_INLINES 00067 #endif 00068 00069 /* Speed critical loops can be unrolled to gain speed but consume more 00070 memory 00071 */ 00072 #if 0 00073 # define UNROLL_LOOPS 00074 #endif 00075 00076 /* Multiply a GF128 field element by x. Field elements are held in arrays 00077 of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower 00078 indexed bits placed in the more numerically significant bit positions 00079 within bytes. 00080 00081 On little endian machines the bit indexes translate into the bit 00082 positions within four 32-bit words in the following way 00083 00084 MS x[0] LS MS x[1] LS 00085 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 00086 24...31 16...23 08...15 00...07 56...63 48...55 40...47 32...39 00087 00088 MS x[2] LS MS x[3] LS 00089 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 00090 88...95 80...87 72...79 64...71 120.127 112.119 104.111 96..103 00091 00092 On big endian machines the bit indexes translate into the bit 00093 positions within four 32-bit words in the following way 00094 00095 MS x[0] LS MS x[1] LS 00096 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 00097 00...07 08...15 16...23 24...31 32...39 40...47 48...55 56...63 00098 00099 MS x[2] LS MS x[3] LS 00100 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 00101 64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127 00102 */ 00103 00104 #define GF_BYTE_LEN 16 00105 00106 #if defined( USE_INLINES ) 00107 # if defined( _MSC_VER ) 00108 # define gf_inline __inline 00109 # elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) 00110 # define gf_inline static inline 00111 # else 00112 # define gf_inline static 00113 # endif 00114 #endif 00115 00116 #if defined(__cplusplus) 00117 extern "C" 00118 { 00119 #endif 00120 00121 /* These functions multiply a field element x, by x^4 and by x^8 in the 00122 polynomial field representation. It uses 32-bit word operations to 00123 gain speed but compensates for machine endianess and hence works 00124 correctly on both styles of machine. 00125 */ 00126 extern const unsigned short gf_tab[256]; 00127 00128 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN 00129 00130 /* This section is not needed as GF(128) multiplication is now implemented 00131 but is left in place as it provides a template for an alternative little 00132 endian implementation approach based on conversion to and from big endian 00133 format 00134 */ 00135 #if 0 00136 00137 /* This is a template for mul_x. The mul_x4 and mul_x8 little endian 00138 alternative implementations (and their defined versions) follow the 00139 big endian functions below in the same way. 00140 */ 00141 00142 gf_inline void mul_x(void *r, const void *x) 00143 { uint_32t _tt; 00144 bswap32_block(r, x, 4); 00145 _tt = gf_tab[(ui32_ptr(r)[3] << 7) & 0xff]; 00146 ui32_ptr(r)[3] = (ui32_ptr(r)[3] >> 1) | (ui32_ptr(r)[2] << 31); 00147 ui32_ptr(r)[2] = (ui32_ptr(r)[2] >> 1) | (ui32_ptr(r)[1] << 31); 00148 ui32_ptr(r)[1] = (ui32_ptr(r)[1] >> 1) | (ui32_ptr(r)[0] << 31); 00149 ui32_ptr(r)[0] = (ui32_ptr(r)[0] >> 1) ^ bswap_32(_tt); 00150 bswap32_block(r, r, 4); 00151 } 00152 00153 #endif 00154 00155 #define VERSION_1 00156 00157 #define MSK_80 (0x80 * (unit_cast(BFR_UNIT,-1) / 0xff)) 00158 #define MSK_F0 (0xf0 * (unit_cast(BFR_UNIT,-1) / 0xff)) 00159 00160 #if defined( USE_INLINES ) 00161 00162 #if BFR_UNIT == 64 00163 00164 gf_inline void mul_x(void *r, const void *x) 00165 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; 00166 00167 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80; 00168 ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt; 00169 } 00170 00171 #if defined( VERSION_1 ) 00172 00173 gf_inline void mul_x4(void *x) 00174 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0]; 00175 00176 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) | (ui64_ptr(x)[0] >> 52)) & MSK_F0; 00177 ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt; 00178 } 00179 00180 #else 00181 00182 gf_inline void mul_x4(void *x) 00183 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0]; 00184 bswap64_block(x, x, 2); 00185 ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60)); 00186 ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt; 00187 } 00188 00189 #endif 00190 00191 gf_inline void mul_x8(void *x) 00192 { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56]; 00193 ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); 00194 ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt; 00195 } 00196 00197 #elif BFR_UNIT == 32 00198 00199 gf_inline void mul_x(void *r, const void *x) 00200 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; 00201 00202 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) | (ui32_ptr(x)[2] >> 17)) & MSK_80; 00203 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) | (ui32_ptr(x)[1] >> 17)) & MSK_80; 00204 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) | (ui32_ptr(x)[0] >> 17)) & MSK_80; 00205 ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt; 00206 } 00207 00208 #if defined( VERSION_1 ) 00209 00210 gf_inline void mul_x4(void *x) 00211 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0]; 00212 00213 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) | (ui32_ptr(x)[2] >> 20)) & MSK_F0; 00214 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) | (ui32_ptr(x)[1] >> 20)) & MSK_F0; 00215 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) | (ui32_ptr(x)[0] >> 20)) & MSK_F0; 00216 ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt; 00217 } 00218 00219 #else 00220 00221 gf_inline void mul_x4(void *x) 00222 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0]; 00223 bswap32_block(x, x, 4); 00224 ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28)); 00225 ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28)); 00226 ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28)); 00227 ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt; 00228 } 00229 00230 #endif 00231 00232 gf_inline void mul_x8(void *x) 00233 { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24]; 00234 00235 ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24); 00236 ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24); 00237 ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24); 00238 ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt; 00239 } 00240 00241 #else 00242 00243 gf_inline void mul_x(void *r, const void *x) 00244 { uint_8t _tt = ui8_ptr(x)[15] & 1; 00245 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); 00246 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); 00247 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); 00248 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); 00249 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); 00250 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); 00251 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); 00252 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); 00253 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); 00254 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); 00255 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); 00256 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); 00257 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); 00258 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); 00259 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); 00260 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); 00261 } 00262 00263 gf_inline void mul_x4(void *x) 00264 { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; 00265 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); 00266 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); 00267 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); 00268 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); 00269 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); 00270 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); 00271 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); 00272 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); 00273 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); 00274 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); 00275 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); 00276 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); 00277 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); 00278 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); 00279 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8); 00280 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff); 00281 } 00282 00283 gf_inline void mul_x8(void *x) 00284 { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; 00285 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); 00286 ui8_ptr(x)[1] ^= (_tt >> 8); 00287 ui8_ptr(x)[0] = (_tt & 0xff); 00288 } 00289 00290 #endif 00291 00292 #else /* DEFINES */ 00293 00294 #if BFR_UNIT == 64 00295 00296 #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; \ 00297 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 \ 00298 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80; \ 00299 ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 \ 00300 | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt; \ 00301 } while(0) 00302 00303 #if defined( VERSION_1 ) 00304 00305 #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0]; \ 00306 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) \ 00307 | (ui64_ptr(x)[0] >> 52)) & MSK_F0; \ 00308 ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 \ 00309 | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \ 00310 } while(0) 00311 00312 #else 00313 00314 #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0]; \ 00315 bswap64_block(x, x, 2); \ 00316 ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60)); \ 00317 ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt; \ 00318 } while(0) 00319 00320 #endif 00321 00322 #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56]; \ 00323 ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); \ 00324 ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt; \ 00325 } while(0) 00326 00327 #elif BFR_UNIT == 32 00328 00329 #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; \ 00330 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) \ 00331 | (ui32_ptr(x)[2] >> 17)) & MSK_80; \ 00332 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) \ 00333 | (ui32_ptr(x)[1] >> 17)) & MSK_80; \ 00334 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) \ 00335 | (ui32_ptr(x)[0] >> 17)) & MSK_80; \ 00336 ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 \ 00337 | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt; \ 00338 } while(0) 00339 00340 #if defined( VERSION_1 ) 00341 00342 #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0]; \ 00343 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) \ 00344 | (ui32_ptr(x)[2] >> 20)) & MSK_F0; \ 00345 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) \ 00346 | (ui32_ptr(x)[1] >> 20)) & MSK_F0; \ 00347 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) \ 00348 | (ui32_ptr(x)[0] >> 20)) & MSK_F0; \ 00349 ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 \ 00350 | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \ 00351 } while(0) 00352 00353 #else 00354 00355 #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0]; \ 00356 bswap32_block(x, x, 4); \ 00357 ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28)); \ 00358 ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28)); \ 00359 ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28)); \ 00360 ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt; \ 00361 } while(0) 00362 00363 #endif 00364 00365 #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24]; \ 00366 ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24); \ 00367 ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24); \ 00368 ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24); \ 00369 ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt; \ 00370 } while(0) 00371 00372 #else 00373 00374 #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \ 00375 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \ 00376 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \ 00377 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \ 00378 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \ 00379 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \ 00380 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \ 00381 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \ 00382 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \ 00383 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \ 00384 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \ 00385 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \ 00386 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \ 00387 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \ 00388 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \ 00389 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \ 00390 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \ 00391 } while(0) 00392 00393 #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \ 00394 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \ 00395 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \ 00396 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \ 00397 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \ 00398 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \ 00399 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \ 00400 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \ 00401 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \ 00402 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \ 00403 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \ 00404 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \ 00405 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \ 00406 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \ 00407 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \ 00408 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8); \ 00409 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff); \ 00410 } while(0) 00411 00412 #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \ 00413 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \ 00414 ui8_ptr(x)[1] ^= (_tt >> 8); \ 00415 ui8_ptr(x)[0] = (_tt & 0xff); \ 00416 } while(0) 00417 00418 #endif 00419 00420 #endif 00421 00422 #elif PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN 00423 00424 #if defined( USE_INLINES ) 00425 00426 #if BFR_UNIT == 64 00427 00428 gf_inline void mul_x(void *r, const void *x) 00429 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff]; 00430 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63); 00431 ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48); 00432 } 00433 00434 gf_inline void mul_x4(void *x) 00435 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; 00436 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60); 00437 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48); 00438 } 00439 00440 gf_inline void mul_x8(void *x) 00441 { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff]; 00442 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56); 00443 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48); 00444 } 00445 00446 #elif BFR_UNIT == 32 00447 00448 gf_inline void mul_x(void *r, const void *x) 00449 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff]; 00450 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31); 00451 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31); 00452 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31); 00453 ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16); 00454 } 00455 00456 gf_inline void mul_x4(void *x) 00457 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; 00458 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28); 00459 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28); 00460 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28); 00461 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16); 00462 } 00463 00464 gf_inline void mul_x8(void *x) 00465 { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff]; 00466 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24); 00467 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24); 00468 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24); 00469 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16); 00470 } 00471 00472 #else 00473 00474 gf_inline void mul_x(void *r, const void *x) 00475 { uint_8t _tt = ui8_ptr(x)[15] & 1; 00476 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); 00477 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); 00478 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); 00479 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); 00480 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); 00481 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); 00482 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); 00483 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); 00484 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); 00485 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); 00486 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); 00487 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); 00488 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); 00489 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); 00490 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); 00491 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); 00492 } 00493 00494 gf_inline void mul_x4(void *x) 00495 { 00496 uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; 00497 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); 00498 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); 00499 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); 00500 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); 00501 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); 00502 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); 00503 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); 00504 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); 00505 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); 00506 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); 00507 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); 00508 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); 00509 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); 00510 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); 00511 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff); 00512 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8); 00513 } 00514 00515 gf_inline void mul_x8(void *x) 00516 { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; 00517 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); 00518 ui8_ptr(x)[1] ^= (_tt & 0xff); 00519 ui8_ptr(x)[0] = (_tt >> 8); 00520 } 00521 00522 #endif 00523 00524 #else /* DEFINES */ 00525 00526 #if BFR_UNIT == 64 00527 00528 #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff]; \ 00529 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63); \ 00530 ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48); \ 00531 } while(0) 00532 00533 #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; \ 00534 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60); \ 00535 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48); \ 00536 } while(0) 00537 00538 #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff]; \ 00539 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56); \ 00540 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48); \ 00541 } while(0) 00542 00543 #elif BFR_UNIT == 32 00544 00545 #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff]; \ 00546 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31); \ 00547 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31); \ 00548 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31); \ 00549 ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16); \ 00550 } while(0) 00551 00552 #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; \ 00553 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28); \ 00554 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28); \ 00555 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28); \ 00556 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16); \ 00557 } while(0) 00558 00559 #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff]; \ 00560 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24); \ 00561 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24); \ 00562 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24); \ 00563 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16); \ 00564 } while(0) 00565 00566 #else 00567 00568 #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \ 00569 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \ 00570 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \ 00571 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \ 00572 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \ 00573 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \ 00574 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \ 00575 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \ 00576 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \ 00577 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \ 00578 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \ 00579 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \ 00580 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \ 00581 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \ 00582 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \ 00583 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \ 00584 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \ 00585 } while(0) 00586 00587 #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \ 00588 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \ 00589 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \ 00590 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \ 00591 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \ 00592 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \ 00593 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \ 00594 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \ 00595 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \ 00596 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \ 00597 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \ 00598 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \ 00599 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \ 00600 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \ 00601 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \ 00602 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff); \ 00603 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8); \ 00604 } while(0) 00605 00606 #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \ 00607 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \ 00608 ui8_ptr(x)[1] ^= (_tt & 0xff); \ 00609 ui8_ptr(x)[0] = (_tt >> 8); \ 00610 } while(0) 00611 00612 #endif 00613 00614 #endif 00615 00616 #else 00617 # error Platform byte order has not been set. 00618 #endif 00619 00620 /* A slow generic version of gf_mul (a = a * b) */ 00621 00622 void gf_mul(void *a, const void* b); 00623 00624 /* This version uses 64k bytes of table space on the stack. 00625 A 16 byte buffer has to be multiplied by a 16 byte key 00626 value in GF(128). If we consider a GF(128) value in 00627 the buffer's lowest byte, we can construct a table of 00628 the 256 16 byte values that result from the 256 values 00629 of this byte. This requires 4096 bytes. But we also 00630 need tables for each of the 16 higher bytes in the 00631 buffer as well, which makes 64 kbytes in total. 00632 */ 00633 00634 void init_64k_table(unsigned char g[], void *t); 00635 typedef uint_32t (*gf_t64k)[256][GF_BYTE_LEN >> 2]; 00636 #define tab64k(x) ((gf_t64k)x) 00637 #define xor_64k(i,a,t,r) xor_block_aligned(r, tab64k(t)[i][a[i]]) 00638 00639 #if defined( USE_INLINES ) 00640 00641 #if defined( UNROLL_LOOPS ) 00642 00643 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r) 00644 { 00645 move_block_aligned(r, tab64k(t)[0][a[0]]); xor_64k( 1, a, t, r); 00646 xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); 00647 xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); 00648 xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); 00649 xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); 00650 xor_64k(10, a, t, r); xor_64k(11, a, t, r); 00651 xor_64k(12, a, t, r); xor_64k(13, a, t, r); 00652 xor_64k(14, a, t, r); xor_64k(15, a, t, r); 00653 move_block_aligned(a, r); 00654 } 00655 00656 #else 00657 00658 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r) 00659 { int i; 00660 move_block_aligned(r, tab64k(t)[0][a[0]]); 00661 for(i = 1; i < GF_BYTE_LEN; ++i) 00662 xor_64k(i, a, t, r); 00663 move_block_aligned(a, r); 00664 } 00665 00666 #endif 00667 00668 #else 00669 00670 #if !defined( UNROLL_LOOPS ) 00671 00672 #define gf_mul_64k(a, t, r) do { \ 00673 move_block_aligned(r, tab64k(t)[0][a[0]]); \ 00674 xor_64k( 1, a, t, r); \ 00675 xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); \ 00676 xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); \ 00677 xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); \ 00678 xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); \ 00679 xor_64k(10, a, t, r); xor_64k(11, a, t, r); \ 00680 xor_64k(12, a, t, r); xor_64k(13, a, t, r); \ 00681 xor_64k(14, a, t, r); xor_64k(15, a, t, r); \ 00682 move_block_aligned(a, r); \ 00683 } while(0) 00684 00685 #else 00686 00687 #define gf_mul_64k(a, t, r) do { int i; \ 00688 move_block_aligned(r, tab64k(t)[0][a[0]]); \ 00689 for(i = 1; i < GF_BYTE_LEN; ++i) \ 00690 { xor_64k(i, a, t, r); \ 00691 } \ 00692 move_block_aligned(a, r); \ 00693 } while(0) 00694 00695 #endif 00696 00697 #endif 00698 00699 /* This version uses 8k bytes of table space on the stack. 00700 A 16 byte buffer has to be multiplied by a 16 byte key 00701 value in GF(128). If we consider a GF(128) value in 00702 the buffer's lowest 4-bits, we can construct a table of 00703 the 16 16 byte values that result from the 16 values 00704 of these 4 bits. This requires 256 bytes. But we also 00705 need tables for each of the 32 higher 4 bit groups, 00706 which makes 8 kbytes in total. 00707 */ 00708 00709 void init_8k_table(unsigned char g[], void *t); 00710 00711 typedef uint_32t (*gf_t8k)[16][GF_BYTE_LEN >> 2]; 00712 #define tab8k(x) ((gf_t8k)x) 00713 #define xor_8k(i,a,t,r) \ 00714 xor_block_aligned(r, tab8k(t)[i + i][a[i] & 15]); \ 00715 xor_block_aligned(r, tab8k(t)[i + i + 1][a[i] >> 4]) 00716 00717 #if defined( USE_INLINES ) 00718 00719 #if defined( UNROLL_LOOPS ) 00720 00721 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r) 00722 { 00723 move_block_aligned(r, tab8k(t)[0][a[0] & 15]); 00724 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); 00725 xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); xor_8k( 3, a, t, r); 00726 xor_8k( 4, a, t, r); xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); xor_8k( 7, a, t, r); 00727 xor_8k( 8, a, t, r); xor_8k( 9, a, t, r); xor_8k(10, a, t, r); xor_8k(11, a, t, r); 00728 xor_8k(12, a, t, r); xor_8k(13, a, t, r); xor_8k(14, a, t, r); xor_8k(15, a, t, r); 00729 move_block_aligned(a, r); 00730 } 00731 00732 #else 00733 00734 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r) 00735 { int i; 00736 memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); 00737 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); 00738 for(i = 1; i < GF_BYTE_LEN; ++i) 00739 { xor_8k(i, a, t, r); 00740 } 00741 memcpy(a, r, GF_BYTE_LEN); 00742 } 00743 00744 #endif 00745 00746 #else 00747 00748 #if defined( UNROLL_LOOPS ) 00749 00750 #define gf_mul_8k(a, t, r) do { \ 00751 move_block_aligned(r, tab8k(t)[0][a[0] & 15]); \ 00752 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \ 00753 xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); \ 00754 xor_8k( 3, a, t, r); xor_8k( 4, a, t, r); \ 00755 xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); \ 00756 xor_8k( 7, a, t, r); xor_8k( 8, a, t, r); \ 00757 xor_8k( 9, a, t, r); xor_8k(10, a, t, r); \ 00758 xor_8k(11, a, t, r); xor_8k(12, a, t, r); \ 00759 xor_8k(13, a, t, r); xor_8k(14, a, t, r); \ 00760 xor_8k(15, a, t, r); move_block_aligned(a, r); \ 00761 } while(0) 00762 00763 #else 00764 00765 #define gf_mul_8k(a, t, r) do { int i; \ 00766 memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); \ 00767 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \ 00768 for(i = 1; i < GF_BYTE_LEN; ++i) \ 00769 { xor_8k(i, a, t, r); \ 00770 } \ 00771 memcpy(a, r, GF_BYTE_LEN); \ 00772 } while(0) 00773 00774 #endif 00775 00776 #endif 00777 00778 /* This version uses 4k bytes of table space on the stack. 00779 A 16 byte buffer has to be multiplied by a 16 byte key 00780 value in GF(128). If we consider a GF(128) value in a 00781 single byte, we can construct a table of the 256 16 byte 00782 values that result from the 256 values of this byte. 00783 This requires 4096 bytes. If we take the highest byte in 00784 the buffer and use this table to get the result, we then 00785 have to multiply by x^120 to get the final value. For the 00786 next highest byte the result has to be multiplied by x^112 00787 and so on. But we can do this by accumulating the result 00788 in an accumulator starting with the result for the top 00789 byte. We repeatedly multiply the accumulator value by 00790 x^8 and then add in (i.e. xor) the 16 bytes of the next 00791 lower byte in the buffer, stopping when we reach the 00792 lowest byte. This requires a 4096 byte table. 00793 */ 00794 00795 void init_4k_table(unsigned char g[], void *t); 00796 00797 typedef uint_32t (*gf_t4k)[GF_BYTE_LEN >> 2]; 00798 #define tab4k(x) ((gf_t4k)x) 00799 #define xor_4k(i,a,t,r) mul_x8(r); xor_block_aligned(r, tab4k(t)[a[i]]) 00800 00801 #if defined( USE_INLINES ) 00802 00803 #if defined( UNROLL_LOOPS ) 00804 00805 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r) 00806 { 00807 move_block_aligned(r,tab4k(t)[a[15]]); 00808 xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r); 00809 xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r); 00810 xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r); 00811 xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r); 00812 xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r); 00813 move_block_aligned(a, r); 00814 } 00815 00816 #else 00817 00818 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r) 00819 { int i = 15; 00820 move_block_aligned(r,tab4k(t)[a[15]]); 00821 while(i--) 00822 { 00823 xor_4k(i, a, t, r); 00824 } 00825 move_block_aligned(a, r); 00826 } 00827 00828 #endif 00829 00830 #else 00831 00832 #if defined( UNROLL_LOOPS ) 00833 00834 #define gf_mul_4k(a, t, r) do { \ 00835 move_block_aligned(r,tab4k(t)[a[15]]); \ 00836 xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r); \ 00837 xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r); \ 00838 xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r); \ 00839 xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r); \ 00840 xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r); \ 00841 move_block_aligned(a, r); \ 00842 } while(0) 00843 00844 #else 00845 00846 #define gf_mul_4k(a, t, r) do { int i = 15; \ 00847 move_block_aligned(r,tab4k(t)[a[15]]); \ 00848 while(i--) \ 00849 { xor_4k(i, a, t, r); \ 00850 } \ 00851 move_block_aligned(a, r); \ 00852 } while(0) 00853 00854 #endif 00855 00856 #endif 00857 00858 /* This version uses 256 bytes of table space on the stack. 00859 A 16 byte buffer has to be multiplied by a 16 byte key 00860 value in GF(128). If we consider a GF(128) value in a 00861 single 4-bit nibble, we can construct a table of the 16 00862 16 byte values that result from the 16 values of this 00863 byte. This requires 256 bytes. If we take the highest 00864 4-bit nibble in the buffer and use this table to get the 00865 result, we then have to multiply by x^124 to get the 00866 final value. For the next highest byte the result has to 00867 be multiplied by x^120 and so on. But we can do this by 00868 accumulating the result in an accumulator starting with 00869 the result for the top nibble. We repeatedly multiply 00870 the accumulator value by x^4 and then add in (i.e. xor) 00871 the 16 bytes of the next lower nibble in the buffer, 00872 stopping when we reach the lowest nibblebyte. This uses 00873 a 256 byte table. 00874 */ 00875 00876 void init_256_table(unsigned char g[], void *t); 00877 00878 typedef uint_32t (*gf_t256)[GF_BYTE_LEN >> 2]; 00879 #define tab256(t) ((gf_t256)t) 00880 #define xor_256(i,a,t,r) \ 00881 mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] & 15]); \ 00882 mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] >> 4]) 00883 00884 #if defined( USE_INLINES ) 00885 00886 #if defined( UNROLL_LOOPS ) 00887 00888 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r) 00889 { 00890 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); 00891 xor_block_aligned(r, tab256(t)[a[15] >> 4]); 00892 xor_256(14, a, t, r); xor_256(13, a, t, r); 00893 xor_256(12, a, t, r); xor_256(11, a, t, r); 00894 xor_256(10, a, t, r); xor_256( 9, a, t, r); 00895 xor_256( 8, a, t, r); xor_256( 7, a, t, r); 00896 xor_256( 6, a, t, r); xor_256( 5, a, t, r); 00897 xor_256( 4, a, t, r); xor_256( 3, a, t, r); 00898 xor_256( 2, a, t, r); xor_256( 1, a, t, r); 00899 xor_256( 0, a, t, r); move_block_aligned(a, r); 00900 } 00901 00902 #else 00903 00904 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r) 00905 { int i = 15; 00906 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); 00907 xor_block_aligned(r, tab256(t)[a[15] >> 4]); 00908 while(i--) 00909 { xor_256(i, a, t, r); 00910 } 00911 move_block_aligned(a, r); 00912 } 00913 00914 #endif 00915 00916 #else 00917 00918 #if defined( UNROLL_LOOPS ) 00919 00920 #define gf_mul_256(a, t, r) do { \ 00921 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \ 00922 xor_block_aligned(r, tab256(t)[a[15] >> 4]); \ 00923 xor_256(14, a, t, r); xor_256(13, a, t, r); \ 00924 xor_256(12, a, t, r); xor_256(11, a, t, r); \ 00925 xor_256(10, a, t, r); xor_256( 9, a, t, r); \ 00926 xor_256( 8, a, t, r); xor_256( 7, a, t, r); \ 00927 xor_256( 6, a, t, r); xor_256( 5, a, t, r); \ 00928 xor_256( 4, a, t, r); xor_256( 3, a, t, r); \ 00929 xor_256( 2, a, t, r); xor_256( 1, a, t, r); \ 00930 xor_256( 0, a, t, r); move_block_aligned(a, r); \ 00931 } while(0) 00932 00933 #else 00934 00935 #define gf_mul_256(a, t, r) do { int i = 15; \ 00936 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \ 00937 xor_block_aligned(r, tab256(t)[a[15] >> 4]); \ 00938 while(i--) \ 00939 { xor_256(i, a, t, r); \ 00940 } \ 00941 move_block_aligned(a, r); \ 00942 } while(0) 00943 00944 #endif 00945 00946 #endif 00947 00948 #if defined(__cplusplus) 00949 } 00950 #endif 00951 00952 #endif