IBR-DTNSuite  0.8
ibrcommon/ibrcommon/ssl/gcm/gf128mul.h
Go to the documentation of this file.
00001 /*
00002  ---------------------------------------------------------------------------
00003  Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
00004 
00005  LICENSE TERMS
00006 
00007  The free distribution and use of this software in both source and binary
00008  form is allowed (with or without changes) provided that:
00009 
00010    1. distributions of this source code include the above copyright
00011       notice, this list of conditions and the following disclaimer;
00012 
00013    2. distributions in binary form include the above copyright
00014       notice, this list of conditions and the following disclaimer
00015       in the documentation and/or other associated materials;
00016 
00017    3. the copyright holder's name is not used to endorse products
00018       built using this software without specific written permission.
00019 
00020  ALTERNATIVELY, provided that this notice is retained in full, this product
00021  may be distributed under the terms of the GNU General Public License (GPL),
00022  in which case the provisions of the GPL apply INSTEAD OF those given above.
00023 
00024  DISCLAIMER
00025 
00026  This software is provided 'as is' with no explicit or implied warranties
00027  in respect of its properties, including, but not limited to, correctness
00028  and/or fitness for purpose.
00029  ---------------------------------------------------------------------------
00030  Issue Date: 13/10/2006
00031 
00032  An implementation of field multiplication in Galois Field GF(128)
00033 */
00034 
00035 #ifndef GF128MUL_H
00036 #define GF128MUL_H
00037 
00038 #include <stdlib.h>
00039 #include <string.h>
00040 
00041 #include "mode_hdr.h"
00042 
00043 /*  Table sizes for GF(128) Multiply.  Normally larger tables give 
00044     higher speed but cache loading might change this. Normally only 
00045     one table size (or none at all) will be specified here
00046 */
00047 
00048 #if 0
00049 #  define TABLES_64K
00050 #endif
00051 #if 1
00052 #  define TABLES_8K
00053 #endif
00054 #if 0
00055 #  define TABLES_4K
00056 #endif
00057 #if 0
00058 #  define TABLES_256
00059 #endif
00060 
00061 /*  Use of inlines is preferred but code blocks can also be expanded inline
00062     using 'defines'.  But the latter approach will typically generate a LOT
00063     of code and is not recommended. 
00064 */
00065 #if 0
00066 #  define USE_INLINES
00067 #endif
00068 
00069 /*  Speed critical loops can be unrolled to gain speed but consume more
00070     memory
00071 */
00072 #if 0
00073 #  define UNROLL_LOOPS
00074 #endif
00075 
00076 /*  Multiply a GF128 field element by x. Field elements are held in arrays
00077     of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower
00078     indexed bits placed in the more numerically significant bit positions
00079     within bytes.
00080 
00081     On little endian machines the bit indexes translate into the bit
00082     positions within four 32-bit words in the following way
00083 
00084     MS            x[0]           LS  MS            x[1]           LS
00085     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
00086     24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39
00087 
00088     MS            x[2]           LS  MS            x[3]           LS
00089     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
00090     88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103
00091 
00092     On big endian machines the bit indexes translate into the bit
00093     positions within four 32-bit words in the following way
00094 
00095     MS            x[0]           LS  MS            x[1]           LS
00096     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
00097     00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63
00098 
00099     MS            x[2]           LS  MS            x[3]           LS
00100     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
00101     64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127
00102 */
00103 
00104 #define GF_BYTE_LEN 16
00105 
00106 #if defined( USE_INLINES )
00107 #  if defined( _MSC_VER )
00108 #    define gf_inline __inline
00109 #  elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
00110 #    define gf_inline static inline
00111 #  else
00112 #    define gf_inline static
00113 #  endif
00114 #endif
00115 
00116 #if defined(__cplusplus)
00117 extern "C"
00118 {
00119 #endif
00120 
00121 /*  These functions multiply a field element x, by x^4 and by x^8 in the 
00122     polynomial field representation. It uses 32-bit word operations to
00123     gain speed but compensates for machine endianess and hence works 
00124     correctly on both styles of machine.
00125 */
00126 extern const unsigned short gf_tab[256];
00127 
00128 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
00129 
00130 /*  This section is not needed as GF(128) multiplication is now implemented
00131     but is left in place as it provides a template for an alternative little
00132     endian implementation approach based on conversion to and from big endian
00133     format
00134 */
00135 #if 0
00136 
00137 /*  This is a template for mul_x.  The mul_x4 and mul_x8 little endian
00138     alternative implementations (and their defined versions) follow the 
00139     big endian functions below in the same way.
00140 */
00141 
00142 gf_inline void mul_x(void *r, const void *x)
00143 {   uint_32t _tt;
00144     bswap32_block(r, x, 4); 
00145     _tt = gf_tab[(ui32_ptr(r)[3] << 7) & 0xff];
00146     ui32_ptr(r)[3] = (ui32_ptr(r)[3] >> 1) | (ui32_ptr(r)[2] << 31);
00147     ui32_ptr(r)[2] = (ui32_ptr(r)[2] >> 1) | (ui32_ptr(r)[1] << 31);
00148     ui32_ptr(r)[1] = (ui32_ptr(r)[1] >> 1) | (ui32_ptr(r)[0] << 31);
00149     ui32_ptr(r)[0] = (ui32_ptr(r)[0] >> 1) ^ bswap_32(_tt);
00150     bswap32_block(r, r, 4);
00151 }
00152 
00153 #endif
00154 
00155 #define VERSION_1
00156 
00157 #define MSK_80   (0x80 * (unit_cast(BFR_UNIT,-1) / 0xff))
00158 #define MSK_F0   (0xf0 * (unit_cast(BFR_UNIT,-1) / 0xff))
00159 
00160 #if defined( USE_INLINES )
00161 
00162 #if BFR_UNIT == 64
00163 
00164     gf_inline void mul_x(void *r, const void *x)
00165     {   uint_64t  _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80];
00166 
00167         ui64_ptr(r)[1] =  (ui64_ptr(x)[1] >> 1) & ~MSK_80 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;
00168         ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 |  (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;
00169     }
00170 
00171   #if defined( VERSION_1 )
00172 
00173     gf_inline void mul_x4(void *x)
00174     {   uint_64t   _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];
00175 
00176         ui64_ptr(x)[1] =  (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) | (ui64_ptr(x)[0] >> 52)) & MSK_F0;
00177         ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 |  (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
00178     }
00179 
00180   #else
00181 
00182     gf_inline void mul_x4(void *x)
00183     {   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];
00184         bswap64_block(x, x, 2);
00185         ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));
00186         ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;
00187     }
00188 
00189   #endif
00190 
00191     gf_inline void mul_x8(void *x)
00192     {   uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];
00193         ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); 
00194         ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;
00195     }
00196 
00197 #elif BFR_UNIT == 32
00198 
00199     gf_inline void mul_x(void *r, const void *x)
00200     {   uint_32t  _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80];
00201 
00202         ui32_ptr(r)[3] =  (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) | (ui32_ptr(x)[2] >> 17)) & MSK_80;
00203         ui32_ptr(r)[2] =  (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) | (ui32_ptr(x)[1] >> 17)) & MSK_80;
00204         ui32_ptr(r)[1] =  (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) | (ui32_ptr(x)[0] >> 17)) & MSK_80;
00205         ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 |  (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;
00206     }
00207 
00208   #if defined( VERSION_1 )
00209 
00210     gf_inline void mul_x4(void *x)
00211     {   uint_32t   _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];
00212 
00213         ui32_ptr(x)[3] =  (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) | (ui32_ptr(x)[2] >> 20)) & MSK_F0;
00214         ui32_ptr(x)[2] =  (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) | (ui32_ptr(x)[1] >> 20)) & MSK_F0;
00215         ui32_ptr(x)[1] =  (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) | (ui32_ptr(x)[0] >> 20)) & MSK_F0;
00216         ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 |  (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
00217     }
00218 
00219   #else
00220 
00221     gf_inline void mul_x4(void *x)
00222     {   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];
00223         bswap32_block(x, x, 4);
00224         ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));
00225         ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));
00226         ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));
00227         ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;
00228     }
00229 
00230   #endif
00231 
00232     gf_inline void mul_x8(void *x)
00233     {   uint_32t   _tt = gf_tab[ui32_ptr(x)[3] >> 24];
00234 
00235         ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);
00236         ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);
00237         ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);
00238         ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;
00239     }
00240 
00241 #else
00242 
00243     gf_inline void mul_x(void *r, const void *x)
00244     {   uint_8t _tt = ui8_ptr(x)[15] & 1;
00245         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
00246         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
00247         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
00248         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
00249         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
00250         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
00251         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
00252         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
00253         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
00254         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
00255         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
00256         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
00257         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
00258         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
00259         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
00260         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
00261     }
00262 
00263     gf_inline void mul_x4(void *x)
00264     {   uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
00265         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
00266         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
00267         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
00268         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
00269         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
00270         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
00271         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
00272         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
00273         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
00274         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
00275         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
00276         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
00277         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
00278         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
00279         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);
00280         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);
00281     }
00282 
00283     gf_inline void mul_x8(void *x)
00284     {   uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
00285         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
00286         ui8_ptr(x)[1] ^= (_tt >> 8);
00287         ui8_ptr(x)[0] = (_tt & 0xff);
00288     }
00289 
00290 #endif
00291 
00292 #else   /* DEFINES */
00293 
00294 #if BFR_UNIT == 64
00295 
00296     #define mul_x(r, x) do { uint_64t  _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; \
00297         ui64_ptr(r)[1] =  (ui64_ptr(x)[1] >> 1) & ~MSK_80                             \
00298             | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;             \
00299         ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80                             \
00300             |  (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;                                \
00301     } while(0)
00302 
00303   #if defined( VERSION_1 )
00304 
00305     #define mul_x4(x) do { uint_64t   _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];  \
00306         ui64_ptr(x)[1] =  (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12)   \
00307             | (ui64_ptr(x)[0] >> 52)) & MSK_F0;                                       \
00308         ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0                             \
00309             |  (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;                                \
00310     } while(0)
00311 
00312   #else
00313 
00314     #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];        \
00315         bswap64_block(x, x, 2);                                                         \
00316         ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));      \
00317         ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;                         \
00318     } while(0)
00319 
00320   #endif
00321 
00322     #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];     \
00323         ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56);    \
00324         ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;                       \
00325     } while(0)
00326 
00327 #elif BFR_UNIT == 32
00328 
00329     #define mul_x(r, x) do { uint_32t  _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; \
00330         ui32_ptr(r)[3] =  (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15)   \
00331             | (ui32_ptr(x)[2] >> 17)) & MSK_80;                                       \
00332         ui32_ptr(r)[2] =  (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15)   \
00333             | (ui32_ptr(x)[1] >> 17)) & MSK_80;                                       \
00334         ui32_ptr(r)[1] =  (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15)   \
00335             | (ui32_ptr(x)[0] >> 17)) & MSK_80;                                       \
00336         ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80                             \
00337             | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;                                 \
00338     } while(0)
00339 
00340   #if defined( VERSION_1 )
00341 
00342     #define mul_x4(x) do { uint_32t   _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];  \
00343         ui32_ptr(x)[3] =  (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12)   \
00344             | (ui32_ptr(x)[2] >> 20)) & MSK_F0;                                       \
00345         ui32_ptr(x)[2] =  (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12)   \
00346             | (ui32_ptr(x)[1] >> 20)) & MSK_F0;                                       \
00347         ui32_ptr(x)[1] =  (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12)   \
00348             | (ui32_ptr(x)[0] >> 20)) & MSK_F0;                                       \
00349         ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0                             \
00350             |  (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;                                \
00351     } while(0)
00352 
00353   #else
00354 
00355     #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];    \
00356         bswap32_block(x, x, 4);                                                     \
00357         ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));  \
00358         ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));  \
00359         ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));  \
00360         ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;                     \
00361     } while(0)
00362 
00363   #endif
00364 
00365 #define mul_x8(x) do { uint_32t   _tt = gf_tab[ui32_ptr(x)[3] >> 24];       \
00366         ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);    \
00367         ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);    \
00368         ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);    \
00369         ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;                       \
00370     } while(0)
00371 
00372 #else
00373 
00374     #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1;          \
00375         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
00376         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
00377         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
00378         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
00379         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
00380         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
00381         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
00382         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
00383         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
00384         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
00385         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
00386         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
00387         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
00388         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
00389         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
00390         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);   \
00391     } while(0)
00392 
00393     #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];         \
00394         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);                \
00395         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);                \
00396         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);                \
00397         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);                \
00398         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);                \
00399         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);                \
00400         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);                \
00401         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);                \
00402         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);                \
00403         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);                \
00404         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);                \
00405         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);                \
00406         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);                \
00407         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);                \
00408         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);  \
00409         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);                         \
00410     } while(0)
00411 
00412     #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];   \
00413         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);                    \
00414         ui8_ptr(x)[1] ^= (_tt >> 8);                                \
00415         ui8_ptr(x)[0] = (_tt & 0xff);                               \
00416     } while(0)
00417 
00418 #endif 
00419 
00420 #endif
00421 
00422 #elif PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
00423 
00424 #if defined( USE_INLINES )
00425 
00426 #if BFR_UNIT == 64
00427 
00428     gf_inline void mul_x(void *r, const void *x)
00429     {   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];
00430         ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);
00431         ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);
00432     }
00433 
00434     gf_inline void mul_x4(void *x)
00435     {   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff];
00436         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);
00437         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);
00438     }
00439 
00440     gf_inline void mul_x8(void *x)
00441     {   uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];
00442         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);
00443         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);
00444     }
00445 
00446 #elif BFR_UNIT == 32
00447 
00448     gf_inline void mul_x(void *r, const void *x)
00449     {   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];
00450         ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);
00451         ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);
00452         ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);
00453         ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);
00454     }
00455 
00456     gf_inline void mul_x4(void *x)
00457     {   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff];
00458         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);
00459         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);
00460         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);
00461         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);
00462     }
00463 
00464     gf_inline void mul_x8(void *x)
00465     {   uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];
00466         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);
00467         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);
00468         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);
00469         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);
00470     }
00471 
00472 #else
00473 
00474     gf_inline void mul_x(void *r, const void *x)
00475     {   uint_8t _tt = ui8_ptr(x)[15] & 1;
00476         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
00477         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
00478         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
00479         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
00480         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
00481         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
00482         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
00483         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
00484         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
00485         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
00486         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
00487         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
00488         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
00489         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
00490         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
00491         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
00492     }
00493 
00494     gf_inline void mul_x4(void *x)
00495     {
00496         uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
00497         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
00498         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
00499         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
00500         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
00501         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
00502         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
00503         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
00504         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
00505         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
00506         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
00507         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
00508         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
00509         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
00510         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
00511         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);
00512         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);
00513     }
00514 
00515     gf_inline void mul_x8(void *x)
00516     {   uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
00517         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
00518         ui8_ptr(x)[1] ^= (_tt & 0xff);
00519         ui8_ptr(x)[0] = (_tt >> 8);
00520     }
00521 
00522 #endif
00523 
00524 #else   /* DEFINES */
00525 
00526 #if BFR_UNIT == 64
00527 
00528     #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];   \
00529         ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);            \
00530         ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);                       \
00531     } while(0)
00532 
00533     #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; \
00534         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);        \
00535         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);                   \
00536     } while(0)
00537 
00538     #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];    \
00539         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);    \
00540         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);               \
00541     } while(0)
00542 
00543 #elif BFR_UNIT == 32
00544 
00545     #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];   \
00546         ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);            \
00547         ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);            \
00548         ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);            \
00549         ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);                       \
00550     } while(0)
00551 
00552     #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; \
00553         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);        \
00554         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);        \
00555         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);        \
00556         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);                   \
00557     } while(0)
00558 
00559     #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];    \
00560         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);    \
00561         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);    \
00562         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);    \
00563         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);               \
00564     } while(0)
00565 
00566 #else
00567 
00568     #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1;          \
00569         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
00570         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
00571         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
00572         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
00573         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
00574         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
00575         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
00576         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
00577         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
00578         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
00579         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
00580         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
00581         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
00582         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
00583         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
00584         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);   \
00585     } while(0)
00586 
00587     #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \
00588         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);        \
00589         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);        \
00590         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);        \
00591         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);        \
00592         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);        \
00593         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);        \
00594         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);        \
00595         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);        \
00596         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);        \
00597         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);        \
00598         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);        \
00599         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);        \
00600         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);        \
00601         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);        \
00602         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);    \
00603         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);                   \
00604     } while(0)
00605 
00606     #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];   \
00607         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);                    \
00608         ui8_ptr(x)[1] ^= (_tt & 0xff);                              \
00609         ui8_ptr(x)[0] = (_tt >> 8);                                 \
00610     } while(0)
00611 
00612 #endif
00613 
00614 #endif
00615 
00616 #else
00617 #  error Platform byte order has not been set. 
00618 #endif
00619 
00620 /*  A slow generic version of gf_mul (a = a * b) */
00621 
00622 void gf_mul(void *a, const void* b);
00623 
00624 /*  This version uses 64k bytes of table space on the stack.
00625     A 16 byte buffer has to be multiplied by a 16 byte key
00626     value in GF(128).  If we consider a GF(128) value in
00627     the buffer's lowest byte, we can construct a table of
00628     the 256 16 byte values that result from the 256 values
00629     of this byte.  This requires 4096 bytes. But we also
00630     need tables for each of the 16 higher bytes in the
00631     buffer as well, which makes 64 kbytes in total.
00632 */
00633 
00634 void init_64k_table(unsigned char g[], void *t);
00635 typedef uint_32t            (*gf_t64k)[256][GF_BYTE_LEN >> 2];
00636 #define tab64k(x)           ((gf_t64k)x)
00637 #define xor_64k(i,a,t,r)    xor_block_aligned(r, tab64k(t)[i][a[i]])
00638 
00639 #if defined( USE_INLINES )
00640 
00641 #if defined( UNROLL_LOOPS )
00642 
00643 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
00644 {
00645     move_block_aligned(r, tab64k(t)[0][a[0]]); xor_64k( 1, a, t, r);
00646     xor_64k( 2, a, t, r); xor_64k( 3, a, t, r);
00647     xor_64k( 4, a, t, r); xor_64k( 5, a, t, r);
00648     xor_64k( 6, a, t, r); xor_64k( 7, a, t, r);
00649     xor_64k( 8, a, t, r); xor_64k( 9, a, t, r);
00650     xor_64k(10, a, t, r); xor_64k(11, a, t, r);
00651     xor_64k(12, a, t, r); xor_64k(13, a, t, r);
00652     xor_64k(14, a, t, r); xor_64k(15, a, t, r);
00653     move_block_aligned(a, r);
00654 }
00655 
00656 #else
00657 
00658 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
00659 {   int i;
00660     move_block_aligned(r, tab64k(t)[0][a[0]]);
00661     for(i = 1; i < GF_BYTE_LEN; ++i)
00662         xor_64k(i, a, t, r);
00663     move_block_aligned(a, r);
00664 }
00665 
00666 #endif
00667 
00668 #else
00669 
00670 #if !defined( UNROLL_LOOPS )
00671 
00672 #define gf_mul_64k(a, t, r) do {                \
00673     move_block_aligned(r, tab64k(t)[0][a[0]]);  \
00674     xor_64k( 1, a, t, r);                       \
00675     xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); \
00676     xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); \
00677     xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); \
00678     xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); \
00679     xor_64k(10, a, t, r); xor_64k(11, a, t, r); \
00680     xor_64k(12, a, t, r); xor_64k(13, a, t, r); \
00681     xor_64k(14, a, t, r); xor_64k(15, a, t, r); \
00682     move_block_aligned(a, r);                   \
00683 } while(0)
00684 
00685 #else
00686 
00687 #define gf_mul_64k(a, t, r) do { int i;         \
00688     move_block_aligned(r, tab64k(t)[0][a[0]]);  \
00689     for(i = 1; i < GF_BYTE_LEN; ++i)            \
00690     {   xor_64k(i, a, t, r);                    \
00691     }                                           \
00692     move_block_aligned(a, r);                   \
00693 } while(0)
00694 
00695 #endif
00696 
00697 #endif
00698 
00699 /*  This version uses 8k bytes of table space on the stack.
00700     A 16 byte buffer has to be multiplied by a 16 byte key
00701     value in GF(128).  If we consider a GF(128) value in
00702     the buffer's lowest 4-bits, we can construct a table of
00703     the 16 16 byte values that result from the 16 values
00704     of these 4 bits. This requires 256 bytes. But we also
00705     need tables for each of the 32 higher 4 bit groups,
00706     which makes 8 kbytes in total.
00707 */
00708 
00709 void init_8k_table(unsigned char g[], void *t);
00710 
00711 typedef uint_32t    (*gf_t8k)[16][GF_BYTE_LEN >> 2];
00712 #define tab8k(x)    ((gf_t8k)x)
00713 #define xor_8k(i,a,t,r)   \
00714     xor_block_aligned(r, tab8k(t)[i + i][a[i] & 15]); \
00715     xor_block_aligned(r, tab8k(t)[i + i + 1][a[i] >> 4])
00716 
00717 #if defined( USE_INLINES )
00718 
00719 #if defined( UNROLL_LOOPS )
00720 
00721 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
00722 {
00723     move_block_aligned(r, tab8k(t)[0][a[0] & 15]);
00724     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
00725                 xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); xor_8k( 3, a, t, r);
00726     xor_8k( 4, a, t, r); xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); xor_8k( 7, a, t, r);
00727     xor_8k( 8, a, t, r); xor_8k( 9, a, t, r); xor_8k(10, a, t, r); xor_8k(11, a, t, r);
00728     xor_8k(12, a, t, r); xor_8k(13, a, t, r); xor_8k(14, a, t, r); xor_8k(15, a, t, r);
00729     move_block_aligned(a, r);
00730 }
00731 
00732 #else
00733 
00734 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
00735 {   int i;
00736     memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN);
00737     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
00738     for(i = 1; i < GF_BYTE_LEN; ++i)
00739     {   xor_8k(i, a, t, r);
00740     }
00741     memcpy(a, r, GF_BYTE_LEN);
00742 }
00743 
00744 #endif
00745 
00746 #else
00747 
00748 #if defined( UNROLL_LOOPS )
00749 
00750 #define gf_mul_8k(a, t, r) do {                     \
00751     move_block_aligned(r, tab8k(t)[0][a[0] & 15]);  \
00752     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);   \
00753     xor_8k( 1, a, t, r); xor_8k( 2, a, t, r);       \
00754     xor_8k( 3, a, t, r); xor_8k( 4, a, t, r);       \
00755     xor_8k( 5, a, t, r); xor_8k( 6, a, t, r);       \
00756     xor_8k( 7, a, t, r); xor_8k( 8, a, t, r);       \
00757     xor_8k( 9, a, t, r); xor_8k(10, a, t, r);       \
00758     xor_8k(11, a, t, r); xor_8k(12, a, t, r);       \
00759     xor_8k(13, a, t, r); xor_8k(14, a, t, r);       \
00760     xor_8k(15, a, t, r); move_block_aligned(a, r);  \
00761 } while(0)
00762 
00763 #else
00764 
00765 #define gf_mul_8k(a, t, r) do { int i;              \
00766     memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); \
00767     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);   \
00768     for(i = 1; i < GF_BYTE_LEN; ++i)                \
00769     {   xor_8k(i, a, t, r);                         \
00770     }                                               \
00771     memcpy(a, r, GF_BYTE_LEN);                      \
00772 } while(0)
00773 
00774 #endif
00775 
00776 #endif
00777 
00778 /*  This version uses 4k bytes of table space on the stack.
00779     A 16 byte buffer has to be multiplied by a 16 byte key
00780     value in GF(128).  If we consider a GF(128) value in a
00781     single byte, we can construct a table of the 256 16 byte
00782     values that result from the 256 values of this byte.
00783     This requires 4096 bytes. If we take the highest byte in
00784     the buffer and use this table to get the result, we then
00785     have to multiply by x^120 to get the final value. For the
00786     next highest byte the result has to be multiplied by x^112
00787     and so on. But we can do this by accumulating the result
00788     in an accumulator starting with the result for the top
00789     byte.  We repeatedly multiply the accumulator value by
00790     x^8 and then add in (i.e. xor) the 16 bytes of the next
00791     lower byte in the buffer, stopping when we reach the
00792     lowest byte. This requires a 4096 byte table.
00793 */
00794 
00795 void init_4k_table(unsigned char g[], void *t);
00796 
00797 typedef uint_32t        (*gf_t4k)[GF_BYTE_LEN >> 2];
00798 #define tab4k(x)        ((gf_t4k)x)
00799 #define xor_4k(i,a,t,r) mul_x8(r); xor_block_aligned(r, tab4k(t)[a[i]])
00800 
00801 #if defined( USE_INLINES )
00802 
00803 #if defined( UNROLL_LOOPS )
00804 
00805 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
00806 {
00807     move_block_aligned(r,tab4k(t)[a[15]]);
00808     xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);
00809     xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);
00810     xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);
00811     xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);
00812     xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);
00813     move_block_aligned(a, r);
00814 }
00815 
00816 #else
00817 
00818 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
00819 {   int i = 15;
00820     move_block_aligned(r,tab4k(t)[a[15]]);
00821     while(i--)
00822     {
00823         xor_4k(i, a, t, r);
00824     }
00825     move_block_aligned(a, r);
00826 }
00827 
00828 #endif
00829 
00830 #else
00831 
00832 #if defined( UNROLL_LOOPS )
00833 
00834 #define gf_mul_4k(a, t, r) do {                                     \
00835     move_block_aligned(r,tab4k(t)[a[15]]);                          \
00836     xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);  \
00837     xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);  \
00838     xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);  \
00839     xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);  \
00840     xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);  \
00841     move_block_aligned(a, r);                                       \
00842 } while(0)
00843 
00844 #else
00845 
00846 #define gf_mul_4k(a, t, r) do { int i = 15; \
00847     move_block_aligned(r,tab4k(t)[a[15]]);  \
00848     while(i--)                              \
00849     {   xor_4k(i, a, t, r);                 \
00850     }                                       \
00851     move_block_aligned(a, r);               \
00852 } while(0)
00853 
00854 #endif
00855 
00856 #endif
00857 
00858 /*  This version uses 256 bytes of table space on the stack.
00859     A 16 byte buffer has to be multiplied by a 16 byte key
00860     value in GF(128).  If we consider a GF(128) value in a
00861     single 4-bit nibble, we can construct a table of the 16
00862     16 byte  values that result from the 16 values of this
00863     byte.  This requires 256 bytes. If we take the highest
00864     4-bit nibble in the buffer and use this table to get the
00865     result, we then have to multiply by x^124 to get the
00866     final value. For the next highest byte the result has to
00867     be multiplied by x^120 and so on. But we can do this by
00868     accumulating the result in an accumulator starting with
00869     the result for the top nibble.  We repeatedly multiply
00870     the accumulator value by x^4 and then add in (i.e. xor)
00871     the 16 bytes of the next lower nibble in the buffer,
00872     stopping when we reach the lowest nibblebyte. This uses
00873     a 256 byte table.
00874 */
00875 
00876 void init_256_table(unsigned char g[], void *t);
00877 
00878 typedef uint_32t    (*gf_t256)[GF_BYTE_LEN >> 2];
00879 #define tab256(t)   ((gf_t256)t)
00880 #define xor_256(i,a,t,r)    \
00881     mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] & 15]);  \
00882     mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] >> 4])
00883 
00884 #if defined( USE_INLINES )
00885 
00886 #if defined( UNROLL_LOOPS )
00887 
00888 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
00889 {
00890     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
00891     xor_block_aligned(r, tab256(t)[a[15] >> 4]);
00892     xor_256(14, a, t, r); xor_256(13, a, t, r);
00893     xor_256(12, a, t, r); xor_256(11, a, t, r);
00894     xor_256(10, a, t, r); xor_256( 9, a, t, r);
00895     xor_256( 8, a, t, r); xor_256( 7, a, t, r);
00896     xor_256( 6, a, t, r); xor_256( 5, a, t, r);
00897     xor_256( 4, a, t, r); xor_256( 3, a, t, r);
00898     xor_256( 2, a, t, r); xor_256( 1, a, t, r);
00899     xor_256( 0, a, t, r); move_block_aligned(a, r);
00900 }
00901 
00902 #else
00903 
00904 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
00905 {   int i = 15;
00906     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
00907     xor_block_aligned(r, tab256(t)[a[15] >> 4]);
00908     while(i--)
00909     {   xor_256(i, a, t, r);
00910     }
00911     move_block_aligned(a, r);
00912 }
00913 
00914 #endif
00915 
00916 #else
00917 
00918 #if defined( UNROLL_LOOPS )
00919 
00920 #define gf_mul_256(a, t, r) do {                            \
00921     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
00922     xor_block_aligned(r, tab256(t)[a[15] >> 4]);            \
00923     xor_256(14, a, t, r); xor_256(13, a, t, r);             \
00924     xor_256(12, a, t, r); xor_256(11, a, t, r);             \
00925     xor_256(10, a, t, r); xor_256( 9, a, t, r);             \
00926     xor_256( 8, a, t, r); xor_256( 7, a, t, r);             \
00927     xor_256( 6, a, t, r); xor_256( 5, a, t, r);             \
00928     xor_256( 4, a, t, r); xor_256( 3, a, t, r);             \
00929     xor_256( 2, a, t, r); xor_256( 1, a, t, r);             \
00930     xor_256( 0, a, t, r); move_block_aligned(a, r);         \
00931 } while(0)
00932 
00933 #else
00934 
00935 #define gf_mul_256(a, t, r) do { int i = 15;                \
00936     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
00937     xor_block_aligned(r, tab256(t)[a[15] >> 4]);            \
00938     while(i--)                                              \
00939     {   xor_256(i, a, t, r);                                \
00940     }                                                       \
00941     move_block_aligned(a, r);                               \
00942 } while(0)
00943 
00944 #endif
00945 
00946 #endif
00947 
00948 #if defined(__cplusplus)
00949 }
00950 #endif
00951 
00952 #endif