Twofish-256-GCM.c (18158B)
1 /* 2 * Twofish-256-GCM Implementation 3 * 4 * Based on the Twofish specification by Bruce Schneier et al. 5 * Reference: https://www.schneier.com/academic/twofish/ 6 */ 7 8 #include "Twofish-256-GCM.h" 9 #include <stdlib.h> 10 #include <string.h> 11 12 /* Twofish MDS matrix and RS matrix constants */ 13 static const uint8_t Q0[256] = { 14 0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, 0x9A, 0x92, 0x80, 0x78, 0xE4, 0xDD, 0xD1, 0x38, 15 0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C, 0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48, 16 0xF2, 0xD0, 0x8B, 0x30, 0x84, 0x54, 0xDF, 0x23, 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82, 17 0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, 0xA6, 0xEB, 0xA5, 0xBE, 0x16, 0x0C, 0xE3, 0x61, 18 0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B, 0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1, 19 0xE1, 0xE6, 0xBD, 0x45, 0xE2, 0xF4, 0xB6, 0x66, 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7, 20 0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, 0xEA, 0x77, 0x39, 0xAF, 0x33, 0xC9, 0x62, 0x71, 21 0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8, 0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7, 22 0xA1, 0x1D, 0xAA, 0xED, 0x06, 0x70, 0xB2, 0xD2, 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90, 23 0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, 0x9E, 0x9C, 0x52, 0x1B, 0x5F, 0x93, 0x0A, 0xEF, 24 0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B, 0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64, 25 0x2A, 0xCE, 0xCB, 0x2F, 0xFC, 0x97, 0x05, 0x7A, 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A, 26 0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, 0xB8, 0xDA, 0xB0, 0x17, 0x55, 0x1F, 0x8A, 0x7D, 27 0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72, 0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34, 28 0x6E, 0x50, 0xDE, 0x68, 0x65, 0xBC, 0xDB, 0xF8, 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4, 29 0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, 0x6F, 0x9D, 0x36, 0x42, 0x4A, 0x5E, 0xC1, 0xE0 30 }; 31 32 static const uint8_t Q1[256] = { 33 0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, 0x4A, 0xD3, 0xE6, 0x6B, 0x45, 0x7D, 0xE8, 0x4B, 34 0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1, 0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F, 35 0x5E, 0xBA, 0xAE, 0x5B, 0x8A, 0x00, 0xBC, 0x9D, 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5, 36 0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, 0xB2, 0x73, 0x4C, 0x54, 0x92, 0x74, 0x36, 0x51, 37 0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96, 0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C, 38 0x13, 0x95, 0x9C, 0xC7, 0x24, 0x46, 0x3B, 0x70, 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8, 39 0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, 0x03, 0x6F, 0x08, 0xBF, 0x40, 0xE7, 0x2B, 0xE2, 40 0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9, 0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17, 41 0x66, 0x94, 0xA1, 0x1D, 0x3D, 0xF0, 0xDE, 0xB3, 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E, 42 0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, 0x81, 0x88, 0xEE, 0x21, 0xC4, 0x1A, 0xEB, 0xD9, 43 0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01, 0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48, 44 0x4F, 0xF2, 0x65, 0x8E, 0x78, 0x5C, 0x58, 0x19, 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64, 45 0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, 0xCE, 0xE9, 0x68, 0x44, 0xE0, 0x4D, 0x43, 0x69, 46 0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E, 0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC, 47 0x22, 0xC9, 0xC0, 0x9B, 0x89, 0xD4, 0xED, 0xAB, 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9, 48 0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, 0x16, 0x25, 0x86, 0x56, 0x55, 0x09, 0xBE, 0x91 49 }; 50 51 /* MDS multiplication in GF(2^8) */ 52 static inline uint8_t gf_mult(uint8_t a, uint8_t b, uint8_t poly) { 53 uint8_t result = 0; 54 for (int i = 0; i < 8; i++) { 55 if (b & 1) result ^= a; 56 uint8_t hi_bit = a & 0x80; 57 a <<= 1; 58 if (hi_bit) a ^= poly; 59 b >>= 1; 60 } 61 return result; 62 } 63 64 /* MDS matrix multiply */ 65 static inline uint32_t mds_multiply(uint32_t x) { 66 uint8_t b0 = x & 0xFF; 67 uint8_t b1 = (x >> 8) & 0xFF; 68 uint8_t b2 = (x >> 16) & 0xFF; 69 uint8_t b3 = (x >> 24) & 0xFF; 70 71 /* MDS matrix coefficients */ 72 uint8_t y0 = gf_mult(b0, 0x01, 0x69) ^ gf_mult(b1, 0xEF, 0x69) ^ 73 gf_mult(b2, 0x5B, 0x69) ^ gf_mult(b3, 0x5B, 0x69); 74 uint8_t y1 = gf_mult(b0, 0x5B, 0x69) ^ gf_mult(b1, 0xEF, 0x69) ^ 75 gf_mult(b2, 0xEF, 0x69) ^ gf_mult(b3, 0x01, 0x69); 76 uint8_t y2 = gf_mult(b0, 0xEF, 0x69) ^ gf_mult(b1, 0x5B, 0x69) ^ 77 gf_mult(b2, 0x01, 0x69) ^ gf_mult(b3, 0xEF, 0x69); 78 uint8_t y3 = gf_mult(b0, 0xEF, 0x69) ^ gf_mult(b1, 0x01, 0x69) ^ 79 gf_mult(b2, 0xEF, 0x69) ^ gf_mult(b3, 0x5B, 0x69); 80 81 return y0 | (y1 << 8) | (y2 << 16) | (y3 << 24); 82 } 83 84 /* g function for key schedule */ 85 static uint32_t g_func(uint32_t x, const uint32_t *S, int k) { 86 uint8_t b0 = x & 0xFF; 87 uint8_t b1 = (x >> 8) & 0xFF; 88 uint8_t b2 = (x >> 16) & 0xFF; 89 uint8_t b3 = (x >> 24) & 0xFF; 90 91 if (k == 4) { 92 b0 = Q1[b0] ^ ((S[3] >> 0) & 0xFF); 93 b1 = Q0[b1] ^ ((S[3] >> 8) & 0xFF); 94 b2 = Q0[b2] ^ ((S[3] >> 16) & 0xFF); 95 b3 = Q1[b3] ^ ((S[3] >> 24) & 0xFF); 96 } 97 if (k >= 3) { 98 b0 = Q1[b0] ^ ((S[2] >> 0) & 0xFF); 99 b1 = Q1[b1] ^ ((S[2] >> 8) & 0xFF); 100 b2 = Q0[b2] ^ ((S[2] >> 16) & 0xFF); 101 b3 = Q0[b3] ^ ((S[2] >> 24) & 0xFF); 102 } 103 104 b0 = Q1[Q0[Q0[b0] ^ ((S[1] >> 0) & 0xFF)] ^ ((S[0] >> 0) & 0xFF)]; 105 b1 = Q0[Q0[Q1[b1] ^ ((S[1] >> 8) & 0xFF)] ^ ((S[0] >> 8) & 0xFF)]; 106 b2 = Q1[Q1[Q0[b2] ^ ((S[1] >> 16) & 0xFF)] ^ ((S[0] >> 16) & 0xFF)]; 107 b3 = Q0[Q1[Q1[b3] ^ ((S[1] >> 24) & 0xFF)] ^ ((S[0] >> 24) & 0xFF)]; 108 109 return mds_multiply(b0 | (b1 << 8) | (b2 << 16) | (b3 << 24)); 110 } 111 112 /* Twofish key schedule */ 113 static void twofish_setup_key(twofish_key_schedule *ks, const uint8_t *key) { 114 uint32_t M[8]; 115 uint32_t S[4]; 116 117 /* Split key into 32-bit words */ 118 for (int i = 0; i < 8; i++) { 119 M[i] = ((uint32_t)key[i*4]) | 120 ((uint32_t)key[i*4+1] << 8) | 121 ((uint32_t)key[i*4+2] << 16) | 122 ((uint32_t)key[i*4+3] << 24); 123 } 124 125 /* Generate S-box keys */ 126 for (int i = 0; i < 4; i++) { 127 S[i] = M[2*i] ^ M[2*i+1]; 128 } 129 130 /* Generate subkeys */ 131 for (int i = 0; i < 20; i++) { 132 uint32_t A = g_func(2*i * 0x01010101, S, 4); 133 uint32_t B = g_func((2*i+1) * 0x01010101, S, 4); 134 B = (B << 8) | (B >> 24); 135 ks->K[2*i] = A + B; 136 ks->K[2*i+1] = ((A + 2*B) << 9) | ((A + 2*B) >> 23); 137 } 138 139 /* Build S-boxes */ 140 for (int i = 0; i < 256; i++) { 141 for (int j = 0; j < 4; j++) { 142 ks->S[j][i] = g_func(i, S, 4); 143 } 144 } 145 } 146 147 /* Twofish block encryption */ 148 static void twofish_encrypt_block(const twofish_key_schedule *ks, 149 const uint8_t *in, uint8_t *out) { 150 uint32_t R0, R1, R2, R3; 151 152 /* Input whitening */ 153 R0 = ((uint32_t)in[0]) | ((uint32_t)in[1] << 8) | 154 ((uint32_t)in[2] << 16) | ((uint32_t)in[3] << 24); 155 R1 = ((uint32_t)in[4]) | ((uint32_t)in[5] << 8) | 156 ((uint32_t)in[6] << 16) | ((uint32_t)in[7] << 24); 157 R2 = ((uint32_t)in[8]) | ((uint32_t)in[9] << 8) | 158 ((uint32_t)in[10] << 16) | ((uint32_t)in[11] << 24); 159 R3 = ((uint32_t)in[12]) | ((uint32_t)in[13] << 8) | 160 ((uint32_t)in[14] << 16) | ((uint32_t)in[15] << 24); 161 162 R0 ^= ks->K[0]; 163 R1 ^= ks->K[1]; 164 R2 ^= ks->K[2]; 165 R3 ^= ks->K[3]; 166 167 /* 16 rounds */ 168 for (int r = 0; r < 16; r++) { 169 uint32_t F0 = ks->S[0][R0 & 0xFF] ^ ks->S[1][(R0 >> 8) & 0xFF] ^ 170 ks->S[2][(R0 >> 16) & 0xFF] ^ ks->S[3][(R0 >> 24) & 0xFF]; 171 uint32_t F1 = ks->S[0][R1 & 0xFF] ^ ks->S[1][(R1 >> 8) & 0xFF] ^ 172 ks->S[2][(R1 >> 16) & 0xFF] ^ ks->S[3][(R1 >> 24) & 0xFF]; 173 174 R2 ^= (F0 + F1 + ks->K[8 + 2*r]); 175 R2 = (R2 >> 1) | (R2 << 31); 176 177 R3 = (R3 << 1) | (R3 >> 31); 178 R3 ^= (F0 + 2*F1 + ks->K[8 + 2*r + 1]); 179 180 /* Swap for next round */ 181 if (r < 15) { 182 uint32_t tmp = R0; 183 R0 = R2; 184 R2 = tmp; 185 tmp = R1; 186 R1 = R3; 187 R3 = tmp; 188 } 189 } 190 191 /* Output whitening */ 192 R2 ^= ks->K[4]; 193 R3 ^= ks->K[5]; 194 R0 ^= ks->K[6]; 195 R1 ^= ks->K[7]; 196 197 out[0] = R2 & 0xFF; 198 out[1] = (R2 >> 8) & 0xFF; 199 out[2] = (R2 >> 16) & 0xFF; 200 out[3] = (R2 >> 24) & 0xFF; 201 202 out[4] = R3 & 0xFF; 203 out[5] = (R3 >> 8) & 0xFF; 204 out[6] = (R3 >> 16) & 0xFF; 205 out[7] = (R3 >> 24) & 0xFF; 206 207 out[8] = R0 & 0xFF; 208 out[9] = (R0 >> 8) & 0xFF; 209 out[10] = (R0 >> 16) & 0xFF; 210 out[11] = (R0 >> 24) & 0xFF; 211 212 out[12] = R1 & 0xFF; 213 out[13] = (R1 >> 8) & 0xFF; 214 out[14] = (R1 >> 16) & 0xFF; 215 out[15] = (R1 >> 24) & 0xFF; 216 } 217 218 /* GCM helper functions */ 219 static inline __m128i reverse_bytes(__m128i x) { 220 const __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); 221 return _mm_shuffle_epi8(x, mask); 222 } 223 224 static inline __m128i gf_mult_gcm(__m128i a, __m128i b) { 225 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 226 __m128i tmp8, tmp9, tmp10, tmp11, tmp12; 227 __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0); 228 229 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 230 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 231 tmp4 = _mm_shuffle_epi32(a, 78); 232 tmp5 = _mm_shuffle_epi32(b, 78); 233 tmp4 = _mm_xor_si128(tmp4, a); 234 tmp5 = _mm_xor_si128(tmp5, b); 235 tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); 236 tmp4 = _mm_xor_si128(tmp4, tmp3); 237 tmp4 = _mm_xor_si128(tmp4, tmp6); 238 tmp5 = _mm_slli_si128(tmp4, 8); 239 tmp4 = _mm_srli_si128(tmp4, 8); 240 tmp3 = _mm_xor_si128(tmp3, tmp5); 241 tmp6 = _mm_xor_si128(tmp6, tmp4); 242 243 tmp7 = _mm_srli_epi32(tmp3, 31); 244 tmp8 = _mm_srli_epi32(tmp6, 31); 245 tmp3 = _mm_slli_epi32(tmp3, 1); 246 tmp6 = _mm_slli_epi32(tmp6, 1); 247 248 tmp9 = _mm_srli_si128(tmp7, 12); 249 tmp8 = _mm_slli_si128(tmp8, 4); 250 tmp7 = _mm_slli_si128(tmp7, 4); 251 tmp3 = _mm_or_si128(tmp3, tmp7); 252 tmp6 = _mm_or_si128(tmp6, tmp8); 253 tmp6 = _mm_or_si128(tmp6, tmp9); 254 255 tmp7 = _mm_slli_epi32(tmp3, 31); 256 tmp8 = _mm_slli_epi32(tmp3, 30); 257 tmp9 = _mm_slli_epi32(tmp3, 25); 258 tmp7 = _mm_xor_si128(tmp7, tmp8); 259 tmp7 = _mm_xor_si128(tmp7, tmp9); 260 tmp8 = _mm_srli_si128(tmp7, 4); 261 tmp7 = _mm_slli_si128(tmp7, 12); 262 tmp3 = _mm_xor_si128(tmp3, tmp7); 263 264 tmp2 = _mm_srli_epi32(tmp3, 1); 265 tmp4 = _mm_srli_epi32(tmp3, 2); 266 tmp5 = _mm_srli_epi32(tmp3, 7); 267 tmp2 = _mm_xor_si128(tmp2, tmp4); 268 tmp2 = _mm_xor_si128(tmp2, tmp5); 269 tmp2 = _mm_xor_si128(tmp2, tmp8); 270 tmp3 = _mm_xor_si128(tmp3, tmp2); 271 tmp6 = _mm_xor_si128(tmp6, tmp3); 272 273 return tmp6; 274 } 275 276 /* GCM GHASH */ 277 static void gcm_ghash(const __m128i *H_powers, const uint8_t *data, size_t len, __m128i *result) { 278 __m128i hash = _mm_setzero_si128(); 279 280 for (size_t i = 0; i < len / 16; i++) { 281 __m128i block = _mm_loadu_si128((const __m128i*)(data + i*16)); 282 block = reverse_bytes(block); 283 hash = _mm_xor_si128(hash, block); 284 hash = gf_mult_gcm(hash, H_powers[0]); 285 } 286 287 /* Handle remaining bytes */ 288 if (len % 16 != 0) { 289 uint8_t last_block[16] = {0}; 290 memcpy(last_block, data + (len / 16) * 16, len % 16); 291 __m128i block = _mm_loadu_si128((const __m128i*)last_block); 292 block = reverse_bytes(block); 293 hash = _mm_xor_si128(hash, block); 294 hash = gf_mult_gcm(hash, H_powers[0]); 295 } 296 297 *result = hash; 298 } 299 300 int twofish256_gcm_init(twofish256_gcm_context *ctx, const uint8_t *key) { 301 if (!ctx || !key) return -1; 302 303 /* Initialize Twofish key schedule */ 304 twofish_setup_key(&ctx->key_schedule, key); 305 306 /* Compute H = E(K, 0^128) for GCM */ 307 uint8_t zero_block[16] = {0}; 308 uint8_t h_block[16]; 309 twofish_encrypt_block(&ctx->key_schedule, zero_block, h_block); 310 ctx->H = _mm_loadu_si128((const __m128i*)h_block); 311 ctx->H = reverse_bytes(ctx->H); 312 313 /* Precompute powers of H */ 314 ctx->H_powers[0] = ctx->H; 315 for (int i = 1; i < 8; i++) { 316 ctx->H_powers[i] = gf_mult_gcm(ctx->H_powers[i-1], ctx->H); 317 } 318 319 return 0; 320 } 321 322 int twofish256_gcm_encrypt( 323 twofish256_gcm_context *ctx, 324 const uint8_t *iv, size_t iv_len, 325 const uint8_t *aad, size_t aad_len, 326 const uint8_t *plaintext, size_t pt_len, 327 uint8_t *ciphertext, 328 uint8_t *tag, size_t tag_len 329 ) { 330 if (!ctx || !iv || !tag || tag_len != 16) return -1; 331 if (pt_len > 0 && (!plaintext || !ciphertext)) return -1; 332 333 /* Prepare counter */ 334 uint8_t counter[16] = {0}; 335 if (iv_len == 12) { 336 memcpy(counter, iv, 12); 337 counter[15] = 1; 338 } else { 339 /* GHASH the IV */ 340 __m128i iv_hash; 341 gcm_ghash(ctx->H_powers, iv, iv_len, &iv_hash); 342 uint8_t len_block[16] = {0}; 343 uint64_t iv_bits = iv_len * 8; 344 for (int i = 0; i < 8; i++) { 345 len_block[15-i] = (iv_bits >> (i*8)) & 0xFF; 346 } 347 __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block); 348 len_vec = reverse_bytes(len_vec); 349 iv_hash = _mm_xor_si128(iv_hash, len_vec); 350 iv_hash = gf_mult_gcm(iv_hash, ctx->H_powers[0]); 351 _mm_storeu_si128((__m128i*)counter, reverse_bytes(iv_hash)); 352 } 353 354 /* Encrypt tag using counter = 0 */ 355 uint8_t tag_mask[16]; 356 twofish_encrypt_block(&ctx->key_schedule, counter, tag_mask); 357 358 /* CTR mode encryption */ 359 for (size_t i = 0; i < pt_len; i += 16) { 360 /* Increment counter */ 361 for (int j = 15; j >= 0; j--) { 362 if (++counter[j] != 0) break; 363 } 364 365 uint8_t keystream[16]; 366 twofish_encrypt_block(&ctx->key_schedule, counter, keystream); 367 368 size_t block_len = (i + 16 <= pt_len) ? 16 : (pt_len - i); 369 for (size_t j = 0; j < block_len; j++) { 370 ciphertext[i+j] = plaintext[i+j] ^ keystream[j]; 371 } 372 } 373 374 /* Compute GHASH */ 375 __m128i ghash = _mm_setzero_si128(); 376 377 if (aad_len > 0) { 378 gcm_ghash(ctx->H_powers, aad, aad_len, &ghash); 379 } 380 381 if (pt_len > 0) { 382 __m128i ct_hash; 383 gcm_ghash(ctx->H_powers, ciphertext, pt_len, &ct_hash); 384 ghash = _mm_xor_si128(ghash, ct_hash); 385 } 386 387 /* Add length block */ 388 uint8_t len_block[16] = {0}; 389 uint64_t aad_bits = aad_len * 8; 390 uint64_t ct_bits = pt_len * 8; 391 for (int i = 0; i < 8; i++) { 392 len_block[7-i] = (aad_bits >> (i*8)) & 0xFF; 393 len_block[15-i] = (ct_bits >> (i*8)) & 0xFF; 394 } 395 __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block); 396 len_vec = reverse_bytes(len_vec); 397 ghash = _mm_xor_si128(ghash, len_vec); 398 ghash = gf_mult_gcm(ghash, ctx->H_powers[0]); 399 400 /* XOR with tag mask */ 401 ghash = reverse_bytes(ghash); 402 __m128i tag_vec = _mm_xor_si128(ghash, _mm_loadu_si128((const __m128i*)tag_mask)); 403 _mm_storeu_si128((__m128i*)tag, tag_vec); 404 405 return 0; 406 } 407 408 int twofish256_gcm_decrypt( 409 twofish256_gcm_context *ctx, 410 const uint8_t *iv, size_t iv_len, 411 const uint8_t *aad, size_t aad_len, 412 const uint8_t *ciphertext, size_t ct_len, 413 const uint8_t *tag, size_t tag_len, 414 uint8_t *plaintext 415 ) { 416 if (!ctx || !iv || !tag || tag_len != 16) return -1; 417 if (ct_len > 0 && (!ciphertext || !plaintext)) return -1; 418 419 /* Compute expected tag */ 420 uint8_t expected_tag[16]; 421 422 /* Prepare counter */ 423 uint8_t counter[16] = {0}; 424 if (iv_len == 12) { 425 memcpy(counter, iv, 12); 426 counter[15] = 1; 427 } else { 428 __m128i iv_hash; 429 gcm_ghash(ctx->H_powers, iv, iv_len, &iv_hash); 430 uint8_t len_block[16] = {0}; 431 uint64_t iv_bits = iv_len * 8; 432 for (int i = 0; i < 8; i++) { 433 len_block[15-i] = (iv_bits >> (i*8)) & 0xFF; 434 } 435 __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block); 436 len_vec = reverse_bytes(len_vec); 437 iv_hash = _mm_xor_si128(iv_hash, len_vec); 438 iv_hash = gf_mult_gcm(iv_hash, ctx->H_powers[0]); 439 _mm_storeu_si128((__m128i*)counter, reverse_bytes(iv_hash)); 440 } 441 442 /* Encrypt tag using counter = 0 */ 443 uint8_t tag_mask[16]; 444 twofish_encrypt_block(&ctx->key_schedule, counter, tag_mask); 445 446 /* Compute GHASH */ 447 __m128i ghash = _mm_setzero_si128(); 448 449 if (aad_len > 0) { 450 gcm_ghash(ctx->H_powers, aad, aad_len, &ghash); 451 } 452 453 if (ct_len > 0) { 454 __m128i ct_hash; 455 gcm_ghash(ctx->H_powers, ciphertext, ct_len, &ct_hash); 456 ghash = _mm_xor_si128(ghash, ct_hash); 457 } 458 459 /* Add length block */ 460 uint8_t len_block[16] = {0}; 461 uint64_t aad_bits = aad_len * 8; 462 uint64_t ct_bits = ct_len * 8; 463 for (int i = 0; i < 8; i++) { 464 len_block[7-i] = (aad_bits >> (i*8)) & 0xFF; 465 len_block[15-i] = (ct_bits >> (i*8)) & 0xFF; 466 } 467 __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block); 468 len_vec = reverse_bytes(len_vec); 469 ghash = _mm_xor_si128(ghash, len_vec); 470 ghash = gf_mult_gcm(ghash, ctx->H_powers[0]); 471 472 /* XOR with tag mask */ 473 ghash = reverse_bytes(ghash); 474 __m128i tag_vec = _mm_xor_si128(ghash, _mm_loadu_si128((const __m128i*)tag_mask)); 475 _mm_storeu_si128((__m128i*)expected_tag, tag_vec); 476 477 /* Constant-time tag comparison */ 478 int tag_match = 1; 479 for (size_t i = 0; i < 16; i++) { 480 if (tag[i] != expected_tag[i]) tag_match = 0; 481 } 482 483 if (!tag_match) { 484 memset(plaintext, 0, ct_len); 485 return -1; 486 } 487 488 /* CTR mode decryption */ 489 memcpy(counter, iv, (iv_len < 12) ? iv_len : 12); 490 if (iv_len == 12) counter[15] = 1; 491 492 for (size_t i = 0; i < ct_len; i += 16) { 493 /* Increment counter */ 494 for (int j = 15; j >= 0; j--) { 495 if (++counter[j] != 0) break; 496 } 497 498 uint8_t keystream[16]; 499 twofish_encrypt_block(&ctx->key_schedule, counter, keystream); 500 501 size_t block_len = (i + 16 <= ct_len) ? 16 : (ct_len - i); 502 for (size_t j = 0; j < block_len; j++) { 503 plaintext[i+j] = ciphertext[i+j] ^ keystream[j]; 504 } 505 } 506 507 return 0; 508 } 509 510 void twofish256_gcm_cleanup(twofish256_gcm_context *ctx) { 511 if (!ctx) return; 512 memset(ctx, 0, sizeof(twofish256_gcm_context)); 513 }