Serpent-256-GCM.c (19572B)
1 /* 2 * Serpent-256-GCM Implementation with Hardware Acceleration 3 * Serpent: 256-bit key, 128-bit block cipher (AES finalist) 4 * Uses SIMD (SSE/AVX) for performance optimization 5 * GCM mode with PCLMULQDQ acceleration 6 */ 7 8 #include "Serpent-256-GCM.h" 9 #include <stdio.h> 10 #include <stdlib.h> 11 #include <string.h> 12 #include <stdint.h> 13 #include <immintrin.h> 14 15 #ifdef __GNUC__ 16 #include <cpuid.h> 17 #endif 18 19 // CPU feature detection 20 static int has_sse2 = 0; 21 static int has_pclmulqdq = 0; 22 23 static void detect_cpu_features(void) { 24 #ifdef __GNUC__ 25 unsigned int eax, ebx, ecx, edx; 26 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { 27 has_sse2 = (edx & bit_SSE2) != 0; 28 has_pclmulqdq = (ecx & bit_PCLMUL) != 0; 29 } 30 #endif 31 } 32 33 // Serpent constants (PHI for key schedule) 34 #define PHI 0x9E3779B9 // Golden ratio constant 35 36 // Type definitions are in Serpent-256-GCM.h 37 38 // Official Serpent S-box lookup tables (from specification) 39 static const uint8_t SBOX[8][16] = { 40 {3,8,15,1,10,6,5,11,14,13,4,2,7,0,9,12}, // S0 41 {15,12,2,7,9,0,5,10,1,11,14,8,6,13,3,4}, // S1 42 {8,6,7,9,3,12,10,15,13,1,14,4,0,11,5,2}, // S2 43 {0,15,11,8,12,9,6,3,13,1,2,4,10,7,5,14}, // S3 44 {1,15,8,3,12,0,11,6,2,5,4,10,9,14,7,13}, // S4 45 {15,5,2,11,4,10,9,12,0,3,14,8,13,6,7,1}, // S5 46 {7,2,12,5,8,4,6,11,14,9,1,15,13,3,10,0}, // S6 47 {1,13,15,0,14,8,2,11,7,4,12,10,9,3,5,6} // S7 48 }; 49 50 // Inverse S-box lookup tables 51 static const uint8_t SBOX_INV[8][16] = { 52 {13,3,11,0,10,6,5,12,1,14,4,7,15,9,8,2}, // S0^-1 53 {5,8,2,14,15,6,12,3,11,4,7,9,1,13,10,0}, // S1^-1 54 {12,9,15,4,11,14,1,2,0,3,6,13,5,8,10,7}, // S2^-1 55 {0,9,10,7,11,14,6,13,3,5,12,2,4,8,15,1}, // S3^-1 56 {5,0,8,3,10,9,7,14,2,12,11,6,4,15,13,1}, // S4^-1 57 {8,15,2,9,4,1,13,14,11,6,5,3,7,12,10,0}, // S5^-1 58 {15,10,1,13,5,3,6,0,4,9,14,7,2,12,8,11}, // S6^-1 59 {3,0,6,13,9,14,15,8,5,12,11,7,10,1,4,2} // S7^-1 60 }; 61 62 // Apply S-box to 4 words (bitsliced - 32 parallel 4-bit S-box lookups) 63 static inline void sbox(int box, uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) { 64 uint32_t out0 = 0, out1 = 0, out2 = 0, out3 = 0; 65 for (int i = 0; i < 32; i++) { 66 // Extract 4-bit input from bit position i of each word 67 uint8_t input = ((*r0 >> i) & 1) | 68 (((*r1 >> i) & 1) << 1) | 69 (((*r2 >> i) & 1) << 2) | 70 (((*r3 >> i) & 1) << 3); 71 // Apply S-box lookup 72 uint8_t output = SBOX[box][input]; 73 // Distribute output bits back to words 74 out0 |= ((output >> 0) & 1) << i; 75 out1 |= ((output >> 1) & 1) << i; 76 out2 |= ((output >> 2) & 1) << i; 77 out3 |= ((output >> 3) & 1) << i; 78 } 79 *r0 = out0; *r1 = out1; *r2 = out2; *r3 = out3; 80 } 81 82 // Apply inverse S-box to 4 words (bitsliced) 83 static inline void sbox_inv(int box, uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) { 84 uint32_t out0 = 0, out1 = 0, out2 = 0, out3 = 0; 85 for (int i = 0; i < 32; i++) { 86 uint8_t input = ((*r0 >> i) & 1) | 87 (((*r1 >> i) & 1) << 1) | 88 (((*r2 >> i) & 1) << 2) | 89 (((*r3 >> i) & 1) << 3); 90 uint8_t output = SBOX_INV[box][input]; 91 out0 |= ((output >> 0) & 1) << i; 92 out1 |= ((output >> 1) & 1) << i; 93 out2 |= ((output >> 2) & 1) << i; 94 out3 |= ((output >> 3) & 1) << i; 95 } 96 *r0 = out0; *r1 = out1; *r2 = out2; *r3 = out3; 97 } 98 99 // Linear transformation 100 static inline void linear_transform(uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) { 101 uint32_t t0 = *r0, t1 = *r1, t2 = *r2, t3 = *r3; 102 t0 = ((t0 << 13) | (t0 >> 19)); 103 t2 = ((t2 << 3) | (t2 >> 29)); 104 t1 = t1 ^ t0 ^ t2; 105 t3 = t3 ^ t2 ^ (t0 << 3); 106 t1 = ((t1 << 1) | (t1 >> 31)); 107 t3 = ((t3 << 7) | (t3 >> 25)); 108 t0 = t0 ^ t1 ^ t3; 109 t2 = t2 ^ t3 ^ (t1 << 7); 110 t0 = ((t0 << 5) | (t0 >> 27)); 111 t2 = ((t2 << 22) | (t2 >> 10)); 112 *r0 = t0; *r1 = t1; *r2 = t2; *r3 = t3; 113 } 114 115 // Inverse linear transformation 116 static inline void linear_transform_inv(uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) { 117 uint32_t t0 = *r0, t1 = *r1, t2 = *r2, t3 = *r3; 118 t2 = ((t2 >> 22) | (t2 << 10)); 119 t0 = ((t0 >> 5) | (t0 << 27)); 120 t2 = t2 ^ t3 ^ (t1 << 7); 121 t0 = t0 ^ t1 ^ t3; 122 t3 = ((t3 >> 7) | (t3 << 25)); 123 t1 = ((t1 >> 1) | (t1 << 31)); 124 t3 = t3 ^ t2 ^ (t0 << 3); 125 t1 = t1 ^ t0 ^ t2; 126 t2 = ((t2 >> 3) | (t2 << 29)); 127 t0 = ((t0 >> 13) | (t0 << 19)); 128 *r0 = t0; *r1 = t1; *r2 = t2; *r3 = t3; 129 } 130 131 // Serpent key expansion function 132 static void serpent_key_expansion(const uint8_t *key, serpent_key_schedule *ks) { 133 uint32_t w[140]; // Prekeys 134 int i; 135 136 // Initialize w from key (256-bit key = 8 words) 137 for (i = 0; i < 8; i++) { 138 w[i] = ((uint32_t)key[i*4]) | 139 ((uint32_t)key[i*4+1] << 8) | 140 ((uint32_t)key[i*4+2] << 16) | 141 ((uint32_t)key[i*4+3] << 24); 142 } 143 144 // Generate prekeys 145 for (i = 8; i < 16; i++) { 146 w[i] = 0; 147 } 148 149 for (i = 8; i < 140; i++) { 150 w[i] = w[i-8] ^ w[i-5] ^ w[i-3] ^ w[i-1] ^ PHI ^ (i-8); 151 w[i] = (w[i] << 11) | (w[i] >> 21); 152 } 153 154 // Apply S-boxes to generate subkeys 155 // Key schedule uses S-boxes in order: S3, S2, S1, S0, S7, S6, S5, S4 156 static const int ks_sbox_order[8] = {3, 2, 1, 0, 7, 6, 5, 4}; 157 for (i = 0; i < 33; i++) { 158 uint32_t r0 = w[4*i+8]; 159 uint32_t r1 = w[4*i+9]; 160 uint32_t r2 = w[4*i+10]; 161 uint32_t r3 = w[4*i+11]; 162 163 sbox(ks_sbox_order[i % 8], &r0, &r1, &r2, &r3); 164 165 ks->subkeys[i][0] = r0; 166 ks->subkeys[i][1] = r1; 167 ks->subkeys[i][2] = r2; 168 ks->subkeys[i][3] = r3; 169 } 170 } 171 172 // Serpent encryption of a single block 173 static void serpent_encrypt_block(const uint32_t *input, uint32_t *output, 174 const serpent_key_schedule *ks) { 175 uint32_t r0 = input[0]; 176 uint32_t r1 = input[1]; 177 uint32_t r2 = input[2]; 178 uint32_t r3 = input[3]; 179 180 // Initial permutation (IP) - XOR with first subkey 181 r0 ^= ks->subkeys[0][0]; 182 r1 ^= ks->subkeys[0][1]; 183 r2 ^= ks->subkeys[0][2]; 184 r3 ^= ks->subkeys[0][3]; 185 186 // 31 rounds 187 for (int i = 0; i < 31; i++) { 188 sbox(i % 8, &r0, &r1, &r2, &r3); 189 linear_transform(&r0, &r1, &r2, &r3); 190 191 r0 ^= ks->subkeys[i+1][0]; 192 r1 ^= ks->subkeys[i+1][1]; 193 r2 ^= ks->subkeys[i+1][2]; 194 r3 ^= ks->subkeys[i+1][3]; 195 } 196 197 // Final round (no linear transform) 198 sbox(7, &r0, &r1, &r2, &r3); 199 200 r0 ^= ks->subkeys[32][0]; 201 r1 ^= ks->subkeys[32][1]; 202 r2 ^= ks->subkeys[32][2]; 203 r3 ^= ks->subkeys[32][3]; 204 205 output[0] = r0; 206 output[1] = r1; 207 output[2] = r2; 208 output[3] = r3; 209 } 210 211 // Serpent decryption of a single block 212 static void serpent_decrypt_block(const uint32_t *input, uint32_t *output, 213 const serpent_key_schedule *ks) { 214 uint32_t r0 = input[0]; 215 uint32_t r1 = input[1]; 216 uint32_t r2 = input[2]; 217 uint32_t r3 = input[3]; 218 219 // XOR with last subkey 220 r0 ^= ks->subkeys[32][0]; 221 r1 ^= ks->subkeys[32][1]; 222 r2 ^= ks->subkeys[32][2]; 223 r3 ^= ks->subkeys[32][3]; 224 225 // Inverse final round 226 sbox_inv(7, &r0, &r1, &r2, &r3); 227 228 // 31 inverse rounds 229 for (int i = 31; i > 0; i--) { 230 r0 ^= ks->subkeys[i][0]; 231 r1 ^= ks->subkeys[i][1]; 232 r2 ^= ks->subkeys[i][2]; 233 r3 ^= ks->subkeys[i][3]; 234 235 linear_transform_inv(&r0, &r1, &r2, &r3); 236 sbox_inv((i-1) % 8, &r0, &r1, &r2, &r3); 237 } 238 239 r0 ^= ks->subkeys[0][0]; 240 r1 ^= ks->subkeys[0][1]; 241 r2 ^= ks->subkeys[0][2]; 242 r3 ^= ks->subkeys[0][3]; 243 244 output[0] = r0; 245 output[1] = r1; 246 output[2] = r2; 247 output[3] = r3; 248 } 249 250 // Utility functions for GCM 251 static inline __m128i reverse_bytes(__m128i x) { 252 const __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); 253 return _mm_shuffle_epi8(x, mask); 254 } 255 256 // GHASH multiplication using PCLMULQDQ (same as AES-GCM) 257 static inline __m128i gf_mul(__m128i a, __m128i b) { 258 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 259 __m128i tmp8, tmp9; 260 261 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 262 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 263 264 tmp4 = _mm_shuffle_epi32(a, 78); 265 tmp5 = _mm_shuffle_epi32(b, 78); 266 tmp4 = _mm_xor_si128(tmp4, a); 267 tmp5 = _mm_xor_si128(tmp5, b); 268 269 tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); 270 tmp4 = _mm_xor_si128(tmp4, tmp3); 271 tmp4 = _mm_xor_si128(tmp4, tmp6); 272 273 tmp5 = _mm_slli_si128(tmp4, 8); 274 tmp4 = _mm_srli_si128(tmp4, 8); 275 tmp3 = _mm_xor_si128(tmp3, tmp5); 276 tmp6 = _mm_xor_si128(tmp6, tmp4); 277 278 // Reduction 279 tmp7 = _mm_srli_epi32(tmp3, 31); 280 tmp8 = _mm_srli_epi32(tmp6, 31); 281 tmp3 = _mm_slli_epi32(tmp3, 1); 282 tmp6 = _mm_slli_epi32(tmp6, 1); 283 284 tmp9 = _mm_srli_si128(tmp7, 12); 285 tmp8 = _mm_slli_si128(tmp8, 4); 286 tmp7 = _mm_slli_si128(tmp7, 4); 287 tmp3 = _mm_or_si128(tmp3, tmp7); 288 tmp6 = _mm_or_si128(tmp6, tmp8); 289 tmp6 = _mm_or_si128(tmp6, tmp9); 290 291 tmp7 = _mm_slli_epi32(tmp3, 31); 292 tmp8 = _mm_slli_epi32(tmp3, 30); 293 tmp9 = _mm_slli_epi32(tmp3, 25); 294 295 tmp7 = _mm_xor_si128(tmp7, tmp8); 296 tmp7 = _mm_xor_si128(tmp7, tmp9); 297 tmp8 = _mm_srli_si128(tmp7, 4); 298 tmp7 = _mm_slli_si128(tmp7, 12); 299 tmp3 = _mm_xor_si128(tmp3, tmp7); 300 301 tmp2 = _mm_srli_epi32(tmp3, 1); 302 tmp4 = _mm_srli_epi32(tmp3, 2); 303 tmp5 = _mm_srli_epi32(tmp3, 7); 304 tmp2 = _mm_xor_si128(tmp2, tmp4); 305 tmp2 = _mm_xor_si128(tmp2, tmp5); 306 tmp2 = _mm_xor_si128(tmp2, tmp8); 307 tmp3 = _mm_xor_si128(tmp3, tmp2); 308 tmp6 = _mm_xor_si128(tmp6, tmp3); 309 310 return tmp6; 311 } 312 313 // Initialize Serpent-GCM context 314 int serpent_gcm_init(serpent_gcm_context *ctx, const uint8_t *key) { 315 if (!has_pclmulqdq) { 316 fprintf(stderr, "Error: CPU does not support PCLMULQDQ\n"); 317 return -1; 318 } 319 320 // Generate key schedule 321 serpent_key_expansion(key, &ctx->key_schedule); 322 323 // Compute H = E(K, 0^128) 324 uint32_t zero[4] = {0, 0, 0, 0}; 325 uint32_t h_block[4]; 326 serpent_encrypt_block(zero, h_block, &ctx->key_schedule); 327 328 // Convert to __m128i and reverse bytes for GCM 329 ctx->H = _mm_set_epi32(h_block[3], h_block[2], h_block[1], h_block[0]); 330 ctx->H = reverse_bytes(ctx->H); 331 332 // Precompute powers of H 333 ctx->H_powers[0] = ctx->H; 334 for (int i = 1; i < 8; i++) { 335 ctx->H_powers[i] = gf_mul(ctx->H_powers[i-1], ctx->H); 336 } 337 338 return 0; 339 } 340 341 // GHASH computation 342 static void ghash(const serpent_gcm_context *ctx, const uint8_t *aad, size_t aad_len, 343 const uint8_t *ciphertext, size_t ct_len, __m128i *tag) { 344 __m128i hash = _mm_setzero_si128(); 345 size_t i; 346 347 // Process AAD 348 for (i = 0; i + 16 <= aad_len; i += 16) { 349 __m128i block = _mm_loadu_si128((__m128i*)(aad + i)); 350 block = reverse_bytes(block); 351 hash = _mm_xor_si128(hash, block); 352 hash = gf_mul(hash, ctx->H); 353 } 354 355 if (i < aad_len) { 356 uint8_t temp[16] = {0}; 357 memcpy(temp, aad + i, aad_len - i); 358 __m128i block = _mm_loadu_si128((__m128i*)temp); 359 block = reverse_bytes(block); 360 hash = _mm_xor_si128(hash, block); 361 hash = gf_mul(hash, ctx->H); 362 } 363 364 // Process ciphertext 365 for (i = 0; i + 16 <= ct_len; i += 16) { 366 __m128i block = _mm_loadu_si128((__m128i*)(ciphertext + i)); 367 block = reverse_bytes(block); 368 hash = _mm_xor_si128(hash, block); 369 hash = gf_mul(hash, ctx->H); 370 } 371 372 if (i < ct_len) { 373 uint8_t temp[16] = {0}; 374 memcpy(temp, ciphertext + i, ct_len - i); 375 __m128i block = _mm_loadu_si128((__m128i*)temp); 376 block = reverse_bytes(block); 377 hash = _mm_xor_si128(hash, block); 378 hash = gf_mul(hash, ctx->H); 379 } 380 381 // Process length block 382 uint64_t aad_bits = aad_len * 8; 383 uint64_t ct_bits = ct_len * 8; 384 __m128i len_block = _mm_set_epi64x(ct_bits, aad_bits); 385 len_block = reverse_bytes(len_block); 386 hash = _mm_xor_si128(hash, len_block); 387 hash = gf_mul(hash, ctx->H); 388 389 *tag = reverse_bytes(hash); 390 } 391 392 // Increment counter 393 static inline __m128i inc_counter(__m128i counter) { 394 uint32_t ctr = _mm_extract_epi32(counter, 0); 395 ctr++; 396 counter = _mm_insert_epi32(counter, ctr, 0); 397 return counter; 398 } 399 400 // Serpent-GCM Encryption 401 int serpent_gcm_encrypt(serpent_gcm_context *ctx, 402 const uint8_t *iv, size_t iv_len, 403 const uint8_t *aad, size_t aad_len, 404 const uint8_t *plaintext, size_t pt_len, 405 uint8_t *ciphertext, 406 uint8_t *tag, size_t tag_len) { 407 if (tag_len > 16) return -1; 408 409 // Prepare initial counter 410 __m128i counter; 411 if (iv_len == 12) { 412 counter = _mm_set_epi32(1, 413 ((uint32_t*)iv)[2], 414 ((uint32_t*)iv)[1], 415 ((uint32_t*)iv)[0]); 416 } else { 417 // Non-standard IV: use GHASH 418 __m128i hash = _mm_setzero_si128(); 419 size_t i; 420 for (i = 0; i + 16 <= iv_len; i += 16) { 421 __m128i block = _mm_loadu_si128((__m128i*)(iv + i)); 422 block = reverse_bytes(block); 423 hash = _mm_xor_si128(hash, block); 424 hash = gf_mul(hash, ctx->H); 425 } 426 if (i < iv_len) { 427 uint8_t temp[16] = {0}; 428 memcpy(temp, iv + i, iv_len - i); 429 __m128i block = _mm_loadu_si128((__m128i*)temp); 430 block = reverse_bytes(block); 431 hash = _mm_xor_si128(hash, block); 432 hash = gf_mul(hash, ctx->H); 433 } 434 uint64_t iv_bits = iv_len * 8; 435 __m128i len_block = _mm_set_epi64x(iv_bits, 0); 436 len_block = reverse_bytes(len_block); 437 hash = _mm_xor_si128(hash, len_block); 438 hash = gf_mul(hash, ctx->H); 439 counter = reverse_bytes(hash); 440 } 441 442 __m128i J0 = counter; 443 444 // Encrypt plaintext 445 size_t i; 446 for (i = 0; i + 16 <= pt_len; i += 16) { 447 counter = inc_counter(counter); 448 449 // Extract counter to uint32_t array 450 uint32_t ctr_block[4]; 451 _mm_storeu_si128((__m128i*)ctr_block, counter); 452 453 // Encrypt counter with Serpent 454 uint32_t keystream[4]; 455 serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule); 456 457 // XOR with plaintext 458 uint32_t *pt = (uint32_t*)(plaintext + i); 459 uint32_t *ct = (uint32_t*)(ciphertext + i); 460 ct[0] = pt[0] ^ keystream[0]; 461 ct[1] = pt[1] ^ keystream[1]; 462 ct[2] = pt[2] ^ keystream[2]; 463 ct[3] = pt[3] ^ keystream[3]; 464 } 465 466 // Handle remaining bytes 467 if (i < pt_len) { 468 counter = inc_counter(counter); 469 uint32_t ctr_block[4]; 470 _mm_storeu_si128((__m128i*)ctr_block, counter); 471 uint32_t keystream[4]; 472 serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule); 473 474 uint8_t *ks_bytes = (uint8_t*)keystream; 475 for (size_t j = 0; j < pt_len - i; j++) { 476 ciphertext[i + j] = plaintext[i + j] ^ ks_bytes[j]; 477 } 478 } 479 480 // Compute authentication tag 481 __m128i auth_tag; 482 ghash(ctx, aad, aad_len, ciphertext, pt_len, &auth_tag); 483 484 // Encrypt tag with J0 485 uint32_t j0_block[4]; 486 _mm_storeu_si128((__m128i*)j0_block, J0); 487 uint32_t j0_keystream[4]; 488 serpent_encrypt_block(j0_block, j0_keystream, &ctx->key_schedule); 489 490 __m128i j0_ks = _mm_set_epi32(j0_keystream[3], j0_keystream[2], 491 j0_keystream[1], j0_keystream[0]); 492 auth_tag = _mm_xor_si128(auth_tag, j0_ks); 493 494 _mm_storeu_si128((__m128i*)tag, auth_tag); 495 496 return 0; 497 } 498 499 // Serpent-GCM Decryption 500 int serpent_gcm_decrypt(serpent_gcm_context *ctx, 501 const uint8_t *iv, size_t iv_len, 502 const uint8_t *aad, size_t aad_len, 503 const uint8_t *ciphertext, size_t ct_len, 504 const uint8_t *tag, size_t tag_len, 505 uint8_t *plaintext) { 506 if (tag_len > 16) return -1; 507 508 // Prepare initial counter 509 __m128i counter; 510 if (iv_len == 12) { 511 counter = _mm_set_epi32(1, 512 ((uint32_t*)iv)[2], 513 ((uint32_t*)iv)[1], 514 ((uint32_t*)iv)[0]); 515 } else { 516 __m128i hash = _mm_setzero_si128(); 517 size_t i; 518 for (i = 0; i + 16 <= iv_len; i += 16) { 519 __m128i block = _mm_loadu_si128((__m128i*)(iv + i)); 520 block = reverse_bytes(block); 521 hash = _mm_xor_si128(hash, block); 522 hash = gf_mul(hash, ctx->H); 523 } 524 if (i < iv_len) { 525 uint8_t temp[16] = {0}; 526 memcpy(temp, iv + i, iv_len - i); 527 __m128i block = _mm_loadu_si128((__m128i*)temp); 528 block = reverse_bytes(block); 529 hash = _mm_xor_si128(hash, block); 530 hash = gf_mul(hash, ctx->H); 531 } 532 uint64_t iv_bits = iv_len * 8; 533 __m128i len_block = _mm_set_epi64x(iv_bits, 0); 534 len_block = reverse_bytes(len_block); 535 hash = _mm_xor_si128(hash, len_block); 536 hash = gf_mul(hash, ctx->H); 537 counter = reverse_bytes(hash); 538 } 539 540 __m128i J0 = counter; 541 542 // Verify authentication tag 543 __m128i computed_tag; 544 ghash(ctx, aad, aad_len, ciphertext, ct_len, &computed_tag); 545 546 uint32_t j0_block[4]; 547 _mm_storeu_si128((__m128i*)j0_block, J0); 548 uint32_t j0_keystream[4]; 549 serpent_encrypt_block(j0_block, j0_keystream, &ctx->key_schedule); 550 551 __m128i j0_ks = _mm_set_epi32(j0_keystream[3], j0_keystream[2], 552 j0_keystream[1], j0_keystream[0]); 553 computed_tag = _mm_xor_si128(computed_tag, j0_ks); 554 555 // Constant-time comparison 556 uint8_t computed_tag_bytes[16]; 557 _mm_storeu_si128((__m128i*)computed_tag_bytes, computed_tag); 558 559 int tag_match = 1; 560 for (size_t i = 0; i < tag_len; i++) { 561 tag_match &= (computed_tag_bytes[i] == tag[i]); 562 } 563 564 if (!tag_match) return -1; 565 566 // Decrypt ciphertext 567 size_t i; 568 for (i = 0; i + 16 <= ct_len; i += 16) { 569 counter = inc_counter(counter); 570 571 uint32_t ctr_block[4]; 572 _mm_storeu_si128((__m128i*)ctr_block, counter); 573 574 uint32_t keystream[4]; 575 serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule); 576 577 uint32_t *ct = (uint32_t*)(ciphertext + i); 578 uint32_t *pt = (uint32_t*)(plaintext + i); 579 pt[0] = ct[0] ^ keystream[0]; 580 pt[1] = ct[1] ^ keystream[1]; 581 pt[2] = ct[2] ^ keystream[2]; 582 pt[3] = ct[3] ^ keystream[3]; 583 } 584 585 if (i < ct_len) { 586 counter = inc_counter(counter); 587 uint32_t ctr_block[4]; 588 _mm_storeu_si128((__m128i*)ctr_block, counter); 589 uint32_t keystream[4]; 590 serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule); 591 592 uint8_t *ks_bytes = (uint8_t*)keystream; 593 for (size_t j = 0; j < ct_len - i; j++) { 594 plaintext[i + j] = ciphertext[i + j] ^ ks_bytes[j]; 595 } 596 } 597 598 return 0; 599 } 600 601 /** 602 * Clean up Serpent-256-GCM context 603 * Zeros all sensitive key material 604 */ 605 void serpent_gcm_cleanup(serpent_gcm_context *ctx) { 606 if (ctx == NULL) return; 607 608 // Zero all sensitive data using volatile to prevent compiler optimization 609 volatile uint8_t *p = (volatile uint8_t *)ctx; 610 size_t n = sizeof(serpent_gcm_context); 611 while (n--) { 612 *p++ = 0; 613 } 614 }