luajitos

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

Serpent-256-GCM.c (19572B)


      1 /*
      2  * Serpent-256-GCM Implementation with Hardware Acceleration
      3  * Serpent: 256-bit key, 128-bit block cipher (AES finalist)
      4  * Uses SIMD (SSE/AVX) for performance optimization
      5  * GCM mode with PCLMULQDQ acceleration
      6  */
      7 
      8 #include "Serpent-256-GCM.h"
      9 #include <stdio.h>
     10 #include <stdlib.h>
     11 #include <string.h>
     12 #include <stdint.h>
     13 #include <immintrin.h>
     14 
     15 #ifdef __GNUC__
     16 #include <cpuid.h>
     17 #endif
     18 
     19 // CPU feature detection
     20 static int has_sse2 = 0;
     21 static int has_pclmulqdq = 0;
     22 
     23 static void detect_cpu_features(void) {
     24 #ifdef __GNUC__
     25     unsigned int eax, ebx, ecx, edx;
     26     if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
     27         has_sse2 = (edx & bit_SSE2) != 0;
     28         has_pclmulqdq = (ecx & bit_PCLMUL) != 0;
     29     }
     30 #endif
     31 }
     32 
     33 // Serpent constants (PHI for key schedule)
     34 #define PHI 0x9E3779B9  // Golden ratio constant
     35 
     36 // Type definitions are in Serpent-256-GCM.h
     37 
     38 // Official Serpent S-box lookup tables (from specification)
     39 static const uint8_t SBOX[8][16] = {
     40     {3,8,15,1,10,6,5,11,14,13,4,2,7,0,9,12},      // S0
     41     {15,12,2,7,9,0,5,10,1,11,14,8,6,13,3,4},      // S1
     42     {8,6,7,9,3,12,10,15,13,1,14,4,0,11,5,2},      // S2
     43     {0,15,11,8,12,9,6,3,13,1,2,4,10,7,5,14},      // S3
     44     {1,15,8,3,12,0,11,6,2,5,4,10,9,14,7,13},      // S4
     45     {15,5,2,11,4,10,9,12,0,3,14,8,13,6,7,1},      // S5
     46     {7,2,12,5,8,4,6,11,14,9,1,15,13,3,10,0},      // S6
     47     {1,13,15,0,14,8,2,11,7,4,12,10,9,3,5,6}       // S7
     48 };
     49 
     50 // Inverse S-box lookup tables
     51 static const uint8_t SBOX_INV[8][16] = {
     52     {13,3,11,0,10,6,5,12,1,14,4,7,15,9,8,2},      // S0^-1
     53     {5,8,2,14,15,6,12,3,11,4,7,9,1,13,10,0},      // S1^-1
     54     {12,9,15,4,11,14,1,2,0,3,6,13,5,8,10,7},      // S2^-1
     55     {0,9,10,7,11,14,6,13,3,5,12,2,4,8,15,1},      // S3^-1
     56     {5,0,8,3,10,9,7,14,2,12,11,6,4,15,13,1},      // S4^-1
     57     {8,15,2,9,4,1,13,14,11,6,5,3,7,12,10,0},      // S5^-1
     58     {15,10,1,13,5,3,6,0,4,9,14,7,2,12,8,11},      // S6^-1
     59     {3,0,6,13,9,14,15,8,5,12,11,7,10,1,4,2}       // S7^-1
     60 };
     61 
     62 // Apply S-box to 4 words (bitsliced - 32 parallel 4-bit S-box lookups)
     63 static inline void sbox(int box, uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) {
     64     uint32_t out0 = 0, out1 = 0, out2 = 0, out3 = 0;
     65     for (int i = 0; i < 32; i++) {
     66         // Extract 4-bit input from bit position i of each word
     67         uint8_t input = ((*r0 >> i) & 1) |
     68                        (((*r1 >> i) & 1) << 1) |
     69                        (((*r2 >> i) & 1) << 2) |
     70                        (((*r3 >> i) & 1) << 3);
     71         // Apply S-box lookup
     72         uint8_t output = SBOX[box][input];
     73         // Distribute output bits back to words
     74         out0 |= ((output >> 0) & 1) << i;
     75         out1 |= ((output >> 1) & 1) << i;
     76         out2 |= ((output >> 2) & 1) << i;
     77         out3 |= ((output >> 3) & 1) << i;
     78     }
     79     *r0 = out0; *r1 = out1; *r2 = out2; *r3 = out3;
     80 }
     81 
     82 // Apply inverse S-box to 4 words (bitsliced)
     83 static inline void sbox_inv(int box, uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) {
     84     uint32_t out0 = 0, out1 = 0, out2 = 0, out3 = 0;
     85     for (int i = 0; i < 32; i++) {
     86         uint8_t input = ((*r0 >> i) & 1) |
     87                        (((*r1 >> i) & 1) << 1) |
     88                        (((*r2 >> i) & 1) << 2) |
     89                        (((*r3 >> i) & 1) << 3);
     90         uint8_t output = SBOX_INV[box][input];
     91         out0 |= ((output >> 0) & 1) << i;
     92         out1 |= ((output >> 1) & 1) << i;
     93         out2 |= ((output >> 2) & 1) << i;
     94         out3 |= ((output >> 3) & 1) << i;
     95     }
     96     *r0 = out0; *r1 = out1; *r2 = out2; *r3 = out3;
     97 }
     98 
     99 // Linear transformation
    100 static inline void linear_transform(uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) {
    101     uint32_t t0 = *r0, t1 = *r1, t2 = *r2, t3 = *r3;
    102     t0 = ((t0 << 13) | (t0 >> 19));
    103     t2 = ((t2 << 3) | (t2 >> 29));
    104     t1 = t1 ^ t0 ^ t2;
    105     t3 = t3 ^ t2 ^ (t0 << 3);
    106     t1 = ((t1 << 1) | (t1 >> 31));
    107     t3 = ((t3 << 7) | (t3 >> 25));
    108     t0 = t0 ^ t1 ^ t3;
    109     t2 = t2 ^ t3 ^ (t1 << 7);
    110     t0 = ((t0 << 5) | (t0 >> 27));
    111     t2 = ((t2 << 22) | (t2 >> 10));
    112     *r0 = t0; *r1 = t1; *r2 = t2; *r3 = t3;
    113 }
    114 
    115 // Inverse linear transformation
    116 static inline void linear_transform_inv(uint32_t *r0, uint32_t *r1, uint32_t *r2, uint32_t *r3) {
    117     uint32_t t0 = *r0, t1 = *r1, t2 = *r2, t3 = *r3;
    118     t2 = ((t2 >> 22) | (t2 << 10));
    119     t0 = ((t0 >> 5) | (t0 << 27));
    120     t2 = t2 ^ t3 ^ (t1 << 7);
    121     t0 = t0 ^ t1 ^ t3;
    122     t3 = ((t3 >> 7) | (t3 << 25));
    123     t1 = ((t1 >> 1) | (t1 << 31));
    124     t3 = t3 ^ t2 ^ (t0 << 3);
    125     t1 = t1 ^ t0 ^ t2;
    126     t2 = ((t2 >> 3) | (t2 << 29));
    127     t0 = ((t0 >> 13) | (t0 << 19));
    128     *r0 = t0; *r1 = t1; *r2 = t2; *r3 = t3;
    129 }
    130 
    131 // Serpent key expansion function
    132 static void serpent_key_expansion(const uint8_t *key, serpent_key_schedule *ks) {
    133     uint32_t w[140];  // Prekeys
    134     int i;
    135 
    136     // Initialize w from key (256-bit key = 8 words)
    137     for (i = 0; i < 8; i++) {
    138         w[i] = ((uint32_t)key[i*4]) |
    139                ((uint32_t)key[i*4+1] << 8) |
    140                ((uint32_t)key[i*4+2] << 16) |
    141                ((uint32_t)key[i*4+3] << 24);
    142     }
    143 
    144     // Generate prekeys
    145     for (i = 8; i < 16; i++) {
    146         w[i] = 0;
    147     }
    148 
    149     for (i = 8; i < 140; i++) {
    150         w[i] = w[i-8] ^ w[i-5] ^ w[i-3] ^ w[i-1] ^ PHI ^ (i-8);
    151         w[i] = (w[i] << 11) | (w[i] >> 21);
    152     }
    153 
    154     // Apply S-boxes to generate subkeys
    155     // Key schedule uses S-boxes in order: S3, S2, S1, S0, S7, S6, S5, S4
    156     static const int ks_sbox_order[8] = {3, 2, 1, 0, 7, 6, 5, 4};
    157     for (i = 0; i < 33; i++) {
    158         uint32_t r0 = w[4*i+8];
    159         uint32_t r1 = w[4*i+9];
    160         uint32_t r2 = w[4*i+10];
    161         uint32_t r3 = w[4*i+11];
    162 
    163         sbox(ks_sbox_order[i % 8], &r0, &r1, &r2, &r3);
    164 
    165         ks->subkeys[i][0] = r0;
    166         ks->subkeys[i][1] = r1;
    167         ks->subkeys[i][2] = r2;
    168         ks->subkeys[i][3] = r3;
    169     }
    170 }
    171 
    172 // Serpent encryption of a single block
    173 static void serpent_encrypt_block(const uint32_t *input, uint32_t *output,
    174                                    const serpent_key_schedule *ks) {
    175     uint32_t r0 = input[0];
    176     uint32_t r1 = input[1];
    177     uint32_t r2 = input[2];
    178     uint32_t r3 = input[3];
    179 
    180     // Initial permutation (IP) - XOR with first subkey
    181     r0 ^= ks->subkeys[0][0];
    182     r1 ^= ks->subkeys[0][1];
    183     r2 ^= ks->subkeys[0][2];
    184     r3 ^= ks->subkeys[0][3];
    185 
    186     // 31 rounds
    187     for (int i = 0; i < 31; i++) {
    188         sbox(i % 8, &r0, &r1, &r2, &r3);
    189         linear_transform(&r0, &r1, &r2, &r3);
    190 
    191         r0 ^= ks->subkeys[i+1][0];
    192         r1 ^= ks->subkeys[i+1][1];
    193         r2 ^= ks->subkeys[i+1][2];
    194         r3 ^= ks->subkeys[i+1][3];
    195     }
    196 
    197     // Final round (no linear transform)
    198     sbox(7, &r0, &r1, &r2, &r3);
    199 
    200     r0 ^= ks->subkeys[32][0];
    201     r1 ^= ks->subkeys[32][1];
    202     r2 ^= ks->subkeys[32][2];
    203     r3 ^= ks->subkeys[32][3];
    204 
    205     output[0] = r0;
    206     output[1] = r1;
    207     output[2] = r2;
    208     output[3] = r3;
    209 }
    210 
    211 // Serpent decryption of a single block
    212 static void serpent_decrypt_block(const uint32_t *input, uint32_t *output,
    213                                    const serpent_key_schedule *ks) {
    214     uint32_t r0 = input[0];
    215     uint32_t r1 = input[1];
    216     uint32_t r2 = input[2];
    217     uint32_t r3 = input[3];
    218 
    219     // XOR with last subkey
    220     r0 ^= ks->subkeys[32][0];
    221     r1 ^= ks->subkeys[32][1];
    222     r2 ^= ks->subkeys[32][2];
    223     r3 ^= ks->subkeys[32][3];
    224 
    225     // Inverse final round
    226     sbox_inv(7, &r0, &r1, &r2, &r3);
    227 
    228     // 31 inverse rounds
    229     for (int i = 31; i > 0; i--) {
    230         r0 ^= ks->subkeys[i][0];
    231         r1 ^= ks->subkeys[i][1];
    232         r2 ^= ks->subkeys[i][2];
    233         r3 ^= ks->subkeys[i][3];
    234 
    235         linear_transform_inv(&r0, &r1, &r2, &r3);
    236         sbox_inv((i-1) % 8, &r0, &r1, &r2, &r3);
    237     }
    238 
    239     r0 ^= ks->subkeys[0][0];
    240     r1 ^= ks->subkeys[0][1];
    241     r2 ^= ks->subkeys[0][2];
    242     r3 ^= ks->subkeys[0][3];
    243 
    244     output[0] = r0;
    245     output[1] = r1;
    246     output[2] = r2;
    247     output[3] = r3;
    248 }
    249 
    250 // Utility functions for GCM
    251 static inline __m128i reverse_bytes(__m128i x) {
    252     const __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
    253     return _mm_shuffle_epi8(x, mask);
    254 }
    255 
    256 // GHASH multiplication using PCLMULQDQ (same as AES-GCM)
    257 static inline __m128i gf_mul(__m128i a, __m128i b) {
    258     __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    259     __m128i tmp8, tmp9;
    260 
    261     tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
    262     tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
    263 
    264     tmp4 = _mm_shuffle_epi32(a, 78);
    265     tmp5 = _mm_shuffle_epi32(b, 78);
    266     tmp4 = _mm_xor_si128(tmp4, a);
    267     tmp5 = _mm_xor_si128(tmp5, b);
    268 
    269     tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00);
    270     tmp4 = _mm_xor_si128(tmp4, tmp3);
    271     tmp4 = _mm_xor_si128(tmp4, tmp6);
    272 
    273     tmp5 = _mm_slli_si128(tmp4, 8);
    274     tmp4 = _mm_srli_si128(tmp4, 8);
    275     tmp3 = _mm_xor_si128(tmp3, tmp5);
    276     tmp6 = _mm_xor_si128(tmp6, tmp4);
    277 
    278     // Reduction
    279     tmp7 = _mm_srli_epi32(tmp3, 31);
    280     tmp8 = _mm_srli_epi32(tmp6, 31);
    281     tmp3 = _mm_slli_epi32(tmp3, 1);
    282     tmp6 = _mm_slli_epi32(tmp6, 1);
    283 
    284     tmp9 = _mm_srli_si128(tmp7, 12);
    285     tmp8 = _mm_slli_si128(tmp8, 4);
    286     tmp7 = _mm_slli_si128(tmp7, 4);
    287     tmp3 = _mm_or_si128(tmp3, tmp7);
    288     tmp6 = _mm_or_si128(tmp6, tmp8);
    289     tmp6 = _mm_or_si128(tmp6, tmp9);
    290 
    291     tmp7 = _mm_slli_epi32(tmp3, 31);
    292     tmp8 = _mm_slli_epi32(tmp3, 30);
    293     tmp9 = _mm_slli_epi32(tmp3, 25);
    294 
    295     tmp7 = _mm_xor_si128(tmp7, tmp8);
    296     tmp7 = _mm_xor_si128(tmp7, tmp9);
    297     tmp8 = _mm_srli_si128(tmp7, 4);
    298     tmp7 = _mm_slli_si128(tmp7, 12);
    299     tmp3 = _mm_xor_si128(tmp3, tmp7);
    300 
    301     tmp2 = _mm_srli_epi32(tmp3, 1);
    302     tmp4 = _mm_srli_epi32(tmp3, 2);
    303     tmp5 = _mm_srli_epi32(tmp3, 7);
    304     tmp2 = _mm_xor_si128(tmp2, tmp4);
    305     tmp2 = _mm_xor_si128(tmp2, tmp5);
    306     tmp2 = _mm_xor_si128(tmp2, tmp8);
    307     tmp3 = _mm_xor_si128(tmp3, tmp2);
    308     tmp6 = _mm_xor_si128(tmp6, tmp3);
    309 
    310     return tmp6;
    311 }
    312 
    313 // Initialize Serpent-GCM context
    314 int serpent_gcm_init(serpent_gcm_context *ctx, const uint8_t *key) {
    315     if (!has_pclmulqdq) {
    316         fprintf(stderr, "Error: CPU does not support PCLMULQDQ\n");
    317         return -1;
    318     }
    319 
    320     // Generate key schedule
    321     serpent_key_expansion(key, &ctx->key_schedule);
    322 
    323     // Compute H = E(K, 0^128)
    324     uint32_t zero[4] = {0, 0, 0, 0};
    325     uint32_t h_block[4];
    326     serpent_encrypt_block(zero, h_block, &ctx->key_schedule);
    327 
    328     // Convert to __m128i and reverse bytes for GCM
    329     ctx->H = _mm_set_epi32(h_block[3], h_block[2], h_block[1], h_block[0]);
    330     ctx->H = reverse_bytes(ctx->H);
    331 
    332     // Precompute powers of H
    333     ctx->H_powers[0] = ctx->H;
    334     for (int i = 1; i < 8; i++) {
    335         ctx->H_powers[i] = gf_mul(ctx->H_powers[i-1], ctx->H);
    336     }
    337 
    338     return 0;
    339 }
    340 
    341 // GHASH computation
    342 static void ghash(const serpent_gcm_context *ctx, const uint8_t *aad, size_t aad_len,
    343                   const uint8_t *ciphertext, size_t ct_len, __m128i *tag) {
    344     __m128i hash = _mm_setzero_si128();
    345     size_t i;
    346 
    347     // Process AAD
    348     for (i = 0; i + 16 <= aad_len; i += 16) {
    349         __m128i block = _mm_loadu_si128((__m128i*)(aad + i));
    350         block = reverse_bytes(block);
    351         hash = _mm_xor_si128(hash, block);
    352         hash = gf_mul(hash, ctx->H);
    353     }
    354 
    355     if (i < aad_len) {
    356         uint8_t temp[16] = {0};
    357         memcpy(temp, aad + i, aad_len - i);
    358         __m128i block = _mm_loadu_si128((__m128i*)temp);
    359         block = reverse_bytes(block);
    360         hash = _mm_xor_si128(hash, block);
    361         hash = gf_mul(hash, ctx->H);
    362     }
    363 
    364     // Process ciphertext
    365     for (i = 0; i + 16 <= ct_len; i += 16) {
    366         __m128i block = _mm_loadu_si128((__m128i*)(ciphertext + i));
    367         block = reverse_bytes(block);
    368         hash = _mm_xor_si128(hash, block);
    369         hash = gf_mul(hash, ctx->H);
    370     }
    371 
    372     if (i < ct_len) {
    373         uint8_t temp[16] = {0};
    374         memcpy(temp, ciphertext + i, ct_len - i);
    375         __m128i block = _mm_loadu_si128((__m128i*)temp);
    376         block = reverse_bytes(block);
    377         hash = _mm_xor_si128(hash, block);
    378         hash = gf_mul(hash, ctx->H);
    379     }
    380 
    381     // Process length block
    382     uint64_t aad_bits = aad_len * 8;
    383     uint64_t ct_bits = ct_len * 8;
    384     __m128i len_block = _mm_set_epi64x(ct_bits, aad_bits);
    385     len_block = reverse_bytes(len_block);
    386     hash = _mm_xor_si128(hash, len_block);
    387     hash = gf_mul(hash, ctx->H);
    388 
    389     *tag = reverse_bytes(hash);
    390 }
    391 
    392 // Increment counter
    393 static inline __m128i inc_counter(__m128i counter) {
    394     uint32_t ctr = _mm_extract_epi32(counter, 0);
    395     ctr++;
    396     counter = _mm_insert_epi32(counter, ctr, 0);
    397     return counter;
    398 }
    399 
    400 // Serpent-GCM Encryption
    401 int serpent_gcm_encrypt(serpent_gcm_context *ctx,
    402                         const uint8_t *iv, size_t iv_len,
    403                         const uint8_t *aad, size_t aad_len,
    404                         const uint8_t *plaintext, size_t pt_len,
    405                         uint8_t *ciphertext,
    406                         uint8_t *tag, size_t tag_len) {
    407     if (tag_len > 16) return -1;
    408 
    409     // Prepare initial counter
    410     __m128i counter;
    411     if (iv_len == 12) {
    412         counter = _mm_set_epi32(1,
    413                                  ((uint32_t*)iv)[2],
    414                                  ((uint32_t*)iv)[1],
    415                                  ((uint32_t*)iv)[0]);
    416     } else {
    417         // Non-standard IV: use GHASH
    418         __m128i hash = _mm_setzero_si128();
    419         size_t i;
    420         for (i = 0; i + 16 <= iv_len; i += 16) {
    421             __m128i block = _mm_loadu_si128((__m128i*)(iv + i));
    422             block = reverse_bytes(block);
    423             hash = _mm_xor_si128(hash, block);
    424             hash = gf_mul(hash, ctx->H);
    425         }
    426         if (i < iv_len) {
    427             uint8_t temp[16] = {0};
    428             memcpy(temp, iv + i, iv_len - i);
    429             __m128i block = _mm_loadu_si128((__m128i*)temp);
    430             block = reverse_bytes(block);
    431             hash = _mm_xor_si128(hash, block);
    432             hash = gf_mul(hash, ctx->H);
    433         }
    434         uint64_t iv_bits = iv_len * 8;
    435         __m128i len_block = _mm_set_epi64x(iv_bits, 0);
    436         len_block = reverse_bytes(len_block);
    437         hash = _mm_xor_si128(hash, len_block);
    438         hash = gf_mul(hash, ctx->H);
    439         counter = reverse_bytes(hash);
    440     }
    441 
    442     __m128i J0 = counter;
    443 
    444     // Encrypt plaintext
    445     size_t i;
    446     for (i = 0; i + 16 <= pt_len; i += 16) {
    447         counter = inc_counter(counter);
    448 
    449         // Extract counter to uint32_t array
    450         uint32_t ctr_block[4];
    451         _mm_storeu_si128((__m128i*)ctr_block, counter);
    452 
    453         // Encrypt counter with Serpent
    454         uint32_t keystream[4];
    455         serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule);
    456 
    457         // XOR with plaintext
    458         uint32_t *pt = (uint32_t*)(plaintext + i);
    459         uint32_t *ct = (uint32_t*)(ciphertext + i);
    460         ct[0] = pt[0] ^ keystream[0];
    461         ct[1] = pt[1] ^ keystream[1];
    462         ct[2] = pt[2] ^ keystream[2];
    463         ct[3] = pt[3] ^ keystream[3];
    464     }
    465 
    466     // Handle remaining bytes
    467     if (i < pt_len) {
    468         counter = inc_counter(counter);
    469         uint32_t ctr_block[4];
    470         _mm_storeu_si128((__m128i*)ctr_block, counter);
    471         uint32_t keystream[4];
    472         serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule);
    473 
    474         uint8_t *ks_bytes = (uint8_t*)keystream;
    475         for (size_t j = 0; j < pt_len - i; j++) {
    476             ciphertext[i + j] = plaintext[i + j] ^ ks_bytes[j];
    477         }
    478     }
    479 
    480     // Compute authentication tag
    481     __m128i auth_tag;
    482     ghash(ctx, aad, aad_len, ciphertext, pt_len, &auth_tag);
    483 
    484     // Encrypt tag with J0
    485     uint32_t j0_block[4];
    486     _mm_storeu_si128((__m128i*)j0_block, J0);
    487     uint32_t j0_keystream[4];
    488     serpent_encrypt_block(j0_block, j0_keystream, &ctx->key_schedule);
    489 
    490     __m128i j0_ks = _mm_set_epi32(j0_keystream[3], j0_keystream[2],
    491                                    j0_keystream[1], j0_keystream[0]);
    492     auth_tag = _mm_xor_si128(auth_tag, j0_ks);
    493 
    494     _mm_storeu_si128((__m128i*)tag, auth_tag);
    495 
    496     return 0;
    497 }
    498 
    499 // Serpent-GCM Decryption
    500 int serpent_gcm_decrypt(serpent_gcm_context *ctx,
    501                         const uint8_t *iv, size_t iv_len,
    502                         const uint8_t *aad, size_t aad_len,
    503                         const uint8_t *ciphertext, size_t ct_len,
    504                         const uint8_t *tag, size_t tag_len,
    505                         uint8_t *plaintext) {
    506     if (tag_len > 16) return -1;
    507 
    508     // Prepare initial counter
    509     __m128i counter;
    510     if (iv_len == 12) {
    511         counter = _mm_set_epi32(1,
    512                                  ((uint32_t*)iv)[2],
    513                                  ((uint32_t*)iv)[1],
    514                                  ((uint32_t*)iv)[0]);
    515     } else {
    516         __m128i hash = _mm_setzero_si128();
    517         size_t i;
    518         for (i = 0; i + 16 <= iv_len; i += 16) {
    519             __m128i block = _mm_loadu_si128((__m128i*)(iv + i));
    520             block = reverse_bytes(block);
    521             hash = _mm_xor_si128(hash, block);
    522             hash = gf_mul(hash, ctx->H);
    523         }
    524         if (i < iv_len) {
    525             uint8_t temp[16] = {0};
    526             memcpy(temp, iv + i, iv_len - i);
    527             __m128i block = _mm_loadu_si128((__m128i*)temp);
    528             block = reverse_bytes(block);
    529             hash = _mm_xor_si128(hash, block);
    530             hash = gf_mul(hash, ctx->H);
    531         }
    532         uint64_t iv_bits = iv_len * 8;
    533         __m128i len_block = _mm_set_epi64x(iv_bits, 0);
    534         len_block = reverse_bytes(len_block);
    535         hash = _mm_xor_si128(hash, len_block);
    536         hash = gf_mul(hash, ctx->H);
    537         counter = reverse_bytes(hash);
    538     }
    539 
    540     __m128i J0 = counter;
    541 
    542     // Verify authentication tag
    543     __m128i computed_tag;
    544     ghash(ctx, aad, aad_len, ciphertext, ct_len, &computed_tag);
    545 
    546     uint32_t j0_block[4];
    547     _mm_storeu_si128((__m128i*)j0_block, J0);
    548     uint32_t j0_keystream[4];
    549     serpent_encrypt_block(j0_block, j0_keystream, &ctx->key_schedule);
    550 
    551     __m128i j0_ks = _mm_set_epi32(j0_keystream[3], j0_keystream[2],
    552                                    j0_keystream[1], j0_keystream[0]);
    553     computed_tag = _mm_xor_si128(computed_tag, j0_ks);
    554 
    555     // Constant-time comparison
    556     uint8_t computed_tag_bytes[16];
    557     _mm_storeu_si128((__m128i*)computed_tag_bytes, computed_tag);
    558 
    559     int tag_match = 1;
    560     for (size_t i = 0; i < tag_len; i++) {
    561         tag_match &= (computed_tag_bytes[i] == tag[i]);
    562     }
    563 
    564     if (!tag_match) return -1;
    565 
    566     // Decrypt ciphertext
    567     size_t i;
    568     for (i = 0; i + 16 <= ct_len; i += 16) {
    569         counter = inc_counter(counter);
    570 
    571         uint32_t ctr_block[4];
    572         _mm_storeu_si128((__m128i*)ctr_block, counter);
    573 
    574         uint32_t keystream[4];
    575         serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule);
    576 
    577         uint32_t *ct = (uint32_t*)(ciphertext + i);
    578         uint32_t *pt = (uint32_t*)(plaintext + i);
    579         pt[0] = ct[0] ^ keystream[0];
    580         pt[1] = ct[1] ^ keystream[1];
    581         pt[2] = ct[2] ^ keystream[2];
    582         pt[3] = ct[3] ^ keystream[3];
    583     }
    584 
    585     if (i < ct_len) {
    586         counter = inc_counter(counter);
    587         uint32_t ctr_block[4];
    588         _mm_storeu_si128((__m128i*)ctr_block, counter);
    589         uint32_t keystream[4];
    590         serpent_encrypt_block(ctr_block, keystream, &ctx->key_schedule);
    591 
    592         uint8_t *ks_bytes = (uint8_t*)keystream;
    593         for (size_t j = 0; j < ct_len - i; j++) {
    594             plaintext[i + j] = ciphertext[i + j] ^ ks_bytes[j];
    595         }
    596     }
    597 
    598     return 0;
    599 }
    600 
    601 /**
    602  * Clean up Serpent-256-GCM context
    603  * Zeros all sensitive key material
    604  */
    605 void serpent_gcm_cleanup(serpent_gcm_context *ctx) {
    606     if (ctx == NULL) return;
    607 
    608     // Zero all sensitive data using volatile to prevent compiler optimization
    609     volatile uint8_t *p = (volatile uint8_t *)ctx;
    610     size_t n = sizeof(serpent_gcm_context);
    611     while (n--) {
    612         *p++ = 0;
    613     }
    614 }