luajitos

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

Twofish-256-GCM.c (18158B)


      1 /*
      2  * Twofish-256-GCM Implementation
      3  *
      4  * Based on the Twofish specification by Bruce Schneier et al.
      5  * Reference: https://www.schneier.com/academic/twofish/
      6  */
      7 
      8 #include "Twofish-256-GCM.h"
      9 #include <stdlib.h>
     10 #include <string.h>
     11 
     12 /* Twofish MDS matrix and RS matrix constants */
     13 static const uint8_t Q0[256] = {
     14     0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, 0x9A, 0x92, 0x80, 0x78, 0xE4, 0xDD, 0xD1, 0x38,
     15     0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C, 0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48,
     16     0xF2, 0xD0, 0x8B, 0x30, 0x84, 0x54, 0xDF, 0x23, 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82,
     17     0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, 0xA6, 0xEB, 0xA5, 0xBE, 0x16, 0x0C, 0xE3, 0x61,
     18     0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B, 0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1,
     19     0xE1, 0xE6, 0xBD, 0x45, 0xE2, 0xF4, 0xB6, 0x66, 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7,
     20     0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, 0xEA, 0x77, 0x39, 0xAF, 0x33, 0xC9, 0x62, 0x71,
     21     0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8, 0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7,
     22     0xA1, 0x1D, 0xAA, 0xED, 0x06, 0x70, 0xB2, 0xD2, 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90,
     23     0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, 0x9E, 0x9C, 0x52, 0x1B, 0x5F, 0x93, 0x0A, 0xEF,
     24     0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B, 0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64,
     25     0x2A, 0xCE, 0xCB, 0x2F, 0xFC, 0x97, 0x05, 0x7A, 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A,
     26     0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, 0xB8, 0xDA, 0xB0, 0x17, 0x55, 0x1F, 0x8A, 0x7D,
     27     0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72, 0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34,
     28     0x6E, 0x50, 0xDE, 0x68, 0x65, 0xBC, 0xDB, 0xF8, 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4,
     29     0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, 0x6F, 0x9D, 0x36, 0x42, 0x4A, 0x5E, 0xC1, 0xE0
     30 };
     31 
     32 static const uint8_t Q1[256] = {
     33     0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, 0x4A, 0xD3, 0xE6, 0x6B, 0x45, 0x7D, 0xE8, 0x4B,
     34     0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1, 0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F,
     35     0x5E, 0xBA, 0xAE, 0x5B, 0x8A, 0x00, 0xBC, 0x9D, 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5,
     36     0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, 0xB2, 0x73, 0x4C, 0x54, 0x92, 0x74, 0x36, 0x51,
     37     0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96, 0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C,
     38     0x13, 0x95, 0x9C, 0xC7, 0x24, 0x46, 0x3B, 0x70, 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8,
     39     0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, 0x03, 0x6F, 0x08, 0xBF, 0x40, 0xE7, 0x2B, 0xE2,
     40     0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9, 0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17,
     41     0x66, 0x94, 0xA1, 0x1D, 0x3D, 0xF0, 0xDE, 0xB3, 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E,
     42     0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, 0x81, 0x88, 0xEE, 0x21, 0xC4, 0x1A, 0xEB, 0xD9,
     43     0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01, 0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48,
     44     0x4F, 0xF2, 0x65, 0x8E, 0x78, 0x5C, 0x58, 0x19, 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64,
     45     0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, 0xCE, 0xE9, 0x68, 0x44, 0xE0, 0x4D, 0x43, 0x69,
     46     0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E, 0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC,
     47     0x22, 0xC9, 0xC0, 0x9B, 0x89, 0xD4, 0xED, 0xAB, 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9,
     48     0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, 0x16, 0x25, 0x86, 0x56, 0x55, 0x09, 0xBE, 0x91
     49 };
     50 
     51 /* MDS multiplication in GF(2^8) */
     52 static inline uint8_t gf_mult(uint8_t a, uint8_t b, uint8_t poly) {
     53     uint8_t result = 0;
     54     for (int i = 0; i < 8; i++) {
     55         if (b & 1) result ^= a;
     56         uint8_t hi_bit = a & 0x80;
     57         a <<= 1;
     58         if (hi_bit) a ^= poly;
     59         b >>= 1;
     60     }
     61     return result;
     62 }
     63 
     64 /* MDS matrix multiply */
     65 static inline uint32_t mds_multiply(uint32_t x) {
     66     uint8_t b0 = x & 0xFF;
     67     uint8_t b1 = (x >> 8) & 0xFF;
     68     uint8_t b2 = (x >> 16) & 0xFF;
     69     uint8_t b3 = (x >> 24) & 0xFF;
     70 
     71     /* MDS matrix coefficients */
     72     uint8_t y0 = gf_mult(b0, 0x01, 0x69) ^ gf_mult(b1, 0xEF, 0x69) ^
     73                  gf_mult(b2, 0x5B, 0x69) ^ gf_mult(b3, 0x5B, 0x69);
     74     uint8_t y1 = gf_mult(b0, 0x5B, 0x69) ^ gf_mult(b1, 0xEF, 0x69) ^
     75                  gf_mult(b2, 0xEF, 0x69) ^ gf_mult(b3, 0x01, 0x69);
     76     uint8_t y2 = gf_mult(b0, 0xEF, 0x69) ^ gf_mult(b1, 0x5B, 0x69) ^
     77                  gf_mult(b2, 0x01, 0x69) ^ gf_mult(b3, 0xEF, 0x69);
     78     uint8_t y3 = gf_mult(b0, 0xEF, 0x69) ^ gf_mult(b1, 0x01, 0x69) ^
     79                  gf_mult(b2, 0xEF, 0x69) ^ gf_mult(b3, 0x5B, 0x69);
     80 
     81     return y0 | (y1 << 8) | (y2 << 16) | (y3 << 24);
     82 }
     83 
     84 /* g function for key schedule */
     85 static uint32_t g_func(uint32_t x, const uint32_t *S, int k) {
     86     uint8_t b0 = x & 0xFF;
     87     uint8_t b1 = (x >> 8) & 0xFF;
     88     uint8_t b2 = (x >> 16) & 0xFF;
     89     uint8_t b3 = (x >> 24) & 0xFF;
     90 
     91     if (k == 4) {
     92         b0 = Q1[b0] ^ ((S[3] >> 0) & 0xFF);
     93         b1 = Q0[b1] ^ ((S[3] >> 8) & 0xFF);
     94         b2 = Q0[b2] ^ ((S[3] >> 16) & 0xFF);
     95         b3 = Q1[b3] ^ ((S[3] >> 24) & 0xFF);
     96     }
     97     if (k >= 3) {
     98         b0 = Q1[b0] ^ ((S[2] >> 0) & 0xFF);
     99         b1 = Q1[b1] ^ ((S[2] >> 8) & 0xFF);
    100         b2 = Q0[b2] ^ ((S[2] >> 16) & 0xFF);
    101         b3 = Q0[b3] ^ ((S[2] >> 24) & 0xFF);
    102     }
    103 
    104     b0 = Q1[Q0[Q0[b0] ^ ((S[1] >> 0) & 0xFF)] ^ ((S[0] >> 0) & 0xFF)];
    105     b1 = Q0[Q0[Q1[b1] ^ ((S[1] >> 8) & 0xFF)] ^ ((S[0] >> 8) & 0xFF)];
    106     b2 = Q1[Q1[Q0[b2] ^ ((S[1] >> 16) & 0xFF)] ^ ((S[0] >> 16) & 0xFF)];
    107     b3 = Q0[Q1[Q1[b3] ^ ((S[1] >> 24) & 0xFF)] ^ ((S[0] >> 24) & 0xFF)];
    108 
    109     return mds_multiply(b0 | (b1 << 8) | (b2 << 16) | (b3 << 24));
    110 }
    111 
    112 /* Twofish key schedule */
    113 static void twofish_setup_key(twofish_key_schedule *ks, const uint8_t *key) {
    114     uint32_t M[8];
    115     uint32_t S[4];
    116 
    117     /* Split key into 32-bit words */
    118     for (int i = 0; i < 8; i++) {
    119         M[i] = ((uint32_t)key[i*4]) |
    120                ((uint32_t)key[i*4+1] << 8) |
    121                ((uint32_t)key[i*4+2] << 16) |
    122                ((uint32_t)key[i*4+3] << 24);
    123     }
    124 
    125     /* Generate S-box keys */
    126     for (int i = 0; i < 4; i++) {
    127         S[i] = M[2*i] ^ M[2*i+1];
    128     }
    129 
    130     /* Generate subkeys */
    131     for (int i = 0; i < 20; i++) {
    132         uint32_t A = g_func(2*i * 0x01010101, S, 4);
    133         uint32_t B = g_func((2*i+1) * 0x01010101, S, 4);
    134         B = (B << 8) | (B >> 24);
    135         ks->K[2*i] = A + B;
    136         ks->K[2*i+1] = ((A + 2*B) << 9) | ((A + 2*B) >> 23);
    137     }
    138 
    139     /* Build S-boxes */
    140     for (int i = 0; i < 256; i++) {
    141         for (int j = 0; j < 4; j++) {
    142             ks->S[j][i] = g_func(i, S, 4);
    143         }
    144     }
    145 }
    146 
    147 /* Twofish block encryption */
    148 static void twofish_encrypt_block(const twofish_key_schedule *ks,
    149                                    const uint8_t *in, uint8_t *out) {
    150     uint32_t R0, R1, R2, R3;
    151 
    152     /* Input whitening */
    153     R0 = ((uint32_t)in[0]) | ((uint32_t)in[1] << 8) |
    154          ((uint32_t)in[2] << 16) | ((uint32_t)in[3] << 24);
    155     R1 = ((uint32_t)in[4]) | ((uint32_t)in[5] << 8) |
    156          ((uint32_t)in[6] << 16) | ((uint32_t)in[7] << 24);
    157     R2 = ((uint32_t)in[8]) | ((uint32_t)in[9] << 8) |
    158          ((uint32_t)in[10] << 16) | ((uint32_t)in[11] << 24);
    159     R3 = ((uint32_t)in[12]) | ((uint32_t)in[13] << 8) |
    160          ((uint32_t)in[14] << 16) | ((uint32_t)in[15] << 24);
    161 
    162     R0 ^= ks->K[0];
    163     R1 ^= ks->K[1];
    164     R2 ^= ks->K[2];
    165     R3 ^= ks->K[3];
    166 
    167     /* 16 rounds */
    168     for (int r = 0; r < 16; r++) {
    169         uint32_t F0 = ks->S[0][R0 & 0xFF] ^ ks->S[1][(R0 >> 8) & 0xFF] ^
    170                       ks->S[2][(R0 >> 16) & 0xFF] ^ ks->S[3][(R0 >> 24) & 0xFF];
    171         uint32_t F1 = ks->S[0][R1 & 0xFF] ^ ks->S[1][(R1 >> 8) & 0xFF] ^
    172                       ks->S[2][(R1 >> 16) & 0xFF] ^ ks->S[3][(R1 >> 24) & 0xFF];
    173 
    174         R2 ^= (F0 + F1 + ks->K[8 + 2*r]);
    175         R2 = (R2 >> 1) | (R2 << 31);
    176 
    177         R3 = (R3 << 1) | (R3 >> 31);
    178         R3 ^= (F0 + 2*F1 + ks->K[8 + 2*r + 1]);
    179 
    180         /* Swap for next round */
    181         if (r < 15) {
    182             uint32_t tmp = R0;
    183             R0 = R2;
    184             R2 = tmp;
    185             tmp = R1;
    186             R1 = R3;
    187             R3 = tmp;
    188         }
    189     }
    190 
    191     /* Output whitening */
    192     R2 ^= ks->K[4];
    193     R3 ^= ks->K[5];
    194     R0 ^= ks->K[6];
    195     R1 ^= ks->K[7];
    196 
    197     out[0] = R2 & 0xFF;
    198     out[1] = (R2 >> 8) & 0xFF;
    199     out[2] = (R2 >> 16) & 0xFF;
    200     out[3] = (R2 >> 24) & 0xFF;
    201 
    202     out[4] = R3 & 0xFF;
    203     out[5] = (R3 >> 8) & 0xFF;
    204     out[6] = (R3 >> 16) & 0xFF;
    205     out[7] = (R3 >> 24) & 0xFF;
    206 
    207     out[8] = R0 & 0xFF;
    208     out[9] = (R0 >> 8) & 0xFF;
    209     out[10] = (R0 >> 16) & 0xFF;
    210     out[11] = (R0 >> 24) & 0xFF;
    211 
    212     out[12] = R1 & 0xFF;
    213     out[13] = (R1 >> 8) & 0xFF;
    214     out[14] = (R1 >> 16) & 0xFF;
    215     out[15] = (R1 >> 24) & 0xFF;
    216 }
    217 
    218 /* GCM helper functions */
    219 static inline __m128i reverse_bytes(__m128i x) {
    220     const __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
    221     return _mm_shuffle_epi8(x, mask);
    222 }
    223 
    224 static inline __m128i gf_mult_gcm(__m128i a, __m128i b) {
    225     __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    226     __m128i tmp8, tmp9, tmp10, tmp11, tmp12;
    227     __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
    228 
    229     tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
    230     tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
    231     tmp4 = _mm_shuffle_epi32(a, 78);
    232     tmp5 = _mm_shuffle_epi32(b, 78);
    233     tmp4 = _mm_xor_si128(tmp4, a);
    234     tmp5 = _mm_xor_si128(tmp5, b);
    235     tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00);
    236     tmp4 = _mm_xor_si128(tmp4, tmp3);
    237     tmp4 = _mm_xor_si128(tmp4, tmp6);
    238     tmp5 = _mm_slli_si128(tmp4, 8);
    239     tmp4 = _mm_srli_si128(tmp4, 8);
    240     tmp3 = _mm_xor_si128(tmp3, tmp5);
    241     tmp6 = _mm_xor_si128(tmp6, tmp4);
    242 
    243     tmp7 = _mm_srli_epi32(tmp3, 31);
    244     tmp8 = _mm_srli_epi32(tmp6, 31);
    245     tmp3 = _mm_slli_epi32(tmp3, 1);
    246     tmp6 = _mm_slli_epi32(tmp6, 1);
    247 
    248     tmp9 = _mm_srli_si128(tmp7, 12);
    249     tmp8 = _mm_slli_si128(tmp8, 4);
    250     tmp7 = _mm_slli_si128(tmp7, 4);
    251     tmp3 = _mm_or_si128(tmp3, tmp7);
    252     tmp6 = _mm_or_si128(tmp6, tmp8);
    253     tmp6 = _mm_or_si128(tmp6, tmp9);
    254 
    255     tmp7 = _mm_slli_epi32(tmp3, 31);
    256     tmp8 = _mm_slli_epi32(tmp3, 30);
    257     tmp9 = _mm_slli_epi32(tmp3, 25);
    258     tmp7 = _mm_xor_si128(tmp7, tmp8);
    259     tmp7 = _mm_xor_si128(tmp7, tmp9);
    260     tmp8 = _mm_srli_si128(tmp7, 4);
    261     tmp7 = _mm_slli_si128(tmp7, 12);
    262     tmp3 = _mm_xor_si128(tmp3, tmp7);
    263 
    264     tmp2 = _mm_srli_epi32(tmp3, 1);
    265     tmp4 = _mm_srli_epi32(tmp3, 2);
    266     tmp5 = _mm_srli_epi32(tmp3, 7);
    267     tmp2 = _mm_xor_si128(tmp2, tmp4);
    268     tmp2 = _mm_xor_si128(tmp2, tmp5);
    269     tmp2 = _mm_xor_si128(tmp2, tmp8);
    270     tmp3 = _mm_xor_si128(tmp3, tmp2);
    271     tmp6 = _mm_xor_si128(tmp6, tmp3);
    272 
    273     return tmp6;
    274 }
    275 
    276 /* GCM GHASH */
    277 static void gcm_ghash(const __m128i *H_powers, const uint8_t *data, size_t len, __m128i *result) {
    278     __m128i hash = _mm_setzero_si128();
    279 
    280     for (size_t i = 0; i < len / 16; i++) {
    281         __m128i block = _mm_loadu_si128((const __m128i*)(data + i*16));
    282         block = reverse_bytes(block);
    283         hash = _mm_xor_si128(hash, block);
    284         hash = gf_mult_gcm(hash, H_powers[0]);
    285     }
    286 
    287     /* Handle remaining bytes */
    288     if (len % 16 != 0) {
    289         uint8_t last_block[16] = {0};
    290         memcpy(last_block, data + (len / 16) * 16, len % 16);
    291         __m128i block = _mm_loadu_si128((const __m128i*)last_block);
    292         block = reverse_bytes(block);
    293         hash = _mm_xor_si128(hash, block);
    294         hash = gf_mult_gcm(hash, H_powers[0]);
    295     }
    296 
    297     *result = hash;
    298 }
    299 
    300 int twofish256_gcm_init(twofish256_gcm_context *ctx, const uint8_t *key) {
    301     if (!ctx || !key) return -1;
    302 
    303     /* Initialize Twofish key schedule */
    304     twofish_setup_key(&ctx->key_schedule, key);
    305 
    306     /* Compute H = E(K, 0^128) for GCM */
    307     uint8_t zero_block[16] = {0};
    308     uint8_t h_block[16];
    309     twofish_encrypt_block(&ctx->key_schedule, zero_block, h_block);
    310     ctx->H = _mm_loadu_si128((const __m128i*)h_block);
    311     ctx->H = reverse_bytes(ctx->H);
    312 
    313     /* Precompute powers of H */
    314     ctx->H_powers[0] = ctx->H;
    315     for (int i = 1; i < 8; i++) {
    316         ctx->H_powers[i] = gf_mult_gcm(ctx->H_powers[i-1], ctx->H);
    317     }
    318 
    319     return 0;
    320 }
    321 
    322 int twofish256_gcm_encrypt(
    323     twofish256_gcm_context *ctx,
    324     const uint8_t *iv, size_t iv_len,
    325     const uint8_t *aad, size_t aad_len,
    326     const uint8_t *plaintext, size_t pt_len,
    327     uint8_t *ciphertext,
    328     uint8_t *tag, size_t tag_len
    329 ) {
    330     if (!ctx || !iv || !tag || tag_len != 16) return -1;
    331     if (pt_len > 0 && (!plaintext || !ciphertext)) return -1;
    332 
    333     /* Prepare counter */
    334     uint8_t counter[16] = {0};
    335     if (iv_len == 12) {
    336         memcpy(counter, iv, 12);
    337         counter[15] = 1;
    338     } else {
    339         /* GHASH the IV */
    340         __m128i iv_hash;
    341         gcm_ghash(ctx->H_powers, iv, iv_len, &iv_hash);
    342         uint8_t len_block[16] = {0};
    343         uint64_t iv_bits = iv_len * 8;
    344         for (int i = 0; i < 8; i++) {
    345             len_block[15-i] = (iv_bits >> (i*8)) & 0xFF;
    346         }
    347         __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block);
    348         len_vec = reverse_bytes(len_vec);
    349         iv_hash = _mm_xor_si128(iv_hash, len_vec);
    350         iv_hash = gf_mult_gcm(iv_hash, ctx->H_powers[0]);
    351         _mm_storeu_si128((__m128i*)counter, reverse_bytes(iv_hash));
    352     }
    353 
    354     /* Encrypt tag using counter = 0 */
    355     uint8_t tag_mask[16];
    356     twofish_encrypt_block(&ctx->key_schedule, counter, tag_mask);
    357 
    358     /* CTR mode encryption */
    359     for (size_t i = 0; i < pt_len; i += 16) {
    360         /* Increment counter */
    361         for (int j = 15; j >= 0; j--) {
    362             if (++counter[j] != 0) break;
    363         }
    364 
    365         uint8_t keystream[16];
    366         twofish_encrypt_block(&ctx->key_schedule, counter, keystream);
    367 
    368         size_t block_len = (i + 16 <= pt_len) ? 16 : (pt_len - i);
    369         for (size_t j = 0; j < block_len; j++) {
    370             ciphertext[i+j] = plaintext[i+j] ^ keystream[j];
    371         }
    372     }
    373 
    374     /* Compute GHASH */
    375     __m128i ghash = _mm_setzero_si128();
    376 
    377     if (aad_len > 0) {
    378         gcm_ghash(ctx->H_powers, aad, aad_len, &ghash);
    379     }
    380 
    381     if (pt_len > 0) {
    382         __m128i ct_hash;
    383         gcm_ghash(ctx->H_powers, ciphertext, pt_len, &ct_hash);
    384         ghash = _mm_xor_si128(ghash, ct_hash);
    385     }
    386 
    387     /* Add length block */
    388     uint8_t len_block[16] = {0};
    389     uint64_t aad_bits = aad_len * 8;
    390     uint64_t ct_bits = pt_len * 8;
    391     for (int i = 0; i < 8; i++) {
    392         len_block[7-i] = (aad_bits >> (i*8)) & 0xFF;
    393         len_block[15-i] = (ct_bits >> (i*8)) & 0xFF;
    394     }
    395     __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block);
    396     len_vec = reverse_bytes(len_vec);
    397     ghash = _mm_xor_si128(ghash, len_vec);
    398     ghash = gf_mult_gcm(ghash, ctx->H_powers[0]);
    399 
    400     /* XOR with tag mask */
    401     ghash = reverse_bytes(ghash);
    402     __m128i tag_vec = _mm_xor_si128(ghash, _mm_loadu_si128((const __m128i*)tag_mask));
    403     _mm_storeu_si128((__m128i*)tag, tag_vec);
    404 
    405     return 0;
    406 }
    407 
    408 int twofish256_gcm_decrypt(
    409     twofish256_gcm_context *ctx,
    410     const uint8_t *iv, size_t iv_len,
    411     const uint8_t *aad, size_t aad_len,
    412     const uint8_t *ciphertext, size_t ct_len,
    413     const uint8_t *tag, size_t tag_len,
    414     uint8_t *plaintext
    415 ) {
    416     if (!ctx || !iv || !tag || tag_len != 16) return -1;
    417     if (ct_len > 0 && (!ciphertext || !plaintext)) return -1;
    418 
    419     /* Compute expected tag */
    420     uint8_t expected_tag[16];
    421 
    422     /* Prepare counter */
    423     uint8_t counter[16] = {0};
    424     if (iv_len == 12) {
    425         memcpy(counter, iv, 12);
    426         counter[15] = 1;
    427     } else {
    428         __m128i iv_hash;
    429         gcm_ghash(ctx->H_powers, iv, iv_len, &iv_hash);
    430         uint8_t len_block[16] = {0};
    431         uint64_t iv_bits = iv_len * 8;
    432         for (int i = 0; i < 8; i++) {
    433             len_block[15-i] = (iv_bits >> (i*8)) & 0xFF;
    434         }
    435         __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block);
    436         len_vec = reverse_bytes(len_vec);
    437         iv_hash = _mm_xor_si128(iv_hash, len_vec);
    438         iv_hash = gf_mult_gcm(iv_hash, ctx->H_powers[0]);
    439         _mm_storeu_si128((__m128i*)counter, reverse_bytes(iv_hash));
    440     }
    441 
    442     /* Encrypt tag using counter = 0 */
    443     uint8_t tag_mask[16];
    444     twofish_encrypt_block(&ctx->key_schedule, counter, tag_mask);
    445 
    446     /* Compute GHASH */
    447     __m128i ghash = _mm_setzero_si128();
    448 
    449     if (aad_len > 0) {
    450         gcm_ghash(ctx->H_powers, aad, aad_len, &ghash);
    451     }
    452 
    453     if (ct_len > 0) {
    454         __m128i ct_hash;
    455         gcm_ghash(ctx->H_powers, ciphertext, ct_len, &ct_hash);
    456         ghash = _mm_xor_si128(ghash, ct_hash);
    457     }
    458 
    459     /* Add length block */
    460     uint8_t len_block[16] = {0};
    461     uint64_t aad_bits = aad_len * 8;
    462     uint64_t ct_bits = ct_len * 8;
    463     for (int i = 0; i < 8; i++) {
    464         len_block[7-i] = (aad_bits >> (i*8)) & 0xFF;
    465         len_block[15-i] = (ct_bits >> (i*8)) & 0xFF;
    466     }
    467     __m128i len_vec = _mm_loadu_si128((const __m128i*)len_block);
    468     len_vec = reverse_bytes(len_vec);
    469     ghash = _mm_xor_si128(ghash, len_vec);
    470     ghash = gf_mult_gcm(ghash, ctx->H_powers[0]);
    471 
    472     /* XOR with tag mask */
    473     ghash = reverse_bytes(ghash);
    474     __m128i tag_vec = _mm_xor_si128(ghash, _mm_loadu_si128((const __m128i*)tag_mask));
    475     _mm_storeu_si128((__m128i*)expected_tag, tag_vec);
    476 
    477     /* Constant-time tag comparison */
    478     int tag_match = 1;
    479     for (size_t i = 0; i < 16; i++) {
    480         if (tag[i] != expected_tag[i]) tag_match = 0;
    481     }
    482 
    483     if (!tag_match) {
    484         memset(plaintext, 0, ct_len);
    485         return -1;
    486     }
    487 
    488     /* CTR mode decryption */
    489     memcpy(counter, iv, (iv_len < 12) ? iv_len : 12);
    490     if (iv_len == 12) counter[15] = 1;
    491 
    492     for (size_t i = 0; i < ct_len; i += 16) {
    493         /* Increment counter */
    494         for (int j = 15; j >= 0; j--) {
    495             if (++counter[j] != 0) break;
    496         }
    497 
    498         uint8_t keystream[16];
    499         twofish_encrypt_block(&ctx->key_schedule, counter, keystream);
    500 
    501         size_t block_len = (i + 16 <= ct_len) ? 16 : (ct_len - i);
    502         for (size_t j = 0; j < block_len; j++) {
    503             plaintext[i+j] = ciphertext[i+j] ^ keystream[j];
    504         }
    505     }
    506 
    507     return 0;
    508 }
    509 
    510 void twofish256_gcm_cleanup(twofish256_gcm_context *ctx) {
    511     if (!ctx) return;
    512     memset(ctx, 0, sizeof(twofish256_gcm_context));
    513 }