luajitos

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

XChaCha20-Poly1305.c (15424B)


      1 /*
      2  * XChaCha20-Poly1305 AEAD Implementation
      3  * Extended-nonce ChaCha20-Poly1305
      4  * Based on draft-irtf-cfrg-xchacha
      5  */
      6 
      7 #include "XChaCha20-Poly1305.h"
      8 #include <stdio.h>
      9 #include <string.h>
     10 
     11 // Utility: Read 32-bit little-endian
     12 static inline uint32_t read_le32(const uint8_t *p) {
     13     return ((uint32_t)p[0]) |
     14            ((uint32_t)p[1] << 8) |
     15            ((uint32_t)p[2] << 16) |
     16            ((uint32_t)p[3] << 24);
     17 }
     18 
     19 // Utility: Write 32-bit little-endian
     20 static inline void write_le32(uint8_t *p, uint32_t v) {
     21     p[0] = v & 0xff;
     22     p[1] = (v >> 8) & 0xff;
     23     p[2] = (v >> 16) & 0xff;
     24     p[3] = (v >> 24) & 0xff;
     25 }
     26 
     27 // Utility: Write 64-bit little-endian
     28 static inline void write_le64(uint8_t *p, uint64_t v) {
     29     p[0] = v & 0xff;
     30     p[1] = (v >> 8) & 0xff;
     31     p[2] = (v >> 16) & 0xff;
     32     p[3] = (v >> 24) & 0xff;
     33     p[4] = (v >> 32) & 0xff;
     34     p[5] = (v >> 40) & 0xff;
     35     p[6] = (v >> 48) & 0xff;
     36     p[7] = (v >> 56) & 0xff;
     37 }
     38 
     39 // ChaCha20 quarter round
     40 #define ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
     41 
     42 #define QUARTERROUND(a, b, c, d) \
     43     a += b; d ^= a; d = ROTL32(d, 16); \
     44     c += d; b ^= c; b = ROTL32(b, 12); \
     45     a += b; d ^= a; d = ROTL32(d, 8); \
     46     c += d; b ^= c; b = ROTL32(b, 7)
     47 
     48 // ChaCha20 block function
     49 static void chacha20_block(uint32_t out[16], const uint32_t in[16]) {
     50     int i;
     51     uint32_t x[16];
     52 
     53     // Copy input to working state
     54     for (i = 0; i < 16; i++) {
     55         x[i] = in[i];
     56     }
     57 
     58     // 20 rounds (10 double rounds)
     59     for (i = 0; i < 10; i++) {
     60         // Column rounds
     61         QUARTERROUND(x[0], x[4], x[8], x[12]);
     62         QUARTERROUND(x[1], x[5], x[9], x[13]);
     63         QUARTERROUND(x[2], x[6], x[10], x[14]);
     64         QUARTERROUND(x[3], x[7], x[11], x[15]);
     65 
     66         // Diagonal rounds
     67         QUARTERROUND(x[0], x[5], x[10], x[15]);
     68         QUARTERROUND(x[1], x[6], x[11], x[12]);
     69         QUARTERROUND(x[2], x[7], x[8], x[13]);
     70         QUARTERROUND(x[3], x[4], x[9], x[14]);
     71     }
     72 
     73     // Add original state
     74     for (i = 0; i < 16; i++) {
     75         out[i] = x[i] + in[i];
     76     }
     77 }
     78 
     79 /**
     80  * HChaCha20 - Key derivation for XChaCha20
     81  *
     82  * Takes a 256-bit key and 128-bit nonce, produces a 256-bit subkey.
     83  * This is ChaCha20 without the final addition, outputting only
     84  * the first and last 4 words.
     85  */
     86 void hchacha20(const uint8_t key[32], const uint8_t nonce[16], uint8_t subkey[32]) {
     87     uint32_t state[16];
     88     int i;
     89 
     90     // Constants "expand 32-byte k"
     91     state[0] = 0x61707865;
     92     state[1] = 0x3320646e;
     93     state[2] = 0x79622d32;
     94     state[3] = 0x6b206574;
     95 
     96     // Key (256 bits = 8 words)
     97     state[4] = read_le32(key + 0);
     98     state[5] = read_le32(key + 4);
     99     state[6] = read_le32(key + 8);
    100     state[7] = read_le32(key + 12);
    101     state[8] = read_le32(key + 16);
    102     state[9] = read_le32(key + 20);
    103     state[10] = read_le32(key + 24);
    104     state[11] = read_le32(key + 28);
    105 
    106     // Nonce (128 bits = 4 words)
    107     state[12] = read_le32(nonce + 0);
    108     state[13] = read_le32(nonce + 4);
    109     state[14] = read_le32(nonce + 8);
    110     state[15] = read_le32(nonce + 12);
    111 
    112     // 20 rounds (10 double rounds) - same as ChaCha20
    113     for (i = 0; i < 10; i++) {
    114         // Column rounds
    115         QUARTERROUND(state[0], state[4], state[8], state[12]);
    116         QUARTERROUND(state[1], state[5], state[9], state[13]);
    117         QUARTERROUND(state[2], state[6], state[10], state[14]);
    118         QUARTERROUND(state[3], state[7], state[11], state[15]);
    119 
    120         // Diagonal rounds
    121         QUARTERROUND(state[0], state[5], state[10], state[15]);
    122         QUARTERROUND(state[1], state[6], state[11], state[12]);
    123         QUARTERROUND(state[2], state[7], state[8], state[13]);
    124         QUARTERROUND(state[3], state[4], state[9], state[14]);
    125     }
    126 
    127     // Output subkey: first 4 words + last 4 words (total 256 bits)
    128     write_le32(subkey + 0, state[0]);
    129     write_le32(subkey + 4, state[1]);
    130     write_le32(subkey + 8, state[2]);
    131     write_le32(subkey + 12, state[3]);
    132     write_le32(subkey + 16, state[12]);
    133     write_le32(subkey + 20, state[13]);
    134     write_le32(subkey + 24, state[14]);
    135     write_le32(subkey + 28, state[15]);
    136 
    137     // Clear state
    138     memset(state, 0, sizeof(state));
    139 }
    140 
    141 // Initialize XChaCha20 state
    142 static void xchacha20_init_state(xchacha20_context *ctx,
    143                                   const uint8_t *key,
    144                                   const uint8_t *nonce) {
    145     // Derive subkey using HChaCha20 with first 16 bytes of nonce
    146     uint8_t subkey[32];
    147     hchacha20(key, nonce, subkey);
    148 
    149     // Constants "expand 32-byte k"
    150     ctx->state[0] = 0x61707865;
    151     ctx->state[1] = 0x3320646e;
    152     ctx->state[2] = 0x79622d32;
    153     ctx->state[3] = 0x6b206574;
    154 
    155     // Subkey (256 bits = 8 words)
    156     ctx->state[4] = read_le32(subkey + 0);
    157     ctx->state[5] = read_le32(subkey + 4);
    158     ctx->state[6] = read_le32(subkey + 8);
    159     ctx->state[7] = read_le32(subkey + 12);
    160     ctx->state[8] = read_le32(subkey + 16);
    161     ctx->state[9] = read_le32(subkey + 20);
    162     ctx->state[10] = read_le32(subkey + 24);
    163     ctx->state[11] = read_le32(subkey + 28);
    164 
    165     // Counter (32 bits) - starts at 0
    166     ctx->state[12] = 0;
    167 
    168     // Last 8 bytes of XChaCha nonce become ChaCha nonce (64 bits + 32 bits padding)
    169     ctx->state[13] = read_le32(nonce + 16);
    170     ctx->state[14] = read_le32(nonce + 20);
    171     ctx->state[15] = 0; // Padding
    172 
    173     ctx->keystream_pos = 64; // Force generation on first use
    174     ctx->counter = 0;
    175 
    176     // Clear subkey
    177     memset(subkey, 0, sizeof(subkey));
    178 }
    179 
    180 // Generate next XChaCha20 keystream block
    181 static void xchacha20_generate_block(xchacha20_context *ctx) {
    182     uint32_t block[16];
    183 
    184     chacha20_block(block, ctx->state);
    185 
    186     // Convert to bytes
    187     for (int i = 0; i < 16; i++) {
    188         write_le32(ctx->keystream + i * 4, block[i]);
    189     }
    190 
    191     // Increment counter
    192     ctx->state[12]++;
    193     ctx->keystream_pos = 0;
    194 }
    195 
    196 // Poly1305 clamp function
    197 static void poly1305_clamp(uint32_t r[5], const uint8_t key[16]) {
    198     r[0] = (read_le32(key + 0)) & 0x0fffffff;
    199     r[1] = (read_le32(key + 4)) & 0x0ffffffc;
    200     r[2] = (read_le32(key + 8)) & 0x0ffffffc;
    201     r[3] = (read_le32(key + 12)) & 0x0ffffffc;
    202     r[4] = 0;
    203 }
    204 
    205 // Poly1305 initialization
    206 static void poly1305_init(xchacha20_poly1305_context_mac *ctx, const uint8_t key[32]) {
    207     // Clamp r
    208     poly1305_clamp(ctx->r, key);
    209 
    210     // Set pad
    211     ctx->pad[0] = read_le32(key + 16);
    212     ctx->pad[1] = read_le32(key + 20);
    213     ctx->pad[2] = read_le32(key + 24);
    214     ctx->pad[3] = read_le32(key + 28);
    215 
    216     // Initialize accumulator
    217     ctx->h[0] = ctx->h[1] = ctx->h[2] = ctx->h[3] = ctx->h[4] = 0;
    218     ctx->buffer_len = 0;
    219     ctx->total_len = 0;
    220 }
    221 
    222 // Poly1305 block processing
    223 static void poly1305_block(xchacha20_poly1305_context_mac *ctx, const uint8_t block[16], int final) {
    224     uint64_t h0 = ctx->h[0];
    225     uint64_t h1 = ctx->h[1];
    226     uint64_t h2 = ctx->h[2];
    227     uint64_t h3 = ctx->h[3];
    228     uint64_t h4 = ctx->h[4];
    229 
    230     // Add block (with high bit for non-final blocks)
    231     h0 += (read_le32(block + 0)) & 0xffffffff;
    232     h1 += (read_le32(block + 4)) & 0xffffffff;
    233     h2 += (read_le32(block + 8)) & 0xffffffff;
    234     h3 += (read_le32(block + 12)) & 0xffffffff;
    235     h4 += final ? 0 : (1ULL << 24);
    236 
    237     // Multiply by r
    238     uint64_t r0 = ctx->r[0];
    239     uint64_t r1 = ctx->r[1];
    240     uint64_t r2 = ctx->r[2];
    241     uint64_t r3 = ctx->r[3];
    242 
    243     uint64_t s1 = r1 * 5;
    244     uint64_t s2 = r2 * 5;
    245     uint64_t s3 = r3 * 5;
    246 
    247     uint64_t d0 = h0 * r0 + h1 * s3 + h2 * s2 + h3 * s1;
    248     uint64_t d1 = h0 * r1 + h1 * r0 + h2 * s3 + h3 * s2 + h4 * s1;
    249     uint64_t d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s3 + h4 * s2;
    250     uint64_t d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s3;
    251     uint64_t d4 = h4 * r0;
    252 
    253     // Carry propagation
    254     uint64_t c;
    255     c = d0 >> 26; h0 = d0 & 0x3ffffff;
    256     d1 += c; c = d1 >> 26; h1 = d1 & 0x3ffffff;
    257     d2 += c; c = d2 >> 26; h2 = d2 & 0x3ffffff;
    258     d3 += c; c = d3 >> 26; h3 = d3 & 0x3ffffff;
    259     d4 += c; c = d4 >> 26; h4 = d4 & 0x3ffffff;
    260     h0 += c * 5; c = h0 >> 26; h0 &= 0x3ffffff;
    261     h1 += c;
    262 
    263     ctx->h[0] = (uint32_t)h0;
    264     ctx->h[1] = (uint32_t)h1;
    265     ctx->h[2] = (uint32_t)h2;
    266     ctx->h[3] = (uint32_t)h3;
    267     ctx->h[4] = (uint32_t)h4;
    268 }
    269 
    270 // Poly1305 update
    271 static void poly1305_update(xchacha20_poly1305_context_mac *ctx, const uint8_t *data, size_t len) {
    272     ctx->total_len += len;
    273 
    274     // Process buffered data
    275     if (ctx->buffer_len > 0) {
    276         size_t to_copy = 16 - ctx->buffer_len;
    277         if (to_copy > len) to_copy = len;
    278         memcpy(ctx->buffer + ctx->buffer_len, data, to_copy);
    279         ctx->buffer_len += to_copy;
    280         data += to_copy;
    281         len -= to_copy;
    282 
    283         if (ctx->buffer_len == 16) {
    284             poly1305_block(ctx, ctx->buffer, 0);
    285             ctx->buffer_len = 0;
    286         }
    287     }
    288 
    289     // Process full blocks
    290     while (len >= 16) {
    291         poly1305_block(ctx, data, 0);
    292         data += 16;
    293         len -= 16;
    294     }
    295 
    296     // Buffer remaining data
    297     if (len > 0) {
    298         memcpy(ctx->buffer, data, len);
    299         ctx->buffer_len = len;
    300     }
    301 }
    302 
    303 // Poly1305 finalization
    304 static void poly1305_finish(xchacha20_poly1305_context_mac *ctx, uint8_t tag[16]) {
    305     // Process final block
    306     if (ctx->buffer_len > 0) {
    307         uint8_t final_block[16] = {0};
    308         memcpy(final_block, ctx->buffer, ctx->buffer_len);
    309         final_block[ctx->buffer_len] = 1;
    310         poly1305_block(ctx, final_block, 1);
    311     }
    312 
    313     // Final reduction
    314     uint64_t h0 = ctx->h[0];
    315     uint64_t h1 = ctx->h[1];
    316     uint64_t h2 = ctx->h[2];
    317     uint64_t h3 = ctx->h[3];
    318     uint64_t h4 = ctx->h[4];
    319 
    320     uint64_t c;
    321     c = h1 >> 26; h1 &= 0x3ffffff;
    322     h2 += c; c = h2 >> 26; h2 &= 0x3ffffff;
    323     h3 += c; c = h3 >> 26; h3 &= 0x3ffffff;
    324     h4 += c; c = h4 >> 26; h4 &= 0x3ffffff;
    325     h0 += c * 5; c = h0 >> 26; h0 &= 0x3ffffff;
    326     h1 += c;
    327 
    328     uint64_t g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
    329     uint64_t g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
    330     uint64_t g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
    331     uint64_t g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
    332     uint64_t g4 = h4 + c - (1ULL << 26);
    333 
    334     uint64_t mask = (g4 >> 63) - 1;
    335     g0 &= mask; g1 &= mask; g2 &= mask; g3 &= mask; g4 &= mask;
    336     mask = ~mask;
    337     h0 = (h0 & mask) | g0;
    338     h1 = (h1 & mask) | g1;
    339     h2 = (h2 & mask) | g2;
    340     h3 = (h3 & mask) | g3;
    341     h4 = (h4 & mask) | g4;
    342 
    343     // Add pad
    344     uint64_t f0 = ((h0) | (h1 << 26)) + ctx->pad[0];
    345     uint64_t f1 = ((h1 >> 6) | (h2 << 20)) + ctx->pad[1] + (f0 >> 32);
    346     uint64_t f2 = ((h2 >> 12) | (h3 << 14)) + ctx->pad[2] + (f1 >> 32);
    347     uint64_t f3 = ((h3 >> 18) | (h4 << 8)) + ctx->pad[3] + (f2 >> 32);
    348 
    349     write_le32(tag + 0, (uint32_t)f0);
    350     write_le32(tag + 4, (uint32_t)f1);
    351     write_le32(tag + 8, (uint32_t)f2);
    352     write_le32(tag + 12, (uint32_t)f3);
    353 }
    354 
    355 // Initialize XChaCha20-Poly1305
    356 int xchacha20_poly1305_init(xchacha20_poly1305_context *ctx,
    357                              const uint8_t *key,
    358                              const uint8_t *nonce) {
    359     if (!ctx || !key || !nonce) return -1;
    360 
    361     // Initialize XChaCha20 with extended nonce
    362     xchacha20_init_state(&ctx->cipher, key, nonce);
    363 
    364     // Generate Poly1305 key (first 32 bytes of keystream with counter=0)
    365     uint8_t poly_key[32];
    366     for (int i = 0; i < 32; i++) {
    367         if (ctx->cipher.keystream_pos >= 64) {
    368             xchacha20_generate_block(&ctx->cipher);
    369         }
    370         poly_key[i] = ctx->cipher.keystream[ctx->cipher.keystream_pos++];
    371     }
    372 
    373     // Initialize Poly1305
    374     poly1305_init(&ctx->mac, poly_key);
    375     memset(poly_key, 0, sizeof(poly_key));
    376 
    377     ctx->aad_len = 0;
    378     ctx->data_len = 0;
    379 
    380     return 0;
    381 }
    382 
    383 // XChaCha20-Poly1305 encryption
    384 int xchacha20_poly1305_encrypt(xchacha20_poly1305_context *ctx,
    385                                 const uint8_t *aad, size_t aad_len,
    386                                 const uint8_t *plaintext, size_t pt_len,
    387                                 uint8_t *ciphertext,
    388                                 uint8_t *tag) {
    389     if (!ctx || !tag) return -1;
    390     if (pt_len > 0 && (!plaintext || !ciphertext)) return -1;
    391     if (aad_len > 0 && !aad) return -1;
    392 
    393     // Authenticate AAD
    394     if (aad_len > 0) {
    395         poly1305_update(&ctx->mac, aad, aad_len);
    396         ctx->aad_len = aad_len;
    397     }
    398 
    399     // Pad AAD to 16 bytes
    400     if (aad_len % 16 != 0) {
    401         uint8_t padding[16] = {0};
    402         poly1305_update(&ctx->mac, padding, 16 - (aad_len % 16));
    403     }
    404 
    405     // Encrypt and authenticate ciphertext
    406     for (size_t i = 0; i < pt_len; i++) {
    407         if (ctx->cipher.keystream_pos >= 64) {
    408             xchacha20_generate_block(&ctx->cipher);
    409         }
    410         ciphertext[i] = plaintext[i] ^ ctx->cipher.keystream[ctx->cipher.keystream_pos++];
    411     }
    412 
    413     if (pt_len > 0) {
    414         poly1305_update(&ctx->mac, ciphertext, pt_len);
    415         ctx->data_len = pt_len;
    416     }
    417 
    418     // Pad ciphertext to 16 bytes
    419     if (pt_len % 16 != 0) {
    420         uint8_t padding[16] = {0};
    421         poly1305_update(&ctx->mac, padding, 16 - (pt_len % 16));
    422     }
    423 
    424     // Authenticate lengths
    425     uint8_t len_block[16];
    426     write_le64(len_block + 0, aad_len);
    427     write_le64(len_block + 8, pt_len);
    428     poly1305_update(&ctx->mac, len_block, 16);
    429 
    430     // Compute tag
    431     poly1305_finish(&ctx->mac, tag);
    432 
    433     return 0;
    434 }
    435 
    436 // XChaCha20-Poly1305 decryption
    437 int xchacha20_poly1305_decrypt(xchacha20_poly1305_context *ctx,
    438                                 const uint8_t *aad, size_t aad_len,
    439                                 const uint8_t *ciphertext, size_t ct_len,
    440                                 const uint8_t *tag,
    441                                 uint8_t *plaintext) {
    442     if (!ctx || !tag) return -1;
    443     if (ct_len > 0 && (!ciphertext || !plaintext)) return -1;
    444     if (aad_len > 0 && !aad) return -1;
    445 
    446     // Authenticate AAD
    447     if (aad_len > 0) {
    448         poly1305_update(&ctx->mac, aad, aad_len);
    449         ctx->aad_len = aad_len;
    450     }
    451 
    452     // Pad AAD to 16 bytes
    453     if (aad_len % 16 != 0) {
    454         uint8_t padding[16] = {0};
    455         poly1305_update(&ctx->mac, padding, 16 - (aad_len % 16));
    456     }
    457 
    458     // Authenticate ciphertext
    459     if (ct_len > 0) {
    460         poly1305_update(&ctx->mac, ciphertext, ct_len);
    461         ctx->data_len = ct_len;
    462     }
    463 
    464     // Pad ciphertext to 16 bytes
    465     if (ct_len % 16 != 0) {
    466         uint8_t padding[16] = {0};
    467         poly1305_update(&ctx->mac, padding, 16 - (ct_len % 16));
    468     }
    469 
    470     // Authenticate lengths
    471     uint8_t len_block[16];
    472     write_le64(len_block + 0, aad_len);
    473     write_le64(len_block + 8, ct_len);
    474     poly1305_update(&ctx->mac, len_block, 16);
    475 
    476     // Compute and verify tag
    477     uint8_t computed_tag[16];
    478     poly1305_finish(&ctx->mac, computed_tag);
    479 
    480     // Constant-time comparison
    481     int mismatch = 0;
    482     for (int i = 0; i < 16; i++) {
    483         mismatch |= (tag[i] ^ computed_tag[i]);
    484     }
    485 
    486     if (mismatch != 0) {
    487         memset(computed_tag, 0, sizeof(computed_tag));
    488         return -1; // Authentication failed
    489     }
    490 
    491     memset(computed_tag, 0, sizeof(computed_tag));
    492 
    493     // Decrypt ciphertext
    494     for (size_t i = 0; i < ct_len; i++) {
    495         if (ctx->cipher.keystream_pos >= 64) {
    496             xchacha20_generate_block(&ctx->cipher);
    497         }
    498         plaintext[i] = ciphertext[i] ^ ctx->cipher.keystream[ctx->cipher.keystream_pos++];
    499     }
    500 
    501     return 0;
    502 }
    503 
    504 // Cleanup
    505 void xchacha20_poly1305_cleanup(xchacha20_poly1305_context *ctx) {
    506     if (!ctx) return;
    507     volatile uint8_t *p = (volatile uint8_t *)ctx;
    508     size_t n = sizeof(xchacha20_poly1305_context);
    509     while (n--) {
    510         *p++ = 0;
    511     }
    512 }