XChaCha20-Poly1305.c (15424B)
1 /* 2 * XChaCha20-Poly1305 AEAD Implementation 3 * Extended-nonce ChaCha20-Poly1305 4 * Based on draft-irtf-cfrg-xchacha 5 */ 6 7 #include "XChaCha20-Poly1305.h" 8 #include <stdio.h> 9 #include <string.h> 10 11 // Utility: Read 32-bit little-endian 12 static inline uint32_t read_le32(const uint8_t *p) { 13 return ((uint32_t)p[0]) | 14 ((uint32_t)p[1] << 8) | 15 ((uint32_t)p[2] << 16) | 16 ((uint32_t)p[3] << 24); 17 } 18 19 // Utility: Write 32-bit little-endian 20 static inline void write_le32(uint8_t *p, uint32_t v) { 21 p[0] = v & 0xff; 22 p[1] = (v >> 8) & 0xff; 23 p[2] = (v >> 16) & 0xff; 24 p[3] = (v >> 24) & 0xff; 25 } 26 27 // Utility: Write 64-bit little-endian 28 static inline void write_le64(uint8_t *p, uint64_t v) { 29 p[0] = v & 0xff; 30 p[1] = (v >> 8) & 0xff; 31 p[2] = (v >> 16) & 0xff; 32 p[3] = (v >> 24) & 0xff; 33 p[4] = (v >> 32) & 0xff; 34 p[5] = (v >> 40) & 0xff; 35 p[6] = (v >> 48) & 0xff; 36 p[7] = (v >> 56) & 0xff; 37 } 38 39 // ChaCha20 quarter round 40 #define ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) 41 42 #define QUARTERROUND(a, b, c, d) \ 43 a += b; d ^= a; d = ROTL32(d, 16); \ 44 c += d; b ^= c; b = ROTL32(b, 12); \ 45 a += b; d ^= a; d = ROTL32(d, 8); \ 46 c += d; b ^= c; b = ROTL32(b, 7) 47 48 // ChaCha20 block function 49 static void chacha20_block(uint32_t out[16], const uint32_t in[16]) { 50 int i; 51 uint32_t x[16]; 52 53 // Copy input to working state 54 for (i = 0; i < 16; i++) { 55 x[i] = in[i]; 56 } 57 58 // 20 rounds (10 double rounds) 59 for (i = 0; i < 10; i++) { 60 // Column rounds 61 QUARTERROUND(x[0], x[4], x[8], x[12]); 62 QUARTERROUND(x[1], x[5], x[9], x[13]); 63 QUARTERROUND(x[2], x[6], x[10], x[14]); 64 QUARTERROUND(x[3], x[7], x[11], x[15]); 65 66 // Diagonal rounds 67 QUARTERROUND(x[0], x[5], x[10], x[15]); 68 QUARTERROUND(x[1], x[6], x[11], x[12]); 69 QUARTERROUND(x[2], x[7], x[8], x[13]); 70 QUARTERROUND(x[3], x[4], x[9], x[14]); 71 } 72 73 // Add original state 74 for (i = 0; i < 16; i++) { 75 out[i] = x[i] + in[i]; 76 } 77 } 78 79 /** 80 * HChaCha20 - Key derivation for XChaCha20 81 * 82 * Takes a 256-bit key and 128-bit nonce, produces a 256-bit subkey. 83 * This is ChaCha20 without the final addition, outputting only 84 * the first and last 4 words. 85 */ 86 void hchacha20(const uint8_t key[32], const uint8_t nonce[16], uint8_t subkey[32]) { 87 uint32_t state[16]; 88 int i; 89 90 // Constants "expand 32-byte k" 91 state[0] = 0x61707865; 92 state[1] = 0x3320646e; 93 state[2] = 0x79622d32; 94 state[3] = 0x6b206574; 95 96 // Key (256 bits = 8 words) 97 state[4] = read_le32(key + 0); 98 state[5] = read_le32(key + 4); 99 state[6] = read_le32(key + 8); 100 state[7] = read_le32(key + 12); 101 state[8] = read_le32(key + 16); 102 state[9] = read_le32(key + 20); 103 state[10] = read_le32(key + 24); 104 state[11] = read_le32(key + 28); 105 106 // Nonce (128 bits = 4 words) 107 state[12] = read_le32(nonce + 0); 108 state[13] = read_le32(nonce + 4); 109 state[14] = read_le32(nonce + 8); 110 state[15] = read_le32(nonce + 12); 111 112 // 20 rounds (10 double rounds) - same as ChaCha20 113 for (i = 0; i < 10; i++) { 114 // Column rounds 115 QUARTERROUND(state[0], state[4], state[8], state[12]); 116 QUARTERROUND(state[1], state[5], state[9], state[13]); 117 QUARTERROUND(state[2], state[6], state[10], state[14]); 118 QUARTERROUND(state[3], state[7], state[11], state[15]); 119 120 // Diagonal rounds 121 QUARTERROUND(state[0], state[5], state[10], state[15]); 122 QUARTERROUND(state[1], state[6], state[11], state[12]); 123 QUARTERROUND(state[2], state[7], state[8], state[13]); 124 QUARTERROUND(state[3], state[4], state[9], state[14]); 125 } 126 127 // Output subkey: first 4 words + last 4 words (total 256 bits) 128 write_le32(subkey + 0, state[0]); 129 write_le32(subkey + 4, state[1]); 130 write_le32(subkey + 8, state[2]); 131 write_le32(subkey + 12, state[3]); 132 write_le32(subkey + 16, state[12]); 133 write_le32(subkey + 20, state[13]); 134 write_le32(subkey + 24, state[14]); 135 write_le32(subkey + 28, state[15]); 136 137 // Clear state 138 memset(state, 0, sizeof(state)); 139 } 140 141 // Initialize XChaCha20 state 142 static void xchacha20_init_state(xchacha20_context *ctx, 143 const uint8_t *key, 144 const uint8_t *nonce) { 145 // Derive subkey using HChaCha20 with first 16 bytes of nonce 146 uint8_t subkey[32]; 147 hchacha20(key, nonce, subkey); 148 149 // Constants "expand 32-byte k" 150 ctx->state[0] = 0x61707865; 151 ctx->state[1] = 0x3320646e; 152 ctx->state[2] = 0x79622d32; 153 ctx->state[3] = 0x6b206574; 154 155 // Subkey (256 bits = 8 words) 156 ctx->state[4] = read_le32(subkey + 0); 157 ctx->state[5] = read_le32(subkey + 4); 158 ctx->state[6] = read_le32(subkey + 8); 159 ctx->state[7] = read_le32(subkey + 12); 160 ctx->state[8] = read_le32(subkey + 16); 161 ctx->state[9] = read_le32(subkey + 20); 162 ctx->state[10] = read_le32(subkey + 24); 163 ctx->state[11] = read_le32(subkey + 28); 164 165 // Counter (32 bits) - starts at 0 166 ctx->state[12] = 0; 167 168 // Last 8 bytes of XChaCha nonce become ChaCha nonce (64 bits + 32 bits padding) 169 ctx->state[13] = read_le32(nonce + 16); 170 ctx->state[14] = read_le32(nonce + 20); 171 ctx->state[15] = 0; // Padding 172 173 ctx->keystream_pos = 64; // Force generation on first use 174 ctx->counter = 0; 175 176 // Clear subkey 177 memset(subkey, 0, sizeof(subkey)); 178 } 179 180 // Generate next XChaCha20 keystream block 181 static void xchacha20_generate_block(xchacha20_context *ctx) { 182 uint32_t block[16]; 183 184 chacha20_block(block, ctx->state); 185 186 // Convert to bytes 187 for (int i = 0; i < 16; i++) { 188 write_le32(ctx->keystream + i * 4, block[i]); 189 } 190 191 // Increment counter 192 ctx->state[12]++; 193 ctx->keystream_pos = 0; 194 } 195 196 // Poly1305 clamp function 197 static void poly1305_clamp(uint32_t r[5], const uint8_t key[16]) { 198 r[0] = (read_le32(key + 0)) & 0x0fffffff; 199 r[1] = (read_le32(key + 4)) & 0x0ffffffc; 200 r[2] = (read_le32(key + 8)) & 0x0ffffffc; 201 r[3] = (read_le32(key + 12)) & 0x0ffffffc; 202 r[4] = 0; 203 } 204 205 // Poly1305 initialization 206 static void poly1305_init(xchacha20_poly1305_context_mac *ctx, const uint8_t key[32]) { 207 // Clamp r 208 poly1305_clamp(ctx->r, key); 209 210 // Set pad 211 ctx->pad[0] = read_le32(key + 16); 212 ctx->pad[1] = read_le32(key + 20); 213 ctx->pad[2] = read_le32(key + 24); 214 ctx->pad[3] = read_le32(key + 28); 215 216 // Initialize accumulator 217 ctx->h[0] = ctx->h[1] = ctx->h[2] = ctx->h[3] = ctx->h[4] = 0; 218 ctx->buffer_len = 0; 219 ctx->total_len = 0; 220 } 221 222 // Poly1305 block processing 223 static void poly1305_block(xchacha20_poly1305_context_mac *ctx, const uint8_t block[16], int final) { 224 uint64_t h0 = ctx->h[0]; 225 uint64_t h1 = ctx->h[1]; 226 uint64_t h2 = ctx->h[2]; 227 uint64_t h3 = ctx->h[3]; 228 uint64_t h4 = ctx->h[4]; 229 230 // Add block (with high bit for non-final blocks) 231 h0 += (read_le32(block + 0)) & 0xffffffff; 232 h1 += (read_le32(block + 4)) & 0xffffffff; 233 h2 += (read_le32(block + 8)) & 0xffffffff; 234 h3 += (read_le32(block + 12)) & 0xffffffff; 235 h4 += final ? 0 : (1ULL << 24); 236 237 // Multiply by r 238 uint64_t r0 = ctx->r[0]; 239 uint64_t r1 = ctx->r[1]; 240 uint64_t r2 = ctx->r[2]; 241 uint64_t r3 = ctx->r[3]; 242 243 uint64_t s1 = r1 * 5; 244 uint64_t s2 = r2 * 5; 245 uint64_t s3 = r3 * 5; 246 247 uint64_t d0 = h0 * r0 + h1 * s3 + h2 * s2 + h3 * s1; 248 uint64_t d1 = h0 * r1 + h1 * r0 + h2 * s3 + h3 * s2 + h4 * s1; 249 uint64_t d2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * s3 + h4 * s2; 250 uint64_t d3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * s3; 251 uint64_t d4 = h4 * r0; 252 253 // Carry propagation 254 uint64_t c; 255 c = d0 >> 26; h0 = d0 & 0x3ffffff; 256 d1 += c; c = d1 >> 26; h1 = d1 & 0x3ffffff; 257 d2 += c; c = d2 >> 26; h2 = d2 & 0x3ffffff; 258 d3 += c; c = d3 >> 26; h3 = d3 & 0x3ffffff; 259 d4 += c; c = d4 >> 26; h4 = d4 & 0x3ffffff; 260 h0 += c * 5; c = h0 >> 26; h0 &= 0x3ffffff; 261 h1 += c; 262 263 ctx->h[0] = (uint32_t)h0; 264 ctx->h[1] = (uint32_t)h1; 265 ctx->h[2] = (uint32_t)h2; 266 ctx->h[3] = (uint32_t)h3; 267 ctx->h[4] = (uint32_t)h4; 268 } 269 270 // Poly1305 update 271 static void poly1305_update(xchacha20_poly1305_context_mac *ctx, const uint8_t *data, size_t len) { 272 ctx->total_len += len; 273 274 // Process buffered data 275 if (ctx->buffer_len > 0) { 276 size_t to_copy = 16 - ctx->buffer_len; 277 if (to_copy > len) to_copy = len; 278 memcpy(ctx->buffer + ctx->buffer_len, data, to_copy); 279 ctx->buffer_len += to_copy; 280 data += to_copy; 281 len -= to_copy; 282 283 if (ctx->buffer_len == 16) { 284 poly1305_block(ctx, ctx->buffer, 0); 285 ctx->buffer_len = 0; 286 } 287 } 288 289 // Process full blocks 290 while (len >= 16) { 291 poly1305_block(ctx, data, 0); 292 data += 16; 293 len -= 16; 294 } 295 296 // Buffer remaining data 297 if (len > 0) { 298 memcpy(ctx->buffer, data, len); 299 ctx->buffer_len = len; 300 } 301 } 302 303 // Poly1305 finalization 304 static void poly1305_finish(xchacha20_poly1305_context_mac *ctx, uint8_t tag[16]) { 305 // Process final block 306 if (ctx->buffer_len > 0) { 307 uint8_t final_block[16] = {0}; 308 memcpy(final_block, ctx->buffer, ctx->buffer_len); 309 final_block[ctx->buffer_len] = 1; 310 poly1305_block(ctx, final_block, 1); 311 } 312 313 // Final reduction 314 uint64_t h0 = ctx->h[0]; 315 uint64_t h1 = ctx->h[1]; 316 uint64_t h2 = ctx->h[2]; 317 uint64_t h3 = ctx->h[3]; 318 uint64_t h4 = ctx->h[4]; 319 320 uint64_t c; 321 c = h1 >> 26; h1 &= 0x3ffffff; 322 h2 += c; c = h2 >> 26; h2 &= 0x3ffffff; 323 h3 += c; c = h3 >> 26; h3 &= 0x3ffffff; 324 h4 += c; c = h4 >> 26; h4 &= 0x3ffffff; 325 h0 += c * 5; c = h0 >> 26; h0 &= 0x3ffffff; 326 h1 += c; 327 328 uint64_t g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; 329 uint64_t g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; 330 uint64_t g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; 331 uint64_t g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; 332 uint64_t g4 = h4 + c - (1ULL << 26); 333 334 uint64_t mask = (g4 >> 63) - 1; 335 g0 &= mask; g1 &= mask; g2 &= mask; g3 &= mask; g4 &= mask; 336 mask = ~mask; 337 h0 = (h0 & mask) | g0; 338 h1 = (h1 & mask) | g1; 339 h2 = (h2 & mask) | g2; 340 h3 = (h3 & mask) | g3; 341 h4 = (h4 & mask) | g4; 342 343 // Add pad 344 uint64_t f0 = ((h0) | (h1 << 26)) + ctx->pad[0]; 345 uint64_t f1 = ((h1 >> 6) | (h2 << 20)) + ctx->pad[1] + (f0 >> 32); 346 uint64_t f2 = ((h2 >> 12) | (h3 << 14)) + ctx->pad[2] + (f1 >> 32); 347 uint64_t f3 = ((h3 >> 18) | (h4 << 8)) + ctx->pad[3] + (f2 >> 32); 348 349 write_le32(tag + 0, (uint32_t)f0); 350 write_le32(tag + 4, (uint32_t)f1); 351 write_le32(tag + 8, (uint32_t)f2); 352 write_le32(tag + 12, (uint32_t)f3); 353 } 354 355 // Initialize XChaCha20-Poly1305 356 int xchacha20_poly1305_init(xchacha20_poly1305_context *ctx, 357 const uint8_t *key, 358 const uint8_t *nonce) { 359 if (!ctx || !key || !nonce) return -1; 360 361 // Initialize XChaCha20 with extended nonce 362 xchacha20_init_state(&ctx->cipher, key, nonce); 363 364 // Generate Poly1305 key (first 32 bytes of keystream with counter=0) 365 uint8_t poly_key[32]; 366 for (int i = 0; i < 32; i++) { 367 if (ctx->cipher.keystream_pos >= 64) { 368 xchacha20_generate_block(&ctx->cipher); 369 } 370 poly_key[i] = ctx->cipher.keystream[ctx->cipher.keystream_pos++]; 371 } 372 373 // Initialize Poly1305 374 poly1305_init(&ctx->mac, poly_key); 375 memset(poly_key, 0, sizeof(poly_key)); 376 377 ctx->aad_len = 0; 378 ctx->data_len = 0; 379 380 return 0; 381 } 382 383 // XChaCha20-Poly1305 encryption 384 int xchacha20_poly1305_encrypt(xchacha20_poly1305_context *ctx, 385 const uint8_t *aad, size_t aad_len, 386 const uint8_t *plaintext, size_t pt_len, 387 uint8_t *ciphertext, 388 uint8_t *tag) { 389 if (!ctx || !tag) return -1; 390 if (pt_len > 0 && (!plaintext || !ciphertext)) return -1; 391 if (aad_len > 0 && !aad) return -1; 392 393 // Authenticate AAD 394 if (aad_len > 0) { 395 poly1305_update(&ctx->mac, aad, aad_len); 396 ctx->aad_len = aad_len; 397 } 398 399 // Pad AAD to 16 bytes 400 if (aad_len % 16 != 0) { 401 uint8_t padding[16] = {0}; 402 poly1305_update(&ctx->mac, padding, 16 - (aad_len % 16)); 403 } 404 405 // Encrypt and authenticate ciphertext 406 for (size_t i = 0; i < pt_len; i++) { 407 if (ctx->cipher.keystream_pos >= 64) { 408 xchacha20_generate_block(&ctx->cipher); 409 } 410 ciphertext[i] = plaintext[i] ^ ctx->cipher.keystream[ctx->cipher.keystream_pos++]; 411 } 412 413 if (pt_len > 0) { 414 poly1305_update(&ctx->mac, ciphertext, pt_len); 415 ctx->data_len = pt_len; 416 } 417 418 // Pad ciphertext to 16 bytes 419 if (pt_len % 16 != 0) { 420 uint8_t padding[16] = {0}; 421 poly1305_update(&ctx->mac, padding, 16 - (pt_len % 16)); 422 } 423 424 // Authenticate lengths 425 uint8_t len_block[16]; 426 write_le64(len_block + 0, aad_len); 427 write_le64(len_block + 8, pt_len); 428 poly1305_update(&ctx->mac, len_block, 16); 429 430 // Compute tag 431 poly1305_finish(&ctx->mac, tag); 432 433 return 0; 434 } 435 436 // XChaCha20-Poly1305 decryption 437 int xchacha20_poly1305_decrypt(xchacha20_poly1305_context *ctx, 438 const uint8_t *aad, size_t aad_len, 439 const uint8_t *ciphertext, size_t ct_len, 440 const uint8_t *tag, 441 uint8_t *plaintext) { 442 if (!ctx || !tag) return -1; 443 if (ct_len > 0 && (!ciphertext || !plaintext)) return -1; 444 if (aad_len > 0 && !aad) return -1; 445 446 // Authenticate AAD 447 if (aad_len > 0) { 448 poly1305_update(&ctx->mac, aad, aad_len); 449 ctx->aad_len = aad_len; 450 } 451 452 // Pad AAD to 16 bytes 453 if (aad_len % 16 != 0) { 454 uint8_t padding[16] = {0}; 455 poly1305_update(&ctx->mac, padding, 16 - (aad_len % 16)); 456 } 457 458 // Authenticate ciphertext 459 if (ct_len > 0) { 460 poly1305_update(&ctx->mac, ciphertext, ct_len); 461 ctx->data_len = ct_len; 462 } 463 464 // Pad ciphertext to 16 bytes 465 if (ct_len % 16 != 0) { 466 uint8_t padding[16] = {0}; 467 poly1305_update(&ctx->mac, padding, 16 - (ct_len % 16)); 468 } 469 470 // Authenticate lengths 471 uint8_t len_block[16]; 472 write_le64(len_block + 0, aad_len); 473 write_le64(len_block + 8, ct_len); 474 poly1305_update(&ctx->mac, len_block, 16); 475 476 // Compute and verify tag 477 uint8_t computed_tag[16]; 478 poly1305_finish(&ctx->mac, computed_tag); 479 480 // Constant-time comparison 481 int mismatch = 0; 482 for (int i = 0; i < 16; i++) { 483 mismatch |= (tag[i] ^ computed_tag[i]); 484 } 485 486 if (mismatch != 0) { 487 memset(computed_tag, 0, sizeof(computed_tag)); 488 return -1; // Authentication failed 489 } 490 491 memset(computed_tag, 0, sizeof(computed_tag)); 492 493 // Decrypt ciphertext 494 for (size_t i = 0; i < ct_len; i++) { 495 if (ctx->cipher.keystream_pos >= 64) { 496 xchacha20_generate_block(&ctx->cipher); 497 } 498 plaintext[i] = ciphertext[i] ^ ctx->cipher.keystream[ctx->cipher.keystream_pos++]; 499 } 500 501 return 0; 502 } 503 504 // Cleanup 505 void xchacha20_poly1305_cleanup(xchacha20_poly1305_context *ctx) { 506 if (!ctx) return; 507 volatile uint8_t *p = (volatile uint8_t *)ctx; 508 size_t n = sizeof(xchacha20_poly1305_context); 509 while (n--) { 510 *p++ = 0; 511 } 512 }