AES-128-GCM.c (13571B)
1 /* 2 * AES-128-GCM Implementation with Hardware Acceleration 3 * Supports AES-NI and PCLMULQDQ for high performance 4 * Conforms to NIST SP 800-38D 5 */ 6 7 #include "AES-128-GCM.h" 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include <stdint.h> 12 #include <immintrin.h> 13 #include <wmmintrin.h> 14 15 // CPU feature detection 16 #ifdef __GNUC__ 17 #include <cpuid.h> 18 #endif 19 20 // Feature flags (shared with AES-256-GCM) 21 static int has_aesni = 0; 22 static int has_pclmulqdq = 0; 23 static int features_detected = 0; 24 25 // Check CPU capabilities 26 static void detect_cpu_features(void) { 27 if (features_detected) return; 28 features_detected = 1; 29 #ifdef __GNUC__ 30 unsigned int eax, ebx, ecx, edx; 31 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { 32 has_aesni = (ecx & bit_AES) != 0; 33 has_pclmulqdq = (ecx & bit_PCLMUL) != 0; 34 } 35 #endif 36 } 37 38 // Utility: Reverse bytes in __m128i (for GCM) 39 static inline __m128i reverse_bytes(__m128i x) { 40 const __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); 41 return _mm_shuffle_epi8(x, mask); 42 } 43 44 // AES-128 key expansion using AES-NI 45 static void aes128_key_expansion(const uint8_t *key, aes128_key_schedule *ks) { 46 __m128i temp1, temp2; 47 __m128i *round_key = ks->round_keys; 48 49 ks->nr = 10; // AES-128 has 10 rounds 50 51 // Load the key (16 bytes for AES-128) 52 round_key[0] = _mm_loadu_si128((__m128i*)key); 53 54 // Helper macro for key expansion 55 #define AES_128_KEY_EXP(round_idx, temp1, temp2, rcon) do { \ 56 temp2 = _mm_aeskeygenassist_si128(temp1, rcon); \ 57 temp2 = _mm_shuffle_epi32(temp2, 0xff); \ 58 temp1 = _mm_xor_si128(temp1, _mm_slli_si128(temp1, 4)); \ 59 temp1 = _mm_xor_si128(temp1, _mm_slli_si128(temp1, 4)); \ 60 temp1 = _mm_xor_si128(temp1, _mm_slli_si128(temp1, 4)); \ 61 temp1 = _mm_xor_si128(temp1, temp2); \ 62 round_key[round_idx] = temp1; \ 63 } while(0) 64 65 temp1 = round_key[0]; 66 AES_128_KEY_EXP(1, temp1, temp2, 0x01); 67 AES_128_KEY_EXP(2, temp1, temp2, 0x02); 68 AES_128_KEY_EXP(3, temp1, temp2, 0x04); 69 AES_128_KEY_EXP(4, temp1, temp2, 0x08); 70 AES_128_KEY_EXP(5, temp1, temp2, 0x10); 71 AES_128_KEY_EXP(6, temp1, temp2, 0x20); 72 AES_128_KEY_EXP(7, temp1, temp2, 0x40); 73 AES_128_KEY_EXP(8, temp1, temp2, 0x80); 74 AES_128_KEY_EXP(9, temp1, temp2, 0x1b); 75 AES_128_KEY_EXP(10, temp1, temp2, 0x36); 76 } 77 78 // AES-128 single block encryption using AES-NI 79 static inline __m128i aes128_encrypt_block(__m128i plaintext, const aes128_key_schedule *ks) { 80 __m128i tmp = _mm_xor_si128(plaintext, ks->round_keys[0]); 81 82 for (int i = 1; i < 10; i++) { 83 tmp = _mm_aesenc_si128(tmp, ks->round_keys[i]); 84 } 85 86 tmp = _mm_aesenclast_si128(tmp, ks->round_keys[10]); 87 return tmp; 88 } 89 90 // GHASH multiplication using PCLMULQDQ (same as AES-256-GCM) 91 static inline __m128i gf_mul(__m128i a, __m128i b) { 92 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 93 __m128i tmp8, tmp9, tmp10, tmp11, tmp12; 94 __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0); 95 96 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 97 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 98 99 tmp4 = _mm_shuffle_epi32(a, 78); 100 tmp5 = _mm_shuffle_epi32(b, 78); 101 tmp4 = _mm_xor_si128(tmp4, a); 102 tmp5 = _mm_xor_si128(tmp5, b); 103 104 tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00); 105 tmp4 = _mm_xor_si128(tmp4, tmp3); 106 tmp4 = _mm_xor_si128(tmp4, tmp6); 107 108 tmp5 = _mm_slli_si128(tmp4, 8); 109 tmp4 = _mm_srli_si128(tmp4, 8); 110 tmp3 = _mm_xor_si128(tmp3, tmp5); 111 tmp6 = _mm_xor_si128(tmp6, tmp4); 112 113 // Reduction 114 tmp7 = _mm_srli_epi32(tmp3, 31); 115 tmp8 = _mm_srli_epi32(tmp6, 31); 116 tmp3 = _mm_slli_epi32(tmp3, 1); 117 tmp6 = _mm_slli_epi32(tmp6, 1); 118 119 tmp9 = _mm_srli_si128(tmp7, 12); 120 tmp8 = _mm_slli_si128(tmp8, 4); 121 tmp7 = _mm_slli_si128(tmp7, 4); 122 tmp3 = _mm_or_si128(tmp3, tmp7); 123 tmp6 = _mm_or_si128(tmp6, tmp8); 124 tmp6 = _mm_or_si128(tmp6, tmp9); 125 126 tmp7 = _mm_slli_epi32(tmp3, 31); 127 tmp8 = _mm_slli_epi32(tmp3, 30); 128 tmp9 = _mm_slli_epi32(tmp3, 25); 129 130 tmp7 = _mm_xor_si128(tmp7, tmp8); 131 tmp7 = _mm_xor_si128(tmp7, tmp9); 132 tmp8 = _mm_srli_si128(tmp7, 4); 133 tmp7 = _mm_slli_si128(tmp7, 12); 134 tmp3 = _mm_xor_si128(tmp3, tmp7); 135 136 tmp2 = _mm_srli_epi32(tmp3, 1); 137 tmp4 = _mm_srli_epi32(tmp3, 2); 138 tmp5 = _mm_srli_epi32(tmp3, 7); 139 tmp2 = _mm_xor_si128(tmp2, tmp4); 140 tmp2 = _mm_xor_si128(tmp2, tmp5); 141 tmp2 = _mm_xor_si128(tmp2, tmp8); 142 tmp3 = _mm_xor_si128(tmp3, tmp2); 143 tmp6 = _mm_xor_si128(tmp6, tmp3); 144 145 return tmp6; 146 } 147 148 // Initialize GCM context 149 int aes128_gcm_init(aes128_gcm_context *ctx, const uint8_t *key) { 150 detect_cpu_features(); 151 152 if (!has_aesni || !has_pclmulqdq) { 153 fprintf(stderr, "Error: CPU does not support AES-NI and PCLMULQDQ\n"); 154 return -1; 155 } 156 157 // Expand the key 158 aes128_key_expansion(key, &ctx->key_schedule); 159 160 // Compute H = E(K, 0^128) 161 __m128i zero = _mm_setzero_si128(); 162 ctx->H = aes128_encrypt_block(zero, &ctx->key_schedule); 163 ctx->H = reverse_bytes(ctx->H); 164 165 // Precompute powers of H for faster GHASH 166 ctx->H_powers[0] = ctx->H; 167 for (int i = 1; i < 8; i++) { 168 ctx->H_powers[i] = gf_mul(ctx->H_powers[i-1], ctx->H); 169 } 170 171 return 0; 172 } 173 174 // GHASH computation (same as AES-256-GCM) 175 static void ghash(const aes128_gcm_context *ctx, const uint8_t *aad, size_t aad_len, 176 const uint8_t *ciphertext, size_t ct_len, __m128i *tag) { 177 __m128i hash = _mm_setzero_si128(); 178 size_t i; 179 180 // Process AAD 181 for (i = 0; i + 16 <= aad_len; i += 16) { 182 __m128i block = _mm_loadu_si128((__m128i*)(aad + i)); 183 block = reverse_bytes(block); 184 hash = _mm_xor_si128(hash, block); 185 hash = gf_mul(hash, ctx->H); 186 } 187 188 // Handle remaining AAD bytes 189 if (i < aad_len) { 190 uint8_t temp[16] = {0}; 191 memcpy(temp, aad + i, aad_len - i); 192 __m128i block = _mm_loadu_si128((__m128i*)temp); 193 block = reverse_bytes(block); 194 hash = _mm_xor_si128(hash, block); 195 hash = gf_mul(hash, ctx->H); 196 } 197 198 // Process ciphertext 199 for (i = 0; i + 16 <= ct_len; i += 16) { 200 __m128i block = _mm_loadu_si128((__m128i*)(ciphertext + i)); 201 block = reverse_bytes(block); 202 hash = _mm_xor_si128(hash, block); 203 hash = gf_mul(hash, ctx->H); 204 } 205 206 // Handle remaining ciphertext bytes 207 if (i < ct_len) { 208 uint8_t temp[16] = {0}; 209 memcpy(temp, ciphertext + i, ct_len - i); 210 __m128i block = _mm_loadu_si128((__m128i*)temp); 211 block = reverse_bytes(block); 212 hash = _mm_xor_si128(hash, block); 213 hash = gf_mul(hash, ctx->H); 214 } 215 216 // Process length block 217 uint64_t aad_bits = aad_len * 8; 218 uint64_t ct_bits = ct_len * 8; 219 __m128i len_block = _mm_set_epi64x(ct_bits, aad_bits); 220 len_block = reverse_bytes(len_block); 221 hash = _mm_xor_si128(hash, len_block); 222 hash = gf_mul(hash, ctx->H); 223 224 *tag = reverse_bytes(hash); 225 } 226 227 // Increment counter (32-bit increment of rightmost 32 bits) 228 static inline __m128i inc_counter(__m128i counter) { 229 uint32_t ctr = _mm_extract_epi32(counter, 0); 230 ctr++; 231 counter = _mm_insert_epi32(counter, ctr, 0); 232 return counter; 233 } 234 235 // AES-128-GCM Encryption 236 int aes128_gcm_encrypt(aes128_gcm_context *ctx, 237 const uint8_t *iv, size_t iv_len, 238 const uint8_t *aad, size_t aad_len, 239 const uint8_t *plaintext, size_t pt_len, 240 uint8_t *ciphertext, 241 uint8_t *tag, size_t tag_len) { 242 if (tag_len > 16) { 243 return -1; 244 } 245 246 // Prepare initial counter block 247 __m128i counter; 248 if (iv_len == 12) { 249 // Standard case: IV is 96 bits 250 counter = _mm_set_epi32(1, 251 ((uint32_t*)iv)[2], 252 ((uint32_t*)iv)[1], 253 ((uint32_t*)iv)[0]); 254 } else { 255 // Non-standard IV length: use GHASH 256 __m128i hash = _mm_setzero_si128(); 257 size_t i; 258 for (i = 0; i + 16 <= iv_len; i += 16) { 259 __m128i block = _mm_loadu_si128((__m128i*)(iv + i)); 260 block = reverse_bytes(block); 261 hash = _mm_xor_si128(hash, block); 262 hash = gf_mul(hash, ctx->H); 263 } 264 if (i < iv_len) { 265 uint8_t temp[16] = {0}; 266 memcpy(temp, iv + i, iv_len - i); 267 __m128i block = _mm_loadu_si128((__m128i*)temp); 268 block = reverse_bytes(block); 269 hash = _mm_xor_si128(hash, block); 270 hash = gf_mul(hash, ctx->H); 271 } 272 uint64_t iv_bits = iv_len * 8; 273 __m128i len_block = _mm_set_epi64x(iv_bits, 0); 274 len_block = reverse_bytes(len_block); 275 hash = _mm_xor_si128(hash, len_block); 276 hash = gf_mul(hash, ctx->H); 277 counter = reverse_bytes(hash); 278 } 279 280 // Save J0 for tag computation 281 __m128i J0 = counter; 282 283 // Encrypt plaintext 284 size_t i; 285 for (i = 0; i + 16 <= pt_len; i += 16) { 286 counter = inc_counter(counter); 287 __m128i keystream = aes128_encrypt_block(counter, &ctx->key_schedule); 288 __m128i pt_block = _mm_loadu_si128((__m128i*)(plaintext + i)); 289 __m128i ct_block = _mm_xor_si128(pt_block, keystream); 290 _mm_storeu_si128((__m128i*)(ciphertext + i), ct_block); 291 } 292 293 // Handle remaining bytes 294 if (i < pt_len) { 295 counter = inc_counter(counter); 296 __m128i keystream = aes128_encrypt_block(counter, &ctx->key_schedule); 297 uint8_t ks_bytes[16]; 298 _mm_storeu_si128((__m128i*)ks_bytes, keystream); 299 for (size_t j = 0; j < pt_len - i; j++) { 300 ciphertext[i + j] = plaintext[i + j] ^ ks_bytes[j]; 301 } 302 } 303 304 // Compute authentication tag 305 __m128i auth_tag; 306 ghash(ctx, aad, aad_len, ciphertext, pt_len, &auth_tag); 307 308 // Encrypt the tag with J0 309 __m128i J0_keystream = aes128_encrypt_block(J0, &ctx->key_schedule); 310 auth_tag = _mm_xor_si128(auth_tag, J0_keystream); 311 312 // Store tag 313 _mm_storeu_si128((__m128i*)tag, auth_tag); 314 315 return 0; 316 } 317 318 // AES-128-GCM Decryption 319 int aes128_gcm_decrypt(aes128_gcm_context *ctx, 320 const uint8_t *iv, size_t iv_len, 321 const uint8_t *aad, size_t aad_len, 322 const uint8_t *ciphertext, size_t ct_len, 323 const uint8_t *tag, size_t tag_len, 324 uint8_t *plaintext) { 325 if (tag_len > 16) { 326 return -1; 327 } 328 329 // Prepare initial counter block (same as encryption) 330 __m128i counter; 331 if (iv_len == 12) { 332 counter = _mm_set_epi32(1, 333 ((uint32_t*)iv)[2], 334 ((uint32_t*)iv)[1], 335 ((uint32_t*)iv)[0]); 336 } else { 337 __m128i hash = _mm_setzero_si128(); 338 size_t i; 339 for (i = 0; i + 16 <= iv_len; i += 16) { 340 __m128i block = _mm_loadu_si128((__m128i*)(iv + i)); 341 block = reverse_bytes(block); 342 hash = _mm_xor_si128(hash, block); 343 hash = gf_mul(hash, ctx->H); 344 } 345 if (i < iv_len) { 346 uint8_t temp[16] = {0}; 347 memcpy(temp, iv + i, iv_len - i); 348 __m128i block = _mm_loadu_si128((__m128i*)temp); 349 block = reverse_bytes(block); 350 hash = _mm_xor_si128(hash, block); 351 hash = gf_mul(hash, ctx->H); 352 } 353 uint64_t iv_bits = iv_len * 8; 354 __m128i len_block = _mm_set_epi64x(iv_bits, 0); 355 len_block = reverse_bytes(len_block); 356 hash = _mm_xor_si128(hash, len_block); 357 hash = gf_mul(hash, ctx->H); 358 counter = reverse_bytes(hash); 359 } 360 361 __m128i J0 = counter; 362 363 // Verify authentication tag first 364 __m128i computed_tag; 365 ghash(ctx, aad, aad_len, ciphertext, ct_len, &computed_tag); 366 __m128i J0_keystream = aes128_encrypt_block(J0, &ctx->key_schedule); 367 computed_tag = _mm_xor_si128(computed_tag, J0_keystream); 368 369 // Compare tags (constant-time comparison) 370 uint8_t computed_tag_bytes[16]; 371 _mm_storeu_si128((__m128i*)computed_tag_bytes, computed_tag); 372 373 int tag_match = 1; 374 for (size_t i = 0; i < tag_len; i++) { 375 tag_match &= (computed_tag_bytes[i] == tag[i]); 376 } 377 378 if (!tag_match) { 379 return -1; // Authentication failed 380 } 381 382 // Decrypt ciphertext 383 size_t i; 384 for (i = 0; i + 16 <= ct_len; i += 16) { 385 counter = inc_counter(counter); 386 __m128i keystream = aes128_encrypt_block(counter, &ctx->key_schedule); 387 __m128i ct_block = _mm_loadu_si128((__m128i*)(ciphertext + i)); 388 __m128i pt_block = _mm_xor_si128(ct_block, keystream); 389 _mm_storeu_si128((__m128i*)(plaintext + i), pt_block); 390 } 391 392 // Handle remaining bytes 393 if (i < ct_len) { 394 counter = inc_counter(counter); 395 __m128i keystream = aes128_encrypt_block(counter, &ctx->key_schedule); 396 uint8_t ks_bytes[16]; 397 _mm_storeu_si128((__m128i*)ks_bytes, keystream); 398 for (size_t j = 0; j < ct_len - i; j++) { 399 plaintext[i + j] = ciphertext[i + j] ^ ks_bytes[j]; 400 } 401 } 402 403 return 0; 404 } 405 406 /** 407 * Clean up AES-128-GCM context 408 * Zeros all sensitive key material 409 */ 410 void aes128_gcm_cleanup(aes128_gcm_context *ctx) { 411 if (ctx == NULL) return; 412 413 // Zero all sensitive data using volatile to prevent compiler optimization 414 volatile uint8_t *p = (volatile uint8_t *)ctx; 415 size_t n = sizeof(aes128_gcm_context); 416 while (n--) { 417 *p++ = 0; 418 } 419 }