SHA256.c (15241B)
1 /* 2 * SHA-256 Implementation with Intel SHA Extensions 3 * Hardware accelerated when available, falls back to software 4 */ 5 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 #include <stdint.h> 10 #include <immintrin.h> 11 12 #ifdef __GNUC__ 13 #include <cpuid.h> 14 #endif 15 16 // SHA-256 context 17 typedef struct { 18 uint32_t state[8]; 19 uint64_t count; 20 uint8_t buffer[64]; 21 } sha256_context; 22 23 // CPU feature detection 24 static int has_sha_ext = 0; 25 static int features_detected = 0; 26 27 static void detect_cpu_features(void) { 28 if (features_detected) return; 29 #ifdef __GNUC__ 30 unsigned int eax, ebx, ecx, edx; 31 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) { 32 has_sha_ext = (ebx & bit_SHA) != 0; 33 } 34 #endif 35 features_detected = 1; 36 } 37 38 // SHA-256 constants 39 static const uint32_t K[64] = { 40 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 41 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 42 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 43 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 44 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 45 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 46 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 47 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 48 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 49 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 50 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 51 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 52 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 53 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 54 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 55 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 56 }; 57 58 // Rotate right 59 #define ROTR(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) 60 #define SHR(x, n) ((x) >> (n)) 61 62 // SHA-256 functions 63 #define CH(x, y, z) (((x) & (y)) ^ (~(x) & (z))) 64 #define MAJ(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 65 #define SIGMA0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) 66 #define SIGMA1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) 67 #define sigma0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) 68 #define sigma1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) 69 70 // Software implementation 71 static void sha256_transform_software(uint32_t state[8], const uint8_t block[64]) { 72 uint32_t W[64]; 73 uint32_t a, b, c, d, e, f, g, h, t1, t2; 74 int i; 75 76 // Prepare message schedule 77 for (i = 0; i < 16; i++) { 78 W[i] = ((uint32_t)block[i * 4] << 24) | 79 ((uint32_t)block[i * 4 + 1] << 16) | 80 ((uint32_t)block[i * 4 + 2] << 8) | 81 ((uint32_t)block[i * 4 + 3]); 82 } 83 84 for (i = 16; i < 64; i++) { 85 W[i] = sigma1(W[i - 2]) + W[i - 7] + sigma0(W[i - 15]) + W[i - 16]; 86 } 87 88 // Initialize working variables 89 a = state[0]; b = state[1]; c = state[2]; d = state[3]; 90 e = state[4]; f = state[5]; g = state[6]; h = state[7]; 91 92 // Main loop 93 for (i = 0; i < 64; i++) { 94 t1 = h + SIGMA1(e) + CH(e, f, g) + K[i] + W[i]; 95 t2 = SIGMA0(a) + MAJ(a, b, c); 96 h = g; g = f; f = e; e = d + t1; 97 d = c; c = b; b = a; a = t1 + t2; 98 } 99 100 // Update state 101 state[0] += a; state[1] += b; state[2] += c; state[3] += d; 102 state[4] += e; state[5] += f; state[6] += g; state[7] += h; 103 } 104 105 // Hardware accelerated implementation (Intel SHA extensions) 106 #ifdef __SHA__ 107 static void sha256_transform_hardware(uint32_t state[8], const uint8_t block[64]) { 108 __m128i STATE0, STATE1; 109 __m128i MSG, TMP; 110 __m128i MSG0, MSG1, MSG2, MSG3; 111 __m128i ABEF_SAVE, CDGH_SAVE; 112 113 // Load initial state 114 TMP = _mm_loadu_si128((__m128i*)&state[0]); 115 STATE1 = _mm_loadu_si128((__m128i*)&state[4]); 116 117 TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB 118 STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH 119 STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF 120 STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH 121 122 // Save current state 123 ABEF_SAVE = STATE0; 124 CDGH_SAVE = STATE1; 125 126 // Load message 127 MSG0 = _mm_loadu_si128((__m128i*)(block + 0)); 128 MSG1 = _mm_loadu_si128((__m128i*)(block + 16)); 129 MSG2 = _mm_loadu_si128((__m128i*)(block + 32)); 130 MSG3 = _mm_loadu_si128((__m128i*)(block + 48)); 131 132 // Reverse byte order 133 MSG0 = _mm_shuffle_epi8(MSG0, _mm_set_epi64x(0x0c0d0e0f08090a0b, 0x0405060700010203)); 134 MSG1 = _mm_shuffle_epi8(MSG1, _mm_set_epi64x(0x0c0d0e0f08090a0b, 0x0405060700010203)); 135 MSG2 = _mm_shuffle_epi8(MSG2, _mm_set_epi64x(0x0c0d0e0f08090a0b, 0x0405060700010203)); 136 MSG3 = _mm_shuffle_epi8(MSG3, _mm_set_epi64x(0x0c0d0e0f08090a0b, 0x0405060700010203)); 137 138 // Rounds 0-3 139 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); 140 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 141 MSG = _mm_shuffle_epi32(MSG, 0x0E); 142 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 143 144 // Rounds 4-7 145 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); 146 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 147 MSG = _mm_shuffle_epi32(MSG, 0x0E); 148 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 149 MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); 150 151 // Rounds 8-11 152 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); 153 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 154 MSG = _mm_shuffle_epi32(MSG, 0x0E); 155 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 156 MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); 157 158 // Rounds 12-15 159 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); 160 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 161 TMP = _mm_alignr_epi8(MSG3, MSG2, 4); 162 MSG0 = _mm_add_epi32(MSG0, TMP); 163 MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); 164 MSG = _mm_shuffle_epi32(MSG, 0x0E); 165 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 166 MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); 167 168 // Rounds 16-19 169 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); 170 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 171 TMP = _mm_alignr_epi8(MSG0, MSG3, 4); 172 MSG1 = _mm_add_epi32(MSG1, TMP); 173 MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); 174 MSG = _mm_shuffle_epi32(MSG, 0x0E); 175 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 176 MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); 177 178 // Rounds 20-23 179 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); 180 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 181 TMP = _mm_alignr_epi8(MSG1, MSG0, 4); 182 MSG2 = _mm_add_epi32(MSG2, TMP); 183 MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); 184 MSG = _mm_shuffle_epi32(MSG, 0x0E); 185 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 186 MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); 187 188 // Rounds 24-27 189 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); 190 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 191 TMP = _mm_alignr_epi8(MSG2, MSG1, 4); 192 MSG3 = _mm_add_epi32(MSG3, TMP); 193 MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); 194 MSG = _mm_shuffle_epi32(MSG, 0x0E); 195 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 196 MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); 197 198 // Rounds 28-31 199 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); 200 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 201 TMP = _mm_alignr_epi8(MSG3, MSG2, 4); 202 MSG0 = _mm_add_epi32(MSG0, TMP); 203 MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); 204 MSG = _mm_shuffle_epi32(MSG, 0x0E); 205 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 206 MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); 207 208 // Rounds 32-35 209 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); 210 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 211 TMP = _mm_alignr_epi8(MSG0, MSG3, 4); 212 MSG1 = _mm_add_epi32(MSG1, TMP); 213 MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); 214 MSG = _mm_shuffle_epi32(MSG, 0x0E); 215 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 216 MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); 217 218 // Rounds 36-39 219 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); 220 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 221 TMP = _mm_alignr_epi8(MSG1, MSG0, 4); 222 MSG2 = _mm_add_epi32(MSG2, TMP); 223 MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); 224 MSG = _mm_shuffle_epi32(MSG, 0x0E); 225 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 226 MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); 227 228 // Rounds 40-43 229 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); 230 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 231 TMP = _mm_alignr_epi8(MSG2, MSG1, 4); 232 MSG3 = _mm_add_epi32(MSG3, TMP); 233 MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); 234 MSG = _mm_shuffle_epi32(MSG, 0x0E); 235 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 236 MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); 237 238 // Rounds 44-47 239 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); 240 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 241 TMP = _mm_alignr_epi8(MSG3, MSG2, 4); 242 MSG0 = _mm_add_epi32(MSG0, TMP); 243 MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); 244 MSG = _mm_shuffle_epi32(MSG, 0x0E); 245 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 246 MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); 247 248 // Rounds 48-51 249 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); 250 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 251 TMP = _mm_alignr_epi8(MSG0, MSG3, 4); 252 MSG1 = _mm_add_epi32(MSG1, TMP); 253 MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); 254 MSG = _mm_shuffle_epi32(MSG, 0x0E); 255 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 256 MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); 257 258 // Rounds 52-55 259 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); 260 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 261 TMP = _mm_alignr_epi8(MSG1, MSG0, 4); 262 MSG2 = _mm_add_epi32(MSG2, TMP); 263 MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); 264 MSG = _mm_shuffle_epi32(MSG, 0x0E); 265 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 266 267 // Rounds 56-59 268 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); 269 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 270 TMP = _mm_alignr_epi8(MSG2, MSG1, 4); 271 MSG3 = _mm_add_epi32(MSG3, TMP); 272 MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); 273 MSG = _mm_shuffle_epi32(MSG, 0x0E); 274 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 275 276 // Rounds 60-63 277 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); 278 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 279 MSG = _mm_shuffle_epi32(MSG, 0x0E); 280 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 281 282 // Add to state 283 STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); 284 STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); 285 286 // Save state 287 TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA 288 STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG 289 STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA 290 STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF 291 292 _mm_storeu_si128((__m128i*)&state[0], STATE0); 293 _mm_storeu_si128((__m128i*)&state[4], STATE1); 294 } 295 #endif 296 297 // Initialize SHA-256 context 298 void sha256_init(sha256_context *ctx) { 299 ctx->state[0] = 0x6a09e667; 300 ctx->state[1] = 0xbb67ae85; 301 ctx->state[2] = 0x3c6ef372; 302 ctx->state[3] = 0xa54ff53a; 303 ctx->state[4] = 0x510e527f; 304 ctx->state[5] = 0x9b05688c; 305 ctx->state[6] = 0x1f83d9ab; 306 ctx->state[7] = 0x5be0cd19; 307 ctx->count = 0; 308 309 if (!features_detected) { 310 detect_cpu_features(); 311 } 312 } 313 314 // Update SHA-256 with data 315 void sha256_update(sha256_context *ctx, const uint8_t *data, size_t len) { 316 size_t i, index, part_len; 317 318 index = (size_t)(ctx->count & 0x3F); 319 ctx->count += len; 320 part_len = 64 - index; 321 322 if (len >= part_len) { 323 memcpy(&ctx->buffer[index], data, part_len); 324 325 #ifdef __SHA__ 326 if (has_sha_ext) { 327 sha256_transform_hardware(ctx->state, ctx->buffer); 328 } else { 329 sha256_transform_software(ctx->state, ctx->buffer); 330 } 331 #else 332 sha256_transform_software(ctx->state, ctx->buffer); 333 #endif 334 335 for (i = part_len; i + 63 < len; i += 64) { 336 #ifdef __SHA__ 337 if (has_sha_ext) { 338 sha256_transform_hardware(ctx->state, &data[i]); 339 } else { 340 sha256_transform_software(ctx->state, &data[i]); 341 } 342 #else 343 sha256_transform_software(ctx->state, &data[i]); 344 #endif 345 } 346 index = 0; 347 } else { 348 i = 0; 349 } 350 351 memcpy(&ctx->buffer[index], &data[i], len - i); 352 } 353 354 // Finalize SHA-256 and produce digest 355 void sha256_final(sha256_context *ctx, uint8_t digest[32]) { 356 uint8_t bits[8]; 357 size_t index, pad_len; 358 uint64_t bit_count = ctx->count * 8; 359 360 // Encode bit count 361 for (int i = 0; i < 8; i++) { 362 bits[7 - i] = (uint8_t)(bit_count >> (i * 8)); 363 } 364 365 // Pad 366 index = (size_t)(ctx->count & 0x3F); 367 pad_len = (index < 56) ? (56 - index) : (120 - index); 368 369 uint8_t padding[64]; 370 padding[0] = 0x80; 371 memset(padding + 1, 0, pad_len - 1); 372 373 sha256_update(ctx, padding, pad_len); 374 sha256_update(ctx, bits, 8); 375 376 // Store digest 377 for (int i = 0; i < 8; i++) { 378 digest[i * 4] = (uint8_t)(ctx->state[i] >> 24); 379 digest[i * 4 + 1] = (uint8_t)(ctx->state[i] >> 16); 380 digest[i * 4 + 2] = (uint8_t)(ctx->state[i] >> 8); 381 digest[i * 4 + 3] = (uint8_t)(ctx->state[i]); 382 } 383 } 384 385 // One-shot SHA-256 386 void sha256(const uint8_t *data, size_t len, uint8_t digest[32]) { 387 sha256_context ctx; 388 sha256_init(&ctx); 389 sha256_update(&ctx, data, len); 390 sha256_final(&ctx, digest); 391 } 392 393 // Test program 394 #ifndef LIB_MODE 395 int main(void) { 396 detect_cpu_features(); 397 398 printf("SHA-256 Implementation\n"); 399 printf("======================\n"); 400 printf("Hardware acceleration (SHA extensions): %s\n\n", has_sha_ext ? "Yes" : "No"); 401 402 // Test vectors 403 const char *test1 = ""; 404 const char *test2 = "abc"; 405 const char *test3 = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; 406 407 uint8_t digest[32]; 408 409 // Test 1 410 printf("Test 1: \"%s\"\n", test1); 411 sha256((uint8_t*)test1, strlen(test1), digest); 412 printf("SHA-256: "); 413 for (int i = 0; i < 32; i++) printf("%02x", digest[i]); 414 printf("\n"); 415 printf("Expected: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855\n\n"); 416 417 // Test 2 418 printf("Test 2: \"%s\"\n", test2); 419 sha256((uint8_t*)test2, strlen(test2), digest); 420 printf("SHA-256: "); 421 for (int i = 0; i < 32; i++) printf("%02x", digest[i]); 422 printf("\n"); 423 printf("Expected: ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad\n\n"); 424 425 // Test 3 426 printf("Test 3: \"%s\"\n", test3); 427 sha256((uint8_t*)test3, strlen(test3), digest); 428 printf("SHA-256: "); 429 for (int i = 0; i < 32; i++) printf("%02x", digest[i]); 430 printf("\n"); 431 printf("Expected: 248d6a61d20638b8e5c026930c3e6039a33ce45964ff2167f6ecedd419db06c1\n\n"); 432 433 return 0; 434 } 435 #endif