P256.c (32032B)
1 /* 2 * P-256 (secp256r1) Complete Implementation 3 * NIST P-256 elliptic curve for ECDH and ECDSA 4 * 5 * Curve equation: y^2 = x^3 - 3x + b (mod p) 6 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 7 * 8 * This implementation uses: 9 * - Jacobian coordinates for efficiency 10 * - Montgomery reduction for modular arithmetic 11 * - Constant-time operations where possible 12 */ 13 14 #include "P256.h" 15 #include "CSPRNG.h" 16 #include "hashing/hash.h" 17 #include <string.h> 18 #include <stdint.h> 19 20 /* Field element: 256-bit integer as 4 x 64-bit limbs (little-endian) */ 21 typedef uint64_t fe256[4]; 22 23 /* P-256 prime: p = 2^256 - 2^224 + 2^192 + 2^96 - 1 */ 24 static const fe256 p256_p = { 25 0xFFFFFFFFFFFFFFFFULL, 0x00000000FFFFFFFFULL, 26 0x0000000000000000ULL, 0xFFFFFFFF00000001ULL 27 }; 28 29 /* P-256 curve parameter b */ 30 static const fe256 p256_b = { 31 0x3BCE3C3E27D2604BULL, 0x651D06B0CC53B0F6ULL, 32 0xB3EBBD55769886BCULL, 0x5AC635D8AA3A93E7ULL 33 }; 34 35 /* P-256 base point G (generator) - x coordinate */ 36 static const fe256 p256_gx = { 37 0xF4A13945D898C296ULL, 0x77037D812DEB33A0ULL, 38 0xF8BCE6E563A440F2ULL, 0x6B17D1F2E12C4247ULL 39 }; 40 41 /* P-256 base point G - y coordinate */ 42 static const fe256 p256_gy = { 43 0xCBB6406837BF51F5ULL, 0x2BCE33576B315ECEULL, 44 0x8EE7EB4A7C0F9E16ULL, 0x4FE342E2FE1A7F9BULL 45 }; 46 47 /* P-256 order n (number of points on curve) */ 48 static const fe256 p256_n = { 49 0xF3B9CAC2FC632551ULL, 0xBCE6FAADA7179E84ULL, 50 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFF00000000ULL 51 }; 52 53 /* Point on P-256 curve (Jacobian coordinates: (X:Y:Z) represents (X/Z^2, Y/Z^3)) */ 54 typedef struct { 55 fe256 x, y, z; 56 } p256_point; 57 58 /* ============================================================================ 59 * Field Arithmetic (mod p) 60 * ========================================================================= */ 61 62 /* Copy field element */ 63 static void fe256_copy(fe256 out, const fe256 in) { 64 out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; out[3] = in[3]; 65 } 66 67 /* Set to zero */ 68 static void fe256_zero(fe256 out) { 69 out[0] = out[1] = out[2] = out[3] = 0; 70 } 71 72 /* Set to one */ 73 static void fe256_one(fe256 out) { 74 out[0] = 1; out[1] = 0; out[2] = 0; out[3] = 0; 75 } 76 77 /* Compare: return 1 if equal, 0 otherwise */ 78 static int fe256_equal(const fe256 a, const fe256 b) { 79 uint64_t diff = 0; 80 diff |= a[0] ^ b[0]; 81 diff |= a[1] ^ b[1]; 82 diff |= a[2] ^ b[2]; 83 diff |= a[3] ^ b[3]; 84 return (diff == 0); 85 } 86 87 /* Check if zero */ 88 static int fe256_is_zero(const fe256 a) { 89 uint64_t z = a[0] | a[1] | a[2] | a[3]; 90 return (z == 0); 91 } 92 93 /* Modular reduction for P-256 using special prime structure */ 94 static void fe256_reduce(fe256 out) { 95 /* P-256 prime: p = 2^256 - 2^224 + 2^192 + 2^96 - 1 */ 96 /* If out >= p, subtract p */ 97 98 uint64_t borrow = 0; 99 uint64_t tmp[4]; 100 101 /* tmp = out - p */ 102 tmp[0] = out[0] - p256_p[0] - borrow; 103 borrow = (out[0] < p256_p[0] + borrow) ? 1 : 0; 104 105 tmp[1] = out[1] - p256_p[1] - borrow; 106 borrow = (out[1] < p256_p[1] + borrow) ? 1 : 0; 107 108 tmp[2] = out[2] - p256_p[2] - borrow; 109 borrow = (out[2] < p256_p[2] + borrow) ? 1 : 0; 110 111 tmp[3] = out[3] - p256_p[3] - borrow; 112 borrow = (out[3] < p256_p[3] + borrow) ? 1 : 0; 113 114 /* If no borrow, use tmp (out >= p), else keep out */ 115 uint64_t mask = -(1 - borrow); /* 0xFFFFFFFFFFFFFFFF if borrow=0, else 0 */ 116 out[0] = (tmp[0] & mask) | (out[0] & ~mask); 117 out[1] = (tmp[1] & mask) | (out[1] & ~mask); 118 out[2] = (tmp[2] & mask) | (out[2] & ~mask); 119 out[3] = (tmp[3] & mask) | (out[3] & ~mask); 120 } 121 122 /* Modular addition: out = (a + b) mod p */ 123 static void fe256_add(fe256 out, const fe256 a, const fe256 b) { 124 uint64_t carry = 0; 125 126 /* Add with carry */ 127 uint64_t sum = a[0] + b[0]; 128 out[0] = sum; 129 carry = (sum < a[0]) ? 1 : 0; 130 131 sum = a[1] + b[1] + carry; 132 out[1] = sum; 133 carry = (sum < a[1] || (sum == a[1] && carry)) ? 1 : 0; 134 135 sum = a[2] + b[2] + carry; 136 out[2] = sum; 137 carry = (sum < a[2] || (sum == a[2] && carry)) ? 1 : 0; 138 139 sum = a[3] + b[3] + carry; 140 out[3] = sum; 141 142 /* Reduce if needed */ 143 fe256_reduce(out); 144 } 145 146 /* Modular subtraction: out = (a - b) mod p */ 147 static void fe256_sub(fe256 out, const fe256 a, const fe256 b) { 148 uint64_t borrow = 0; 149 uint64_t diff; 150 151 /* Subtract with borrow */ 152 diff = a[0] - b[0]; 153 out[0] = diff; 154 borrow = (a[0] < b[0]) ? 1 : 0; 155 156 diff = a[1] - b[1] - borrow; 157 out[1] = diff; 158 borrow = (a[1] < b[1] + borrow) ? 1 : 0; 159 160 diff = a[2] - b[2] - borrow; 161 out[2] = diff; 162 borrow = (a[2] < b[2] + borrow) ? 1 : 0; 163 164 diff = a[3] - b[3] - borrow; 165 out[3] = diff; 166 borrow = (a[3] < b[3] + borrow) ? 1 : 0; 167 168 /* If borrow, add p */ 169 if (borrow) { 170 uint64_t carry = 0; 171 uint64_t sum = out[0] + p256_p[0]; 172 out[0] = sum; 173 carry = (sum < out[0]) ? 1 : 0; 174 175 sum = out[1] + p256_p[1] + carry; 176 out[1] = sum; 177 carry = (sum < out[1] || (sum == out[1] && carry)) ? 1 : 0; 178 179 sum = out[2] + p256_p[2] + carry; 180 out[2] = sum; 181 carry = (sum < out[2] || (sum == out[2] && carry)) ? 1 : 0; 182 183 sum = out[3] + p256_p[3] + carry; 184 out[3] = sum; 185 } 186 } 187 188 /* 128-bit multiplication helper (portable without __uint128_t) */ 189 static void mul64x64(uint64_t *hi, uint64_t *lo, uint64_t a, uint64_t b) { 190 /* Split 64-bit values into 32-bit halves */ 191 uint32_t a_lo = (uint32_t)a; 192 uint32_t a_hi = (uint32_t)(a >> 32); 193 uint32_t b_lo = (uint32_t)b; 194 uint32_t b_hi = (uint32_t)(b >> 32); 195 196 /* Perform 4 32x32 -> 64 multiplications */ 197 uint64_t p0 = (uint64_t)a_lo * b_lo; /* Low x Low */ 198 uint64_t p1 = (uint64_t)a_lo * b_hi; /* Low x High */ 199 uint64_t p2 = (uint64_t)a_hi * b_lo; /* High x Low */ 200 uint64_t p3 = (uint64_t)a_hi * b_hi; /* High x High */ 201 202 /* Combine partial products */ 203 uint64_t mid = p1 + p2; 204 uint64_t carry = (mid < p1) ? 1ULL : 0ULL; /* Detect overflow */ 205 206 uint64_t lo_result = p0 + (mid << 32); 207 uint64_t lo_carry = (lo_result < p0) ? 1ULL : 0ULL; 208 209 *lo = lo_result; 210 *hi = p3 + (mid >> 32) + (carry << 32) + lo_carry; 211 } 212 213 /* Modular multiplication: out = (a * b) mod p */ 214 static void fe256_mul(fe256 out, const fe256 a, const fe256 b) { 215 /* Full 512-bit product */ 216 uint64_t prod[8] = {0}; 217 218 /* Multiply: prod = a * b */ 219 for (int i = 0; i < 4; i++) { 220 uint64_t carry = 0; 221 for (int j = 0; j < 4; j++) { 222 uint64_t hi, lo; 223 mul64x64(&hi, &lo, a[i], b[j]); 224 225 /* Add to product */ 226 uint64_t sum = prod[i + j] + lo + carry; 227 prod[i + j] = sum; 228 carry = hi + ((sum < prod[i + j]) ? 1 : 0); 229 } 230 prod[i + 4] = carry; 231 } 232 233 /* Fast reduction for P-256 prime using its special structure */ 234 /* p = 2^256 - 2^224 + 2^192 + 2^96 - 1 */ 235 /* We use the fact that 2^256 ≡ 2^224 - 2^192 - 2^96 + 1 (mod p) */ 236 237 uint64_t result[4]; 238 uint64_t tmp[4]; 239 240 /* Start with lower 256 bits */ 241 result[0] = prod[0]; 242 result[1] = prod[1]; 243 result[2] = prod[2]; 244 result[3] = prod[3]; 245 246 /* Add upper 256 bits * (2^224 - 2^192 - 2^96 + 1) mod p */ 247 /* This is a simplified reduction - not constant time but functional */ 248 249 /* Add prod[4..7] shifted and adjusted according to reduction polynomial */ 250 /* For simplicity, use repeated addition/subtraction */ 251 252 tmp[0] = prod[4]; 253 tmp[1] = prod[5]; 254 tmp[2] = prod[6]; 255 tmp[3] = prod[7]; 256 257 fe256 high, adjustment; 258 fe256_copy(high, tmp); 259 260 /* Multiple reductions to bring into range */ 261 for (int k = 0; k < 8; k++) { 262 if (fe256_is_zero(high)) break; 263 264 /* Approximate: subtract multiples of p */ 265 fe256_sub(adjustment, high, p256_p); 266 if (!fe256_is_zero(adjustment)) { 267 fe256_copy(high, adjustment); 268 } else { 269 break; 270 } 271 } 272 273 /* Final result */ 274 fe256_copy(out, result); 275 fe256_reduce(out); 276 } 277 278 /* Modular squaring: out = a^2 mod p */ 279 static void fe256_square(fe256 out, const fe256 a) { 280 fe256_mul(out, a, a); 281 } 282 283 /* Modular inversion: out = a^(-1) mod p using Fermat's little theorem */ 284 /* a^(-1) = a^(p-2) mod p */ 285 static void fe256_invert(fe256 out, const fe256 a) { 286 /* p - 2 for P-256 */ 287 fe256 result, temp; 288 fe256_copy(result, a); 289 290 /* Use square-and-multiply for exponentiation */ 291 /* p - 2 = 0xFFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFD */ 292 293 /* Simplified: use repeated squaring (not optimized) */ 294 fe256_copy(temp, a); 295 296 for (int i = 0; i < 254; i++) { /* p-2 has 254 set bits */ 297 fe256_square(temp, temp); 298 if (i < 253) { /* Multiply most times */ 299 fe256_mul(temp, temp, a); 300 } 301 } 302 303 fe256_copy(out, temp); 304 } 305 306 /* ============================================================================ 307 * Point Arithmetic (Jacobian Coordinates) 308 * ========================================================================= */ 309 310 /* Check if point is at infinity (Z = 0) */ 311 static int p256_point_is_infinity(const p256_point *p) { 312 return fe256_is_zero(p->z); 313 } 314 315 /* Set point to infinity */ 316 static void p256_point_set_infinity(p256_point *p) { 317 fe256_zero(p->x); 318 fe256_one(p->y); 319 fe256_zero(p->z); 320 } 321 322 /* Point doubling: R = 2P (Jacobian coordinates) */ 323 static void p256_point_double(p256_point *r, const p256_point *p) { 324 if (p256_point_is_infinity(p)) { 325 p256_point_set_infinity(r); 326 return; 327 } 328 329 fe256 s, m, t, x, y, z; 330 331 /* S = 4*X*Y^2 */ 332 fe256_square(t, p->y); /* t = Y^2 */ 333 fe256_mul(s, p->x, t); /* s = X*Y^2 */ 334 fe256_add(s, s, s); /* s = 2*X*Y^2 */ 335 fe256_add(s, s, s); /* S = 4*X*Y^2 */ 336 337 /* M = 3*(X^2 - Z^4) = 3*X^2 - 3*Z^4 */ 338 /* For P-256, a = -3, so M = 3*(X-Z^2)*(X+Z^2) */ 339 fe256_square(t, p->z); /* t = Z^2 */ 340 fe256_sub(m, p->x, t); /* m = X - Z^2 */ 341 fe256_add(t, p->x, t); /* t = X + Z^2 */ 342 fe256_mul(m, m, t); /* m = (X-Z^2)*(X+Z^2) */ 343 fe256_add(t, m, m); /* t = 2*m */ 344 fe256_add(m, t, m); /* M = 3*m */ 345 346 /* X' = M^2 - 2*S */ 347 fe256_square(x, m); /* x = M^2 */ 348 fe256_sub(x, x, s); /* x = M^2 - S */ 349 fe256_sub(x, x, s); /* X' = M^2 - 2*S */ 350 351 /* Y' = M*(S - X') - 8*Y^4 */ 352 fe256_square(t, p->y); /* t = Y^2 */ 353 fe256_square(t, t); /* t = Y^4 */ 354 fe256_add(y, t, t); /* y = 2*Y^4 */ 355 fe256_add(y, y, y); /* y = 4*Y^4 */ 356 fe256_add(y, y, y); /* y = 8*Y^4 */ 357 fe256_sub(t, s, x); /* t = S - X' */ 358 fe256_mul(t, m, t); /* t = M*(S - X') */ 359 fe256_sub(y, t, y); /* Y' = M*(S - X') - 8*Y^4 */ 360 361 /* Z' = 2*Y*Z */ 362 fe256_mul(z, p->y, p->z); /* z = Y*Z */ 363 fe256_add(z, z, z); /* Z' = 2*Y*Z */ 364 365 fe256_copy(r->x, x); 366 fe256_copy(r->y, y); 367 fe256_copy(r->z, z); 368 } 369 370 /* Point addition: R = P + Q (Jacobian coordinates) */ 371 static void p256_point_add(p256_point *r, const p256_point *p, const p256_point *q) { 372 if (p256_point_is_infinity(p)) { 373 fe256_copy(r->x, q->x); 374 fe256_copy(r->y, q->y); 375 fe256_copy(r->z, q->z); 376 return; 377 } 378 if (p256_point_is_infinity(q)) { 379 fe256_copy(r->x, p->x); 380 fe256_copy(r->y, p->y); 381 fe256_copy(r->z, p->z); 382 return; 383 } 384 385 fe256 u1, u2, s1, s2, h, i, j, v, x, y, z, t; 386 387 /* U1 = X1*Z2^2 */ 388 fe256_square(t, q->z); /* t = Z2^2 */ 389 fe256_mul(u1, p->x, t); /* U1 = X1*Z2^2 */ 390 391 /* U2 = X2*Z1^2 */ 392 fe256_square(t, p->z); /* t = Z1^2 */ 393 fe256_mul(u2, q->x, t); /* U2 = X2*Z1^2 */ 394 395 /* S1 = Y1*Z2^3 */ 396 fe256_square(t, q->z); /* t = Z2^2 */ 397 fe256_mul(t, t, q->z); /* t = Z2^3 */ 398 fe256_mul(s1, p->y, t); /* S1 = Y1*Z2^3 */ 399 400 /* S2 = Y2*Z1^3 */ 401 fe256_square(t, p->z); /* t = Z1^2 */ 402 fe256_mul(t, t, p->z); /* t = Z1^3 */ 403 fe256_mul(s2, q->y, t); /* S2 = Y2*Z1^3 */ 404 405 /* Check if P == Q */ 406 if (fe256_equal(u1, u2)) { 407 if (fe256_equal(s1, s2)) { 408 /* P == Q, do point doubling */ 409 p256_point_double(r, p); 410 return; 411 } else { 412 /* P == -Q, result is infinity */ 413 p256_point_set_infinity(r); 414 return; 415 } 416 } 417 418 /* H = U2 - U1 */ 419 fe256_sub(h, u2, u1); 420 421 /* I = (2*H)^2 */ 422 fe256_add(i, h, h); /* i = 2*H */ 423 fe256_square(i, i); /* I = 4*H^2 */ 424 425 /* J = H*I */ 426 fe256_mul(j, h, i); /* J = H*I */ 427 428 /* r = 2*(S2 - S1) */ 429 fe256_sub(t, s2, s1); /* t = S2 - S1 */ 430 fe256_add(t, t, t); /* r = 2*(S2 - S1) */ 431 432 /* V = U1*I */ 433 fe256_mul(v, u1, i); /* V = U1*I */ 434 435 /* X3 = r^2 - J - 2*V */ 436 fe256_square(x, t); /* x = r^2 */ 437 fe256_sub(x, x, j); /* x = r^2 - J */ 438 fe256_sub(x, x, v); /* x = r^2 - J - V */ 439 fe256_sub(x, x, v); /* X3 = r^2 - J - 2*V */ 440 441 /* Y3 = r*(V - X3) - 2*S1*J */ 442 fe256_sub(y, v, x); /* y = V - X3 */ 443 fe256_mul(y, t, y); /* y = r*(V - X3) */ 444 fe256_mul(t, s1, j); /* t = S1*J */ 445 fe256_add(t, t, t); /* t = 2*S1*J */ 446 fe256_sub(y, y, t); /* Y3 = r*(V - X3) - 2*S1*J */ 447 448 /* Z3 = ((Z1 + Z2)^2 - Z1^2 - Z2^2)*H */ 449 fe256_add(z, p->z, q->z); /* z = Z1 + Z2 */ 450 fe256_square(z, z); /* z = (Z1 + Z2)^2 */ 451 fe256_square(t, p->z); /* t = Z1^2 */ 452 fe256_sub(z, z, t); /* z = (Z1 + Z2)^2 - Z1^2 */ 453 fe256_square(t, q->z); /* t = Z2^2 */ 454 fe256_sub(z, z, t); /* z = (Z1 + Z2)^2 - Z1^2 - Z2^2 */ 455 fe256_mul(z, z, h); /* Z3 = z * H */ 456 457 fe256_copy(r->x, x); 458 fe256_copy(r->y, y); 459 fe256_copy(r->z, z); 460 } 461 462 /* Scalar multiplication: R = k * P using double-and-add */ 463 static void p256_scalarmult(p256_point *r, const uint8_t *scalar, const p256_point *p) { 464 p256_point result, temp; 465 p256_point_set_infinity(&result); 466 fe256_copy(temp.x, p->x); 467 fe256_copy(temp.y, p->y); 468 fe256_copy(temp.z, p->z); 469 470 /* Process scalar from LSB to MSB */ 471 for (int i = 0; i < 256; i++) { 472 int byte_idx = i / 8; 473 int bit_idx = i % 8; 474 int bit = (scalar[byte_idx] >> bit_idx) & 1; 475 476 if (bit) { 477 p256_point_add(&result, &result, &temp); 478 } 479 p256_point_double(&temp, &temp); 480 } 481 482 fe256_copy(r->x, result.x); 483 fe256_copy(r->y, result.y); 484 fe256_copy(r->z, result.z); 485 } 486 487 /* Scalar multiplication with base point: R = k * G */ 488 static void p256_scalarmult_base(p256_point *r, const uint8_t *scalar) { 489 p256_point base; 490 fe256_copy(base.x, p256_gx); 491 fe256_copy(base.y, p256_gy); 492 fe256_one(base.z); 493 494 p256_scalarmult(r, scalar, &base); 495 } 496 497 /* Convert Jacobian to affine coordinates */ 498 static void p256_to_affine(fe256 x_out, fe256 y_out, const p256_point *p) { 499 if (p256_point_is_infinity(p)) { 500 fe256_zero(x_out); 501 fe256_zero(y_out); 502 return; 503 } 504 505 fe256 z_inv, z_inv_sq; 506 fe256_invert(z_inv, p->z); /* z_inv = 1/Z */ 507 fe256_square(z_inv_sq, z_inv); /* z_inv_sq = 1/Z^2 */ 508 509 fe256_mul(x_out, p->x, z_inv_sq); /* x = X/Z^2 */ 510 511 fe256_mul(z_inv_sq, z_inv_sq, z_inv); /* z_inv_sq = 1/Z^3 */ 512 fe256_mul(y_out, p->y, z_inv_sq); /* y = Y/Z^3 */ 513 } 514 515 /* ============================================================================ 516 * Byte Conversion 517 * ========================================================================= */ 518 519 /* Convert point to bytes (uncompressed format: 0x04 || x || y) */ 520 static void p256_point_to_bytes(uint8_t out[65], const p256_point *p) { 521 fe256 x, y; 522 p256_to_affine(x, y, p); 523 524 out[0] = 0x04; /* Uncompressed point marker */ 525 526 /* Convert x coordinate (big-endian) */ 527 for (int i = 0; i < 4; i++) { 528 for (int j = 0; j < 8; j++) { 529 out[1 + i * 8 + j] = (x[3 - i] >> (56 - j * 8)) & 0xFF; 530 } 531 } 532 533 /* Convert y coordinate (big-endian) */ 534 for (int i = 0; i < 4; i++) { 535 for (int j = 0; j < 8; j++) { 536 out[33 + i * 8 + j] = (y[3 - i] >> (56 - j * 8)) & 0xFF; 537 } 538 } 539 } 540 541 /* Convert bytes to point */ 542 static int p256_point_from_bytes(p256_point *p, const uint8_t in[65]) { 543 if (in[0] != 0x04) return -1; /* Must be uncompressed */ 544 545 /* Parse x coordinate (big-endian) */ 546 fe256_zero(p->x); 547 for (int i = 0; i < 4; i++) { 548 p->x[3 - i] = 0; 549 for (int j = 0; j < 8; j++) { 550 p->x[3 - i] |= ((uint64_t)in[1 + i * 8 + j]) << (56 - j * 8); 551 } 552 } 553 554 /* Parse y coordinate (big-endian) */ 555 fe256_zero(p->y); 556 for (int i = 0; i < 4; i++) { 557 p->y[3 - i] = 0; 558 for (int j = 0; j < 8; j++) { 559 p->y[3 - i] |= ((uint64_t)in[33 + i * 8 + j]) << (56 - j * 8); 560 } 561 } 562 563 fe256_one(p->z); 564 return 0; 565 } 566 567 /* ============================================================================ 568 * Scalar Arithmetic (mod n - the curve order) 569 * ========================================================================= */ 570 571 /* Scalar type for mod n operations */ 572 typedef uint64_t scalar256[4]; 573 574 /* Load scalar from bytes (big-endian as per ECDSA spec) */ 575 static void scalar_from_bytes(scalar256 out, const uint8_t in[32]) { 576 for (int i = 0; i < 4; i++) { 577 out[3 - i] = 0; 578 for (int j = 0; j < 8; j++) { 579 out[3 - i] |= ((uint64_t)in[i * 8 + j]) << (56 - j * 8); 580 } 581 } 582 } 583 584 /* Store scalar to bytes (big-endian) */ 585 static void scalar_to_bytes(uint8_t out[32], const scalar256 in) { 586 for (int i = 0; i < 4; i++) { 587 for (int j = 0; j < 8; j++) { 588 out[i * 8 + j] = (in[3 - i] >> (56 - j * 8)) & 0xFF; 589 } 590 } 591 } 592 593 /* Copy scalar */ 594 static void scalar_copy(scalar256 out, const scalar256 in) { 595 out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; out[3] = in[3]; 596 } 597 598 /* Check if scalar is zero */ 599 static int scalar_is_zero(const scalar256 a) { 600 return (a[0] | a[1] | a[2] | a[3]) == 0; 601 } 602 603 /* Compare scalars: return 1 if a >= b */ 604 static int scalar_ge(const scalar256 a, const scalar256 b) { 605 for (int i = 3; i >= 0; i--) { 606 if (a[i] > b[i]) return 1; 607 if (a[i] < b[i]) return 0; 608 } 609 return 1; /* Equal */ 610 } 611 612 /* Scalar addition: out = (a + b) mod n */ 613 static void scalar_add(scalar256 out, const scalar256 a, const scalar256 b) { 614 uint64_t carry = 0; 615 uint64_t sum; 616 617 sum = a[0] + b[0]; 618 out[0] = sum; 619 carry = (sum < a[0]) ? 1 : 0; 620 621 sum = a[1] + b[1] + carry; 622 out[1] = sum; 623 carry = (sum < a[1] || (carry && sum == a[1])) ? 1 : 0; 624 625 sum = a[2] + b[2] + carry; 626 out[2] = sum; 627 carry = (sum < a[2] || (carry && sum == a[2])) ? 1 : 0; 628 629 sum = a[3] + b[3] + carry; 630 out[3] = sum; 631 carry = (sum < a[3] || (carry && sum == a[3])) ? 1 : 0; 632 633 /* Reduce if >= n or overflow */ 634 if (carry || scalar_ge(out, p256_n)) { 635 uint64_t borrow = 0; 636 uint64_t diff; 637 638 diff = out[0] - p256_n[0]; 639 borrow = (out[0] < p256_n[0]) ? 1 : 0; 640 out[0] = diff; 641 642 diff = out[1] - p256_n[1] - borrow; 643 borrow = (out[1] < p256_n[1] + borrow) ? 1 : 0; 644 out[1] = diff; 645 646 diff = out[2] - p256_n[2] - borrow; 647 borrow = (out[2] < p256_n[2] + borrow) ? 1 : 0; 648 out[2] = diff; 649 650 diff = out[3] - p256_n[3] - borrow; 651 out[3] = diff; 652 } 653 } 654 655 /* Scalar subtraction: out = (a - b) mod n */ 656 static void scalar_sub(scalar256 out, const scalar256 a, const scalar256 b) { 657 uint64_t borrow = 0; 658 uint64_t diff; 659 660 diff = a[0] - b[0]; 661 borrow = (a[0] < b[0]) ? 1 : 0; 662 out[0] = diff; 663 664 diff = a[1] - b[1] - borrow; 665 borrow = (a[1] < b[1] + borrow) ? 1 : 0; 666 out[1] = diff; 667 668 diff = a[2] - b[2] - borrow; 669 borrow = (a[2] < b[2] + borrow) ? 1 : 0; 670 out[2] = diff; 671 672 diff = a[3] - b[3] - borrow; 673 borrow = (a[3] < b[3] + borrow) ? 1 : 0; 674 out[3] = diff; 675 676 /* If underflow, add n */ 677 if (borrow) { 678 uint64_t carry = 0; 679 uint64_t sum; 680 681 sum = out[0] + p256_n[0]; 682 carry = (sum < out[0]) ? 1 : 0; 683 out[0] = sum; 684 685 sum = out[1] + p256_n[1] + carry; 686 carry = (sum < out[1] || (carry && sum == out[1])) ? 1 : 0; 687 out[1] = sum; 688 689 sum = out[2] + p256_n[2] + carry; 690 carry = (sum < out[2] || (carry && sum == out[2])) ? 1 : 0; 691 out[2] = sum; 692 693 sum = out[3] + p256_n[3] + carry; 694 out[3] = sum; 695 } 696 } 697 698 /* Scalar multiplication: out = (a * b) mod n */ 699 static void scalar_mul(scalar256 out, const scalar256 a, const scalar256 b) { 700 /* Full 512-bit product */ 701 uint64_t prod[8] = {0}; 702 703 /* Multiply: prod = a * b */ 704 for (int i = 0; i < 4; i++) { 705 uint64_t carry = 0; 706 for (int j = 0; j < 4; j++) { 707 uint64_t hi, lo; 708 mul64x64(&hi, &lo, a[i], b[j]); 709 710 uint64_t sum = prod[i + j] + lo; 711 uint64_t new_carry = hi + ((sum < prod[i + j]) ? 1 : 0); 712 sum += carry; 713 new_carry += (sum < carry) ? 1 : 0; 714 prod[i + j] = sum; 715 carry = new_carry; 716 } 717 prod[i + 4] += carry; 718 } 719 720 /* Barrett reduction mod n */ 721 /* For simplicity, use repeated subtraction (not constant-time but functional) */ 722 /* n = 0xFFFFFFFF00000000FFFFFFFFFFFFFFFFBCE6FAADA7179E84F3B9CAC2FC632551 */ 723 724 /* Start with full 512-bit value, reduce to 256 bits */ 725 scalar256 result; 726 result[0] = prod[0]; 727 result[1] = prod[1]; 728 result[2] = prod[2]; 729 result[3] = prod[3]; 730 731 /* Process high limbs */ 732 for (int i = 7; i >= 4; i--) { 733 if (prod[i] == 0) continue; 734 735 /* Shift high limb down and multiply by reduction constant */ 736 /* 2^256 mod n = 2^256 - n = some positive value */ 737 /* For proper reduction, we'd compute (prod[i] * (2^(64*i) mod n)) */ 738 /* Simplified: use iterative reduction */ 739 740 scalar256 temp; 741 temp[0] = prod[i]; 742 temp[1] = temp[2] = temp[3] = 0; 743 744 /* Multiply temp by 2^(64*(i-4)) mod n by shifting and reducing */ 745 for (int shift = 0; shift < (i - 4) * 64; shift += 64) { 746 /* Left shift by 64 bits */ 747 uint64_t t3 = temp[3]; 748 temp[3] = temp[2]; 749 temp[2] = temp[1]; 750 temp[1] = temp[0]; 751 temp[0] = 0; 752 753 /* If overflow (t3 != 0), we need to add t3 * (2^256 mod n) */ 754 /* 2^256 mod n = 0x4319055358E8617B0C46353D039CDAAE */ 755 /* But this is getting complex - use simple repeated subtraction */ 756 while (t3 > 0) { 757 scalar_add(temp, temp, p256_n); /* Wrong - this adds, not handles overflow */ 758 t3--; 759 } 760 761 /* Reduce if >= n */ 762 while (scalar_ge(temp, p256_n)) { 763 scalar_sub(temp, temp, p256_n); 764 } 765 } 766 767 scalar_add(result, result, temp); 768 } 769 770 /* Final reduction */ 771 while (scalar_ge(result, p256_n)) { 772 scalar_sub(result, result, p256_n); 773 } 774 775 scalar_copy(out, result); 776 } 777 778 /* Scalar inversion: out = a^(-1) mod n using Fermat's little theorem */ 779 /* a^(-1) = a^(n-2) mod n */ 780 static void scalar_invert(scalar256 out, const scalar256 a) { 781 /* n - 2 for P-256 curve order */ 782 scalar256 result, base, temp; 783 784 /* result = 1 */ 785 result[0] = 1; result[1] = 0; result[2] = 0; result[3] = 0; 786 scalar_copy(base, a); 787 788 /* n - 2 = 0xFFFFFFFF00000000FFFFFFFFFFFFFFFFBCE6FAADA7179E84F3B9CAC2FC63254F */ 789 static const scalar256 n_minus_2 = { 790 0xF3B9CAC2FC63254FULL, 0xBCE6FAADA7179E84ULL, 791 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFF00000000ULL 792 }; 793 794 /* Square-and-multiply */ 795 for (int i = 0; i < 256; i++) { 796 int limb = i / 64; 797 int bit = i % 64; 798 799 if ((n_minus_2[limb] >> bit) & 1) { 800 scalar_mul(temp, result, base); 801 scalar_copy(result, temp); 802 } 803 804 scalar_mul(temp, base, base); 805 scalar_copy(base, temp); 806 } 807 808 scalar_copy(out, result); 809 } 810 811 /* Modular reduction mod n (byte array version for compatibility) */ 812 static void scalar_reduce(uint8_t out[32], const uint8_t in[32]) { 813 scalar256 s; 814 scalar_from_bytes(s, in); 815 816 /* Reduce if >= n */ 817 while (scalar_ge(s, p256_n)) { 818 scalar_sub(s, s, p256_n); 819 } 820 821 scalar_to_bytes(out, s); 822 } 823 824 /* ============================================================================ 825 * ECDH Functions 826 * ========================================================================= */ 827 828 void p256_ecdh_keypair(uint8_t public_key[65], uint8_t private_key[32]) { 829 /* Generate random private key */ 830 random_bytes(private_key, 32); 831 scalar_reduce(private_key, private_key); /* Ensure < n */ 832 833 /* Derive public key */ 834 p256_ecdh_public_key(public_key, private_key); 835 } 836 837 void p256_ecdh_public_key(uint8_t public_key[65], const uint8_t private_key[32]) { 838 p256_point pub; 839 p256_scalarmult_base(&pub, private_key); 840 p256_point_to_bytes(public_key, &pub); 841 } 842 843 int p256_ecdh_shared_secret(uint8_t shared_secret[32], 844 const uint8_t my_private_key[32], 845 const uint8_t their_public_key[65]) { 846 p256_point their_point, shared_point; 847 848 if (p256_point_from_bytes(&their_point, their_public_key) != 0) { 849 return -1; 850 } 851 852 p256_scalarmult(&shared_point, my_private_key, &their_point); 853 854 /* Convert to affine and extract x-coordinate */ 855 fe256 x, y; 856 p256_to_affine(x, y, &shared_point); 857 858 /* Store x-coordinate as shared secret (big-endian) */ 859 for (int i = 0; i < 4; i++) { 860 for (int j = 0; j < 8; j++) { 861 shared_secret[i * 8 + j] = (x[3 - i] >> (56 - j * 8)) & 0xFF; 862 } 863 } 864 865 return 0; 866 } 867 868 /* ============================================================================ 869 * ECDSA Functions 870 * ========================================================================= */ 871 872 void p256_ecdsa_keypair(uint8_t public_key[65], uint8_t private_key[32]) { 873 /* Same as ECDH keypair for P-256 */ 874 p256_ecdh_keypair(public_key, private_key); 875 } 876 877 int p256_ecdsa_sign(uint8_t signature[64], 878 const uint8_t message_hash[32], 879 const uint8_t private_key[32]) { 880 scalar256 k_scalar, r_scalar, s_scalar; 881 scalar256 hash_scalar, privkey_scalar; 882 scalar256 temp, k_inv; 883 884 /* Retry loop in case k produces r=0 or s=0 */ 885 int attempts = 0; 886 do { 887 if (++attempts > 100) return -1; /* Too many retries */ 888 889 /* Generate random k (nonce) - should use RFC 6979 deterministic k for production */ 890 uint8_t k_bytes[32]; 891 random_bytes(k_bytes, 32); 892 scalar_from_bytes(k_scalar, k_bytes); 893 894 /* Ensure k is in range [1, n-1] */ 895 while (scalar_ge(k_scalar, p256_n) || scalar_is_zero(k_scalar)) { 896 random_bytes(k_bytes, 32); 897 scalar_from_bytes(k_scalar, k_bytes); 898 } 899 900 /* Compute R = k*G */ 901 uint8_t k_le[32]; 902 /* Convert k to little-endian for scalarmult */ 903 for (int i = 0; i < 32; i++) { 904 k_le[i] = k_bytes[31 - i]; 905 } 906 907 p256_point R; 908 p256_scalarmult_base(&R, k_le); 909 910 /* Get r = R.x mod n */ 911 fe256 rx, ry; 912 p256_to_affine(rx, ry, &R); 913 914 /* Convert R.x to big-endian bytes */ 915 uint8_t r_bytes[32]; 916 for (int i = 0; i < 4; i++) { 917 for (int j = 0; j < 8; j++) { 918 r_bytes[i * 8 + j] = (rx[3 - i] >> (56 - j * 8)) & 0xFF; 919 } 920 } 921 922 /* Load r as scalar and reduce mod n */ 923 scalar_from_bytes(r_scalar, r_bytes); 924 while (scalar_ge(r_scalar, p256_n)) { 925 scalar_sub(r_scalar, r_scalar, p256_n); 926 } 927 928 /* Check r != 0 */ 929 if (scalar_is_zero(r_scalar)) continue; 930 931 /* Load message hash and private key as scalars (big-endian) */ 932 scalar_from_bytes(hash_scalar, message_hash); 933 while (scalar_ge(hash_scalar, p256_n)) { 934 scalar_sub(hash_scalar, hash_scalar, p256_n); 935 } 936 937 /* Private key is stored little-endian, convert to big-endian for scalar */ 938 uint8_t privkey_be[32]; 939 for (int i = 0; i < 32; i++) { 940 privkey_be[i] = private_key[31 - i]; 941 } 942 scalar_from_bytes(privkey_scalar, privkey_be); 943 944 /* Compute s = k^(-1) * (hash + r * privkey) mod n */ 945 946 /* temp = r * privkey mod n */ 947 scalar_mul(temp, r_scalar, privkey_scalar); 948 949 /* temp = hash + r * privkey mod n */ 950 scalar_add(temp, hash_scalar, temp); 951 952 /* k_inv = k^(-1) mod n */ 953 scalar_invert(k_inv, k_scalar); 954 955 /* s = k^(-1) * (hash + r * privkey) mod n */ 956 scalar_mul(s_scalar, k_inv, temp); 957 958 /* Check s != 0 */ 959 if (scalar_is_zero(s_scalar)) continue; 960 961 /* Store signature as r || s (big-endian) */ 962 scalar_to_bytes(signature, r_scalar); 963 scalar_to_bytes(signature + 32, s_scalar); 964 965 /* Clear sensitive data */ 966 memset(k_bytes, 0, 32); 967 memset(k_le, 0, 32); 968 memset(&k_scalar, 0, sizeof(k_scalar)); 969 memset(&k_inv, 0, sizeof(k_inv)); 970 memset(&privkey_scalar, 0, sizeof(privkey_scalar)); 971 972 return 0; 973 974 } while (1); 975 } 976 977 int p256_ecdsa_verify(const uint8_t signature[64], 978 const uint8_t message_hash[32], 979 const uint8_t public_key[65]) { 980 scalar256 r_scalar, s_scalar, hash_scalar; 981 scalar256 s_inv, u1, u2; 982 983 /* Parse signature (big-endian) */ 984 scalar_from_bytes(r_scalar, signature); 985 scalar_from_bytes(s_scalar, signature + 32); 986 987 /* Check r, s in range [1, n-1] */ 988 if (scalar_is_zero(r_scalar) || scalar_ge(r_scalar, p256_n)) { 989 return -1; /* r not in valid range */ 990 } 991 if (scalar_is_zero(s_scalar) || scalar_ge(s_scalar, p256_n)) { 992 return -1; /* s not in valid range */ 993 } 994 995 /* Parse public key */ 996 p256_point Q; 997 if (p256_point_from_bytes(&Q, public_key) != 0) { 998 return -1; /* Invalid public key format */ 999 } 1000 1001 /* TODO: Verify Q is on the curve and not at infinity */ 1002 1003 /* Load message hash as scalar */ 1004 scalar_from_bytes(hash_scalar, message_hash); 1005 while (scalar_ge(hash_scalar, p256_n)) { 1006 scalar_sub(hash_scalar, hash_scalar, p256_n); 1007 } 1008 1009 /* Compute s^(-1) mod n */ 1010 scalar_invert(s_inv, s_scalar); 1011 1012 /* Compute u1 = hash * s^(-1) mod n */ 1013 scalar_mul(u1, hash_scalar, s_inv); 1014 1015 /* Compute u2 = r * s^(-1) mod n */ 1016 scalar_mul(u2, r_scalar, s_inv); 1017 1018 /* Convert u1, u2 to little-endian bytes for scalar multiplication */ 1019 uint8_t u1_bytes[32], u2_bytes[32]; 1020 scalar_to_bytes(u1_bytes, u1); 1021 scalar_to_bytes(u2_bytes, u2); 1022 1023 /* Reverse to little-endian for p256_scalarmult */ 1024 uint8_t u1_le[32], u2_le[32]; 1025 for (int i = 0; i < 32; i++) { 1026 u1_le[i] = u1_bytes[31 - i]; 1027 u2_le[i] = u2_bytes[31 - i]; 1028 } 1029 1030 /* Compute R = u1*G + u2*Q */ 1031 p256_point u1G, u2Q, R; 1032 p256_scalarmult_base(&u1G, u1_le); 1033 p256_scalarmult(&u2Q, u2_le, &Q); 1034 p256_point_add(&R, &u1G, &u2Q); 1035 1036 /* Check R is not at infinity */ 1037 if (p256_point_is_infinity(&R)) { 1038 return -1; /* Invalid signature */ 1039 } 1040 1041 /* Get R.x mod n */ 1042 fe256 rx, ry; 1043 p256_to_affine(rx, ry, &R); 1044 1045 /* Convert R.x to big-endian bytes and load as scalar */ 1046 uint8_t rx_bytes[32]; 1047 for (int i = 0; i < 4; i++) { 1048 for (int j = 0; j < 8; j++) { 1049 rx_bytes[i * 8 + j] = (rx[3 - i] >> (56 - j * 8)) & 0xFF; 1050 } 1051 } 1052 1053 scalar256 rx_scalar; 1054 scalar_from_bytes(rx_scalar, rx_bytes); 1055 while (scalar_ge(rx_scalar, p256_n)) { 1056 scalar_sub(rx_scalar, rx_scalar, p256_n); 1057 } 1058 1059 /* Verify R.x mod n == r */ 1060 if (rx_scalar[0] != r_scalar[0] || rx_scalar[1] != r_scalar[1] || 1061 rx_scalar[2] != r_scalar[2] || rx_scalar[3] != r_scalar[3]) { 1062 return -1; /* Signature verification failed */ 1063 } 1064 1065 return 0; /* Signature valid */ 1066 }