luajitos

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

Ed25519.c (24140B)


      1 /*
      2  * Ed25519 Implementation
      3  * RFC 8032 - Edwards-curve Digital Signature Algorithm (EdDSA)
      4  *
      5  * Based on Curve25519: y^2 = x^3 + 486662x^2 + x (mod 2^255-19)
      6  * Using twisted Edwards curve: -x^2 + y^2 = 1 + dx^2y^2
      7  */
      8 
      9 #include "Ed25519.h"
     10 #include "hashing/SHA512.c"  /* We'll need SHA-512 */
     11 #include <string.h>
     12 
     13 /* Field element: value mod 2^255-19 represented as 10 x 32-bit limbs */
     14 typedef int32_t fe[10];
     15 
     16 /* Point on Edwards curve */
     17 typedef struct {
     18     fe X, Y, Z, T;  /* Extended coordinates */
     19 } ge_p3;
     20 
     21 typedef struct {
     22     fe X, Y, Z;  /* Projective coordinates */
     23 } ge_p2;
     24 
     25 typedef struct {
     26     fe YplusX, YminusX, Z, T2d;  /* Precomputed for doubling */
     27 } ge_precomp;
     28 
     29 typedef struct {
     30     fe YplusX, YminusX, Z;  /* Cached */
     31 } ge_cached;
     32 
     33 /* Constants */
     34 static const fe fe_zero = {0};
     35 static const fe fe_one = {1};
     36 static const fe fe_d = {
     37     -10913610, 13857413, -15372611, 6949391, 114729,
     38     -8787816, -6275908, -3247719, -18696448, -12055116
     39 };
     40 
     41 static const fe fe_sqrtm1 = {
     42     -32595792, -7943725, 9377950, 3500415, 12389472,
     43     -272473, -25146209, -2005654, 326686, 11406482
     44 };
     45 
     46 /* Load 32-bit little-endian */
     47 static uint32_t load_3(const uint8_t *in) {
     48     return (uint32_t)in[0] | ((uint32_t)in[1] << 8) | ((uint32_t)in[2] << 16);
     49 }
     50 
     51 static uint32_t load_4(const uint8_t *in) {
     52     return (uint32_t)in[0] | ((uint32_t)in[1] << 8) |
     53            ((uint32_t)in[2] << 16) | ((uint32_t)in[3] << 24);
     54 }
     55 
     56 static uint64_t load_4_64(const uint8_t *in) {
     57     return (uint64_t)load_4(in);
     58 }
     59 
     60 /* Field element operations */
     61 
     62 static void fe_copy(fe h, const fe f) {
     63     memcpy(h, f, sizeof(fe));
     64 }
     65 
     66 static void fe_0(fe h) {
     67     memset(h, 0, sizeof(fe));
     68 }
     69 
     70 static void fe_1(fe h) {
     71     h[0] = 1;
     72     h[1] = 0; h[2] = 0; h[3] = 0; h[4] = 0;
     73     h[5] = 0; h[6] = 0; h[7] = 0; h[8] = 0; h[9] = 0;
     74 }
     75 
     76 static void fe_add(fe h, const fe f, const fe g) {
     77     for (int i = 0; i < 10; i++) {
     78         h[i] = f[i] + g[i];
     79     }
     80 }
     81 
     82 static void fe_sub(fe h, const fe f, const fe g) {
     83     for (int i = 0; i < 10; i++) {
     84         h[i] = f[i] - g[i];
     85     }
     86 }
     87 
     88 static void fe_neg(fe h, const fe f) {
     89     for (int i = 0; i < 10; i++) {
     90         h[i] = -f[i];
     91     }
     92 }
     93 
     94 static void fe_mul(fe h, const fe f, const fe g) {
     95     int32_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4];
     96     int32_t f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
     97     int32_t g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4];
     98     int32_t g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
     99 
    100     int64_t h0 = (int64_t)f0 * g0 + (int64_t)f1 * (2*g9) + (int64_t)f2 * (2*g8) +
    101                  (int64_t)f3 * (2*g7) + (int64_t)f4 * (2*g6) + (int64_t)f5 * (2*g5) +
    102                  (int64_t)f6 * (2*g4) + (int64_t)f7 * (2*g3) + (int64_t)f8 * (2*g2) + (int64_t)f9 * (2*g1);
    103     int64_t h1 = (int64_t)f0 * g1 + (int64_t)f1 * g0 + (int64_t)f2 * (2*g9) +
    104                  (int64_t)f3 * (2*g8) + (int64_t)f4 * (2*g7) + (int64_t)f5 * (2*g6) +
    105                  (int64_t)f6 * (2*g5) + (int64_t)f7 * (2*g4) + (int64_t)f8 * (2*g3) + (int64_t)f9 * (2*g2);
    106     int64_t h2 = (int64_t)f0 * g2 + (int64_t)f1 * g1 + (int64_t)f2 * g0 +
    107                  (int64_t)f3 * (2*g9) + (int64_t)f4 * (2*g8) + (int64_t)f5 * (2*g7) +
    108                  (int64_t)f6 * (2*g6) + (int64_t)f7 * (2*g5) + (int64_t)f8 * (2*g4) + (int64_t)f9 * (2*g3);
    109     int64_t h3 = (int64_t)f0 * g3 + (int64_t)f1 * g2 + (int64_t)f2 * g1 + (int64_t)f3 * g0 +
    110                  (int64_t)f4 * (2*g9) + (int64_t)f5 * (2*g8) + (int64_t)f6 * (2*g7) +
    111                  (int64_t)f7 * (2*g6) + (int64_t)f8 * (2*g5) + (int64_t)f9 * (2*g4);
    112     int64_t h4 = (int64_t)f0 * g4 + (int64_t)f1 * g3 + (int64_t)f2 * g2 + (int64_t)f3 * g1 + (int64_t)f4 * g0 +
    113                  (int64_t)f5 * (2*g9) + (int64_t)f6 * (2*g8) + (int64_t)f7 * (2*g7) +
    114                  (int64_t)f8 * (2*g6) + (int64_t)f9 * (2*g5);
    115     int64_t h5 = (int64_t)f0 * g5 + (int64_t)f1 * g4 + (int64_t)f2 * g3 + (int64_t)f3 * g2 +
    116                  (int64_t)f4 * g1 + (int64_t)f5 * g0 + (int64_t)f6 * (2*g9) +
    117                  (int64_t)f7 * (2*g8) + (int64_t)f8 * (2*g7) + (int64_t)f9 * (2*g6);
    118     int64_t h6 = (int64_t)f0 * g6 + (int64_t)f1 * g5 + (int64_t)f2 * g4 + (int64_t)f3 * g3 +
    119                  (int64_t)f4 * g2 + (int64_t)f5 * g1 + (int64_t)f6 * g0 +
    120                  (int64_t)f7 * (2*g9) + (int64_t)f8 * (2*g8) + (int64_t)f9 * (2*g7);
    121     int64_t h7 = (int64_t)f0 * g7 + (int64_t)f1 * g6 + (int64_t)f2 * g5 + (int64_t)f3 * g4 +
    122                  (int64_t)f4 * g3 + (int64_t)f5 * g2 + (int64_t)f6 * g1 + (int64_t)f7 * g0 +
    123                  (int64_t)f8 * (2*g9) + (int64_t)f9 * (2*g8);
    124     int64_t h8 = (int64_t)f0 * g8 + (int64_t)f1 * g7 + (int64_t)f2 * g6 + (int64_t)f3 * g5 +
    125                  (int64_t)f4 * g4 + (int64_t)f5 * g3 + (int64_t)f6 * g2 + (int64_t)f7 * g1 +
    126                  (int64_t)f8 * g0 + (int64_t)f9 * (2*g9);
    127     int64_t h9 = (int64_t)f0 * g9 + (int64_t)f1 * g8 + (int64_t)f2 * g7 + (int64_t)f3 * g6 +
    128                  (int64_t)f4 * g5 + (int64_t)f5 * g4 + (int64_t)f6 * g3 + (int64_t)f7 * g2 +
    129                  (int64_t)f8 * g1 + (int64_t)f9 * g0;
    130 
    131     /* Reduce mod 2^255-19 */
    132     int64_t carry;
    133     carry = (h0 + (1<<25)) >> 26; h1 += carry; h0 -= carry << 26;
    134     carry = (h4 + (1<<25)) >> 26; h5 += carry; h4 -= carry << 26;
    135     carry = (h1 + (1<<24)) >> 25; h2 += carry; h1 -= carry << 25;
    136     carry = (h5 + (1<<24)) >> 25; h6 += carry; h5 -= carry << 25;
    137     carry = (h2 + (1<<25)) >> 26; h3 += carry; h2 -= carry << 26;
    138     carry = (h6 + (1<<25)) >> 26; h7 += carry; h6 -= carry << 26;
    139     carry = (h3 + (1<<24)) >> 25; h4 += carry; h3 -= carry << 25;
    140     carry = (h7 + (1<<24)) >> 25; h8 += carry; h7 -= carry << 25;
    141     carry = (h4 + (1<<25)) >> 26; h5 += carry; h4 -= carry << 26;
    142     carry = (h8 + (1<<25)) >> 26; h9 += carry; h8 -= carry << 26;
    143     carry = (h9 + (1<<24)) >> 25; h0 += carry * 19; h9 -= carry << 25;
    144     carry = (h0 + (1<<25)) >> 26; h1 += carry; h0 -= carry << 26;
    145 
    146     h[0] = (int32_t)h0; h[1] = (int32_t)h1; h[2] = (int32_t)h2; h[3] = (int32_t)h3; h[4] = (int32_t)h4;
    147     h[5] = (int32_t)h5; h[6] = (int32_t)h6; h[7] = (int32_t)h7; h[8] = (int32_t)h8; h[9] = (int32_t)h9;
    148 }
    149 
    150 static void fe_sq(fe h, const fe f) {
    151     fe_mul(h, f, f);
    152 }
    153 
    154 static void fe_sq2(fe h, const fe f) {
    155     fe_sq(h, f);
    156     fe_add(h, h, h);
    157 }
    158 
    159 static void fe_invert(fe out, const fe z) {
    160     fe t0, t1, t2, t3;
    161     fe_sq(t0, z);
    162     fe_sq(t1, t0);
    163     fe_sq(t1, t1);
    164     fe_mul(t1, z, t1);
    165     fe_mul(t0, t0, t1);
    166     fe_sq(t2, t0);
    167     fe_mul(t1, t1, t2);
    168     fe_sq(t2, t1);
    169     for (int i = 1; i < 5; i++) fe_sq(t2, t2);
    170     fe_mul(t1, t2, t1);
    171     fe_sq(t2, t1);
    172     for (int i = 1; i < 10; i++) fe_sq(t2, t2);
    173     fe_mul(t2, t2, t1);
    174     fe_sq(t3, t2);
    175     for (int i = 1; i < 20; i++) fe_sq(t3, t3);
    176     fe_mul(t2, t3, t2);
    177     fe_sq(t2, t2);
    178     for (int i = 1; i < 10; i++) fe_sq(t2, t2);
    179     fe_mul(t1, t2, t1);
    180     fe_sq(t2, t1);
    181     for (int i = 1; i < 50; i++) fe_sq(t2, t2);
    182     fe_mul(t2, t2, t1);
    183     fe_sq(t3, t2);
    184     for (int i = 1; i < 100; i++) fe_sq(t3, t3);
    185     fe_mul(t2, t3, t2);
    186     fe_sq(t2, t2);
    187     for (int i = 1; i < 50; i++) fe_sq(t2, t2);
    188     fe_mul(t1, t2, t1);
    189     fe_sq(t1, t1);
    190     for (int i = 1; i < 5; i++) fe_sq(t1, t1);
    191     fe_mul(out, t1, t0);
    192 }
    193 
    194 static void fe_tobytes(uint8_t *s, const fe h) {
    195     int32_t h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4];
    196     int32_t h5 = h[5], h6 = h[6], h7 = h[7], h8 = h[8], h9 = h[9];
    197     int32_t q = (19 * h9 + (1 << 24)) >> 25;
    198     q = (h0 + q) >> 26; q = (h1 + q) >> 25; q = (h2 + q) >> 26;
    199     q = (h3 + q) >> 25; q = (h4 + q) >> 26; q = (h5 + q) >> 25;
    200     q = (h6 + q) >> 26; q = (h7 + q) >> 25; q = (h8 + q) >> 26;
    201     q = (h9 + q) >> 25;
    202     h0 += 19 * q;
    203     int32_t carry = h0 >> 26; h1 += carry; h0 -= carry << 26;
    204     carry = h1 >> 25; h2 += carry; h1 -= carry << 25;
    205     carry = h2 >> 26; h3 += carry; h2 -= carry << 26;
    206     carry = h3 >> 25; h4 += carry; h3 -= carry << 25;
    207     carry = h4 >> 26; h5 += carry; h4 -= carry << 26;
    208     carry = h5 >> 25; h6 += carry; h5 -= carry << 25;
    209     carry = h6 >> 26; h7 += carry; h6 -= carry << 26;
    210     carry = h7 >> 25; h8 += carry; h7 -= carry << 25;
    211     carry = h8 >> 26; h9 += carry; h8 -= carry << 26;
    212     s[0] = h0; s[1] = h0 >> 8; s[2] = h0 >> 16; s[3] = (h0 >> 24) | (h1 << 2);
    213     s[4] = h1 >> 6; s[5] = h1 >> 14; s[6] = (h1 >> 22) | (h2 << 3);
    214     s[7] = h2 >> 5; s[8] = h2 >> 13; s[9] = (h2 >> 21) | (h3 << 5);
    215     s[10] = h3 >> 3; s[11] = h3 >> 11; s[12] = (h3 >> 19) | (h4 << 6);
    216     s[13] = h4 >> 2; s[14] = h4 >> 10; s[15] = h4 >> 18;
    217     s[16] = h5; s[17] = h5 >> 8; s[18] = h5 >> 16; s[19] = (h5 >> 24) | (h6 << 1);
    218     s[20] = h6 >> 7; s[21] = h6 >> 15; s[22] = (h6 >> 23) | (h7 << 3);
    219     s[23] = h7 >> 5; s[24] = h7 >> 13; s[25] = (h7 >> 21) | (h8 << 4);
    220     s[26] = h8 >> 4; s[27] = h8 >> 12; s[28] = (h8 >> 20) | (h9 << 6);
    221     s[29] = h9 >> 2; s[30] = h9 >> 10; s[31] = h9 >> 18;
    222 }
    223 
    224 static void fe_frombytes(fe h, const uint8_t *s) {
    225     int64_t h0 = load_4_64(s);
    226     int64_t h1 = load_3(s + 4) << 6;
    227     int64_t h2 = load_3(s + 7) << 5;
    228     int64_t h3 = load_3(s + 10) << 3;
    229     int64_t h4 = load_3(s + 13) << 2;
    230     int64_t h5 = load_4_64(s + 16);
    231     int64_t h6 = load_3(s + 20) << 7;
    232     int64_t h7 = load_3(s + 23) << 5;
    233     int64_t h8 = load_3(s + 26) << 4;
    234     int64_t h9 = (load_3(s + 29) & 8388607) << 2;
    235     int64_t carry;
    236     carry = (h9 + (1<<24)) >> 25; h0 += carry * 19; h9 -= carry << 25;
    237     carry = (h1 + (1<<24)) >> 25; h2 += carry; h1 -= carry << 25;
    238     carry = (h3 + (1<<24)) >> 25; h4 += carry; h3 -= carry << 25;
    239     carry = (h5 + (1<<24)) >> 25; h6 += carry; h5 -= carry << 25;
    240     carry = (h7 + (1<<24)) >> 25; h8 += carry; h7 -= carry << 25;
    241     carry = (h0 + (1<<25)) >> 26; h1 += carry; h0 -= carry << 26;
    242     carry = (h2 + (1<<25)) >> 26; h3 += carry; h2 -= carry << 26;
    243     carry = (h4 + (1<<25)) >> 26; h5 += carry; h4 -= carry << 26;
    244     carry = (h6 + (1<<25)) >> 26; h7 += carry; h6 -= carry << 26;
    245     carry = (h8 + (1<<25)) >> 26; h9 += carry; h8 -= carry << 26;
    246     h[0] = (int32_t)h0; h[1] = (int32_t)h1; h[2] = (int32_t)h2; h[3] = (int32_t)h3; h[4] = (int32_t)h4;
    247     h[5] = (int32_t)h5; h[6] = (int32_t)h6; h[7] = (int32_t)h7; h[8] = (int32_t)h8; h[9] = (int32_t)h9;
    248 }
    249 
    250 /* Check if field element is negative */
    251 static int fe_isnegative(const fe f) {
    252     uint8_t s[32];
    253     fe_tobytes(s, f);
    254     return s[0] & 1;
    255 }
    256 
    257 /* Check if field element is nonzero */
    258 static int fe_isnonzero(const fe f) {
    259     uint8_t s[32];
    260     fe_tobytes(s, f);
    261     uint8_t result = 0;
    262     for (int i = 0; i < 32; i++) {
    263         result |= s[i];
    264     }
    265     return result != 0;
    266 }
    267 
    268 /* Conditional move: if b == 1, set f = g; if b == 0, do nothing */
    269 static void fe_cmov(fe f, const fe g, int b) {
    270     int32_t mask = -b;  /* All 1s if b=1, all 0s if b=0 */
    271     for (int i = 0; i < 10; i++) {
    272         f[i] ^= mask & (f[i] ^ g[i]);
    273     }
    274 }
    275 
    276 /* Compute sqrt(u/v) = u * v^3 * (u * v^7)^((p-5)/8) */
    277 static int fe_sqrt_ratio_m1(fe r, const fe u, const fe v) {
    278     fe v3, vxx, check, x, vx2;
    279 
    280     fe_sq(v3, v);
    281     fe_mul(v3, v3, v);      /* v3 = v^3 */
    282     fe_sq(x, v3);
    283     fe_mul(x, x, v);
    284     fe_mul(x, x, u);        /* x = uv^7 */
    285 
    286     /* Compute x^((p-5)/8) using addition chain */
    287     fe t0, t1, t2;
    288     fe_sq(t0, x);
    289     fe_sq(t1, t0);
    290     fe_sq(t1, t1);
    291     fe_mul(t1, x, t1);
    292     fe_mul(t0, t0, t1);
    293     fe_sq(t2, t0);
    294     fe_mul(t1, t1, t2);
    295     fe_sq(t2, t1);
    296     for (int i = 1; i < 5; i++) fe_sq(t2, t2);
    297     fe_mul(t1, t2, t1);
    298     fe_sq(t2, t1);
    299     for (int i = 1; i < 10; i++) fe_sq(t2, t2);
    300     fe_mul(t2, t2, t1);
    301     fe_sq(r, t2);
    302     for (int i = 1; i < 20; i++) fe_sq(r, r);
    303     fe_mul(r, r, t2);
    304     fe_sq(r, r);
    305     for (int i = 1; i < 10; i++) fe_sq(r, r);
    306     fe_mul(r, r, t1);
    307     fe_sq(t2, r);
    308     for (int i = 1; i < 50; i++) fe_sq(t2, t2);
    309     fe_mul(t2, t2, r);
    310     fe_sq(r, t2);
    311     for (int i = 1; i < 100; i++) fe_sq(r, r);
    312     fe_mul(r, r, t2);
    313     fe_sq(r, r);
    314     for (int i = 1; i < 50; i++) fe_sq(r, r);
    315     fe_mul(r, r, t1);
    316     fe_sq(r, r);
    317     fe_sq(r, r);
    318     fe_mul(r, r, x);
    319 
    320     /* r = u * v^3 * (uv^7)^((p-5)/8) */
    321     fe_mul(vx2, r, r);
    322     fe_mul(vx2, vx2, v);
    323     fe_sub(check, vx2, u);
    324     if (fe_isnonzero(check)) {
    325         fe_add(check, vx2, u);
    326         if (fe_isnonzero(check)) {
    327             return -1;  /* Not a square */
    328         }
    329         fe_mul(r, r, fe_sqrtm1);
    330     }
    331 
    332     if (fe_isnegative(r)) {
    333         fe_neg(r, r);
    334     }
    335     return 0;
    336 }
    337 
    338 /* Group operations */
    339 static void ge_p3_0(ge_p3 *h) {
    340     fe_0(h->X);
    341     fe_1(h->Y);
    342     fe_1(h->Z);
    343     fe_0(h->T);
    344 }
    345 
    346 static void ge_p3_tobytes(uint8_t *s, const ge_p3 *h) {
    347     fe recip, x, y;
    348     fe_invert(recip, h->Z);
    349     fe_mul(x, h->X, recip);
    350     fe_mul(y, h->Y, recip);
    351     fe_tobytes(s, y);
    352     s[31] ^= (fe_isnegative(x) << 7);
    353 }
    354 
    355 /* Decode point from bytes (decompress) */
    356 static int ge_frombytes_negate_vartime(ge_p3 *h, const uint8_t *s) {
    357     fe u, v, v3, vxx, check;
    358 
    359     fe_frombytes(h->Y, s);
    360     fe_1(h->Z);
    361     fe_sq(u, h->Y);         /* u = y^2 */
    362     fe_mul(v, u, fe_d);     /* v = dy^2 */
    363     fe_sub(u, u, h->Z);     /* u = y^2-1 */
    364     fe_add(v, v, h->Z);     /* v = dy^2+1 */
    365 
    366     /* Compute x = sqrt(u/v) */
    367     if (fe_sqrt_ratio_m1(h->X, u, v) != 0) {
    368         return -1;  /* Not on curve */
    369     }
    370 
    371     if (fe_isnegative(h->X) != ((s[31] >> 7) & 1)) {
    372         fe_neg(h->X, h->X);
    373     }
    374 
    375     fe_mul(h->T, h->X, h->Y);
    376     return 0;
    377 }
    378 
    379 /* Point addition: r = p + q */
    380 static void ge_add(ge_p3 *r, const ge_p3 *p, const ge_cached *q) {
    381     fe t0, A, B, C, D, E, F, G, H;
    382 
    383     fe_sub(A, p->Y, p->X);
    384     fe_add(B, p->Y, p->X);
    385     fe_mul(A, A, q->YminusX);
    386     fe_mul(B, B, q->YplusX);
    387     fe_mul(C, q->Z, p->T);
    388     fe_add(C, C, C);
    389     fe_sub(D, B, A);
    390     fe_add(E, B, A);
    391     fe_add(F, p->Z, p->Z);
    392     fe_sub(F, F, C);
    393     fe_add(G, F, D);
    394     fe_sub(H, F, D);
    395 
    396     fe_mul(r->X, G, H);
    397     fe_mul(r->Y, E, F);
    398     fe_mul(r->Z, F, G);
    399     fe_mul(r->T, E, H);
    400 }
    401 
    402 /* Point doubling */
    403 static void ge_p3_dbl(ge_p2 *r, const ge_p3 *p) {
    404     fe A, B, C, D, E, G, F, H;
    405 
    406     fe_sq(A, p->X);
    407     fe_sq(B, p->Y);
    408     fe_sq2(C, p->Z);
    409     fe_add(D, A, B);
    410     fe_add(E, p->X, p->Y);
    411     fe_sq(E, E);
    412     fe_sub(E, E, D);
    413     fe_sub(G, B, A);
    414     fe_sub(F, C, G);
    415     fe_sub(H, B, A);
    416 
    417     fe_mul(r->X, E, F);
    418     fe_mul(r->Y, G, H);
    419     fe_mul(r->Z, F, G);
    420 }
    421 
    422 /* Convert p2 to p3 */
    423 static void ge_p2_to_p3(ge_p3 *r, const ge_p2 *p) {
    424     fe_copy(r->X, p->X);
    425     fe_copy(r->Y, p->Y);
    426     fe_copy(r->Z, p->Z);
    427     fe_mul(r->T, p->X, p->Y);
    428 }
    429 
    430 /* Convert p3 to cached */
    431 static void ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
    432     fe_add(r->YplusX, p->Y, p->X);
    433     fe_sub(r->YminusX, p->Y, p->X);
    434     fe_copy(r->Z, p->Z);
    435 }
    436 
    437 /* Ed25519 base point (generator) */
    438 static const fe ed25519_base_x = {
    439     0x1ad5258f, 0x1ceec73, 0x2c7f5e6, 0x35a6e8b, 0x21c2cbd,
    440     0x3406648, 0x3e7a32d, 0x1e6e4bd, 0x15fb5d5, 0x2484f2a
    441 };
    442 
    443 static const fe ed25519_base_y = {
    444     0x2666666, 0x1999999, 0x0, 0x1333333, 0x1999999,
    445     0x0, 0x1333333, 0x1999999, 0x0, 0x1333333
    446 };
    447 
    448 /* Scalar multiplication with base point using double-and-add */
    449 static void ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
    450     ge_p3 result, temp;
    451     ge_p2 doubled;
    452     ge_cached cached_base;
    453 
    454     /* Start with identity (neutral element) */
    455     ge_p3_0(&result);
    456 
    457     /* Initialize base point */
    458     ge_p3 base;
    459     fe_copy(base.X, ed25519_base_x);
    460     fe_copy(base.Y, ed25519_base_y);
    461     fe_1(base.Z);
    462     fe_mul(base.T, base.X, base.Y);
    463 
    464     /* Double-and-add algorithm */
    465     for (int i = 0; i < 256; i++) {
    466         /* Get bit i of scalar */
    467         int bit = (a[i / 8] >> (i % 8)) & 1;
    468 
    469         if (bit) {
    470             /* result = result + base */
    471             ge_p3_to_cached(&cached_base, &base);
    472             ge_add(&temp, &result, &cached_base);
    473             fe_copy(result.X, temp.X);
    474             fe_copy(result.Y, temp.Y);
    475             fe_copy(result.Z, temp.Z);
    476             fe_copy(result.T, temp.T);
    477         }
    478 
    479         /* base = base + base (double) */
    480         if (i < 255) {
    481             ge_p3_to_cached(&cached_base, &base);
    482             ge_add(&temp, &base, &cached_base);
    483             fe_copy(base.X, temp.X);
    484             fe_copy(base.Y, temp.Y);
    485             fe_copy(base.Z, temp.Z);
    486             fe_copy(base.T, temp.T);
    487         }
    488     }
    489 
    490     fe_copy(h->X, result.X);
    491     fe_copy(h->Y, result.Y);
    492     fe_copy(h->Z, result.Z);
    493     fe_copy(h->T, result.T);
    494 }
    495 
    496 /* Scalar multiplication: r = a * P (variable point) */
    497 static void ge_scalarmult(ge_p3 *r, const uint8_t *a, const ge_p3 *P) {
    498     ge_p3 result, temp;
    499     ge_cached cached_point;
    500     ge_p3 point;
    501 
    502     /* Start with identity */
    503     ge_p3_0(&result);
    504 
    505     /* Copy input point */
    506     fe_copy(point.X, P->X);
    507     fe_copy(point.Y, P->Y);
    508     fe_copy(point.Z, P->Z);
    509     fe_copy(point.T, P->T);
    510 
    511     /* Double-and-add */
    512     for (int i = 0; i < 256; i++) {
    513         int bit = (a[i / 8] >> (i % 8)) & 1;
    514 
    515         if (bit) {
    516             ge_p3_to_cached(&cached_point, &point);
    517             ge_add(&temp, &result, &cached_point);
    518             fe_copy(result.X, temp.X);
    519             fe_copy(result.Y, temp.Y);
    520             fe_copy(result.Z, temp.Z);
    521             fe_copy(result.T, temp.T);
    522         }
    523 
    524         if (i < 255) {
    525             ge_p3_to_cached(&cached_point, &point);
    526             ge_add(&temp, &point, &cached_point);
    527             fe_copy(point.X, temp.X);
    528             fe_copy(point.Y, temp.Y);
    529             fe_copy(point.Z, temp.Z);
    530             fe_copy(point.T, temp.T);
    531         }
    532     }
    533 
    534     fe_copy(r->X, result.X);
    535     fe_copy(r->Y, result.Y);
    536     fe_copy(r->Z, result.Z);
    537     fe_copy(r->T, result.T);
    538 }
    539 
    540 /* Scalar operations (mod L where L is the group order) */
    541 static const uint8_t L[32] = {
    542     0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
    543     0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
    544     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    545     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10
    546 };
    547 
    548 /* Reduce scalar mod L */
    549 static void sc_reduce(uint8_t *s) {
    550     int64_t carry;
    551     int64_t s0 = 0x1FFFFF & load_3(s);
    552     int64_t s1 = 0x1FFFFF & (load_4(s + 2) >> 5);
    553     int64_t s2 = 0x1FFFFF & (load_3(s + 5) >> 2);
    554     int64_t s3 = 0x1FFFFF & (load_4(s + 7) >> 7);
    555     int64_t s4 = 0x1FFFFF & (load_4(s + 10) >> 4);
    556     int64_t s5 = 0x1FFFFF & (load_3(s + 13) >> 1);
    557     int64_t s6 = 0x1FFFFF & (load_4(s + 15) >> 6);
    558     int64_t s7 = 0x1FFFFF & (load_3(s + 18) >> 3);
    559     int64_t s8 = 0x1FFFFF & load_3(s + 21);
    560     int64_t s9 = 0x1FFFFF & (load_4(s + 23) >> 5);
    561     int64_t s10 = 0x1FFFFF & (load_3(s + 26) >> 2);
    562     int64_t s11 = (load_4(s + 28) >> 7);
    563 
    564     /* Repeated reduction steps (simplified) */
    565     for (int i = 0; i < 3; i++) {
    566         carry = (s0 + (1<<20)) >> 21; s1 += carry; s0 -= carry << 21;
    567         carry = (s2 + (1<<20)) >> 21; s3 += carry; s2 -= carry << 21;
    568         carry = (s4 + (1<<20)) >> 21; s5 += carry; s4 -= carry << 21;
    569         carry = (s6 + (1<<20)) >> 21; s7 += carry; s6 -= carry << 21;
    570         carry = (s8 + (1<<20)) >> 21; s9 += carry; s8 -= carry << 21;
    571         carry = (s10 + (1<<20)) >> 21; s11 += carry; s10 -= carry << 21;
    572 
    573         carry = (s1 + (1<<20)) >> 21; s2 += carry; s1 -= carry << 21;
    574         carry = (s3 + (1<<20)) >> 21; s4 += carry; s3 -= carry << 21;
    575         carry = (s5 + (1<<20)) >> 21; s6 += carry; s5 -= carry << 21;
    576         carry = (s7 + (1<<20)) >> 21; s8 += carry; s7 -= carry << 21;
    577         carry = (s9 + (1<<20)) >> 21; s10 += carry; s9 -= carry << 21;
    578         carry = (s11 + (1<<20)) >> 21; s0 += carry * 666643; s11 -= carry << 21;
    579     }
    580 
    581     s[0] = s0 >> 0; s[1] = s0 >> 8; s[2] = (s0 >> 16) | (s1 << 5);
    582     s[3] = s1 >> 3; s[4] = s1 >> 11; s[5] = (s1 >> 19) | (s2 << 2);
    583     s[6] = s2 >> 6; s[7] = (s2 >> 14) | (s3 << 7);
    584     s[8] = s3 >> 1; s[9] = s3 >> 9; s[10] = (s3 >> 17) | (s4 << 4);
    585     s[11] = s4 >> 4; s[12] = s4 >> 12; s[13] = (s4 >> 20) | (s5 << 1);
    586     s[14] = s5 >> 7; s[15] = (s5 >> 15) | (s6 << 6);
    587     s[16] = s6 >> 2; s[17] = s6 >> 10; s[18] = (s6 >> 18) | (s7 << 3);
    588     s[19] = s7 >> 5; s[20] = s7 >> 13; s[21] = s8 >> 0;
    589     s[22] = s8 >> 8; s[23] = (s8 >> 16) | (s9 << 5);
    590     s[24] = s9 >> 3; s[25] = s9 >> 11; s[26] = (s9 >> 19) | (s10 << 2);
    591     s[27] = s10 >> 6; s[28] = (s10 >> 14) | (s11 << 7);
    592     s[29] = s11 >> 1; s[30] = s11 >> 9; s[31] = s11 >> 17;
    593 }
    594 
    595 /* Ed25519 public API implementations */
    596 
    597 void ed25519_create_keypair(uint8_t public_key[32], uint8_t secret_key[64],
    598                             const uint8_t seed[32]) {
    599     uint8_t hash[64];
    600     ge_p3 A;
    601 
    602     /* Hash the seed to get secret scalar */
    603     sha512(seed, 32, hash);
    604 
    605     /* Clamp the secret scalar */
    606     hash[0] &= 248;
    607     hash[31] &= 127;
    608     hash[31] |= 64;
    609 
    610     /* Compute public key A = a*B where B is the base point */
    611     ge_scalarmult_base(&A, hash);
    612     ge_p3_tobytes(public_key, &A);
    613 
    614     /* Store seed and public key in secret key */
    615     memcpy(secret_key, seed, 32);
    616     memcpy(secret_key + 32, public_key, 32);
    617 
    618     /* Zero sensitive data */
    619     memset(hash, 0, 64);
    620 }
    621 
    622 void ed25519_sign(uint8_t signature[64],
    623                   const uint8_t *message, size_t message_len,
    624                   const uint8_t secret_key[64]) {
    625     uint8_t hash[64], hram[64], r[64];
    626     ge_p3 R;
    627 
    628     /* Hash the secret key */
    629     sha512(secret_key, 32, hash);
    630     hash[0] &= 248;
    631     hash[31] &= 127;
    632     hash[31] |= 64;
    633 
    634     /* Compute r = SHA-512(hash[32..64] || message) */
    635     uint8_t *r_input = malloc(32 + message_len);
    636     memcpy(r_input, hash + 32, 32);
    637     memcpy(r_input + 32, message, message_len);
    638     sha512(r_input, 32 + message_len, r);
    639     free(r_input);
    640     sc_reduce(r);
    641 
    642     /* Compute R = r * B */
    643     ge_scalarmult_base(&R, r);
    644     ge_p3_tobytes(signature, &R);
    645 
    646     /* Compute hram = SHA-512(R || A || message) where A is public key */
    647     uint8_t *hram_input = malloc(64 + message_len);
    648     memcpy(hram_input, signature, 32);           /* R */
    649     memcpy(hram_input + 32, secret_key + 32, 32); /* A (public key) */
    650     memcpy(hram_input + 64, message, message_len);
    651     sha512(hram_input, 64 + message_len, hram);
    652     free(hram_input);
    653     sc_reduce(hram);
    654 
    655     /* Compute S = (r + hram * a) mod L */
    656     uint8_t S[32];
    657     memset(S, 0, 32);
    658     for (int i = 0; i < 32; i++) {
    659         int64_t carry = 0;
    660         for (int j = 0; j <= i; j++) {
    661             carry += (int64_t)r[j] + (int64_t)hram[i - j] * (int64_t)hash[j];
    662         }
    663         S[i] = carry & 0xFF;
    664     }
    665     sc_reduce(S);
    666     memcpy(signature + 32, S, 32);
    667 
    668     /* Zero sensitive data */
    669     memset(hash, 0, 64);
    670     memset(r, 0, 64);
    671     memset(hram, 0, 64);
    672     memset(S, 0, 32);
    673 }
    674 
    675 int ed25519_verify(const uint8_t signature[64],
    676                    const uint8_t *message, size_t message_len,
    677                    const uint8_t public_key[32]) {
    678     ge_p3 A, R;
    679     uint8_t hram[64];
    680     const uint8_t *sig_R = signature;
    681     const uint8_t *sig_S = signature + 32;
    682 
    683     /* Check S < L */
    684     for (int i = 31; i >= 0; i--) {
    685         if (sig_S[i] < L[i]) break;
    686         if (sig_S[i] > L[i]) return -1;
    687     }
    688 
    689     /* Decode public key A */
    690     if (ge_frombytes_negate_vartime(&A, public_key) != 0) {
    691         return -1;  /* Invalid public key */
    692     }
    693 
    694     /* Decode R from signature */
    695     if (ge_frombytes_negate_vartime(&R, sig_R) != 0) {
    696         return -1;  /* Invalid R */
    697     }
    698 
    699     /* Compute hram = SHA-512(R || A || message) */
    700     uint8_t *hram_input = malloc(64 + message_len);
    701     memcpy(hram_input, sig_R, 32);
    702     memcpy(hram_input + 32, public_key, 32);
    703     memcpy(hram_input + 64, message, message_len);
    704     sha512(hram_input, 64 + message_len, hram);
    705     free(hram_input);
    706     sc_reduce(hram);
    707 
    708     /* Verify: [S]B == R + [hram]A */
    709     /* Compute lhs = [S]B */
    710     ge_p3 lhs;
    711     ge_scalarmult_base(&lhs, sig_S);
    712 
    713     /* Compute rhs = R + [hram]A */
    714     ge_p3 hram_A;
    715     ge_scalarmult(&hram_A, hram, &A);
    716 
    717     ge_cached cached;
    718     ge_p3_to_cached(&cached, &hram_A);
    719     ge_p3 rhs;
    720     ge_add(&rhs, &R, &cached);
    721 
    722     /* Compare lhs and rhs */
    723     uint8_t lhs_bytes[32], rhs_bytes[32];
    724     ge_p3_tobytes(lhs_bytes, &lhs);
    725     ge_p3_tobytes(rhs_bytes, &rhs);
    726 
    727     int result = 0;
    728     for (int i = 0; i < 32; i++) {
    729         result |= (lhs_bytes[i] ^ rhs_bytes[i]);
    730     }
    731 
    732     memset(hram, 0, 64);
    733     return (result == 0) ? 0 : -1;
    734 }