luajitos

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

encoder_JPEG.c (23011B)


      1 /*
      2  * JPEG Encoder Implementation
      3  * Baseline DCT JPEG encoder (SOF0)
      4  *
      5  * Implements ITU-T T.81 / ISO/IEC 10918-1
      6  */
      7 
      8 #include "encoder_JPEG.h"
      9 #include <stdlib.h>
     10 #include <string.h>
     11 #include <math.h>
     12 #include <lua.h>
     13 #include <lauxlib.h>
     14 
     15 /* Zigzag order for 8x8 block - maps zigzag position to raster position
     16  * zigzag[i] gives the raster index for zigzag position i
     17  * Zigzag pattern: (0,0)->(0,1)->(1,0)->(2,0)->(1,1)->(0,2)->...
     18  */
     19 static const uint8_t zigzag[64] = {
     20      0,  1,  8, 16,  9,  2,  3, 10,   /* zigzag 0-7 */
     21     17, 24, 32, 25, 18, 11,  4,  5,   /* zigzag 8-15 */
     22     12, 19, 26, 33, 40, 48, 41, 34,   /* zigzag 16-23 */
     23     27, 20, 13,  6,  7, 14, 21, 28,   /* zigzag 24-31 */
     24     35, 42, 49, 56, 57, 50, 43, 36,   /* zigzag 32-39 */
     25     29, 22, 15, 23, 30, 37, 44, 51,   /* zigzag 40-47 */
     26     58, 59, 52, 45, 38, 31, 39, 46,   /* zigzag 48-55 */
     27     53, 60, 61, 54, 47, 55, 62, 63    /* zigzag 56-63 */
     28 };
     29 
     30 /* Standard luminance quantization table */
     31 static const uint8_t std_lum_quant[64] = {
     32     16, 11, 10, 16,  24,  40,  51,  61,
     33     12, 12, 14, 19,  26,  58,  60,  55,
     34     14, 13, 16, 24,  40,  57,  69,  56,
     35     14, 17, 22, 29,  51,  87,  80,  62,
     36     18, 22, 37, 56,  68, 109, 103,  77,
     37     24, 35, 55, 64,  81, 104, 113,  92,
     38     49, 64, 78, 87, 103, 121, 120, 101,
     39     72, 92, 95, 98, 112, 100, 103,  99
     40 };
     41 
     42 /* Standard chrominance quantization table (ITU-T.81 Annex K Table K.2) */
     43 static const uint8_t std_chr_quant[64] = {
     44     17, 18, 24, 47, 99, 99, 99, 99,
     45     18, 21, 26, 66, 99, 99, 99, 99,
     46     24, 26, 56, 99, 99, 99, 99, 99,
     47     47, 66, 99, 99, 99, 99, 99, 99,
     48     99, 99, 99, 99, 99, 99, 99, 99,
     49     99, 99, 99, 99, 99, 99, 99, 99,
     50     99, 99, 99, 99, 99, 99, 99, 99,
     51     99, 99, 99, 99, 99, 99, 99, 99
     52 };
     53 
     54 /* DC luminance Huffman table */
     55 static const uint8_t dc_lum_bits[17] = {0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0};
     56 static const uint8_t dc_lum_val[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
     57 
     58 /* DC chrominance Huffman table */
     59 static const uint8_t dc_chr_bits[17] = {0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0};
     60 static const uint8_t dc_chr_val[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
     61 
     62 /* AC luminance Huffman table */
     63 static const uint8_t ac_lum_bits[17] = {0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125};
     64 static const uint8_t ac_lum_val[162] = {
     65     0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
     66     0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
     67     0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
     68     0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
     69     0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
     70     0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
     71     0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
     72     0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
     73     0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
     74     0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
     75     0xf9, 0xfa
     76 };
     77 
     78 /* AC chrominance Huffman table */
     79 static const uint8_t ac_chr_bits[17] = {0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119};
     80 static const uint8_t ac_chr_val[162] = {
     81     0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
     82     0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
     83     0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
     84     0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
     85     0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
     86     0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
     87     0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
     88     0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
     89     0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
     90     0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
     91     0xf9, 0xfa
     92 };
     93 
     94 /* Huffman code structure */
     95 typedef struct {
     96     uint16_t code;
     97     uint8_t length;
     98 } huff_code_t;
     99 
    100 /* Encoder state */
    101 typedef struct {
    102     uint8_t* buffer;
    103     size_t capacity;
    104     size_t size;
    105     uint32_t bit_buffer;
    106     int bit_count;
    107 
    108     /* Quantization tables (scaled) */
    109     uint8_t lum_quant[64];
    110     uint8_t chr_quant[64];
    111 
    112     /* Huffman tables */
    113     huff_code_t dc_lum_codes[12];
    114     huff_code_t dc_chr_codes[12];
    115     huff_code_t ac_lum_codes[256];
    116     huff_code_t ac_chr_codes[256];
    117 } jpeg_encoder_t;
    118 
    119 /* Ensure buffer has space */
    120 static int ensure_capacity(jpeg_encoder_t* enc, size_t needed) {
    121     if (enc->size + needed > enc->capacity) {
    122         size_t new_cap = enc->capacity * 2;
    123         if (new_cap < enc->size + needed) {
    124             new_cap = enc->size + needed + 4096;
    125         }
    126         uint8_t* new_buf = (uint8_t*)realloc(enc->buffer, new_cap);
    127         if (!new_buf) return 0;
    128         enc->buffer = new_buf;
    129         enc->capacity = new_cap;
    130     }
    131     return 1;
    132 }
    133 
    134 /* Write byte to output */
    135 static void write_byte(jpeg_encoder_t* enc, uint8_t b) {
    136     if (!ensure_capacity(enc, 1)) return;
    137     enc->buffer[enc->size++] = b;
    138 }
    139 
    140 /* Write 16-bit big-endian */
    141 static void write_word(jpeg_encoder_t* enc, uint16_t w) {
    142     write_byte(enc, (w >> 8) & 0xFF);
    143     write_byte(enc, w & 0xFF);
    144 }
    145 
    146 /* Write bits to output with byte stuffing */
    147 static void write_bits(jpeg_encoder_t* enc, uint16_t code, int length) {
    148     enc->bit_buffer = (enc->bit_buffer << length) | code;
    149     enc->bit_count += length;
    150 
    151     while (enc->bit_count >= 8) {
    152         enc->bit_count -= 8;
    153         uint8_t byte = (enc->bit_buffer >> enc->bit_count) & 0xFF;
    154         write_byte(enc, byte);
    155         if (byte == 0xFF) {
    156             write_byte(enc, 0x00);  /* Byte stuffing */
    157         }
    158     }
    159 }
    160 
    161 /* Flush remaining bits */
    162 static void flush_bits(jpeg_encoder_t* enc) {
    163     if (enc->bit_count > 0) {
    164         int pad = 8 - enc->bit_count;
    165         write_bits(enc, (1 << pad) - 1, pad);
    166     }
    167 }
    168 
    169 /* Build Huffman codes from bits/values */
    170 static void build_huffman_codes(huff_code_t* codes, int max_codes,
    171                                  const uint8_t* bits, const uint8_t* vals, int num_vals) {
    172     memset(codes, 0, max_codes * sizeof(huff_code_t));
    173 
    174     uint16_t code = 0;
    175     int val_idx = 0;
    176 
    177     for (int length = 1; length <= 16 && val_idx < num_vals; length++) {
    178         for (int i = 0; i < bits[length] && val_idx < num_vals; i++) {
    179             uint8_t val = vals[val_idx++];
    180             if (val < max_codes) {
    181                 codes[val].code = code;
    182                 codes[val].length = length;
    183             }
    184             code++;
    185         }
    186         code <<= 1;
    187     }
    188 }
    189 
    190 /* Scale quantization table by quality */
    191 static void scale_quant_table(uint8_t* dst, const uint8_t* src, int quality) {
    192     int scale;
    193     if (quality < 50) {
    194         scale = 5000 / quality;
    195     } else {
    196         scale = 200 - quality * 2;
    197     }
    198 
    199     for (int i = 0; i < 64; i++) {
    200         int val = (src[i] * scale + 50) / 100;
    201         if (val < 1) val = 1;
    202         if (val > 255) val = 255;
    203         dst[i] = val;
    204     }
    205 }
    206 
    207 /*
    208  * Forward DCT - Independent implementation based on AA&N algorithm
    209  * (Arai, Agui, and Nakajima) with correct fixed-point arithmetic
    210  *
    211  * This implementation uses the same constants and structure as libjpeg's jfdctint.c
    212  */
    213 
    214 #define FIX_0_382683433  ((int32_t)  98)   /* FIX(0.382683433) scaled by 256 */
    215 #define FIX_0_541196100  ((int32_t) 139)   /* FIX(0.541196100) scaled by 256 */
    216 #define FIX_0_707106781  ((int32_t) 181)   /* FIX(0.707106781) scaled by 256 */
    217 #define FIX_1_306562965  ((int32_t) 334)   /* FIX(1.306562965) scaled by 256 */
    218 
    219 #define CONST_BITS  8
    220 #define PASS1_BITS  2
    221 
    222 #define DESCALE(x, n)  (((x) + (1 << ((n)-1))) >> (n))
    223 #define MULTIPLY(var, const)  ((int32_t)(var) * (const))
    224 
    225 /* Forward DCT on 8x8 block */
    226 static void fdct_block(int16_t* block) {
    227     int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    228     int32_t tmp10, tmp11, tmp12, tmp13;
    229     int32_t z1, z2, z3, z4, z5, z11, z13;
    230     int32_t workspace[64];
    231     int16_t *dataptr;
    232     int32_t *wsptr;
    233     int ctr;
    234 
    235     /* Pass 1: process rows */
    236     dataptr = block;
    237     wsptr = workspace;
    238     for (ctr = 0; ctr < 8; ctr++) {
    239         tmp0 = dataptr[0] + dataptr[7];
    240         tmp7 = dataptr[0] - dataptr[7];
    241         tmp1 = dataptr[1] + dataptr[6];
    242         tmp6 = dataptr[1] - dataptr[6];
    243         tmp2 = dataptr[2] + dataptr[5];
    244         tmp5 = dataptr[2] - dataptr[5];
    245         tmp3 = dataptr[3] + dataptr[4];
    246         tmp4 = dataptr[3] - dataptr[4];
    247 
    248         /* Even part */
    249         tmp10 = tmp0 + tmp3;
    250         tmp13 = tmp0 - tmp3;
    251         tmp11 = tmp1 + tmp2;
    252         tmp12 = tmp1 - tmp2;
    253 
    254         wsptr[0] = (tmp10 + tmp11) << PASS1_BITS;
    255         wsptr[4] = (tmp10 - tmp11) << PASS1_BITS;
    256 
    257         z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
    258         wsptr[2] = DESCALE((tmp13 << CONST_BITS) + z1, CONST_BITS - PASS1_BITS);
    259         wsptr[6] = DESCALE((tmp13 << CONST_BITS) - z1, CONST_BITS - PASS1_BITS);
    260 
    261         /* Odd part */
    262         tmp10 = tmp4 + tmp5;
    263         tmp11 = tmp5 + tmp6;
    264         tmp12 = tmp6 + tmp7;
    265 
    266         z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433);
    267         z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5;
    268         z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5;
    269         z3 = MULTIPLY(tmp11, FIX_0_707106781);
    270 
    271         z11 = (tmp7 << CONST_BITS) + z3;
    272         z13 = (tmp7 << CONST_BITS) - z3;
    273 
    274         wsptr[5] = DESCALE(z13 + z2, CONST_BITS - PASS1_BITS);
    275         wsptr[3] = DESCALE(z13 - z2, CONST_BITS - PASS1_BITS);
    276         wsptr[1] = DESCALE(z11 + z4, CONST_BITS - PASS1_BITS);
    277         wsptr[7] = DESCALE(z11 - z4, CONST_BITS - PASS1_BITS);
    278 
    279         dataptr += 8;
    280         wsptr += 8;
    281     }
    282 
    283     /* Pass 2: process columns */
    284     wsptr = workspace;
    285     dataptr = block;
    286     for (ctr = 0; ctr < 8; ctr++) {
    287         tmp0 = wsptr[0*8] + wsptr[7*8];
    288         tmp7 = wsptr[0*8] - wsptr[7*8];
    289         tmp1 = wsptr[1*8] + wsptr[6*8];
    290         tmp6 = wsptr[1*8] - wsptr[6*8];
    291         tmp2 = wsptr[2*8] + wsptr[5*8];
    292         tmp5 = wsptr[2*8] - wsptr[5*8];
    293         tmp3 = wsptr[3*8] + wsptr[4*8];
    294         tmp4 = wsptr[3*8] - wsptr[4*8];
    295 
    296         /* Even part */
    297         tmp10 = tmp0 + tmp3;
    298         tmp13 = tmp0 - tmp3;
    299         tmp11 = tmp1 + tmp2;
    300         tmp12 = tmp1 - tmp2;
    301 
    302         dataptr[0*8] = (int16_t)DESCALE(tmp10 + tmp11, PASS1_BITS + 3);
    303         dataptr[4*8] = (int16_t)DESCALE(tmp10 - tmp11, PASS1_BITS + 3);
    304 
    305         z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
    306         dataptr[2*8] = (int16_t)DESCALE((tmp13 << CONST_BITS) + z1, CONST_BITS + PASS1_BITS + 3);
    307         dataptr[6*8] = (int16_t)DESCALE((tmp13 << CONST_BITS) - z1, CONST_BITS + PASS1_BITS + 3);
    308 
    309         /* Odd part */
    310         tmp10 = tmp4 + tmp5;
    311         tmp11 = tmp5 + tmp6;
    312         tmp12 = tmp6 + tmp7;
    313 
    314         z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433);
    315         z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5;
    316         z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5;
    317         z3 = MULTIPLY(tmp11, FIX_0_707106781);
    318 
    319         z11 = (tmp7 << CONST_BITS) + z3;
    320         z13 = (tmp7 << CONST_BITS) - z3;
    321 
    322         dataptr[5*8] = (int16_t)DESCALE(z13 + z2, CONST_BITS + PASS1_BITS + 3);
    323         dataptr[3*8] = (int16_t)DESCALE(z13 - z2, CONST_BITS + PASS1_BITS + 3);
    324         dataptr[1*8] = (int16_t)DESCALE(z11 + z4, CONST_BITS + PASS1_BITS + 3);
    325         dataptr[7*8] = (int16_t)DESCALE(z11 - z4, CONST_BITS + PASS1_BITS + 3);
    326 
    327         wsptr++;
    328         dataptr++;
    329     }
    330 }
    331 
    332 /* Get bit size for value */
    333 static int bit_size(int val) {
    334     if (val < 0) val = -val;
    335     int size = 0;
    336     while (val) {
    337         size++;
    338         val >>= 1;
    339     }
    340     return size;
    341 }
    342 
    343 /* Encode a single block */
    344 static int encode_block(jpeg_encoder_t* enc, int16_t* block,
    345                         const uint8_t* quant, int* prev_dc,
    346                         const huff_code_t* dc_codes, const huff_code_t* ac_codes) {
    347     /* Quantize in raster order, then reorder to zigzag for encoding */
    348     int16_t quant_block[64];
    349     for (int i = 0; i < 64; i++) {
    350         /* Quantize each DCT coefficient using the corresponding quant value */
    351         int val = block[i];
    352         int q = quant[i];
    353         int quantized = (val >= 0) ? (val + q/2) / q : (val - q/2) / q;
    354         quant_block[i] = (int16_t)quantized;
    355     }
    356 
    357     /* Now reorder to zigzag sequence for encoding */
    358     int16_t zigzag_block[64];
    359     for (int i = 0; i < 64; i++) {
    360         zigzag_block[i] = quant_block[zigzag[i]];
    361     }
    362 
    363     /* Encode DC coefficient */
    364     int dc = quant_block[0];
    365     int dc_diff = dc - *prev_dc;
    366     *prev_dc = dc;
    367 
    368     int dc_size = bit_size(dc_diff);
    369     if (dc_size > 11) dc_size = 11;
    370 
    371     write_bits(enc, dc_codes[dc_size].code, dc_codes[dc_size].length);
    372 
    373     if (dc_size > 0) {
    374         int dc_val = dc_diff;
    375         if (dc_diff < 0) {
    376             dc_val = dc_diff + (1 << dc_size) - 1;
    377         }
    378         write_bits(enc, dc_val & ((1 << dc_size) - 1), dc_size);
    379     }
    380 
    381     /* Encode AC coefficients (use zigzag ordered block) */
    382     int zero_count = 0;
    383     for (int i = 1; i < 64; i++) {
    384         int ac = zigzag_block[i];
    385         if (ac == 0) {
    386             zero_count++;
    387         } else {
    388             /* Write 16-zero runs (ZRL) */
    389             while (zero_count >= 16) {
    390                 write_bits(enc, ac_codes[0xF0].code, ac_codes[0xF0].length);
    391                 zero_count -= 16;
    392             }
    393 
    394             /* Write coefficient */
    395             int ac_size = bit_size(ac);
    396             if (ac_size > 10) ac_size = 10;
    397             int symbol = (zero_count << 4) | ac_size;
    398 
    399             write_bits(enc, ac_codes[symbol].code, ac_codes[symbol].length);
    400 
    401             int ac_val = ac;
    402             if (ac < 0) {
    403                 ac_val = ac + (1 << ac_size) - 1;
    404             }
    405             write_bits(enc, ac_val & ((1 << ac_size) - 1), ac_size);
    406 
    407             zero_count = 0;
    408         }
    409     }
    410 
    411     /* EOB if needed */
    412     if (zero_count > 0) {
    413         write_bits(enc, ac_codes[0x00].code, ac_codes[0x00].length);
    414     }
    415 
    416     return 1;
    417 }
    418 
    419 /* Write JFIF APP0 marker */
    420 static void write_app0(jpeg_encoder_t* enc) {
    421     write_word(enc, 0xFFE0);  /* APP0 marker */
    422     write_word(enc, 16);      /* Length */
    423     write_byte(enc, 'J');
    424     write_byte(enc, 'F');
    425     write_byte(enc, 'I');
    426     write_byte(enc, 'F');
    427     write_byte(enc, 0);
    428     write_byte(enc, 1);       /* Version major */
    429     write_byte(enc, 1);       /* Version minor */
    430     write_byte(enc, 0);       /* Aspect ratio units */
    431     write_word(enc, 1);       /* X density */
    432     write_word(enc, 1);       /* Y density */
    433     write_byte(enc, 0);       /* Thumbnail width */
    434     write_byte(enc, 0);       /* Thumbnail height */
    435 }
    436 
    437 /* Write DQT marker - table values are written in zigzag order per JPEG spec */
    438 static void write_dqt(jpeg_encoder_t* enc, int table_id, const uint8_t* quant) {
    439     write_word(enc, 0xFFDB);  /* DQT marker */
    440     write_word(enc, 67);      /* Length */
    441     write_byte(enc, table_id); /* Table ID, 8-bit precision */
    442 
    443     /* Write quantization values in zigzag order
    444      * For zigzag position i, we write the quant value for raster position zigzag[i]
    445      * This way decoder reads DQT[zz_pos] and applies it to coefficient at zz_pos */
    446     for (int i = 0; i < 64; i++) {
    447         write_byte(enc, quant[zigzag[i]]);
    448     }
    449 }
    450 
    451 /* Write SOF0 marker */
    452 static void write_sof0(jpeg_encoder_t* enc, int width, int height) {
    453     write_word(enc, 0xFFC0);  /* SOF0 marker */
    454     write_word(enc, 17);      /* Length */
    455     write_byte(enc, 8);       /* Precision */
    456     write_word(enc, height);
    457     write_word(enc, width);
    458     write_byte(enc, 3);       /* Number of components */
    459 
    460     /* Y component */
    461     write_byte(enc, 1);       /* ID */
    462     write_byte(enc, 0x11);    /* Sampling 1x1 */
    463     write_byte(enc, 0);       /* Quant table 0 */
    464 
    465     /* Cb component */
    466     write_byte(enc, 2);
    467     write_byte(enc, 0x11);
    468     write_byte(enc, 1);       /* Quant table 1 */
    469 
    470     /* Cr component */
    471     write_byte(enc, 3);
    472     write_byte(enc, 0x11);
    473     write_byte(enc, 1);
    474 }
    475 
    476 /* Write DHT marker */
    477 static void write_dht(jpeg_encoder_t* enc, int table_class, int table_id,
    478                       const uint8_t* bits, const uint8_t* vals, int num_vals) {
    479     int length = 19 + num_vals;
    480 
    481     write_word(enc, 0xFFC4);  /* DHT marker */
    482     write_word(enc, length);
    483     write_byte(enc, (table_class << 4) | table_id);
    484 
    485     for (int i = 1; i <= 16; i++) {
    486         write_byte(enc, bits[i]);
    487     }
    488 
    489     for (int i = 0; i < num_vals; i++) {
    490         write_byte(enc, vals[i]);
    491     }
    492 }
    493 
    494 /* Write SOS marker */
    495 static void write_sos(jpeg_encoder_t* enc) {
    496     write_word(enc, 0xFFDA);  /* SOS marker */
    497     write_word(enc, 12);      /* Length */
    498     write_byte(enc, 3);       /* Number of components */
    499 
    500     write_byte(enc, 1);       /* Y: DC table 0, AC table 0 */
    501     write_byte(enc, 0x00);
    502 
    503     write_byte(enc, 2);       /* Cb: DC table 1, AC table 1 */
    504     write_byte(enc, 0x11);
    505 
    506     write_byte(enc, 3);       /* Cr: DC table 1, AC table 1 */
    507     write_byte(enc, 0x11);
    508 
    509     write_byte(enc, 0);       /* Spectral selection start */
    510     write_byte(enc, 63);      /* Spectral selection end */
    511     write_byte(enc, 0);       /* Successive approximation */
    512 }
    513 
    514 /* Main encode function */
    515 uint8_t* jpeg_encode(const uint8_t* bgra_data, int width, int height, int quality, size_t* out_size) {
    516     if (!bgra_data || width <= 0 || height <= 0 || !out_size) {
    517         return NULL;
    518     }
    519 
    520     if (quality < 1) quality = 1;
    521     if (quality > 100) quality = 100;
    522 
    523     jpeg_encoder_t enc;
    524     memset(&enc, 0, sizeof(enc));
    525 
    526     /* Initial buffer allocation */
    527     enc.capacity = width * height + 4096;
    528     enc.buffer = (uint8_t*)malloc(enc.capacity);
    529     if (!enc.buffer) return NULL;
    530 
    531     /* Scale quantization tables */
    532     scale_quant_table(enc.lum_quant, std_lum_quant, quality);
    533     scale_quant_table(enc.chr_quant, std_chr_quant, quality);
    534 
    535     /* Build Huffman codes */
    536     build_huffman_codes(enc.dc_lum_codes, 12, dc_lum_bits, dc_lum_val, 12);
    537     build_huffman_codes(enc.dc_chr_codes, 12, dc_chr_bits, dc_chr_val, 12);
    538     build_huffman_codes(enc.ac_lum_codes, 256, ac_lum_bits, ac_lum_val, 162);
    539     build_huffman_codes(enc.ac_chr_codes, 256, ac_chr_bits, ac_chr_val, 162);
    540 
    541     /* Write headers */
    542     write_word(&enc, 0xFFD8);  /* SOI */
    543     write_app0(&enc);
    544     write_dqt(&enc, 0, enc.lum_quant);
    545     write_dqt(&enc, 1, enc.chr_quant);
    546     write_sof0(&enc, width, height);
    547     write_dht(&enc, 0, 0, dc_lum_bits, dc_lum_val, 12);
    548     write_dht(&enc, 1, 0, ac_lum_bits, ac_lum_val, 162);
    549     write_dht(&enc, 0, 1, dc_chr_bits, dc_chr_val, 12);
    550     write_dht(&enc, 1, 1, ac_chr_bits, ac_chr_val, 162);
    551     write_sos(&enc);
    552 
    553     /* Encode blocks */
    554     int prev_dc_y = 0, prev_dc_cb = 0, prev_dc_cr = 0;
    555     int block_width = (width + 7) / 8;
    556     int block_height = (height + 7) / 8;
    557 
    558     int16_t block_y[64], block_cb[64], block_cr[64];
    559 
    560     for (int by = 0; by < block_height; by++) {
    561         for (int bx = 0; bx < block_width; bx++) {
    562             /* Extract 8x8 block and convert BGRA to YCbCr */
    563             for (int py = 0; py < 8; py++) {
    564                 for (int px = 0; px < 8; px++) {
    565                     int x = bx * 8 + px;
    566                     int y = by * 8 + py;
    567 
    568                     int b, g, r;
    569                     if (x < width && y < height) {
    570                         int offset = (y * width + x) * 4;
    571                         b = bgra_data[offset];
    572                         g = bgra_data[offset + 1];
    573                         r = bgra_data[offset + 2];
    574                     } else {
    575                         /* Pad with edge pixels */
    576                         int ex = (x < width) ? x : width - 1;
    577                         int ey = (y < height) ? y : height - 1;
    578                         int offset = (ey * width + ex) * 4;
    579                         b = bgra_data[offset];
    580                         g = bgra_data[offset + 1];
    581                         r = bgra_data[offset + 2];
    582                     }
    583 
    584                     /* RGB to YCbCr conversion (level shifted by -128) */
    585                     int idx = py * 8 + px;
    586                     block_y[idx] = (int16_t)(0.299 * r + 0.587 * g + 0.114 * b - 128);
    587                     block_cb[idx] = (int16_t)(-0.168736 * r - 0.331264 * g + 0.5 * b);
    588                     block_cr[idx] = (int16_t)(0.5 * r - 0.418688 * g - 0.081312 * b);
    589                 }
    590             }
    591 
    592             /* Apply DCT */
    593             fdct_block(block_y);
    594             fdct_block(block_cb);
    595             fdct_block(block_cr);
    596 
    597             /* Encode blocks */
    598             encode_block(&enc, block_y, enc.lum_quant, &prev_dc_y,
    599                         enc.dc_lum_codes, enc.ac_lum_codes);
    600             encode_block(&enc, block_cb, enc.chr_quant, &prev_dc_cb,
    601                         enc.dc_chr_codes, enc.ac_chr_codes);
    602             encode_block(&enc, block_cr, enc.chr_quant, &prev_dc_cr,
    603                         enc.dc_chr_codes, enc.ac_chr_codes);
    604         }
    605     }
    606 
    607     /* Flush and write EOI */
    608     flush_bits(&enc);
    609     write_word(&enc, 0xFFD9);  /* EOI */
    610 
    611     *out_size = enc.size;
    612     return enc.buffer;
    613 }
    614 
    615 void jpeg_encode_free(uint8_t* data) {
    616     if (data) free(data);
    617 }
    618 
    619 /* ============================================================================
    620  * Lua Bindings
    621  * ========================================================================= */
    622 
    623 /**
    624  * JPEGEncode(bgra_string, width, height, quality) -> jpeg_string or nil, error
    625  */
    626 static int l_jpeg_encode(lua_State *L) {
    627     size_t data_len;
    628     const char* data = luaL_checklstring(L, 1, &data_len);
    629     int width = luaL_checkinteger(L, 2);
    630     int height = luaL_checkinteger(L, 3);
    631     int quality = luaL_optinteger(L, 4, 95);
    632 
    633     /* Validate input */
    634     size_t expected_size = (size_t)width * height * 4;
    635     if (data_len < expected_size) {
    636         lua_pushnil(L);
    637         lua_pushstring(L, "Buffer too small for specified dimensions");
    638         return 2;
    639     }
    640 
    641     if (width <= 0 || height <= 0 || width > 32768 || height > 32768) {
    642         lua_pushnil(L);
    643         lua_pushstring(L, "Invalid image dimensions");
    644         return 2;
    645     }
    646 
    647     size_t out_size;
    648     uint8_t* jpeg_data = jpeg_encode((const uint8_t*)data, width, height, quality, &out_size);
    649 
    650     if (!jpeg_data) {
    651         lua_pushnil(L);
    652         lua_pushstring(L, "JPEG encoding failed");
    653         return 2;
    654     }
    655 
    656     lua_pushlstring(L, (const char*)jpeg_data, out_size);
    657     jpeg_encode_free(jpeg_data);
    658 
    659     return 1;
    660 }
    661 
    662 int luaopen_jpegencoder(lua_State *L) {
    663     lua_pushcfunction(L, l_jpeg_encode);
    664     lua_setglobal(L, "JPEGEncode");
    665     return 0;
    666 }