deflate_impl.c (20603B)
1 #include "deflate_impl.h" 2 #include <stdlib.h> 3 #include <string.h> 4 5 /* Fixed Huffman codes from RFC 1951 */ 6 static const uint8_t fixed_literal_lengths[288] = { 7 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 10 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 11 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 12 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 13 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 14 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 15 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8 16 }; 17 18 /* Code length alphabet order for dynamic blocks */ 19 static const uint8_t code_length_order[19] = { 20 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 21 }; 22 23 /* Length extra bits and base values */ 24 static const uint8_t length_extra_bits[29] = { 25 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0 26 }; 27 static const uint16_t length_base[29] = { 28 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258 29 }; 30 31 /* Distance extra bits and base values */ 32 static const uint8_t distance_extra_bits[30] = { 33 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 34 }; 35 static const uint16_t distance_base[30] = { 36 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577 37 }; 38 39 /* Bit buffer functions for compression */ 40 static void init_bit_buffer(bit_buffer_t* bb, uint32_t max_size) { 41 bb->buffer = (uint8_t*)malloc(max_size); 42 if (bb->buffer) { 43 memset(bb->buffer, 0, max_size); 44 } 45 bb->buffer_size = 0; 46 bb->byte_pos = 0; 47 bb->bit_pos = 0; 48 bb->max_size = max_size; 49 } 50 51 static void write_bits(bit_buffer_t* bb, uint32_t bits, uint32_t num_bits) { 52 while (num_bits > 0) { 53 if (bb->byte_pos >= bb->max_size) return; 54 55 uint32_t bits_available = 8 - bb->bit_pos; 56 uint32_t bits_to_write = (num_bits < bits_available) ? num_bits : bits_available; 57 58 bb->buffer[bb->byte_pos] |= (bits & ((1 << bits_to_write) - 1)) << bb->bit_pos; 59 60 bits >>= bits_to_write; 61 num_bits -= bits_to_write; 62 bb->bit_pos += bits_to_write; 63 64 if (bb->bit_pos >= 8) { 65 bb->bit_pos = 0; 66 bb->byte_pos++; 67 if (bb->byte_pos < bb->max_size) { 68 bb->buffer[bb->byte_pos] = 0; 69 } 70 } 71 } 72 73 if (bb->byte_pos + (bb->bit_pos > 0 ? 1 : 0) > bb->buffer_size) { 74 bb->buffer_size = bb->byte_pos + (bb->bit_pos > 0 ? 1 : 0); 75 } 76 } 77 78 static void align_bit_buffer(bit_buffer_t* bb) { 79 if (bb->bit_pos > 0) { 80 bb->byte_pos++; 81 bb->bit_pos = 0; 82 if (bb->byte_pos < bb->max_size) { 83 bb->buffer[bb->byte_pos] = 0; 84 } 85 bb->buffer_size = bb->byte_pos; 86 } 87 } 88 89 /* Build fixed Huffman codes for compression */ 90 static void build_fixed_huffman_codes(huffman_code_t* literal_codes, huffman_code_t* distance_codes) { 91 int code = 0; 92 93 /* 0-143: 8 bits, codes 00110000-10111111 */ 94 for (int i = 0; i <= 143; i++) { 95 literal_codes[i].length = 8; 96 literal_codes[i].code = code++; 97 } 98 99 /* 144-255: 9 bits, codes 110010000-111111111 */ 100 code = 0b110010000; 101 for (int i = 144; i <= 255; i++) { 102 literal_codes[i].length = 9; 103 literal_codes[i].code = code++; 104 } 105 106 /* 256-279: 7 bits, codes 0000000-0010111 */ 107 code = 0; 108 for (int i = 256; i <= 279; i++) { 109 literal_codes[i].length = 7; 110 literal_codes[i].code = code++; 111 } 112 113 /* 280-287: 8 bits, codes 11000000-11000111 */ 114 code = 0b11000000; 115 for (int i = 280; i <= 287; i++) { 116 literal_codes[i].length = 8; 117 literal_codes[i].code = code++; 118 } 119 120 /* Fixed distance codes: all 5 bits */ 121 for (int i = 0; i < 30; i++) { 122 distance_codes[i].length = 5; 123 distance_codes[i].code = i; 124 } 125 } 126 127 /* Reverse bits for Huffman codes */ 128 static uint16_t reverse_bits(uint16_t code, uint8_t length) { 129 uint16_t result = 0; 130 for (int i = 0; i < length; i++) { 131 result = (result << 1) | (code & 1); 132 code >>= 1; 133 } 134 return result; 135 } 136 137 /* Write Huffman coded literal/length */ 138 static void write_huffman_code(bit_buffer_t* bb, huffman_code_t* codes, uint16_t symbol) { 139 huffman_code_t code = codes[symbol]; 140 uint16_t reversed = reverse_bits(code.code, code.length); 141 write_bits(bb, reversed, code.length); 142 } 143 144 /* Find LZ77 match */ 145 static void find_best_match(const uint8_t* data, uint32_t pos, uint32_t size, 146 uint32_t* match_len, uint32_t* match_dist) { 147 *match_len = 0; 148 *match_dist = 0; 149 150 if (pos < 3 || size - pos < 3) return; 151 152 uint32_t window_start = (pos > 32768) ? (pos - 32768) : 0; 153 uint32_t max_len = (size - pos < 258) ? (size - pos) : 258; 154 155 for (uint32_t i = window_start; i < pos; i++) { 156 uint32_t len = 0; 157 while (len < max_len && data[i + len] == data[pos + len]) { 158 len++; 159 } 160 161 if (len >= 3 && len > *match_len) { 162 *match_len = len; 163 *match_dist = pos - i; 164 } 165 } 166 } 167 168 /* Get length code */ 169 static uint16_t get_length_code(uint32_t length) { 170 if (length < 3) return 0; 171 if (length > 258) length = 258; 172 173 for (int i = 0; i < 29; i++) { 174 if (i == 28 || length < length_base[i + 1]) { 175 return 257 + i; 176 } 177 } 178 return 285; 179 } 180 181 /* Get distance code */ 182 static uint16_t get_distance_code(uint32_t distance) { 183 if (distance < 1) return 0; 184 185 for (int i = 0; i < 30; i++) { 186 if (i == 29 || distance < distance_base[i + 1]) { 187 return i; 188 } 189 } 190 return 29; 191 } 192 193 /* Compress using fixed Huffman codes */ 194 int deflate_compress_full(const uint8_t* input, uint32_t input_size, 195 uint8_t** output, uint32_t* output_size, int level) { 196 uint32_t max_output = input_size + (input_size / 1000) + 12 + 5; 197 bit_buffer_t bb; 198 init_bit_buffer(&bb, max_output); 199 200 if (!bb.buffer) return -1; 201 202 huffman_code_t literal_codes[288]; 203 huffman_code_t distance_codes[30]; 204 build_fixed_huffman_codes(literal_codes, distance_codes); 205 206 /* Write block header: BFINAL=1, BTYPE=01 (fixed Huffman) */ 207 write_bits(&bb, 0b101, 3); 208 209 uint32_t pos = 0; 210 while (pos < input_size) { 211 uint32_t match_len, match_dist; 212 213 if (level > 1) { 214 find_best_match(input, pos, input_size, &match_len, &match_dist); 215 } else { 216 match_len = 0; 217 } 218 219 if (match_len >= 3) { 220 uint16_t length_code = get_length_code(match_len); 221 write_huffman_code(&bb, literal_codes, length_code); 222 223 uint8_t extra_len_bits = length_extra_bits[length_code - 257]; 224 if (extra_len_bits > 0) { 225 uint32_t extra_len = match_len - length_base[length_code - 257]; 226 write_bits(&bb, extra_len, extra_len_bits); 227 } 228 229 uint16_t dist_code = get_distance_code(match_dist); 230 write_huffman_code(&bb, distance_codes, dist_code); 231 232 uint8_t extra_dist_bits = distance_extra_bits[dist_code]; 233 if (extra_dist_bits > 0) { 234 uint32_t extra_dist = match_dist - distance_base[dist_code]; 235 write_bits(&bb, extra_dist, extra_dist_bits); 236 } 237 238 pos += match_len; 239 } else { 240 write_huffman_code(&bb, literal_codes, input[pos]); 241 pos++; 242 } 243 } 244 245 write_huffman_code(&bb, literal_codes, 256); 246 align_bit_buffer(&bb); 247 248 *output = bb.buffer; 249 *output_size = bb.buffer_size; 250 251 return 0; 252 } 253 254 /* ======== DECOMPRESSION ======== */ 255 256 /* Read bits from input */ 257 static uint32_t read_bits(inflate_state_t* state, uint32_t num_bits) { 258 uint32_t result = 0; 259 uint32_t bits_read = 0; 260 261 while (bits_read < num_bits && state->byte_pos < state->input_size) { 262 uint32_t bits_available = 8 - state->bit_pos; 263 uint32_t bits_to_read = (num_bits - bits_read < bits_available) ? 264 (num_bits - bits_read) : bits_available; 265 266 uint32_t bits = (state->input[state->byte_pos] >> state->bit_pos) & 267 ((1 << bits_to_read) - 1); 268 result |= bits << bits_read; 269 270 bits_read += bits_to_read; 271 state->bit_pos += bits_to_read; 272 273 if (state->bit_pos >= 8) { 274 state->bit_pos = 0; 275 state->byte_pos++; 276 } 277 } 278 279 return result; 280 } 281 282 /* Peek bits without advancing */ 283 static uint32_t peek_bits(inflate_state_t* state, uint32_t num_bits) { 284 uint32_t save_byte = state->byte_pos; 285 uint32_t save_bit = state->bit_pos; 286 uint32_t result = read_bits(state, num_bits); 287 state->byte_pos = save_byte; 288 state->bit_pos = save_bit; 289 return result; 290 } 291 292 /* Build Huffman decode table using canonical codes */ 293 static int build_huffman_table(uint16_t* table, const uint8_t* lengths, uint32_t num_codes, uint32_t max_bits) { 294 /* Count codes of each length */ 295 uint32_t bl_count[16] = {0}; 296 for (uint32_t i = 0; i < num_codes; i++) { 297 if (lengths[i] < 16) { 298 bl_count[lengths[i]]++; 299 } 300 } 301 bl_count[0] = 0; 302 303 /* Find first code for each length */ 304 uint32_t next_code[16]; 305 uint32_t code = 0; 306 for (int bits = 1; bits <= 15; bits++) { 307 code = (code + bl_count[bits - 1]) << 1; 308 next_code[bits] = code; 309 } 310 311 /* Build lookup table */ 312 memset(table, 0xFF, (1 << max_bits) * sizeof(uint16_t) * 2); 313 314 for (uint32_t i = 0; i < num_codes; i++) { 315 uint8_t len = lengths[i]; 316 if (len > 0) { 317 uint32_t code = next_code[len]++; 318 /* Reverse the code bits */ 319 uint32_t rev_code = 0; 320 for (int b = 0; b < len; b++) { 321 rev_code = (rev_code << 1) | (code & 1); 322 code >>= 1; 323 } 324 /* Store in table: [reversed_code] = (length << 16) | symbol */ 325 if (rev_code < (1U << max_bits)) { 326 table[rev_code * 2] = len; 327 table[rev_code * 2 + 1] = i; 328 } 329 } 330 } 331 332 return 0; 333 } 334 335 /* Decode Huffman symbol using lookup table */ 336 static int decode_huffman_symbol(inflate_state_t* state, uint16_t* table, uint32_t max_bits) { 337 /* Try to decode using table lookup */ 338 uint32_t code = 0; 339 340 for (uint32_t bits = 1; bits <= max_bits; bits++) { 341 code = (code << 1) | read_bits(state, 1); 342 343 /* Check all entries for this bit length */ 344 for (uint32_t i = 0; i < (1U << max_bits); i++) { 345 if (table[i * 2] == bits) { 346 /* Check if codes match (compare only 'bits' bits) */ 347 uint32_t table_code = i; 348 if ((code & ((1 << bits) - 1)) == (table_code & ((1 << bits) - 1))) { 349 return table[i * 2 + 1]; 350 } 351 } 352 } 353 } 354 355 return -1; /* Decoding error */ 356 } 357 358 /* Ensure output buffer has enough space */ 359 static int ensure_output_space(inflate_state_t* state, uint32_t needed) { 360 while (state->output_pos + needed > state->output_capacity) { 361 state->output_capacity *= 2; 362 uint8_t* new_output = (uint8_t*)realloc(state->output, state->output_capacity); 363 if (!new_output) { 364 free(state->output); 365 state->output = NULL; 366 return -1; 367 } 368 state->output = new_output; 369 } 370 return 0; 371 } 372 373 /* Decompress deflate data */ 374 int deflate_decompress_full(const uint8_t* input, uint32_t input_size, 375 uint8_t** output, uint32_t* output_size) { 376 inflate_state_t state; 377 memset(&state, 0, sizeof(state)); 378 379 state.input = input; 380 state.input_size = input_size; 381 state.output_capacity = input_size * 3; /* Initial guess */ 382 state.output = (uint8_t*)malloc(state.output_capacity); 383 384 if (!state.output) return -1; 385 386 /* Process blocks */ 387 int is_final = 0; 388 while (!is_final && state.byte_pos < input_size) { 389 is_final = read_bits(&state, 1); 390 uint32_t btype = read_bits(&state, 2); 391 392 if (btype == 0) { 393 /* Uncompressed block */ 394 /* Align to byte boundary */ 395 state.bit_pos = 0; 396 if (state.byte_pos < input_size) { 397 state.byte_pos++; 398 } 399 400 if (state.byte_pos + 4 > input_size) { 401 free(state.output); 402 return -1; 403 } 404 405 uint16_t len = input[state.byte_pos] | (input[state.byte_pos + 1] << 8); 406 state.byte_pos += 4; /* Skip LEN and NLEN */ 407 408 if (ensure_output_space(&state, len) < 0) return -1; 409 410 if (state.byte_pos + len > input_size) { 411 free(state.output); 412 return -1; 413 } 414 415 memcpy(state.output + state.output_pos, input + state.byte_pos, len); 416 state.output_pos += len; 417 state.byte_pos += len; 418 419 } else if (btype == 1) { 420 /* Fixed Huffman codes */ 421 uint8_t lit_lengths[288]; 422 uint8_t dist_lengths[32]; 423 424 for (int i = 0; i < 288; i++) { 425 lit_lengths[i] = fixed_literal_lengths[i]; 426 } 427 for (int i = 0; i < 32; i++) { 428 dist_lengths[i] = 5; 429 } 430 431 build_huffman_table(state.literal_tree, lit_lengths, 288, 9); 432 build_huffman_table(state.distance_tree, dist_lengths, 32, 5); 433 434 /* Decode literals and length/distance pairs */ 435 while (1) { 436 int symbol = decode_huffman_symbol(&state, state.literal_tree, 9); 437 438 if (symbol < 0) { 439 free(state.output); 440 return -1; 441 } 442 443 if (symbol < 256) { 444 /* Literal byte */ 445 if (ensure_output_space(&state, 1) < 0) return -1; 446 state.output[state.output_pos++] = symbol; 447 448 } else if (symbol == 256) { 449 /* End of block */ 450 break; 451 452 } else if (symbol <= 285) { 453 /* Length/distance pair */ 454 uint32_t length = length_base[symbol - 257]; 455 uint8_t extra_bits = length_extra_bits[symbol - 257]; 456 if (extra_bits > 0) { 457 length += read_bits(&state, extra_bits); 458 } 459 460 int dist_symbol = decode_huffman_symbol(&state, state.distance_tree, 5); 461 if (dist_symbol < 0 || dist_symbol >= 30) { 462 free(state.output); 463 return -1; 464 } 465 466 uint32_t distance = distance_base[dist_symbol]; 467 extra_bits = distance_extra_bits[dist_symbol]; 468 if (extra_bits > 0) { 469 distance += read_bits(&state, extra_bits); 470 } 471 472 if (distance > state.output_pos) { 473 free(state.output); 474 return -1; 475 } 476 477 if (ensure_output_space(&state, length) < 0) return -1; 478 479 /* Copy from history */ 480 for (uint32_t i = 0; i < length; i++) { 481 state.output[state.output_pos] = state.output[state.output_pos - distance]; 482 state.output_pos++; 483 } 484 } else { 485 /* Invalid symbol */ 486 free(state.output); 487 return -1; 488 } 489 } 490 491 } else if (btype == 2) { 492 /* Dynamic Huffman codes */ 493 uint32_t hlit = read_bits(&state, 5) + 257; 494 uint32_t hdist = read_bits(&state, 5) + 1; 495 uint32_t hclen = read_bits(&state, 4) + 4; 496 497 /* Read code length code lengths */ 498 uint8_t code_length_lengths[19] = {0}; 499 for (uint32_t i = 0; i < hclen; i++) { 500 code_length_lengths[code_length_order[i]] = read_bits(&state, 3); 501 } 502 503 /* Build code length Huffman table */ 504 uint16_t code_length_table[512]; 505 build_huffman_table(code_length_table, code_length_lengths, 19, 7); 506 507 /* Decode literal/length and distance code lengths */ 508 uint8_t lengths[288 + 32]; 509 uint32_t length_idx = 0; 510 uint32_t total_lengths = hlit + hdist; 511 512 while (length_idx < total_lengths) { 513 int symbol = decode_huffman_symbol(&state, code_length_table, 7); 514 if (symbol < 0) { 515 free(state.output); 516 return -1; 517 } 518 519 if (symbol < 16) { 520 lengths[length_idx++] = symbol; 521 } else if (symbol == 16) { 522 /* Repeat previous length 3-6 times */ 523 uint32_t repeat = 3 + read_bits(&state, 2); 524 if (length_idx == 0) { 525 free(state.output); 526 return -1; 527 } 528 uint8_t prev = lengths[length_idx - 1]; 529 for (uint32_t i = 0; i < repeat && length_idx < total_lengths; i++) { 530 lengths[length_idx++] = prev; 531 } 532 } else if (symbol == 17) { 533 /* Repeat 0 for 3-10 times */ 534 uint32_t repeat = 3 + read_bits(&state, 3); 535 for (uint32_t i = 0; i < repeat && length_idx < total_lengths; i++) { 536 lengths[length_idx++] = 0; 537 } 538 } else if (symbol == 18) { 539 /* Repeat 0 for 11-138 times */ 540 uint32_t repeat = 11 + read_bits(&state, 7); 541 for (uint32_t i = 0; i < repeat && length_idx < total_lengths; i++) { 542 lengths[length_idx++] = 0; 543 } 544 } 545 } 546 547 /* Build literal/length and distance tables */ 548 build_huffman_table(state.literal_tree, lengths, hlit, 15); 549 build_huffman_table(state.distance_tree, lengths + hlit, hdist, 15); 550 551 /* Decode data (same as fixed Huffman) */ 552 while (1) { 553 int symbol = decode_huffman_symbol(&state, state.literal_tree, 15); 554 555 if (symbol < 0) { 556 free(state.output); 557 return -1; 558 } 559 560 if (symbol < 256) { 561 if (ensure_output_space(&state, 1) < 0) return -1; 562 state.output[state.output_pos++] = symbol; 563 564 } else if (symbol == 256) { 565 break; 566 567 } else if (symbol <= 285) { 568 uint32_t length = length_base[symbol - 257]; 569 uint8_t extra_bits = length_extra_bits[symbol - 257]; 570 if (extra_bits > 0) { 571 length += read_bits(&state, extra_bits); 572 } 573 574 int dist_symbol = decode_huffman_symbol(&state, state.distance_tree, 15); 575 if (dist_symbol < 0 || dist_symbol >= 30) { 576 free(state.output); 577 return -1; 578 } 579 580 uint32_t distance = distance_base[dist_symbol]; 581 extra_bits = distance_extra_bits[dist_symbol]; 582 if (extra_bits > 0) { 583 distance += read_bits(&state, extra_bits); 584 } 585 586 if (distance > state.output_pos) { 587 free(state.output); 588 return -1; 589 } 590 591 if (ensure_output_space(&state, length) < 0) return -1; 592 593 for (uint32_t i = 0; i < length; i++) { 594 state.output[state.output_pos] = state.output[state.output_pos - distance]; 595 state.output_pos++; 596 } 597 } else { 598 free(state.output); 599 return -1; 600 } 601 } 602 603 } else { 604 /* Invalid block type (btype == 3) */ 605 free(state.output); 606 return -1; 607 } 608 } 609 610 *output = state.output; 611 *output_size = state.output_pos; 612 613 return 0; 614 }