Huffman tree compressing/decompressing in C

In a past course one of the assignments was to write a program that can compress files using Huffman Tree algorithm, and uncompress the files that the program generates.

My design is to count the byte occurrences first, then construct a HT based on the counted byte frequency.

My compressed file format is 256*4 bytes of “header” that stores the counted frequency, so it can be used to construct the tree when decompressing the file. Then there’s a 4-byte integer that indicates how many bits of the last byte is real data. The rest is the real (compressed) data.

Here is this specific version* of code that I want some feedback. Later versions introduced many messy changes (like GUI and buffered I/O) that is not necessary.

Specifically, I’m looking for feedback on my algorithm and data structure implementation, including but not limited to code style, best practices, potential flaws and defects (see below).

  • An exception is the last two functions print_help and main. They’re intended to be as simple as possible, so they contain the bare minimum amount of code to work in a reasonable way. Data validation and error checking etc. are omitted on purpose.

In order to simplify the idea, during designing and coding, I have assumed that

  • the program will not be told to uncompress an invalid file, so there’s no file validity check in the code
  • file availability is ensured by the environment. It will always be a regular file, with no chance of generating a read error mid-way
  • C library functions does not fail for environmental reasons (e.g. host is short of RAM for malloc(3), target disk out of space for fwrite(3) and consequently write(2), or fread(3) as said above)
  • reading/writing byte-by-byte is fine, because a later version of this code introduced chunk I/O and got a bit messier (I think). Suggestions on making the code run faster without implementing chunk I/O is welcome

so I’m also not looking for feedbacks regarding the above things that I have assumed / intentionally ignored.

I have ensured that the code is working properly, with no warnings when compiled with this command (taken from make output)

gcc -O3 -std=c11 -Wall -Wno-unused-result -o huffman huffman.c 

The last option is to suppress the warning about unused result from fread(3).

During my coding process, I run clang-format occasionally and diff the output and my written code to check for potentially bad indentation / styling issues. I am not confident if it can solve everything.

* The link points to my GitHub repo. The code on that page is identical to the code submitted below verbatim.

// File: huffman.c // Author: iBug  #include <stdio.h> #include <stdlib.h> #include <string.h> #include <stdint.h>  typedef unsigned char byte;  typedef struct _HuffNode {     unsigned data;     struct _HuffNode *left, *right, *parent; } HuffNode;  void count_frequency(FILE* fp, unsigned* freq) {     size_t orig_pos = ftell(fp);     int ch;     while (1) {         ch = fgetc(fp);         if (ch < 0)             break;         freq[ch]++;     }     fseek(fp, orig_pos, SEEK_SET); }  void construct_huffman(unsigned* freq_in, HuffNode* tree) {     int count = 256;     unsigned freq[256];     HuffNode *node[256];      // Initialize data     for (int i = 0; i < 256; i++) {         freq[i] = freq_in[i];         tree[i].data = i;         tree[i].left = tree[i].right = NULL;         node[i] = &tree[i];     }      // Sort by frequency, decreasing order     /* WARNING: Although this Quick Sort is an unstable sort,      * it should at least give the same result for the same input frequency table,      * therefore I'm leaving this code here      */     {         unsigned lower[256], upper[256], top = 1;         lower[0] = 0, upper[0] = 256;         while (top > 0) {             top--;             int left = lower[top], right = upper[top];             int i = left, j = right - 1, flag = 0;             if (i >= j) // Nothing to sort                 continue;             while (i < j) {                 if (freq[i] < freq[j]) {                     unsigned t = freq[i]; freq[i] = freq[j]; freq[j] = t;                     HuffNode *p = node[i]; node[i] = node[j]; node[j] = p;                     flag = !flag;                 }                 flag ? i++ : j--;             }             lower[top] = left, upper[top] = i;             lower[top + 1] = i + 1, upper[top + 1] = right;             top += 2;         }     }      // Construct tree     while (count > 1) {         int pos = 512 - count;         HuffNode *parent = &tree[pos];         // Select lowest 2 by freq         int i = count - 2, j = count - 1;         // Create tree, lower freq left         parent->left = node[j]; parent->right = node[i];         node[j]->parent = node[i]->parent = parent;         node[i] = parent;         freq[i] += freq[j];         // Insert         for (; i > 0 && freq[i] > freq[i - 1]; i--) {             unsigned t = freq[i]; freq[i] = freq[i - 1]; freq[i - 1] = t;             HuffNode *p = node[i]; node[i] = node[i - 1]; node[i - 1] = p;         }         count--;     }     // Now HEAD = node[0] = tree[511]     node[0]->parent = NULL; }  void encode_stream(FILE* fin, FILE* fout, HuffNode* tree, unsigned* padding) {     int n;     byte ch;     byte buf = 0, nbuf = 0;     HuffNode *p;     byte code[256];     while (1) {         n = fgetc(fin);         if (n < 0)             break;         ch = n;          // Encode         p = &tree[ch];         n = 0;         while (p->parent) {             if (p == p->parent->left) {                 // Left is 0                 code[n] = 0;             } else if (p == p->parent->right) {                 code[n] = 1;             }             p = p->parent;             n++;         }          // Write         for (int i = n - 1; i >= 0; i--) {             buf |= code[i] << nbuf;             nbuf++;             if (nbuf == 8) {                 fputc(buf, fout);                 nbuf = buf = 0;             }         }     }     fputc(buf, fout);     *padding = 8 - nbuf; }  void decode_stream(FILE* fin, FILE* fout, HuffNode* tree, unsigned padding) {     size_t startpos = ftell(fin); // should be 1028     fseek(fin, 0L, SEEK_END);     size_t endpos = ftell(fin); // last byte handling     fseek(fin, startpos, SEEK_SET);     int count = endpos - startpos;      byte buf = 0, nbuf = 0, bit;     HuffNode *p;     while (count > 0 || nbuf > 0) {         // Start from tree top         p = tree + 510;         while (p->left || p->right) {             // Prepare next bit if needed             if (nbuf == 0) {                 if (count <= 0)                     return;                  buf = fgetc(fin);                 if (count == 1) {                     // Last bit                     nbuf = 8 - padding;                     if (nbuf == 0) {                         return;                     }                 } else {                     nbuf = 8;                 }                 count--;             }             // p has child             bit = buf & 1;             buf >>= 1;             nbuf--;             if (bit == 0)                 p = p->left;             else                 p = p->right;         }         fputc(p->data, fout);     } }  void compress_file(const char* filename, const char* newname) {     FILE *fin = fopen(filename, "rb"),          *fout = fopen(newname, "wb");      unsigned freq[256], padding;     HuffNode tree[512];     size_t padding_pos;     count_frequency(fin, freq);     construct_huffman(freq, tree);     rewind(fin);     for (int i = 0; i < 256; i++)         fwrite(freq + i, 4, 1, fout);     // Write a placeholder for the padding     padding_pos = ftell(fout);     fwrite(&padding, 4, 1, fout);     encode_stream(fin, fout, tree, &padding);     // Write the padding to the placeholder     fseek(fout, padding_pos, SEEK_SET);     fwrite(&padding, 4, 1, fout);     fclose(fin);     fclose(fout); }  void uncompress_file(const char* filename, const char* newname) {     FILE *fin = fopen(filename, "rb"),          *fout = fopen(newname, "wb");      unsigned freq[256], padding;     HuffNode tree[512];     for (int i = 0; i < 256; i++) {         fread(&padding, 4, 1, fin);         freq[i] = padding;     }     fread(&padding, 4, 1, fin);     construct_huffman(freq, tree);     decode_stream(fin, fout, tree, padding);     fclose(fin);     fclose(fout); }  void print_help(void) {     puts("Usage: huffman (-c|-d) input output");     puts("  -c    Compress file from input to output");     puts("  -d    Uncompress file from input to output");     puts("\nCreated by iBug"); }  int main(int argc, char** argv) {     if (argc != 4) {         print_help();         return 1;     }     if (!strcmp(argv[1], "-c")) {         compress_file(argv[2], argv[3]);     } else if (!strcmp(argv[1], "-d")) {         uncompress_file(argv[2], argv[3]);     } else {         print_help();         return 1;     }     return 0; } 

In addition to the mandatory CC BY-SA 3.0 license by posting on Stack Exchange, the code itself also has a MIT license.

On a side note: Although the course has ended and this code is not maintained anymore, it’s still one of the programs that I have written with maximum attention and carefulness, so I believe that any feedback to this code is highly valuable and I will remember them in my future C-coding times.

Huffman Coding API in Swift

This question is a follow up question to: this post

I’d love some advice on my implementation and the API of the Huffman class.

I’m also not sure how to test if my implementation is actually resulting in less bytes than a string. It seems that encoded.count (as a Data) is larger than word.utf8.count (as a String). Maybe I’m just not testing on large enough strings?

Also any thoughts on HuffData storing the code ex. ["00","01"] and the frequencyTable instead of the tree.

Here’s an example of how the API is used:

let word = "MISSISSIPPI_RIVER!" let encoded = try? Huffman.encode(word) let decoded = try? Huffman.decode(encoded!) XCTAssertEqual(decoded, word) 

Here’s the code:

import Foundation  struct HuffData: Codable {     var code: [String]     var frequencyTable: [String: String] }  class Huffman {     static func decode(_ data: Data) throws -> String {         do {             let huff = try JSONDecoder().decode(HuffData.self, from: data)             let reverseTable = Dictionary(uniqueKeysWithValues: zip(huff.frequencyTable.values, huff.frequencyTable.keys))             return huff.code.compactMap({ reverseTable[$  0]}).joined()         }         catch let error {             throw error         }     }      static func encode(_ input: String) throws -> Data {         let frequencyTable = Huffman.buildFrequencyTable(for: input)         let code = input.compactMap({frequencyTable[String($  0)]})         let huff = HuffData(code: code, frequencyTable: frequencyTable)         do {             let data = try JSONEncoder().encode(huff)             return data         }         catch let error {             throw error         }     }      static private func buildFrequencyTable(for input: String) -> [String: String] {         // count letter frequency         let sortedFrequency = input.reduce(into: [String: Int](), { freq, char in             freq[String(char), default: 0] += 1         })         // create queue of initial Nodes         let queue = sortedFrequency.map{ Node(name: $  0.key, value: $  0.value)}         // generate key by traversing tree         return Huffman.generateKey(for: Huffman.createTree(with: queue), prefix: "")     }      static private func generateKey(for node: Node, prefix: String) -> [String: String] {         var key = [String: String]()         if let left = node.left, let right = node.right {             key.merge(generateKey(for: left, prefix: prefix + "0"), uniquingKeysWith: {current,_ in current})             key.merge(generateKey(for: right, prefix: prefix + "1"), uniquingKeysWith: {current,_ in current})         }else {             key[node.name] = prefix         }         return key     }      static private func createTree(with queue: [Node]) -> Node {         // initialize queue that sorts by decreasing count         var queue = PriorityQueue(queue: queue)         // until we have 1 root node, join subtrees of least frequency         while queue.count > 1 {             let node1 = queue.dequeue()             let node2 = queue.dequeue()             let rootNode = Huffman.createRoot(with: node1, and: node2)             queue.enqueue(node: rootNode)         }         return queue.queue[0]     }      static private func createRoot(with first: Node, and second: Node) -> Node {         return Node(name: "\(first.name)\(second.name)", value: first.value + second.value, left: first, right: second)     }  }  struct PriorityQueue {     var queue: [Node]     var count: Int {         return queue.count     }     mutating func enqueue(node: Node) {         queue.insert(node, at: queue.index(where: {$  0.value <= node.value}) ?? 0)     }     mutating func dequeue() -> Node {         return queue.removeLast()     }     init(queue: [Node]){         // assumes queue will always be sorted by decreasing count         self.queue = queue.sorted(by: {$  0.value > $  1.value})     } }  class Node: CustomStringConvertible {     var description: String {         return "\(name): \(value)"     }     let name: String     let value: Int     let left: Node?     let right: Node?      init(name: String, value: Int, left: Node? = nil, right: Node? = nil) {         self.name = name         self.value = value         self.left = left         self.right = right     } }