Add huffman decoding for C

2012-08-09 16:09:29 -03:00 · 2012-08-09 16:09:29 -03:00 · 11fd9f1f4a
commit 11fd9f1f4a
parent 4b82b83e02
6 changed files with 193 additions and 30 deletions
--- a/6
+++ b/6
@ -1,6 +1,6 @@

 CFLAGS += $(shell pkg-config --libs --cflags zlib)
-CFLAGS += -Wall
+CFLAGS += -Wall -g

 ifndef CC
 CC = gcc
@ -11,8 +11,8 @@ TMP_FILES = $(wildcard *~)

 all: $(APP)

-%: %.c
-	$(CC) $(CFLAGS) -o $@ $<
+unpack: unpack.c huffman.c huffman.h
+	$(CC) $(CFLAGS) -o $@ unpack.c huffman.c huffman.h

 clean:
 	rm -rf $(APP) $(TMP_FILES)
--- a/huffman.c
+++ b/huffman.c
@ -0,0 +1,98 @@
+#include "huffman.h"
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static int
+find_smallest (struct huffman_node **nodes, int count, int different)
+{
+	int smallest;
+	int i;
+
+	for (i = 0; nodes[i]->weight == -1; i++);
+
+	if (i == different) {
+		for (i++; nodes[i]->weight == -1; i++);
+	}
+	smallest = i;
+
+	for (i = smallest + 1; i < count; i++) {
+		if (i == different || nodes[i]->weight == -1) {
+			continue;
+		}
+
+		if (nodes[i]->weight < nodes[smallest]->weight) {
+			smallest = i;
+		}
+	}
+
+	return smallest;
+}
+
+struct huffman_node *
+huffman_build_tree(void **values, int count)
+{
+	int i;
+	struct huffman_node **nodes;
+
+
+	nodes = malloc (sizeof (struct huffman_node *) * count);
+	for (i = 0; i < count; i++) {
+		struct huffman_node *node =
+			malloc (sizeof (struct huffman_node));
+
+		node->value = values[i];
+		node->weight = i;
+		node->left = NULL;
+		node->right = NULL;
+
+		nodes[i] = node;
+	}
+
+	int tree1;
+	int tree2;
+	for (i = 1; i < count; i++) {
+		struct huffman_node *tmp;
+
+		tree1 = find_smallest (nodes, count, -1);
+		tree2 = find_smallest (nodes, count, tree1);
+
+		tmp = nodes[tree1];
+
+		nodes[tree1] = malloc (sizeof (struct huffman_node));
+		nodes[tree1]->weight = tmp->weight + nodes[tree2]->weight;
+		nodes[tree1]->value = NULL;
+		nodes[tree1]->left = nodes[tree2];
+		nodes[tree1]->right = tmp;
+
+		nodes[tree2]->weight = -1;
+	}
+
+	return nodes[tree1];
+}
+
+void *
+huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read)
+{
+
+	struct huffman_node *node = tree;
+	*bits_read = 0;
+
+	while (true) {
+		if (node->value != NULL) {
+			return node->value;
+		}
+
+		if ((bits[0] << *bits_read % 8 & 0x80) == 0) {
+			node = node->left;
+		} else {
+			node = node->right;
+		}
+
+		(*bits_read)++;
+		if (*bits_read % 8 == 0) {
+			bits++;
+		}
+	}
+}
--- a/huffman.h
+++ b/huffman.h
@ -0,0 +1,12 @@
+
+struct huffman_node {
+	int weight;
+	void *value;
+	struct huffman_node *left;
+	struct huffman_node *right;
+};
+
+struct huffman_node *huffman_build_tree(void **values, int count);
+
+void *huffman_lookup (struct huffman_node *tree, unsigned char *bits,
+		      int *bits_read);
--- a/huffman.rb
+++ b/huffman.rb
@ -49,20 +49,34 @@ class NodeQueue
    generate_tree
  end

+  def find_smallest(not_this)
+    smallest = nil
+    for i in 0..@nodes.size - 1
+      if i == not_this
+        next
+      end
+      if smallest.nil? or @nodes[i].weight < @nodes[smallest].weight
+        smallest = i
+      end
+    end
+    smallest
+  end
+
+
  def generate_tree
    while @nodes.size > 1
-      sorted = @nodes.sort { |a,b| a.weight <=> b.weight }
-      to_merge = []
-      2.times { to_merge << sorted.shift }
-      sorted << merge_nodes(to_merge[0], to_merge[1])
-      @nodes = sorted
+      node1 = self.find_smallest(-1)
+      node2 = self.find_smallest(node1)
+      new = merge_nodes(@nodes[node1], @nodes[node2])
+      @nodes[node1] = new
+      @nodes.delete_at(node2)
    end
    @huffman_root = @nodes.first
  end

  def merge_nodes(node1, node2)
-    left = node1.weight > node2.weight ? node2 : node1
-    right = left == node1 ? node2 : node1
+    right = node1
+    left = node2
    node = HuffNode.new(:weight => left.weight + right.weight, :left => left, :right => right)
    left.parent = right.parent = node
    node
--- a/thing.rb
+++ b/thing.rb
@ -248,7 +248,8 @@ end

 def write_strings(file, strings)
  string_io = StringIO.new()
-  strings.each_key do |string|
+
+  strings.each do |string|
    string_io.write(string)
    string_io.write("\0")
  end
@ -264,13 +265,21 @@ def collect_strings(parent)
      strings[key] += 1
    end
  end
-  strings
+
+  list = []
+  strings.sort { |l, r| l[1] <=> r[1] }.each do |string, weight|
+    list << string
+  end
+
+  list
 end

-def build_huffman_for_strings(parent)
+def build_huffman_for_strings(strings)
    paths = []
-    parent.flatten.uniq.each do |node|
-      node.children.each_key {|key| paths << key}
+    i = 1
+    strings.each do |string|
+      i.times { paths << string }
+      i += 1
    end
    HuffmanEncoding.new paths
 end
@ -349,13 +358,15 @@ if $0 == __FILE__
      de_dupe_driver(parent)
    #      parent = compress_prefix(parent)

-      puts "building huffman table for strings"
-      string_huff = build_huffman_for_strings(parent)
      puts "building huffman table for nodes"
      node_huff = build_huffman_for_nodes(parent)
-      
-      puts "writing"
+ 
      strings = collect_strings(parent)
+    
+      puts "building huffman table for strings"
+      string_huff = build_huffman_for_strings(strings)
+ 
+      puts "writing"
      write_strings(file, strings)
      bit_file = BitWriter.new file
      binary_write(bit_file, parent, string_huff, node_huff)
--- a/unpack.c
+++ b/unpack.c
@ -3,6 +3,8 @@
 #include <stdlib.h>
 #include <zlib.h>

+#include "huffman.h"
+
 #define CHUNK 1024

 struct node {
@ -12,15 +14,15 @@ struct node {
 };

 static int 
-load_dictionary(FILE *source, unsigned char **dictionary) {
+load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
+{
 	int ret;
-	unsigned have;
 	z_stream strm;
 	unsigned char in[CHUNK];
 	int read = 0;

+	// XXX keep a ref to buf for free()
 	unsigned char *buf = malloc(sizeof(char) * CHUNK);
-	*dictionary = buf;

 	printf("unpacking string dictionary\n");

@ -67,7 +69,6 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
 				printf("MEMORY ERROR\n");
 				return -1;
 			    }
-			    have = CHUNK - strm.avail_out;
 			    read += CHUNK - strm.avail_out;
 		} while (strm.avail_out == 0);

@ -75,17 +76,34 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
 		/* done when inflate() says it's done */
 	} while (ret != Z_STREAM_END);

-	printf("data is:\n");
+	int offset_size = 64;
+	int *dictionary_offsets = malloc (sizeof (int) * offset_size);
+	*dictionary_size = 1;

 	int i;
-	for (i=0; i < read; i++) {
+	int j = 0;
+	dictionary_offsets[j++] = 0;
+	for (i = 0; i < read; i++) {
 		if (buf[i] == '\0') {
-			putchar('\n');
-		} else {
-			putchar(buf[i]);
+			if (i != read - 1) {
+				dictionary_offsets[j++] = i + 1;
+				(*dictionary_size)++;
+				if (j == offset_size) {
+					offset_size = offset_size * 2;
+					dictionary_offsets =
+						realloc (dictionary_offsets,
+							 sizeof (int) *
+							 offset_size);
+				}
+			}
 		}
 	}

+	*dictionary = malloc (sizeof (char *) * offset_size);
+	for (i = 0; i < offset_size; i++) {
+		(*dictionary)[i] = (char *) buf + dictionary_offsets[i];
+	}
+
 	// rewind back to unused zlib bytes
 	if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) {
 		printf("Error seeking back in stream\n");
@ -95,6 +113,7 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
 	printf ("dictionary stats:\n");
 	printf ("\tcompressed size: %zu\n", ftell(source));
 	printf ("\tuncompressed size: %d\n", read);
+	printf ("\tentries found: %d\n", *dictionary_size);
 	inflateEnd(&strm);

 	return ret == Z_STREAM_END ? 0 : -1;
@ -117,7 +136,8 @@ load_node_list(FILE *stream, struct node **list) {
 int
 main(int argc, char **argv) {
 	FILE *fp;
-	unsigned char *dictionary;
+	char **dictionary;
+	int dictionary_size;
 	struct node *list;

 	if (argc != 2) {
@ -131,11 +151,19 @@ main(int argc, char **argv) {
 		return -1;
 	}

-	if (load_dictionary(fp, &dictionary)) {
+	if (load_dictionary(fp, &dictionary, &dictionary_size)) {
 		printf("dictionary inflation failed. exiting\n");
 		return -1;
 	}

+	struct huffman_node *tree = huffman_build_tree ((void **) dictionary,
+							dictionary_size);
+
+	int bits_read;
+	short bits = 0xC0;
+
+	printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read));
+	
 	if (load_node_list(fp, &list)) {
 		printf("node list parsing failed. exiting\n");
 		return -1;