Decoding working for C

2012-08-10 10:49:13 -03:00 · 2012-08-10 10:49:13 -03:00 · 16345dbad2
commit 16345dbad2
parent 11fd9f1f4a
3 changed files with 187 additions and 68 deletions
--- a/huffman.c
+++ b/huffman.c
@ -43,7 +43,7 @@ huffman_build_tree(void **values, int count)
 			malloc (sizeof (struct huffman_node));
 		node->value = values[i];
-		node->weight = i;
+		node->weight = i + 1;
 		node->left = NULL;
 		node->right = NULL;
@ -77,9 +77,11 @@ huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read)
 {
 	struct huffman_node *node = tree;
 	*bits_read = 0;
 	while (true) {
 		if (node == NULL) {
 			return NULL;
 		}
 		if (node->value != NULL) {
 			return node->value;
 		}
--- a/thing.rb
+++ b/thing.rb
@ -22,20 +22,22 @@ end
 require './huffman'
 $log = Logger.new(STDOUT)
-#$log.level = Logger::DEBUG
+$log.level = Logger::DEBUG
-$log.level = Logger::FATAL
+#$log.level = Logger::FATAL
 $sentinal = "SENTINAL"
 class BitWriter
  def initialize(stream)
    @stream = stream
    @byte = 0x00
-    @count = 8
+    @count = 7
  end
  def write(char)
    if char == '1'
-      @byte |= 1 << @count
+      @byte |= 0x01 << @count
    end
    @count -= 1
    if @count == -1
@ -50,8 +52,8 @@ class BitWriter
  end
  def pad()
-    @count = 8
+    @stream.write(Array(@byte).pack('c'))
-    @stream.write(Array(@byte).pack('C'))
+    @count = 7
    @byte = 0x00
  end
 end
@ -199,33 +201,20 @@ def ran_char(val)
  return val
 end
-def binary_write(file, parent, string_huff, node_huff)
+def binary_write(file, node_list, string_huff, node_huff)
-#  file.write(parent.path)
+  node_list.each do |node|
-#  file.write("\0")
+    $log.debug('binary_write') { "begin node: " + node_huff.encode(node) }
-  #offset to child node indicies
+    node.children.each do |path, child|
-   # not needed, can just go write to children indicies
+      # index of path string
-  #file.write(ran_char)
+      $log.debug('binary_write') { "\tpath: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect }
-  if parent.written
+      file.write_bits(string_huff.encode(path))
-    return
+      # offset to node
-  end
+      # index of node, that is.
-
+      file.write_bits(node_huff.encode(child))
-  parent.children.each do |path, child|
+      $log.debug('binary_write') { "\tnode encoded: " + node_huff.encode(child) }
-#    puts "PATH: " + child.path
+    end
-#    file.write(child.path)
+    # end of node is indicated by the special sentinal huffman coding of \0
-#    file.write("\0")
+    file.write_bits(string_huff.encode($sentinal))
    # index of path string
    $log.debug('binary_write') { "path: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect }
    file.write_bits(string_huff.encode(path))
    # offset to node
    # index of node, that is.
    file.write_bits(node_huff.encode(child))
  end
  # reserve null byte for end of node info
  # 3 0s are reserved in our name huffman table to denote end of node
  file.write_bits("000")
  parent.children.each do |path, child|
      binary_write(file, child, string_huff, node_huff)
      child.written = true
  end
 end
@ -281,22 +270,38 @@ def build_huffman_for_strings(strings)
      i.times { paths << string }
      i += 1
    end
    # add on sentinal string
    i.times { paths << $sentinal }
    puts paths
    HuffmanEncoding.new paths
 end
-def build_huffman_for_nodes(parent)
+def build_node_frequencies(parent)
-    nodes = parent.flatten.uniq
+  nodes = parent.flatten.uniq
-    refs = {}
+  refs = {}
-    nodes.each do |node|
+  nodes.each do |node|
-      node.children.each do |key, node|
+    node.children.each do |key, child|
-        refs[node] ||= 0
+      refs[child] ||= 0
-        refs[node] += 1
+      refs[child] += 1
      end
    end
-    refs[parent] = 1
+  end
  list = []
  refs.sort { |l, r| l[1] <=> r[1] }.each do |node, weight|
    list << node
  end
  list
 end
 def build_huffman_for_nodes(list)
    # parent doesn't have to go into the table
    i = 1
    expanded = []
-    refs.each do |node, freq|
+    list.each do |node|
-      freq.times {expanded << node}
+      i.times {expanded << node}
      i += 1
    end
    table = HuffmanEncoding.new expanded
 end
@ -359,8 +364,13 @@ if $0 == __FILE__
    #      parent = compress_prefix(parent)
      puts "building huffman table for nodes"
-      node_huff = build_huffman_for_nodes(parent)
+      node_list = build_node_frequencies(parent)
- 
+      node_huff = build_huffman_for_nodes(node_list)
      # XXX add sentinal value to strings to indicate end of node.
      # should be most frequent one. the string itself doesn't have to
      # be stored, since we just care about the bitstring.
      strings = collect_strings(parent)
      puts "building huffman table for strings"
@ -368,8 +378,19 @@ if $0 == __FILE__
      puts "writing"
      write_strings(file, strings)
      # write out the number of unique path nodes into 1 or more bytes.  if <
      # 128 nodes, write in a single byte. if > 128 nodes, the first byte will
      # begin with a '1' to indicate as such. the following bits in the byte
      # indicate how many bytes following the first byte are used to store the
      # size.
      node_count = node_list.count + 1
      puts node_count
      file.write([node_count].pack("c"))
      bit_file = BitWriter.new file
-      binary_write(bit_file, parent, string_huff, node_huff)
+      binary_write(bit_file, [parent] + node_list, string_huff, node_huff)
      bit_file.pad
    end
--- a/unpack.c
+++ b/unpack.c
@ -1,6 +1,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <zlib.h>
 #include "huffman.h"
@ -8,9 +9,9 @@
 #define CHUNK 1024
 struct node {
-	struct node *next;
+	int count;
-	unsigned int path;
+	char **paths;
-	unsigned int children[];
+	struct node **children;
 };
 static int 
@ -99,11 +100,17 @@ load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
 		}
 	}
-	*dictionary = malloc (sizeof (char *) * offset_size);
+	*dictionary = malloc (sizeof (char *) * (*dictionary_size + 1));
-	for (i = 0; i < offset_size; i++) {
+	for (i = 0; i < *dictionary_size; i++) {
 		(*dictionary)[i] = (char *) buf + dictionary_offsets[i];
 	}
 	(*dictionary_size)++;
 	// Add in the end of node sentinal string
 	char *sentinal = malloc (sizeof (char));
 	sentinal[0] = 0x00;
 	(*dictionary)[i] = sentinal;
 	// rewind back to unused zlib bytes
 	if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) {
 		printf("Error seeking back in stream\n");
@ -120,25 +127,117 @@ load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
 }
 static int
-load_node_list(FILE *stream, struct node **list) {
+load_content_sets(FILE *stream, struct node **list,
 		  struct huffman_node *dictionary_tree) {
-	unsigned char buf[CHUNK];
+	unsigned char *buf = malloc (sizeof (char *) * CHUNK);
 	size_t read;
-	struct node *np = malloc(sizeof(struct node));
+	struct node **nodes;
-	*list = np;
+	int i;
-	read = fread(buf, 1, CHUNK, stream);
+	unsigned char count;
 	fread(&count, sizeof (unsigned char), 1, stream);
 	printf("number of nodes: %hd\n", count);
 	nodes = malloc (sizeof (struct node *) * (unsigned short) count);
 	for (i = 0; i < (unsigned short) count; i++) {
 		nodes[i] = malloc (sizeof (struct node));
 	}
 	read = fread (buf, sizeof (char), CHUNK, stream);
 	printf("Read %zu bytes\n", read);
 	/* 
 	 * the parent node doesn't go in the huffman tree, as nothing else
 	 * references it.
 	 */
 	struct huffman_node *tree =
 		huffman_build_tree ((void **) nodes + 1,
 				    (unsigned short) count - 1);
 	int bits_read = 0;
 	for (i = 0; i < count; i++) {
 		struct node *node = nodes[i];
 		node->count = 0;
 		// XXX hard coded
 		node->paths = malloc (sizeof (char *) * 64);
 		node->children = malloc (sizeof (struct node *) * 64);
 		while (true) {
 			char *path = (char *) huffman_lookup (dictionary_tree,
 							      buf, &bits_read);
 			buf = buf + bits_read / 8;
 			bits_read = bits_read % 8;
 			if (path[0] == '\0') {
 				break;
 			}
 			struct node *child =
 				(struct node *) huffman_lookup (tree, buf,
 								&bits_read);
 			buf = buf + bits_read / 8;
 			bits_read = bits_read % 8;
 			node->paths[node->count] = path;
 			node->children[node->count] = child;
 			node->count++;
 		}
 	}
 	*list = nodes[0];
 	return 0;
 }
 struct stack {
 	struct stack *next;
 	struct stack *prev;
 	char *path;
 };
 static void
 dump_content_set (struct node *content_sets, struct stack *head,
 		  struct stack *tail)
 {
 	int i;
 	struct stack stack;
 	stack.prev = tail;
 	tail->next = &stack;
 	for (i = 0; i < content_sets->count; i++) {
 		stack.path = content_sets->paths[i];
 		dump_content_set(content_sets->children[i], head, &stack);
 	}
 	if (content_sets->count == 0) {
 		struct stack *cur = head;
 		for (cur = head->next; cur != &stack; cur = cur->next) {
 			printf("/%s", cur->path);
 		}
 		printf("\n");
 	}
 }
 static void
 dump_content_sets (struct node *content_sets)
 {
 	struct stack stack;
 	stack.next = NULL;
 	stack.prev = NULL;
 	stack.path = NULL;
 	dump_content_set (content_sets, &stack, &stack);
 }
 int
 main(int argc, char **argv) {
 	FILE *fp;
 	char **dictionary;
 	int dictionary_size;
-	struct node *list;
+	struct node *content_sets;
 	if (argc != 2) {
 		printf("usage: unpack <bin file>\n");
@ -156,18 +255,15 @@ main(int argc, char **argv) {
 		return -1;
 	}
-	struct huffman_node *tree = huffman_build_tree ((void **) dictionary,
+	struct huffman_node *dictionary_tree =
-							dictionary_size);
+		huffman_build_tree ((void **) dictionary, dictionary_size);
-	int bits_read;
+	if (load_content_sets(fp, &content_sets, dictionary_tree)) {
 	short bits = 0xC0;
 	printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read));
 	if (load_node_list(fp, &list)) {
 		printf("node list parsing failed. exiting\n");
 		return -1;
 	}
 	dump_content_sets (content_sets);
 	return 0;
 }