diff --git a/Makefile b/Makefile index 0e93870..3c0c0f4 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CFLAGS += $(shell pkg-config --libs --cflags zlib) -CFLAGS += -Wall +CFLAGS += -Wall -g ifndef CC CC = gcc @@ -11,8 +11,8 @@ TMP_FILES = $(wildcard *~) all: $(APP) -%: %.c - $(CC) $(CFLAGS) -o $@ $< +unpack: unpack.c huffman.c huffman.h + $(CC) $(CFLAGS) -o $@ unpack.c huffman.c huffman.h clean: rm -rf $(APP) $(TMP_FILES) diff --git a/huffman.c b/huffman.c new file mode 100644 index 0000000..6cfe797 --- /dev/null +++ b/huffman.c @@ -0,0 +1,98 @@ +#include "huffman.h" + +#include +#include +#include + +static int +find_smallest (struct huffman_node **nodes, int count, int different) +{ + int smallest; + int i; + + for (i = 0; nodes[i]->weight == -1; i++); + + if (i == different) { + for (i++; nodes[i]->weight == -1; i++); + } + smallest = i; + + for (i = smallest + 1; i < count; i++) { + if (i == different || nodes[i]->weight == -1) { + continue; + } + + if (nodes[i]->weight < nodes[smallest]->weight) { + smallest = i; + } + } + + return smallest; +} + +struct huffman_node * +huffman_build_tree(void **values, int count) +{ + int i; + struct huffman_node **nodes; + + + nodes = malloc (sizeof (struct huffman_node *) * count); + for (i = 0; i < count; i++) { + struct huffman_node *node = + malloc (sizeof (struct huffman_node)); + + node->value = values[i]; + node->weight = i; + node->left = NULL; + node->right = NULL; + + nodes[i] = node; + } + + int tree1; + int tree2; + for (i = 1; i < count; i++) { + struct huffman_node *tmp; + + tree1 = find_smallest (nodes, count, -1); + tree2 = find_smallest (nodes, count, tree1); + + tmp = nodes[tree1]; + + nodes[tree1] = malloc (sizeof (struct huffman_node)); + nodes[tree1]->weight = tmp->weight + nodes[tree2]->weight; + nodes[tree1]->value = NULL; + nodes[tree1]->left = nodes[tree2]; + nodes[tree1]->right = tmp; + + nodes[tree2]->weight = -1; + } + + return nodes[tree1]; +} + +void * +huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read) +{ + + struct huffman_node *node = tree; + *bits_read = 0; + + while (true) { + if (node->value != NULL) { + return node->value; + } + + if ((bits[0] << *bits_read % 8 & 0x80) == 0) { + node = node->left; + } else { + node = node->right; + } + + (*bits_read)++; + if (*bits_read % 8 == 0) { + bits++; + } + } +} diff --git a/huffman.h b/huffman.h new file mode 100644 index 0000000..c767ec3 --- /dev/null +++ b/huffman.h @@ -0,0 +1,12 @@ + +struct huffman_node { + int weight; + void *value; + struct huffman_node *left; + struct huffman_node *right; +}; + +struct huffman_node *huffman_build_tree(void **values, int count); + +void *huffman_lookup (struct huffman_node *tree, unsigned char *bits, + int *bits_read); diff --git a/huffman.rb b/huffman.rb index ed8a170..2ddcd65 100644 --- a/huffman.rb +++ b/huffman.rb @@ -49,20 +49,34 @@ class NodeQueue generate_tree end + def find_smallest(not_this) + smallest = nil + for i in 0..@nodes.size - 1 + if i == not_this + next + end + if smallest.nil? or @nodes[i].weight < @nodes[smallest].weight + smallest = i + end + end + smallest + end + + def generate_tree while @nodes.size > 1 - sorted = @nodes.sort { |a,b| a.weight <=> b.weight } - to_merge = [] - 2.times { to_merge << sorted.shift } - sorted << merge_nodes(to_merge[0], to_merge[1]) - @nodes = sorted + node1 = self.find_smallest(-1) + node2 = self.find_smallest(node1) + new = merge_nodes(@nodes[node1], @nodes[node2]) + @nodes[node1] = new + @nodes.delete_at(node2) end @huffman_root = @nodes.first end def merge_nodes(node1, node2) - left = node1.weight > node2.weight ? node2 : node1 - right = left == node1 ? node2 : node1 + right = node1 + left = node2 node = HuffNode.new(:weight => left.weight + right.weight, :left => left, :right => right) left.parent = right.parent = node node diff --git a/thing.rb b/thing.rb index 0a5cfcc..6e21ab3 100755 --- a/thing.rb +++ b/thing.rb @@ -248,7 +248,8 @@ end def write_strings(file, strings) string_io = StringIO.new() - strings.each_key do |string| + + strings.each do |string| string_io.write(string) string_io.write("\0") end @@ -264,13 +265,21 @@ def collect_strings(parent) strings[key] += 1 end end - strings + + list = [] + strings.sort { |l, r| l[1] <=> r[1] }.each do |string, weight| + list << string + end + + list end -def build_huffman_for_strings(parent) +def build_huffman_for_strings(strings) paths = [] - parent.flatten.uniq.each do |node| - node.children.each_key {|key| paths << key} + i = 1 + strings.each do |string| + i.times { paths << string } + i += 1 end HuffmanEncoding.new paths end @@ -349,13 +358,15 @@ if $0 == __FILE__ de_dupe_driver(parent) # parent = compress_prefix(parent) - puts "building huffman table for strings" - string_huff = build_huffman_for_strings(parent) puts "building huffman table for nodes" node_huff = build_huffman_for_nodes(parent) - - puts "writing" + strings = collect_strings(parent) + + puts "building huffman table for strings" + string_huff = build_huffman_for_strings(strings) + + puts "writing" write_strings(file, strings) bit_file = BitWriter.new file binary_write(bit_file, parent, string_huff, node_huff) diff --git a/unpack.c b/unpack.c index 891b068..6a8400b 100644 --- a/unpack.c +++ b/unpack.c @@ -3,6 +3,8 @@ #include #include +#include "huffman.h" + #define CHUNK 1024 struct node { @@ -12,15 +14,15 @@ struct node { }; static int -load_dictionary(FILE *source, unsigned char **dictionary) { +load_dictionary(FILE *source, char ***dictionary, int *dictionary_size) +{ int ret; - unsigned have; z_stream strm; unsigned char in[CHUNK]; int read = 0; + // XXX keep a ref to buf for free() unsigned char *buf = malloc(sizeof(char) * CHUNK); - *dictionary = buf; printf("unpacking string dictionary\n"); @@ -67,7 +69,6 @@ load_dictionary(FILE *source, unsigned char **dictionary) { printf("MEMORY ERROR\n"); return -1; } - have = CHUNK - strm.avail_out; read += CHUNK - strm.avail_out; } while (strm.avail_out == 0); @@ -75,17 +76,34 @@ load_dictionary(FILE *source, unsigned char **dictionary) { /* done when inflate() says it's done */ } while (ret != Z_STREAM_END); - printf("data is:\n"); + int offset_size = 64; + int *dictionary_offsets = malloc (sizeof (int) * offset_size); + *dictionary_size = 1; int i; - for (i=0; i < read; i++) { + int j = 0; + dictionary_offsets[j++] = 0; + for (i = 0; i < read; i++) { if (buf[i] == '\0') { - putchar('\n'); - } else { - putchar(buf[i]); + if (i != read - 1) { + dictionary_offsets[j++] = i + 1; + (*dictionary_size)++; + if (j == offset_size) { + offset_size = offset_size * 2; + dictionary_offsets = + realloc (dictionary_offsets, + sizeof (int) * + offset_size); + } + } } } + *dictionary = malloc (sizeof (char *) * offset_size); + for (i = 0; i < offset_size; i++) { + (*dictionary)[i] = (char *) buf + dictionary_offsets[i]; + } + // rewind back to unused zlib bytes if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) { printf("Error seeking back in stream\n"); @@ -95,6 +113,7 @@ load_dictionary(FILE *source, unsigned char **dictionary) { printf ("dictionary stats:\n"); printf ("\tcompressed size: %zu\n", ftell(source)); printf ("\tuncompressed size: %d\n", read); + printf ("\tentries found: %d\n", *dictionary_size); inflateEnd(&strm); return ret == Z_STREAM_END ? 0 : -1; @@ -117,7 +136,8 @@ load_node_list(FILE *stream, struct node **list) { int main(int argc, char **argv) { FILE *fp; - unsigned char *dictionary; + char **dictionary; + int dictionary_size; struct node *list; if (argc != 2) { @@ -131,11 +151,19 @@ main(int argc, char **argv) { return -1; } - if (load_dictionary(fp, &dictionary)) { + if (load_dictionary(fp, &dictionary, &dictionary_size)) { printf("dictionary inflation failed. exiting\n"); return -1; } + struct huffman_node *tree = huffman_build_tree ((void **) dictionary, + dictionary_size); + + int bits_read; + short bits = 0xC0; + + printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read)); + if (load_node_list(fp, &list)) { printf("node list parsing failed. exiting\n"); return -1;