diff --git a/huffman.c b/huffman.c index 6cfe797..6e49e97 100644 --- a/huffman.c +++ b/huffman.c @@ -43,7 +43,7 @@ huffman_build_tree(void **values, int count) malloc (sizeof (struct huffman_node)); node->value = values[i]; - node->weight = i; + node->weight = i + 1; node->left = NULL; node->right = NULL; @@ -77,9 +77,11 @@ huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read) { struct huffman_node *node = tree; - *bits_read = 0; while (true) { + if (node == NULL) { + return NULL; + } if (node->value != NULL) { return node->value; } diff --git a/thing.rb b/thing.rb index 6e21ab3..7bb83e9 100755 --- a/thing.rb +++ b/thing.rb @@ -22,20 +22,22 @@ end require './huffman' $log = Logger.new(STDOUT) -#$log.level = Logger::DEBUG -$log.level = Logger::FATAL +$log.level = Logger::DEBUG +#$log.level = Logger::FATAL + +$sentinal = "SENTINAL" class BitWriter def initialize(stream) @stream = stream @byte = 0x00 - @count = 8 + @count = 7 end def write(char) if char == '1' - @byte |= 1 << @count + @byte |= 0x01 << @count end @count -= 1 if @count == -1 @@ -50,8 +52,8 @@ class BitWriter end def pad() - @count = 8 - @stream.write(Array(@byte).pack('C')) + @stream.write(Array(@byte).pack('c')) + @count = 7 @byte = 0x00 end end @@ -199,33 +201,20 @@ def ran_char(val) return val end -def binary_write(file, parent, string_huff, node_huff) -# file.write(parent.path) -# file.write("\0") - #offset to child node indicies - # not needed, can just go write to children indicies - #file.write(ran_char) - if parent.written - return - end - - parent.children.each do |path, child| -# puts "PATH: " + child.path -# file.write(child.path) -# file.write("\0") - # index of path string - $log.debug('binary_write') { "path: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect } - file.write_bits(string_huff.encode(path)) - # offset to node - # index of node, that is. - file.write_bits(node_huff.encode(child)) - end - # reserve null byte for end of node info - # 3 0s are reserved in our name huffman table to denote end of node - file.write_bits("000") - parent.children.each do |path, child| - binary_write(file, child, string_huff, node_huff) - child.written = true +def binary_write(file, node_list, string_huff, node_huff) + node_list.each do |node| + $log.debug('binary_write') { "begin node: " + node_huff.encode(node) } + node.children.each do |path, child| + # index of path string + $log.debug('binary_write') { "\tpath: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect } + file.write_bits(string_huff.encode(path)) + # offset to node + # index of node, that is. + file.write_bits(node_huff.encode(child)) + $log.debug('binary_write') { "\tnode encoded: " + node_huff.encode(child) } + end + # end of node is indicated by the special sentinal huffman coding of \0 + file.write_bits(string_huff.encode($sentinal)) end end @@ -281,22 +270,38 @@ def build_huffman_for_strings(strings) i.times { paths << string } i += 1 end + # add on sentinal string + i.times { paths << $sentinal } + puts paths HuffmanEncoding.new paths end -def build_huffman_for_nodes(parent) - nodes = parent.flatten.uniq - refs = {} - nodes.each do |node| - node.children.each do |key, node| - refs[node] ||= 0 - refs[node] += 1 - end +def build_node_frequencies(parent) + nodes = parent.flatten.uniq + refs = {} + nodes.each do |node| + node.children.each do |key, child| + refs[child] ||= 0 + refs[child] += 1 end - refs[parent] = 1 + end + + list = [] + refs.sort { |l, r| l[1] <=> r[1] }.each do |node, weight| + list << node + end + + list +end + + +def build_huffman_for_nodes(list) + # parent doesn't have to go into the table + i = 1 expanded = [] - refs.each do |node, freq| - freq.times {expanded << node} + list.each do |node| + i.times {expanded << node} + i += 1 end table = HuffmanEncoding.new expanded end @@ -359,8 +364,13 @@ if $0 == __FILE__ # parent = compress_prefix(parent) puts "building huffman table for nodes" - node_huff = build_huffman_for_nodes(parent) - + node_list = build_node_frequencies(parent) + node_huff = build_huffman_for_nodes(node_list) + + # XXX add sentinal value to strings to indicate end of node. + # should be most frequent one. the string itself doesn't have to + # be stored, since we just care about the bitstring. + strings = collect_strings(parent) puts "building huffman table for strings" @@ -368,8 +378,19 @@ if $0 == __FILE__ puts "writing" write_strings(file, strings) + + # write out the number of unique path nodes into 1 or more bytes. if < + # 128 nodes, write in a single byte. if > 128 nodes, the first byte will + # begin with a '1' to indicate as such. the following bits in the byte + # indicate how many bytes following the first byte are used to store the + # size. + + node_count = node_list.count + 1 + puts node_count + file.write([node_count].pack("c")) + bit_file = BitWriter.new file - binary_write(bit_file, parent, string_huff, node_huff) + binary_write(bit_file, [parent] + node_list, string_huff, node_huff) bit_file.pad end diff --git a/unpack.c b/unpack.c index 6a8400b..cb19054 100644 --- a/unpack.c +++ b/unpack.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "huffman.h" @@ -8,9 +9,9 @@ #define CHUNK 1024 struct node { - struct node *next; - unsigned int path; - unsigned int children[]; + int count; + char **paths; + struct node **children; }; static int @@ -99,11 +100,17 @@ load_dictionary(FILE *source, char ***dictionary, int *dictionary_size) } } - *dictionary = malloc (sizeof (char *) * offset_size); - for (i = 0; i < offset_size; i++) { + *dictionary = malloc (sizeof (char *) * (*dictionary_size + 1)); + for (i = 0; i < *dictionary_size; i++) { (*dictionary)[i] = (char *) buf + dictionary_offsets[i]; } + (*dictionary_size)++; + // Add in the end of node sentinal string + char *sentinal = malloc (sizeof (char)); + sentinal[0] = 0x00; + (*dictionary)[i] = sentinal; + // rewind back to unused zlib bytes if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) { printf("Error seeking back in stream\n"); @@ -120,25 +127,117 @@ load_dictionary(FILE *source, char ***dictionary, int *dictionary_size) } static int -load_node_list(FILE *stream, struct node **list) { +load_content_sets(FILE *stream, struct node **list, + struct huffman_node *dictionary_tree) { - unsigned char buf[CHUNK]; + unsigned char *buf = malloc (sizeof (char *) * CHUNK); size_t read; - struct node *np = malloc(sizeof(struct node)); - *list = np; + struct node **nodes; + int i; - read = fread(buf, 1, CHUNK, stream); + unsigned char count; + fread(&count, sizeof (unsigned char), 1, stream); + printf("number of nodes: %hd\n", count); + + + nodes = malloc (sizeof (struct node *) * (unsigned short) count); + for (i = 0; i < (unsigned short) count; i++) { + nodes[i] = malloc (sizeof (struct node)); + } + + read = fread (buf, sizeof (char), CHUNK, stream); printf("Read %zu bytes\n", read); + /* + * the parent node doesn't go in the huffman tree, as nothing else + * references it. + */ + struct huffman_node *tree = + huffman_build_tree ((void **) nodes + 1, + (unsigned short) count - 1); + + int bits_read = 0; + for (i = 0; i < count; i++) { + struct node *node = nodes[i]; + node->count = 0; + + // XXX hard coded + node->paths = malloc (sizeof (char *) * 64); + node->children = malloc (sizeof (struct node *) * 64); + + while (true) { + char *path = (char *) huffman_lookup (dictionary_tree, + buf, &bits_read); + buf = buf + bits_read / 8; + bits_read = bits_read % 8; + + if (path[0] == '\0') { + break; + } + + struct node *child = + (struct node *) huffman_lookup (tree, buf, + &bits_read); + buf = buf + bits_read / 8; + bits_read = bits_read % 8; + + node->paths[node->count] = path; + node->children[node->count] = child; + node->count++; + } + } + + *list = nodes[0]; return 0; } +struct stack { + struct stack *next; + struct stack *prev; + char *path; +}; + +static void +dump_content_set (struct node *content_sets, struct stack *head, + struct stack *tail) +{ + int i; + struct stack stack; + stack.prev = tail; + tail->next = &stack; + + for (i = 0; i < content_sets->count; i++) { + stack.path = content_sets->paths[i]; + dump_content_set(content_sets->children[i], head, &stack); + } + + if (content_sets->count == 0) { + struct stack *cur = head; + + for (cur = head->next; cur != &stack; cur = cur->next) { + printf("/%s", cur->path); + } + printf("\n"); + } +} + +static void +dump_content_sets (struct node *content_sets) +{ + struct stack stack; + stack.next = NULL; + stack.prev = NULL; + stack.path = NULL; + + dump_content_set (content_sets, &stack, &stack); +} + int main(int argc, char **argv) { FILE *fp; char **dictionary; int dictionary_size; - struct node *list; + struct node *content_sets; if (argc != 2) { printf("usage: unpack \n"); @@ -156,18 +255,15 @@ main(int argc, char **argv) { return -1; } - struct huffman_node *tree = huffman_build_tree ((void **) dictionary, - dictionary_size); + struct huffman_node *dictionary_tree = + huffman_build_tree ((void **) dictionary, dictionary_size); - int bits_read; - short bits = 0xC0; - - printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read)); - - if (load_node_list(fp, &list)) { + if (load_content_sets(fp, &content_sets, dictionary_tree)) { printf("node list parsing failed. exiting\n"); return -1; } + dump_content_sets (content_sets); + return 0; }