Decoding working for C

This commit is contained in:
James Bowes 2012-08-10 10:49:13 -03:00
parent 11fd9f1f4a
commit 16345dbad2
3 changed files with 187 additions and 68 deletions

View file

@ -43,7 +43,7 @@ huffman_build_tree(void **values, int count)
malloc (sizeof (struct huffman_node)); malloc (sizeof (struct huffman_node));
node->value = values[i]; node->value = values[i];
node->weight = i; node->weight = i + 1;
node->left = NULL; node->left = NULL;
node->right = NULL; node->right = NULL;
@ -77,9 +77,11 @@ huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read)
{ {
struct huffman_node *node = tree; struct huffman_node *node = tree;
*bits_read = 0;
while (true) { while (true) {
if (node == NULL) {
return NULL;
}
if (node->value != NULL) { if (node->value != NULL) {
return node->value; return node->value;
} }

View file

@ -22,20 +22,22 @@ end
require './huffman' require './huffman'
$log = Logger.new(STDOUT) $log = Logger.new(STDOUT)
#$log.level = Logger::DEBUG $log.level = Logger::DEBUG
$log.level = Logger::FATAL #$log.level = Logger::FATAL
$sentinal = "SENTINAL"
class BitWriter class BitWriter
def initialize(stream) def initialize(stream)
@stream = stream @stream = stream
@byte = 0x00 @byte = 0x00
@count = 8 @count = 7
end end
def write(char) def write(char)
if char == '1' if char == '1'
@byte |= 1 << @count @byte |= 0x01 << @count
end end
@count -= 1 @count -= 1
if @count == -1 if @count == -1
@ -50,8 +52,8 @@ class BitWriter
end end
def pad() def pad()
@count = 8 @stream.write(Array(@byte).pack('c'))
@stream.write(Array(@byte).pack('C')) @count = 7
@byte = 0x00 @byte = 0x00
end end
end end
@ -199,33 +201,20 @@ def ran_char(val)
return val return val
end end
def binary_write(file, parent, string_huff, node_huff) def binary_write(file, node_list, string_huff, node_huff)
# file.write(parent.path) node_list.each do |node|
# file.write("\0") $log.debug('binary_write') { "begin node: " + node_huff.encode(node) }
#offset to child node indicies node.children.each do |path, child|
# not needed, can just go write to children indicies
#file.write(ran_char)
if parent.written
return
end
parent.children.each do |path, child|
# puts "PATH: " + child.path
# file.write(child.path)
# file.write("\0")
# index of path string # index of path string
$log.debug('binary_write') { "path: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect } $log.debug('binary_write') { "\tpath: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect }
file.write_bits(string_huff.encode(path)) file.write_bits(string_huff.encode(path))
# offset to node # offset to node
# index of node, that is. # index of node, that is.
file.write_bits(node_huff.encode(child)) file.write_bits(node_huff.encode(child))
$log.debug('binary_write') { "\tnode encoded: " + node_huff.encode(child) }
end end
# reserve null byte for end of node info # end of node is indicated by the special sentinal huffman coding of \0
# 3 0s are reserved in our name huffman table to denote end of node file.write_bits(string_huff.encode($sentinal))
file.write_bits("000")
parent.children.each do |path, child|
binary_write(file, child, string_huff, node_huff)
child.written = true
end end
end end
@ -281,22 +270,38 @@ def build_huffman_for_strings(strings)
i.times { paths << string } i.times { paths << string }
i += 1 i += 1
end end
# add on sentinal string
i.times { paths << $sentinal }
puts paths
HuffmanEncoding.new paths HuffmanEncoding.new paths
end end
def build_huffman_for_nodes(parent) def build_node_frequencies(parent)
nodes = parent.flatten.uniq nodes = parent.flatten.uniq
refs = {} refs = {}
nodes.each do |node| nodes.each do |node|
node.children.each do |key, node| node.children.each do |key, child|
refs[node] ||= 0 refs[child] ||= 0
refs[node] += 1 refs[child] += 1
end end
end end
refs[parent] = 1
list = []
refs.sort { |l, r| l[1] <=> r[1] }.each do |node, weight|
list << node
end
list
end
def build_huffman_for_nodes(list)
# parent doesn't have to go into the table
i = 1
expanded = [] expanded = []
refs.each do |node, freq| list.each do |node|
freq.times {expanded << node} i.times {expanded << node}
i += 1
end end
table = HuffmanEncoding.new expanded table = HuffmanEncoding.new expanded
end end
@ -359,7 +364,12 @@ if $0 == __FILE__
# parent = compress_prefix(parent) # parent = compress_prefix(parent)
puts "building huffman table for nodes" puts "building huffman table for nodes"
node_huff = build_huffman_for_nodes(parent) node_list = build_node_frequencies(parent)
node_huff = build_huffman_for_nodes(node_list)
# XXX add sentinal value to strings to indicate end of node.
# should be most frequent one. the string itself doesn't have to
# be stored, since we just care about the bitstring.
strings = collect_strings(parent) strings = collect_strings(parent)
@ -368,8 +378,19 @@ if $0 == __FILE__
puts "writing" puts "writing"
write_strings(file, strings) write_strings(file, strings)
# write out the number of unique path nodes into 1 or more bytes. if <
# 128 nodes, write in a single byte. if > 128 nodes, the first byte will
# begin with a '1' to indicate as such. the following bits in the byte
# indicate how many bytes following the first byte are used to store the
# size.
node_count = node_list.count + 1
puts node_count
file.write([node_count].pack("c"))
bit_file = BitWriter.new file bit_file = BitWriter.new file
binary_write(bit_file, parent, string_huff, node_huff) binary_write(bit_file, [parent] + node_list, string_huff, node_huff)
bit_file.pad bit_file.pad
end end

134
unpack.c
View file

@ -1,6 +1,7 @@
#include <assert.h> #include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdbool.h>
#include <zlib.h> #include <zlib.h>
#include "huffman.h" #include "huffman.h"
@ -8,9 +9,9 @@
#define CHUNK 1024 #define CHUNK 1024
struct node { struct node {
struct node *next; int count;
unsigned int path; char **paths;
unsigned int children[]; struct node **children;
}; };
static int static int
@ -99,11 +100,17 @@ load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
} }
} }
*dictionary = malloc (sizeof (char *) * offset_size); *dictionary = malloc (sizeof (char *) * (*dictionary_size + 1));
for (i = 0; i < offset_size; i++) { for (i = 0; i < *dictionary_size; i++) {
(*dictionary)[i] = (char *) buf + dictionary_offsets[i]; (*dictionary)[i] = (char *) buf + dictionary_offsets[i];
} }
(*dictionary_size)++;
// Add in the end of node sentinal string
char *sentinal = malloc (sizeof (char));
sentinal[0] = 0x00;
(*dictionary)[i] = sentinal;
// rewind back to unused zlib bytes // rewind back to unused zlib bytes
if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) { if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) {
printf("Error seeking back in stream\n"); printf("Error seeking back in stream\n");
@ -120,25 +127,117 @@ load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
} }
static int static int
load_node_list(FILE *stream, struct node **list) { load_content_sets(FILE *stream, struct node **list,
struct huffman_node *dictionary_tree) {
unsigned char buf[CHUNK]; unsigned char *buf = malloc (sizeof (char *) * CHUNK);
size_t read; size_t read;
struct node *np = malloc(sizeof(struct node)); struct node **nodes;
*list = np; int i;
read = fread(buf, 1, CHUNK, stream); unsigned char count;
fread(&count, sizeof (unsigned char), 1, stream);
printf("number of nodes: %hd\n", count);
nodes = malloc (sizeof (struct node *) * (unsigned short) count);
for (i = 0; i < (unsigned short) count; i++) {
nodes[i] = malloc (sizeof (struct node));
}
read = fread (buf, sizeof (char), CHUNK, stream);
printf("Read %zu bytes\n", read); printf("Read %zu bytes\n", read);
/*
* the parent node doesn't go in the huffman tree, as nothing else
* references it.
*/
struct huffman_node *tree =
huffman_build_tree ((void **) nodes + 1,
(unsigned short) count - 1);
int bits_read = 0;
for (i = 0; i < count; i++) {
struct node *node = nodes[i];
node->count = 0;
// XXX hard coded
node->paths = malloc (sizeof (char *) * 64);
node->children = malloc (sizeof (struct node *) * 64);
while (true) {
char *path = (char *) huffman_lookup (dictionary_tree,
buf, &bits_read);
buf = buf + bits_read / 8;
bits_read = bits_read % 8;
if (path[0] == '\0') {
break;
}
struct node *child =
(struct node *) huffman_lookup (tree, buf,
&bits_read);
buf = buf + bits_read / 8;
bits_read = bits_read % 8;
node->paths[node->count] = path;
node->children[node->count] = child;
node->count++;
}
}
*list = nodes[0];
return 0; return 0;
} }
struct stack {
struct stack *next;
struct stack *prev;
char *path;
};
static void
dump_content_set (struct node *content_sets, struct stack *head,
struct stack *tail)
{
int i;
struct stack stack;
stack.prev = tail;
tail->next = &stack;
for (i = 0; i < content_sets->count; i++) {
stack.path = content_sets->paths[i];
dump_content_set(content_sets->children[i], head, &stack);
}
if (content_sets->count == 0) {
struct stack *cur = head;
for (cur = head->next; cur != &stack; cur = cur->next) {
printf("/%s", cur->path);
}
printf("\n");
}
}
static void
dump_content_sets (struct node *content_sets)
{
struct stack stack;
stack.next = NULL;
stack.prev = NULL;
stack.path = NULL;
dump_content_set (content_sets, &stack, &stack);
}
int int
main(int argc, char **argv) { main(int argc, char **argv) {
FILE *fp; FILE *fp;
char **dictionary; char **dictionary;
int dictionary_size; int dictionary_size;
struct node *list; struct node *content_sets;
if (argc != 2) { if (argc != 2) {
printf("usage: unpack <bin file>\n"); printf("usage: unpack <bin file>\n");
@ -156,18 +255,15 @@ main(int argc, char **argv) {
return -1; return -1;
} }
struct huffman_node *tree = huffman_build_tree ((void **) dictionary, struct huffman_node *dictionary_tree =
dictionary_size); huffman_build_tree ((void **) dictionary, dictionary_size);
int bits_read; if (load_content_sets(fp, &content_sets, dictionary_tree)) {
short bits = 0xC0;
printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read));
if (load_node_list(fp, &list)) {
printf("node list parsing failed. exiting\n"); printf("node list parsing failed. exiting\n");
return -1; return -1;
} }
dump_content_sets (content_sets);
return 0; return 0;
} }