Add huffman decoding for C

This commit is contained in:
James Bowes 2012-08-09 16:09:29 -03:00
parent 4b82b83e02
commit 11fd9f1f4a
6 changed files with 193 additions and 30 deletions

View file

@ -1,6 +1,6 @@
CFLAGS += $(shell pkg-config --libs --cflags zlib)
CFLAGS += -Wall
CFLAGS += -Wall -g
ifndef CC
CC = gcc
@ -11,8 +11,8 @@ TMP_FILES = $(wildcard *~)
all: $(APP)
%: %.c
$(CC) $(CFLAGS) -o $@ $<
unpack: unpack.c huffman.c huffman.h
$(CC) $(CFLAGS) -o $@ unpack.c huffman.c huffman.h
clean:
rm -rf $(APP) $(TMP_FILES)

98
huffman.c Normal file
View file

@ -0,0 +1,98 @@
#include "huffman.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
static int
find_smallest (struct huffman_node **nodes, int count, int different)
{
int smallest;
int i;
for (i = 0; nodes[i]->weight == -1; i++);
if (i == different) {
for (i++; nodes[i]->weight == -1; i++);
}
smallest = i;
for (i = smallest + 1; i < count; i++) {
if (i == different || nodes[i]->weight == -1) {
continue;
}
if (nodes[i]->weight < nodes[smallest]->weight) {
smallest = i;
}
}
return smallest;
}
struct huffman_node *
huffman_build_tree(void **values, int count)
{
int i;
struct huffman_node **nodes;
nodes = malloc (sizeof (struct huffman_node *) * count);
for (i = 0; i < count; i++) {
struct huffman_node *node =
malloc (sizeof (struct huffman_node));
node->value = values[i];
node->weight = i;
node->left = NULL;
node->right = NULL;
nodes[i] = node;
}
int tree1;
int tree2;
for (i = 1; i < count; i++) {
struct huffman_node *tmp;
tree1 = find_smallest (nodes, count, -1);
tree2 = find_smallest (nodes, count, tree1);
tmp = nodes[tree1];
nodes[tree1] = malloc (sizeof (struct huffman_node));
nodes[tree1]->weight = tmp->weight + nodes[tree2]->weight;
nodes[tree1]->value = NULL;
nodes[tree1]->left = nodes[tree2];
nodes[tree1]->right = tmp;
nodes[tree2]->weight = -1;
}
return nodes[tree1];
}
void *
huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read)
{
struct huffman_node *node = tree;
*bits_read = 0;
while (true) {
if (node->value != NULL) {
return node->value;
}
if ((bits[0] << *bits_read % 8 & 0x80) == 0) {
node = node->left;
} else {
node = node->right;
}
(*bits_read)++;
if (*bits_read % 8 == 0) {
bits++;
}
}
}

12
huffman.h Normal file
View file

@ -0,0 +1,12 @@
struct huffman_node {
int weight;
void *value;
struct huffman_node *left;
struct huffman_node *right;
};
struct huffman_node *huffman_build_tree(void **values, int count);
void *huffman_lookup (struct huffman_node *tree, unsigned char *bits,
int *bits_read);

View file

@ -49,20 +49,34 @@ class NodeQueue
generate_tree
end
def find_smallest(not_this)
smallest = nil
for i in 0..@nodes.size - 1
if i == not_this
next
end
if smallest.nil? or @nodes[i].weight < @nodes[smallest].weight
smallest = i
end
end
smallest
end
def generate_tree
while @nodes.size > 1
sorted = @nodes.sort { |a,b| a.weight <=> b.weight }
to_merge = []
2.times { to_merge << sorted.shift }
sorted << merge_nodes(to_merge[0], to_merge[1])
@nodes = sorted
node1 = self.find_smallest(-1)
node2 = self.find_smallest(node1)
new = merge_nodes(@nodes[node1], @nodes[node2])
@nodes[node1] = new
@nodes.delete_at(node2)
end
@huffman_root = @nodes.first
end
def merge_nodes(node1, node2)
left = node1.weight > node2.weight ? node2 : node1
right = left == node1 ? node2 : node1
right = node1
left = node2
node = HuffNode.new(:weight => left.weight + right.weight, :left => left, :right => right)
left.parent = right.parent = node
node

View file

@ -248,7 +248,8 @@ end
def write_strings(file, strings)
string_io = StringIO.new()
strings.each_key do |string|
strings.each do |string|
string_io.write(string)
string_io.write("\0")
end
@ -264,13 +265,21 @@ def collect_strings(parent)
strings[key] += 1
end
end
strings
list = []
strings.sort { |l, r| l[1] <=> r[1] }.each do |string, weight|
list << string
end
list
end
def build_huffman_for_strings(parent)
def build_huffman_for_strings(strings)
paths = []
parent.flatten.uniq.each do |node|
node.children.each_key {|key| paths << key}
i = 1
strings.each do |string|
i.times { paths << string }
i += 1
end
HuffmanEncoding.new paths
end
@ -349,13 +358,15 @@ if $0 == __FILE__
de_dupe_driver(parent)
# parent = compress_prefix(parent)
puts "building huffman table for strings"
string_huff = build_huffman_for_strings(parent)
puts "building huffman table for nodes"
node_huff = build_huffman_for_nodes(parent)
puts "writing"
strings = collect_strings(parent)
puts "building huffman table for strings"
string_huff = build_huffman_for_strings(strings)
puts "writing"
write_strings(file, strings)
bit_file = BitWriter.new file
binary_write(bit_file, parent, string_huff, node_huff)

View file

@ -3,6 +3,8 @@
#include <stdlib.h>
#include <zlib.h>
#include "huffman.h"
#define CHUNK 1024
struct node {
@ -12,15 +14,15 @@ struct node {
};
static int
load_dictionary(FILE *source, unsigned char **dictionary) {
load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
{
int ret;
unsigned have;
z_stream strm;
unsigned char in[CHUNK];
int read = 0;
// XXX keep a ref to buf for free()
unsigned char *buf = malloc(sizeof(char) * CHUNK);
*dictionary = buf;
printf("unpacking string dictionary\n");
@ -67,7 +69,6 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
printf("MEMORY ERROR\n");
return -1;
}
have = CHUNK - strm.avail_out;
read += CHUNK - strm.avail_out;
} while (strm.avail_out == 0);
@ -75,17 +76,34 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
/* done when inflate() says it's done */
} while (ret != Z_STREAM_END);
printf("data is:\n");
int offset_size = 64;
int *dictionary_offsets = malloc (sizeof (int) * offset_size);
*dictionary_size = 1;
int i;
for (i=0; i < read; i++) {
int j = 0;
dictionary_offsets[j++] = 0;
for (i = 0; i < read; i++) {
if (buf[i] == '\0') {
putchar('\n');
} else {
putchar(buf[i]);
if (i != read - 1) {
dictionary_offsets[j++] = i + 1;
(*dictionary_size)++;
if (j == offset_size) {
offset_size = offset_size * 2;
dictionary_offsets =
realloc (dictionary_offsets,
sizeof (int) *
offset_size);
}
}
}
}
*dictionary = malloc (sizeof (char *) * offset_size);
for (i = 0; i < offset_size; i++) {
(*dictionary)[i] = (char *) buf + dictionary_offsets[i];
}
// rewind back to unused zlib bytes
if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) {
printf("Error seeking back in stream\n");
@ -95,6 +113,7 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
printf ("dictionary stats:\n");
printf ("\tcompressed size: %zu\n", ftell(source));
printf ("\tuncompressed size: %d\n", read);
printf ("\tentries found: %d\n", *dictionary_size);
inflateEnd(&strm);
return ret == Z_STREAM_END ? 0 : -1;
@ -117,7 +136,8 @@ load_node_list(FILE *stream, struct node **list) {
int
main(int argc, char **argv) {
FILE *fp;
unsigned char *dictionary;
char **dictionary;
int dictionary_size;
struct node *list;
if (argc != 2) {
@ -131,11 +151,19 @@ main(int argc, char **argv) {
return -1;
}
if (load_dictionary(fp, &dictionary)) {
if (load_dictionary(fp, &dictionary, &dictionary_size)) {
printf("dictionary inflation failed. exiting\n");
return -1;
}
struct huffman_node *tree = huffman_build_tree ((void **) dictionary,
dictionary_size);
int bits_read;
short bits = 0xC0;
printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read));
if (load_node_list(fp, &list)) {
printf("node list parsing failed. exiting\n");
return -1;