Add huffman decoding for C
This commit is contained in:
parent
4b82b83e02
commit
11fd9f1f4a
6 changed files with 193 additions and 30 deletions
6
Makefile
6
Makefile
|
@ -1,6 +1,6 @@
|
|||
|
||||
CFLAGS += $(shell pkg-config --libs --cflags zlib)
|
||||
CFLAGS += -Wall
|
||||
CFLAGS += -Wall -g
|
||||
|
||||
ifndef CC
|
||||
CC = gcc
|
||||
|
@ -11,8 +11,8 @@ TMP_FILES = $(wildcard *~)
|
|||
|
||||
all: $(APP)
|
||||
|
||||
%: %.c
|
||||
$(CC) $(CFLAGS) -o $@ $<
|
||||
unpack: unpack.c huffman.c huffman.h
|
||||
$(CC) $(CFLAGS) -o $@ unpack.c huffman.c huffman.h
|
||||
|
||||
clean:
|
||||
rm -rf $(APP) $(TMP_FILES)
|
||||
|
|
98
huffman.c
Normal file
98
huffman.c
Normal file
|
@ -0,0 +1,98 @@
|
|||
#include "huffman.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static int
|
||||
find_smallest (struct huffman_node **nodes, int count, int different)
|
||||
{
|
||||
int smallest;
|
||||
int i;
|
||||
|
||||
for (i = 0; nodes[i]->weight == -1; i++);
|
||||
|
||||
if (i == different) {
|
||||
for (i++; nodes[i]->weight == -1; i++);
|
||||
}
|
||||
smallest = i;
|
||||
|
||||
for (i = smallest + 1; i < count; i++) {
|
||||
if (i == different || nodes[i]->weight == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nodes[i]->weight < nodes[smallest]->weight) {
|
||||
smallest = i;
|
||||
}
|
||||
}
|
||||
|
||||
return smallest;
|
||||
}
|
||||
|
||||
struct huffman_node *
|
||||
huffman_build_tree(void **values, int count)
|
||||
{
|
||||
int i;
|
||||
struct huffman_node **nodes;
|
||||
|
||||
|
||||
nodes = malloc (sizeof (struct huffman_node *) * count);
|
||||
for (i = 0; i < count; i++) {
|
||||
struct huffman_node *node =
|
||||
malloc (sizeof (struct huffman_node));
|
||||
|
||||
node->value = values[i];
|
||||
node->weight = i;
|
||||
node->left = NULL;
|
||||
node->right = NULL;
|
||||
|
||||
nodes[i] = node;
|
||||
}
|
||||
|
||||
int tree1;
|
||||
int tree2;
|
||||
for (i = 1; i < count; i++) {
|
||||
struct huffman_node *tmp;
|
||||
|
||||
tree1 = find_smallest (nodes, count, -1);
|
||||
tree2 = find_smallest (nodes, count, tree1);
|
||||
|
||||
tmp = nodes[tree1];
|
||||
|
||||
nodes[tree1] = malloc (sizeof (struct huffman_node));
|
||||
nodes[tree1]->weight = tmp->weight + nodes[tree2]->weight;
|
||||
nodes[tree1]->value = NULL;
|
||||
nodes[tree1]->left = nodes[tree2];
|
||||
nodes[tree1]->right = tmp;
|
||||
|
||||
nodes[tree2]->weight = -1;
|
||||
}
|
||||
|
||||
return nodes[tree1];
|
||||
}
|
||||
|
||||
void *
|
||||
huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read)
|
||||
{
|
||||
|
||||
struct huffman_node *node = tree;
|
||||
*bits_read = 0;
|
||||
|
||||
while (true) {
|
||||
if (node->value != NULL) {
|
||||
return node->value;
|
||||
}
|
||||
|
||||
if ((bits[0] << *bits_read % 8 & 0x80) == 0) {
|
||||
node = node->left;
|
||||
} else {
|
||||
node = node->right;
|
||||
}
|
||||
|
||||
(*bits_read)++;
|
||||
if (*bits_read % 8 == 0) {
|
||||
bits++;
|
||||
}
|
||||
}
|
||||
}
|
12
huffman.h
Normal file
12
huffman.h
Normal file
|
@ -0,0 +1,12 @@
|
|||
|
||||
struct huffman_node {
|
||||
int weight;
|
||||
void *value;
|
||||
struct huffman_node *left;
|
||||
struct huffman_node *right;
|
||||
};
|
||||
|
||||
struct huffman_node *huffman_build_tree(void **values, int count);
|
||||
|
||||
void *huffman_lookup (struct huffman_node *tree, unsigned char *bits,
|
||||
int *bits_read);
|
28
huffman.rb
28
huffman.rb
|
@ -49,20 +49,34 @@ class NodeQueue
|
|||
generate_tree
|
||||
end
|
||||
|
||||
def find_smallest(not_this)
|
||||
smallest = nil
|
||||
for i in 0..@nodes.size - 1
|
||||
if i == not_this
|
||||
next
|
||||
end
|
||||
if smallest.nil? or @nodes[i].weight < @nodes[smallest].weight
|
||||
smallest = i
|
||||
end
|
||||
end
|
||||
smallest
|
||||
end
|
||||
|
||||
|
||||
def generate_tree
|
||||
while @nodes.size > 1
|
||||
sorted = @nodes.sort { |a,b| a.weight <=> b.weight }
|
||||
to_merge = []
|
||||
2.times { to_merge << sorted.shift }
|
||||
sorted << merge_nodes(to_merge[0], to_merge[1])
|
||||
@nodes = sorted
|
||||
node1 = self.find_smallest(-1)
|
||||
node2 = self.find_smallest(node1)
|
||||
new = merge_nodes(@nodes[node1], @nodes[node2])
|
||||
@nodes[node1] = new
|
||||
@nodes.delete_at(node2)
|
||||
end
|
||||
@huffman_root = @nodes.first
|
||||
end
|
||||
|
||||
def merge_nodes(node1, node2)
|
||||
left = node1.weight > node2.weight ? node2 : node1
|
||||
right = left == node1 ? node2 : node1
|
||||
right = node1
|
||||
left = node2
|
||||
node = HuffNode.new(:weight => left.weight + right.weight, :left => left, :right => right)
|
||||
left.parent = right.parent = node
|
||||
node
|
||||
|
|
27
thing.rb
27
thing.rb
|
@ -248,7 +248,8 @@ end
|
|||
|
||||
def write_strings(file, strings)
|
||||
string_io = StringIO.new()
|
||||
strings.each_key do |string|
|
||||
|
||||
strings.each do |string|
|
||||
string_io.write(string)
|
||||
string_io.write("\0")
|
||||
end
|
||||
|
@ -264,13 +265,21 @@ def collect_strings(parent)
|
|||
strings[key] += 1
|
||||
end
|
||||
end
|
||||
strings
|
||||
|
||||
list = []
|
||||
strings.sort { |l, r| l[1] <=> r[1] }.each do |string, weight|
|
||||
list << string
|
||||
end
|
||||
|
||||
def build_huffman_for_strings(parent)
|
||||
list
|
||||
end
|
||||
|
||||
def build_huffman_for_strings(strings)
|
||||
paths = []
|
||||
parent.flatten.uniq.each do |node|
|
||||
node.children.each_key {|key| paths << key}
|
||||
i = 1
|
||||
strings.each do |string|
|
||||
i.times { paths << string }
|
||||
i += 1
|
||||
end
|
||||
HuffmanEncoding.new paths
|
||||
end
|
||||
|
@ -349,13 +358,15 @@ if $0 == __FILE__
|
|||
de_dupe_driver(parent)
|
||||
# parent = compress_prefix(parent)
|
||||
|
||||
puts "building huffman table for strings"
|
||||
string_huff = build_huffman_for_strings(parent)
|
||||
puts "building huffman table for nodes"
|
||||
node_huff = build_huffman_for_nodes(parent)
|
||||
|
||||
puts "writing"
|
||||
strings = collect_strings(parent)
|
||||
|
||||
puts "building huffman table for strings"
|
||||
string_huff = build_huffman_for_strings(strings)
|
||||
|
||||
puts "writing"
|
||||
write_strings(file, strings)
|
||||
bit_file = BitWriter.new file
|
||||
binary_write(bit_file, parent, string_huff, node_huff)
|
||||
|
|
48
unpack.c
48
unpack.c
|
@ -3,6 +3,8 @@
|
|||
#include <stdlib.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#include "huffman.h"
|
||||
|
||||
#define CHUNK 1024
|
||||
|
||||
struct node {
|
||||
|
@ -12,15 +14,15 @@ struct node {
|
|||
};
|
||||
|
||||
static int
|
||||
load_dictionary(FILE *source, unsigned char **dictionary) {
|
||||
load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
|
||||
{
|
||||
int ret;
|
||||
unsigned have;
|
||||
z_stream strm;
|
||||
unsigned char in[CHUNK];
|
||||
int read = 0;
|
||||
|
||||
// XXX keep a ref to buf for free()
|
||||
unsigned char *buf = malloc(sizeof(char) * CHUNK);
|
||||
*dictionary = buf;
|
||||
|
||||
printf("unpacking string dictionary\n");
|
||||
|
||||
|
@ -67,7 +69,6 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
|
|||
printf("MEMORY ERROR\n");
|
||||
return -1;
|
||||
}
|
||||
have = CHUNK - strm.avail_out;
|
||||
read += CHUNK - strm.avail_out;
|
||||
} while (strm.avail_out == 0);
|
||||
|
||||
|
@ -75,16 +76,33 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
|
|||
/* done when inflate() says it's done */
|
||||
} while (ret != Z_STREAM_END);
|
||||
|
||||
printf("data is:\n");
|
||||
int offset_size = 64;
|
||||
int *dictionary_offsets = malloc (sizeof (int) * offset_size);
|
||||
*dictionary_size = 1;
|
||||
|
||||
int i;
|
||||
int j = 0;
|
||||
dictionary_offsets[j++] = 0;
|
||||
for (i = 0; i < read; i++) {
|
||||
if (buf[i] == '\0') {
|
||||
putchar('\n');
|
||||
} else {
|
||||
putchar(buf[i]);
|
||||
if (i != read - 1) {
|
||||
dictionary_offsets[j++] = i + 1;
|
||||
(*dictionary_size)++;
|
||||
if (j == offset_size) {
|
||||
offset_size = offset_size * 2;
|
||||
dictionary_offsets =
|
||||
realloc (dictionary_offsets,
|
||||
sizeof (int) *
|
||||
offset_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*dictionary = malloc (sizeof (char *) * offset_size);
|
||||
for (i = 0; i < offset_size; i++) {
|
||||
(*dictionary)[i] = (char *) buf + dictionary_offsets[i];
|
||||
}
|
||||
|
||||
// rewind back to unused zlib bytes
|
||||
if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) {
|
||||
|
@ -95,6 +113,7 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
|
|||
printf ("dictionary stats:\n");
|
||||
printf ("\tcompressed size: %zu\n", ftell(source));
|
||||
printf ("\tuncompressed size: %d\n", read);
|
||||
printf ("\tentries found: %d\n", *dictionary_size);
|
||||
inflateEnd(&strm);
|
||||
|
||||
return ret == Z_STREAM_END ? 0 : -1;
|
||||
|
@ -117,7 +136,8 @@ load_node_list(FILE *stream, struct node **list) {
|
|||
int
|
||||
main(int argc, char **argv) {
|
||||
FILE *fp;
|
||||
unsigned char *dictionary;
|
||||
char **dictionary;
|
||||
int dictionary_size;
|
||||
struct node *list;
|
||||
|
||||
if (argc != 2) {
|
||||
|
@ -131,11 +151,19 @@ main(int argc, char **argv) {
|
|||
return -1;
|
||||
}
|
||||
|
||||
if (load_dictionary(fp, &dictionary)) {
|
||||
if (load_dictionary(fp, &dictionary, &dictionary_size)) {
|
||||
printf("dictionary inflation failed. exiting\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct huffman_node *tree = huffman_build_tree ((void **) dictionary,
|
||||
dictionary_size);
|
||||
|
||||
int bits_read;
|
||||
short bits = 0xC0;
|
||||
|
||||
printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read));
|
||||
|
||||
if (load_node_list(fp, &list)) {
|
||||
printf("node list parsing failed. exiting\n");
|
||||
return -1;
|
||||
|
|
Loading…
Reference in a new issue