Add huffman decoding for C
This commit is contained in:
parent
4b82b83e02
commit
11fd9f1f4a
6 changed files with 193 additions and 30 deletions
6
Makefile
6
Makefile
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
CFLAGS += $(shell pkg-config --libs --cflags zlib)
|
CFLAGS += $(shell pkg-config --libs --cflags zlib)
|
||||||
CFLAGS += -Wall
|
CFLAGS += -Wall -g
|
||||||
|
|
||||||
ifndef CC
|
ifndef CC
|
||||||
CC = gcc
|
CC = gcc
|
||||||
|
@ -11,8 +11,8 @@ TMP_FILES = $(wildcard *~)
|
||||||
|
|
||||||
all: $(APP)
|
all: $(APP)
|
||||||
|
|
||||||
%: %.c
|
unpack: unpack.c huffman.c huffman.h
|
||||||
$(CC) $(CFLAGS) -o $@ $<
|
$(CC) $(CFLAGS) -o $@ unpack.c huffman.c huffman.h
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf $(APP) $(TMP_FILES)
|
rm -rf $(APP) $(TMP_FILES)
|
||||||
|
|
98
huffman.c
Normal file
98
huffman.c
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
#include "huffman.h"
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
static int
|
||||||
|
find_smallest (struct huffman_node **nodes, int count, int different)
|
||||||
|
{
|
||||||
|
int smallest;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; nodes[i]->weight == -1; i++);
|
||||||
|
|
||||||
|
if (i == different) {
|
||||||
|
for (i++; nodes[i]->weight == -1; i++);
|
||||||
|
}
|
||||||
|
smallest = i;
|
||||||
|
|
||||||
|
for (i = smallest + 1; i < count; i++) {
|
||||||
|
if (i == different || nodes[i]->weight == -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nodes[i]->weight < nodes[smallest]->weight) {
|
||||||
|
smallest = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return smallest;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct huffman_node *
|
||||||
|
huffman_build_tree(void **values, int count)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct huffman_node **nodes;
|
||||||
|
|
||||||
|
|
||||||
|
nodes = malloc (sizeof (struct huffman_node *) * count);
|
||||||
|
for (i = 0; i < count; i++) {
|
||||||
|
struct huffman_node *node =
|
||||||
|
malloc (sizeof (struct huffman_node));
|
||||||
|
|
||||||
|
node->value = values[i];
|
||||||
|
node->weight = i;
|
||||||
|
node->left = NULL;
|
||||||
|
node->right = NULL;
|
||||||
|
|
||||||
|
nodes[i] = node;
|
||||||
|
}
|
||||||
|
|
||||||
|
int tree1;
|
||||||
|
int tree2;
|
||||||
|
for (i = 1; i < count; i++) {
|
||||||
|
struct huffman_node *tmp;
|
||||||
|
|
||||||
|
tree1 = find_smallest (nodes, count, -1);
|
||||||
|
tree2 = find_smallest (nodes, count, tree1);
|
||||||
|
|
||||||
|
tmp = nodes[tree1];
|
||||||
|
|
||||||
|
nodes[tree1] = malloc (sizeof (struct huffman_node));
|
||||||
|
nodes[tree1]->weight = tmp->weight + nodes[tree2]->weight;
|
||||||
|
nodes[tree1]->value = NULL;
|
||||||
|
nodes[tree1]->left = nodes[tree2];
|
||||||
|
nodes[tree1]->right = tmp;
|
||||||
|
|
||||||
|
nodes[tree2]->weight = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return nodes[tree1];
|
||||||
|
}
|
||||||
|
|
||||||
|
void *
|
||||||
|
huffman_lookup (struct huffman_node *tree, unsigned char *bits, int *bits_read)
|
||||||
|
{
|
||||||
|
|
||||||
|
struct huffman_node *node = tree;
|
||||||
|
*bits_read = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
if (node->value != NULL) {
|
||||||
|
return node->value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((bits[0] << *bits_read % 8 & 0x80) == 0) {
|
||||||
|
node = node->left;
|
||||||
|
} else {
|
||||||
|
node = node->right;
|
||||||
|
}
|
||||||
|
|
||||||
|
(*bits_read)++;
|
||||||
|
if (*bits_read % 8 == 0) {
|
||||||
|
bits++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
12
huffman.h
Normal file
12
huffman.h
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
|
||||||
|
struct huffman_node {
|
||||||
|
int weight;
|
||||||
|
void *value;
|
||||||
|
struct huffman_node *left;
|
||||||
|
struct huffman_node *right;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct huffman_node *huffman_build_tree(void **values, int count);
|
||||||
|
|
||||||
|
void *huffman_lookup (struct huffman_node *tree, unsigned char *bits,
|
||||||
|
int *bits_read);
|
28
huffman.rb
28
huffman.rb
|
@ -49,20 +49,34 @@ class NodeQueue
|
||||||
generate_tree
|
generate_tree
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def find_smallest(not_this)
|
||||||
|
smallest = nil
|
||||||
|
for i in 0..@nodes.size - 1
|
||||||
|
if i == not_this
|
||||||
|
next
|
||||||
|
end
|
||||||
|
if smallest.nil? or @nodes[i].weight < @nodes[smallest].weight
|
||||||
|
smallest = i
|
||||||
|
end
|
||||||
|
end
|
||||||
|
smallest
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
def generate_tree
|
def generate_tree
|
||||||
while @nodes.size > 1
|
while @nodes.size > 1
|
||||||
sorted = @nodes.sort { |a,b| a.weight <=> b.weight }
|
node1 = self.find_smallest(-1)
|
||||||
to_merge = []
|
node2 = self.find_smallest(node1)
|
||||||
2.times { to_merge << sorted.shift }
|
new = merge_nodes(@nodes[node1], @nodes[node2])
|
||||||
sorted << merge_nodes(to_merge[0], to_merge[1])
|
@nodes[node1] = new
|
||||||
@nodes = sorted
|
@nodes.delete_at(node2)
|
||||||
end
|
end
|
||||||
@huffman_root = @nodes.first
|
@huffman_root = @nodes.first
|
||||||
end
|
end
|
||||||
|
|
||||||
def merge_nodes(node1, node2)
|
def merge_nodes(node1, node2)
|
||||||
left = node1.weight > node2.weight ? node2 : node1
|
right = node1
|
||||||
right = left == node1 ? node2 : node1
|
left = node2
|
||||||
node = HuffNode.new(:weight => left.weight + right.weight, :left => left, :right => right)
|
node = HuffNode.new(:weight => left.weight + right.weight, :left => left, :right => right)
|
||||||
left.parent = right.parent = node
|
left.parent = right.parent = node
|
||||||
node
|
node
|
||||||
|
|
27
thing.rb
27
thing.rb
|
@ -248,7 +248,8 @@ end
|
||||||
|
|
||||||
def write_strings(file, strings)
|
def write_strings(file, strings)
|
||||||
string_io = StringIO.new()
|
string_io = StringIO.new()
|
||||||
strings.each_key do |string|
|
|
||||||
|
strings.each do |string|
|
||||||
string_io.write(string)
|
string_io.write(string)
|
||||||
string_io.write("\0")
|
string_io.write("\0")
|
||||||
end
|
end
|
||||||
|
@ -264,13 +265,21 @@ def collect_strings(parent)
|
||||||
strings[key] += 1
|
strings[key] += 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
strings
|
|
||||||
|
list = []
|
||||||
|
strings.sort { |l, r| l[1] <=> r[1] }.each do |string, weight|
|
||||||
|
list << string
|
||||||
|
end
|
||||||
|
|
||||||
|
list
|
||||||
end
|
end
|
||||||
|
|
||||||
def build_huffman_for_strings(parent)
|
def build_huffman_for_strings(strings)
|
||||||
paths = []
|
paths = []
|
||||||
parent.flatten.uniq.each do |node|
|
i = 1
|
||||||
node.children.each_key {|key| paths << key}
|
strings.each do |string|
|
||||||
|
i.times { paths << string }
|
||||||
|
i += 1
|
||||||
end
|
end
|
||||||
HuffmanEncoding.new paths
|
HuffmanEncoding.new paths
|
||||||
end
|
end
|
||||||
|
@ -349,13 +358,15 @@ if $0 == __FILE__
|
||||||
de_dupe_driver(parent)
|
de_dupe_driver(parent)
|
||||||
# parent = compress_prefix(parent)
|
# parent = compress_prefix(parent)
|
||||||
|
|
||||||
puts "building huffman table for strings"
|
|
||||||
string_huff = build_huffman_for_strings(parent)
|
|
||||||
puts "building huffman table for nodes"
|
puts "building huffman table for nodes"
|
||||||
node_huff = build_huffman_for_nodes(parent)
|
node_huff = build_huffman_for_nodes(parent)
|
||||||
|
|
||||||
puts "writing"
|
|
||||||
strings = collect_strings(parent)
|
strings = collect_strings(parent)
|
||||||
|
|
||||||
|
puts "building huffman table for strings"
|
||||||
|
string_huff = build_huffman_for_strings(strings)
|
||||||
|
|
||||||
|
puts "writing"
|
||||||
write_strings(file, strings)
|
write_strings(file, strings)
|
||||||
bit_file = BitWriter.new file
|
bit_file = BitWriter.new file
|
||||||
binary_write(bit_file, parent, string_huff, node_huff)
|
binary_write(bit_file, parent, string_huff, node_huff)
|
||||||
|
|
50
unpack.c
50
unpack.c
|
@ -3,6 +3,8 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <zlib.h>
|
#include <zlib.h>
|
||||||
|
|
||||||
|
#include "huffman.h"
|
||||||
|
|
||||||
#define CHUNK 1024
|
#define CHUNK 1024
|
||||||
|
|
||||||
struct node {
|
struct node {
|
||||||
|
@ -12,15 +14,15 @@ struct node {
|
||||||
};
|
};
|
||||||
|
|
||||||
static int
|
static int
|
||||||
load_dictionary(FILE *source, unsigned char **dictionary) {
|
load_dictionary(FILE *source, char ***dictionary, int *dictionary_size)
|
||||||
|
{
|
||||||
int ret;
|
int ret;
|
||||||
unsigned have;
|
|
||||||
z_stream strm;
|
z_stream strm;
|
||||||
unsigned char in[CHUNK];
|
unsigned char in[CHUNK];
|
||||||
int read = 0;
|
int read = 0;
|
||||||
|
|
||||||
|
// XXX keep a ref to buf for free()
|
||||||
unsigned char *buf = malloc(sizeof(char) * CHUNK);
|
unsigned char *buf = malloc(sizeof(char) * CHUNK);
|
||||||
*dictionary = buf;
|
|
||||||
|
|
||||||
printf("unpacking string dictionary\n");
|
printf("unpacking string dictionary\n");
|
||||||
|
|
||||||
|
@ -67,7 +69,6 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
|
||||||
printf("MEMORY ERROR\n");
|
printf("MEMORY ERROR\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
have = CHUNK - strm.avail_out;
|
|
||||||
read += CHUNK - strm.avail_out;
|
read += CHUNK - strm.avail_out;
|
||||||
} while (strm.avail_out == 0);
|
} while (strm.avail_out == 0);
|
||||||
|
|
||||||
|
@ -75,16 +76,33 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
|
||||||
/* done when inflate() says it's done */
|
/* done when inflate() says it's done */
|
||||||
} while (ret != Z_STREAM_END);
|
} while (ret != Z_STREAM_END);
|
||||||
|
|
||||||
printf("data is:\n");
|
int offset_size = 64;
|
||||||
|
int *dictionary_offsets = malloc (sizeof (int) * offset_size);
|
||||||
|
*dictionary_size = 1;
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
for (i=0; i < read; i++) {
|
int j = 0;
|
||||||
|
dictionary_offsets[j++] = 0;
|
||||||
|
for (i = 0; i < read; i++) {
|
||||||
if (buf[i] == '\0') {
|
if (buf[i] == '\0') {
|
||||||
putchar('\n');
|
if (i != read - 1) {
|
||||||
} else {
|
dictionary_offsets[j++] = i + 1;
|
||||||
putchar(buf[i]);
|
(*dictionary_size)++;
|
||||||
|
if (j == offset_size) {
|
||||||
|
offset_size = offset_size * 2;
|
||||||
|
dictionary_offsets =
|
||||||
|
realloc (dictionary_offsets,
|
||||||
|
sizeof (int) *
|
||||||
|
offset_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*dictionary = malloc (sizeof (char *) * offset_size);
|
||||||
|
for (i = 0; i < offset_size; i++) {
|
||||||
|
(*dictionary)[i] = (char *) buf + dictionary_offsets[i];
|
||||||
|
}
|
||||||
|
|
||||||
// rewind back to unused zlib bytes
|
// rewind back to unused zlib bytes
|
||||||
if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) {
|
if (fseek(source, (long) strm.avail_in * -1, SEEK_CUR)) {
|
||||||
|
@ -95,6 +113,7 @@ load_dictionary(FILE *source, unsigned char **dictionary) {
|
||||||
printf ("dictionary stats:\n");
|
printf ("dictionary stats:\n");
|
||||||
printf ("\tcompressed size: %zu\n", ftell(source));
|
printf ("\tcompressed size: %zu\n", ftell(source));
|
||||||
printf ("\tuncompressed size: %d\n", read);
|
printf ("\tuncompressed size: %d\n", read);
|
||||||
|
printf ("\tentries found: %d\n", *dictionary_size);
|
||||||
inflateEnd(&strm);
|
inflateEnd(&strm);
|
||||||
|
|
||||||
return ret == Z_STREAM_END ? 0 : -1;
|
return ret == Z_STREAM_END ? 0 : -1;
|
||||||
|
@ -117,7 +136,8 @@ load_node_list(FILE *stream, struct node **list) {
|
||||||
int
|
int
|
||||||
main(int argc, char **argv) {
|
main(int argc, char **argv) {
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
unsigned char *dictionary;
|
char **dictionary;
|
||||||
|
int dictionary_size;
|
||||||
struct node *list;
|
struct node *list;
|
||||||
|
|
||||||
if (argc != 2) {
|
if (argc != 2) {
|
||||||
|
@ -131,11 +151,19 @@ main(int argc, char **argv) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (load_dictionary(fp, &dictionary)) {
|
if (load_dictionary(fp, &dictionary, &dictionary_size)) {
|
||||||
printf("dictionary inflation failed. exiting\n");
|
printf("dictionary inflation failed. exiting\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct huffman_node *tree = huffman_build_tree ((void **) dictionary,
|
||||||
|
dictionary_size);
|
||||||
|
|
||||||
|
int bits_read;
|
||||||
|
short bits = 0xC0;
|
||||||
|
|
||||||
|
printf("\n\n%s\n", huffman_lookup (tree, (unsigned char *) &bits, &bits_read));
|
||||||
|
|
||||||
if (load_node_list(fp, &list)) {
|
if (load_node_list(fp, &list)) {
|
||||||
printf("node list parsing failed. exiting\n");
|
printf("node list parsing failed. exiting\n");
|
||||||
return -1;
|
return -1;
|
||||||
|
|
Loading…
Reference in a new issue