checkpoint
This commit is contained in:
parent
ddf7d89408
commit
abfdbebe28
1 changed files with 133 additions and 30 deletions
161
thing.rb
161
thing.rb
|
@ -10,14 +10,42 @@ rescue
|
||||||
abort('ERROR: plz2run #> gem install json')
|
abort('ERROR: plz2run #> gem install json')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
require './huffman'
|
||||||
|
|
||||||
# usage: ./content_from_pem.rb 5286016419950084643.pem
|
# usage: ./content_from_pem.rb 5286016419950084643.pem
|
||||||
|
|
||||||
|
class BitWriter
|
||||||
|
|
||||||
|
def initialize(stream)
|
||||||
|
@stream = stream
|
||||||
|
@byte = '\0'
|
||||||
|
@count = 8
|
||||||
|
end
|
||||||
|
|
||||||
|
def write(char)
|
||||||
|
if char == '1'
|
||||||
|
@byte[0] | 1 << @count
|
||||||
|
end
|
||||||
|
@count -= 1
|
||||||
|
if @count == -1
|
||||||
|
self.pad
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def pad()
|
||||||
|
@count = 8
|
||||||
|
@stream.write(@byte)
|
||||||
|
@byte = '\0'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
class Children
|
class Children
|
||||||
|
|
||||||
attr_accessor :children
|
attr_accessor :children, :written
|
||||||
|
|
||||||
def initialize()
|
def initialize()
|
||||||
@children = []
|
@children = []
|
||||||
|
@written = false
|
||||||
end
|
end
|
||||||
|
|
||||||
def each()
|
def each()
|
||||||
|
@ -50,17 +78,24 @@ class Children
|
||||||
def join(str)
|
def join(str)
|
||||||
@children.join(str)
|
@children.join(str)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def signature
|
||||||
|
@children.sort! do |a, b|
|
||||||
|
a.path <=> b.path
|
||||||
|
end
|
||||||
|
"[" + @children.collect { |x| x.path + x.signature }.join("|") + "]"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
class Node
|
class Node
|
||||||
attr_accessor :path, :children, :de_duped, :written
|
attr_accessor :path, :children, :de_duped, :offset
|
||||||
|
|
||||||
def initialize(path)
|
def initialize(path)
|
||||||
@path = path
|
@path = path
|
||||||
@children = Children.new
|
@children = Children.new
|
||||||
@sig = nil
|
@sig = nil
|
||||||
@de_duped = false
|
@de_duped = false
|
||||||
@written = false
|
@offset = ran_char(2)
|
||||||
end
|
end
|
||||||
|
|
||||||
def has_key?(key)
|
def has_key?(key)
|
||||||
|
@ -81,12 +116,16 @@ class Node
|
||||||
return nil
|
return nil
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def de_duped=(val)
|
||||||
|
@de_duped = val
|
||||||
|
@children.each do |child|
|
||||||
|
child.de_duped = true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def signature()
|
def signature()
|
||||||
@sig = @path + "[" +
|
|
||||||
@children.collect { |x| x.signature }.join("|") + "]"
|
|
||||||
if @sig.nil?
|
if @sig.nil?
|
||||||
@sig = @path + "[" +
|
@sig = @children.signature
|
||||||
@children.collect { |x| x.signature }.join("|") + "]"
|
|
||||||
end
|
end
|
||||||
@sig
|
@sig
|
||||||
end
|
end
|
||||||
|
@ -147,39 +186,97 @@ end
|
||||||
# given a tree of nodes, try and find branches that match the children of node.
|
# given a tree of nodes, try and find branches that match the children of node.
|
||||||
# if found, replace those branches with node's children
|
# if found, replace those branches with node's children
|
||||||
def de_dupe(tree, node)
|
def de_dupe(tree, node)
|
||||||
for i in 0..tree.children.length - 1
|
tree.flatten.each do |sub_tree|
|
||||||
if tree.children[i] == node
|
if sub_tree.children == node.children
|
||||||
# nothing
|
# nothing
|
||||||
elsif node.signature == tree.children[i].signature
|
elsif node.signature == sub_tree.signature
|
||||||
tree.children[i].de_duped = true
|
sub_tree.de_duped = true
|
||||||
tree.children[i] = node
|
sub_tree.children = node.children
|
||||||
puts "Found dupe! " + node.signature
|
puts "Found dupe! " + node.signature unless node.signature == "[]"
|
||||||
else
|
|
||||||
de_dupe(tree.children[i], node)
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def de_dupe_driver(tree, nodes)
|
def de_dupe_driver(tree)
|
||||||
nodes.each do |node|
|
tree.flatten.each do |node|
|
||||||
de_dupe(tree, node) unless node.de_duped
|
de_dupe(tree, node) unless node.de_duped
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def binary_write(file, parent)
|
# simulate random file offsets
|
||||||
file.write(parent.path)
|
def ran_char(val)
|
||||||
file.write("\0\0\0\0")
|
val = (0..val - 1).map {rand(256).chr}.join
|
||||||
|
return val
|
||||||
|
end
|
||||||
|
|
||||||
|
def binary_write(file, parent, strings)
|
||||||
|
# file.write(parent.path)
|
||||||
|
# file.write("\0")
|
||||||
|
#offset to child node indicies
|
||||||
|
# not needed, can just go write to children indicies
|
||||||
|
#file.write(ran_char)
|
||||||
|
if parent.children.written
|
||||||
|
puts "not writing children of #{parent.path}"
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
# number of paths
|
||||||
|
length = parent.children.length.to_s
|
||||||
|
# path_count = (3 - length.length).times.collect { |i| "0" }.join + length
|
||||||
|
# file.write(path_count)
|
||||||
|
# puts "CHILD COUNT: " + parent.children.length.to_s
|
||||||
parent.children.each do |child|
|
parent.children.each do |child|
|
||||||
|
# puts "PATH: " + child.path
|
||||||
# file.write(child.path)
|
# file.write(child.path)
|
||||||
file.write("\0\0\0")
|
# file.write("\0")
|
||||||
|
# index of path string
|
||||||
|
file.write(strings[child.path][1])
|
||||||
|
# offset to node
|
||||||
|
# index of node, that is.
|
||||||
|
file.write(child.offset)
|
||||||
end
|
end
|
||||||
|
# reserve null byte for end of node info
|
||||||
|
file.write("\0")
|
||||||
parent.children.each do |child|
|
parent.children.each do |child|
|
||||||
unless child.written
|
binary_write(file, child, strings)
|
||||||
binary_write(file, child)
|
child.children.written = true
|
||||||
child.written = true
|
|
||||||
else
|
|
||||||
puts "not writing #{child.path}"
|
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def write_strings(file, strings)
|
||||||
|
string_io = StringIO.new()
|
||||||
|
strings.each_key do |string|
|
||||||
|
string_io.write(string)
|
||||||
|
string_io.write("\0")
|
||||||
|
end
|
||||||
|
zlib = Zlib::Deflate.new(Zlib::BEST_COMPRESSION, 15, Zlib::MAX_MEM_LEVEL)
|
||||||
|
file.write zlib.deflate(string_io.to_s, Zlib::FINISH)
|
||||||
|
end
|
||||||
|
|
||||||
|
def collect_strings(parent)
|
||||||
|
strings = {}
|
||||||
|
parent.flatten.each do |node|
|
||||||
|
strings[node.path] = [0, ran_char(1)] unless strings.has_key? node.path
|
||||||
|
strings[node.path][0] += 1
|
||||||
|
end
|
||||||
|
strings
|
||||||
|
end
|
||||||
|
|
||||||
|
def build_huffman_for_strings(parent, strings)
|
||||||
|
nodes = parent.flatten.uniq
|
||||||
|
paths = nodes.collect {|node| node.path}
|
||||||
|
table = HuffmanEncoding.new paths
|
||||||
|
|
||||||
|
|
||||||
|
paths.uniq.each do |string|
|
||||||
|
puts table.encode(string).to_s + " " + string
|
||||||
|
end
|
||||||
|
|
||||||
|
nodes = parent.flatten
|
||||||
|
table = HuffmanEncoding.new nodes
|
||||||
|
|
||||||
|
parent.flatten.uniq do |node|
|
||||||
|
puts table.encode(node).to_s
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -222,11 +319,17 @@ if $0 == __FILE__
|
||||||
parent = mk_hash(chunks, parent)
|
parent = mk_hash(chunks, parent)
|
||||||
end
|
end
|
||||||
# prime the signatures
|
# prime the signatures
|
||||||
de_dupe_driver(parent, parent.flatten)
|
parent.signature
|
||||||
|
de_dupe_driver(parent)
|
||||||
parent = compress_prefix(parent)
|
parent = compress_prefix(parent)
|
||||||
de_dupe_driver(parent, parent.flatten)
|
|
||||||
binary_write(binary, parent)
|
strings = collect_strings(parent)
|
||||||
|
build_huffman(parent, strings)
|
||||||
|
|
||||||
|
write_strings(binary, strings)
|
||||||
|
binary_write(binary, parent, strings)
|
||||||
file.write(parent.to_json)
|
file.write(parent.to_json)
|
||||||
|
|
||||||
end
|
end
|
||||||
puts "Wrote:\n [%d] %s\n [%d] %s" % [File.size(txt_name), txt_name, File.size(json_name), json_name]
|
puts "Wrote:\n [%d] %s\n [%d] %s" % [File.size(txt_name), txt_name, File.size(json_name), json_name]
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue