Fix bug in duplicate detection.
Each node is written to disk as a list of (path, node pointer) pairs. The duplicate detection code was considering the node's children and the node's name. If we only look for the children, we can find much more duplicates. Previous duplicate detection went from 424 nodes to 127. New duplicate detection reduces to 48 nodes. With this better duplicate detection, the prefix compression doesn't appear to be useful anymore. comment it out. Trims an extra 40 bytes off my sample data.
This commit is contained in:
parent
a8a7fd57f6
commit
227e8de979
2 changed files with 71 additions and 106 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
unpack
|
||||||
|
*.sw?
|
175
thing.rb
175
thing.rb
|
@ -45,109 +45,47 @@ class BitWriter
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
class Children
|
|
||||||
|
|
||||||
attr_accessor :children, :written
|
|
||||||
|
|
||||||
def initialize()
|
|
||||||
@children = []
|
|
||||||
@written = false
|
|
||||||
end
|
|
||||||
|
|
||||||
def each()
|
|
||||||
@children.each do |child|
|
|
||||||
yield child
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def collect()
|
|
||||||
@children.each do |child|
|
|
||||||
yield child
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def length()
|
|
||||||
@children.length
|
|
||||||
end
|
|
||||||
def [](i)
|
|
||||||
@children[i]
|
|
||||||
end
|
|
||||||
|
|
||||||
def []=(i, val)
|
|
||||||
@children[i] = val
|
|
||||||
end
|
|
||||||
|
|
||||||
def <<(other)
|
|
||||||
@children << other
|
|
||||||
end
|
|
||||||
|
|
||||||
def join(str)
|
|
||||||
@children.join(str)
|
|
||||||
end
|
|
||||||
|
|
||||||
def signature
|
|
||||||
@children.sort! do |a, b|
|
|
||||||
a.path <=> b.path
|
|
||||||
end
|
|
||||||
"[" + @children.collect { |x| x.path + x.signature }.join("|") + "]"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
class Node
|
class Node
|
||||||
attr_accessor :path, :children, :de_duped, :offset
|
attr_accessor :children, :de_duped, :offset, :written
|
||||||
|
|
||||||
def initialize(path)
|
def initialize(path)
|
||||||
@path = path
|
@children = {}
|
||||||
@children = Children.new
|
|
||||||
@sig = nil
|
|
||||||
@de_duped = false
|
@de_duped = false
|
||||||
@offset = ran_char(2)
|
@offset = ran_char(2)
|
||||||
end
|
end
|
||||||
|
|
||||||
def has_key?(key)
|
def has_key?(key)
|
||||||
@children.each do |child|
|
@children.has_key? key
|
||||||
if child.path == key
|
|
||||||
return true
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return false
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_child(name)
|
def get_child(name)
|
||||||
@children.each do |child|
|
@children[name]
|
||||||
if child.path == name
|
|
||||||
return child
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return nil
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def de_duped=(val)
|
def de_duped=(val)
|
||||||
@de_duped = val
|
@de_duped = val
|
||||||
@children.each do |child|
|
@children.each do |key, child|
|
||||||
child.de_duped = true
|
child.de_duped = true
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def signature()
|
def signature
|
||||||
if @sig.nil?
|
sorted = @children.keys.sort do |a, b|
|
||||||
@sig = @children.signature
|
a <=> b
|
||||||
end
|
end
|
||||||
@sig
|
"[" + sorted.collect { |key| key + @children[key].signature }.join("|") + "]"
|
||||||
end
|
end
|
||||||
|
|
||||||
def flatten()
|
def flatten()
|
||||||
flat = [self]
|
flat = [self]
|
||||||
@children.each do |child|
|
@children.each do |key, child|
|
||||||
flat += child.flatten
|
flat += child.flatten
|
||||||
end
|
end
|
||||||
flat
|
flat
|
||||||
end
|
end
|
||||||
|
|
||||||
def to_json(*a)
|
def to_json(*a)
|
||||||
{
|
@children.to_json(*a)
|
||||||
@path => @children
|
|
||||||
}.to_json(*a)
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -168,7 +106,7 @@ def mk_hash(sgmts, parent)
|
||||||
segment = sgmts.shift
|
segment = sgmts.shift
|
||||||
return parent if segment.nil?
|
return parent if segment.nil?
|
||||||
unless parent.has_key?(segment)
|
unless parent.has_key?(segment)
|
||||||
parent.children << mk_hash(sgmts, Node.new(segment))
|
parent.children[segment] = mk_hash(sgmts, Node.new(segment))
|
||||||
else
|
else
|
||||||
mk_hash(sgmts, parent.get_child(segment))
|
mk_hash(sgmts, parent.get_child(segment))
|
||||||
# else
|
# else
|
||||||
|
@ -178,35 +116,51 @@ def mk_hash(sgmts, parent)
|
||||||
end
|
end
|
||||||
|
|
||||||
def compress_prefix(parent)
|
def compress_prefix(parent)
|
||||||
parent.children.each do |child|
|
parent.children.keys.each do |key|
|
||||||
|
child = parent.children[key]
|
||||||
compress_prefix(child)
|
compress_prefix(child)
|
||||||
end
|
if child.children.length == 1
|
||||||
if parent.children.length == 1
|
puts "compressing #{key} and #{child.children.keys[0]}"
|
||||||
puts "compressing #{parent.path} and #{parent.children[0].path}"
|
new_key = key + "/" + child.children.keys[0]
|
||||||
parent.path += "/" + parent.children[0].path
|
parent.children[new_key] = child
|
||||||
parent.children = parent.children[0].children
|
child.children = child.children.values[0].children
|
||||||
|
parent.children.delete(key)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
return parent
|
return parent
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def replace(tree, old, new)
|
||||||
|
tree.flatten.uniq.each do |node|
|
||||||
|
node.children.keys.each do |key|
|
||||||
|
if node.children[key] == old
|
||||||
|
node.children[key] = new
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
# given a tree of nodes, try and find branches that match the children of node.
|
# given a tree of nodes, try and find branches that match the children of node.
|
||||||
# if found, replace those branches with node's children
|
# if found, replace those branches with node's children
|
||||||
def de_dupe(tree, node)
|
def de_dupe(tree, node)
|
||||||
tree.flatten.each do |sub_tree|
|
tree.flatten.uniq.each do |sub_tree|
|
||||||
if sub_tree.children == node.children
|
if sub_tree == node
|
||||||
# nothing
|
# nothing
|
||||||
elsif node.signature == sub_tree.signature
|
elsif node.signature == sub_tree.signature
|
||||||
sub_tree.de_duped = true
|
sub_tree.de_duped = true
|
||||||
sub_tree.children = node.children
|
replace(tree, sub_tree, node)
|
||||||
puts "Found dupe! " + node.signature unless node.signature == "[]"
|
puts "Found dupe! " + node.signature unless node.signature == "[]"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def de_dupe_driver(tree)
|
def de_dupe_driver(tree)
|
||||||
|
before = tree.flatten.length
|
||||||
tree.flatten.each do |node|
|
tree.flatten.each do |node|
|
||||||
de_dupe(tree, node) unless node.de_duped
|
de_dupe(tree, node) unless node.de_duped
|
||||||
end
|
end
|
||||||
|
|
||||||
|
puts "Total nodes Before: #{before} After: #{tree.flatten.uniq.length}"
|
||||||
end
|
end
|
||||||
|
|
||||||
# simulate random file offsets
|
# simulate random file offsets
|
||||||
|
@ -221,22 +175,16 @@ def binary_write(file, parent, string_huff, node_huff)
|
||||||
#offset to child node indicies
|
#offset to child node indicies
|
||||||
# not needed, can just go write to children indicies
|
# not needed, can just go write to children indicies
|
||||||
#file.write(ran_char)
|
#file.write(ran_char)
|
||||||
if parent.children.written
|
if parent.written
|
||||||
puts "not writing children of #{parent.path}"
|
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
# number of paths
|
parent.children.each do |path, child|
|
||||||
length = parent.children.length.to_s
|
|
||||||
# path_count = (3 - length.length).times.collect { |i| "0" }.join + length
|
|
||||||
# file.write(path_count)
|
|
||||||
# puts "CHILD COUNT: " + parent.children.length.to_s
|
|
||||||
parent.children.each do |child|
|
|
||||||
# puts "PATH: " + child.path
|
# puts "PATH: " + child.path
|
||||||
# file.write(child.path)
|
# file.write(child.path)
|
||||||
# file.write("\0")
|
# file.write("\0")
|
||||||
# index of path string
|
# index of path string
|
||||||
file.write_bits(string_huff.encode(child.path))
|
file.write_bits(string_huff.encode(path))
|
||||||
# offset to node
|
# offset to node
|
||||||
# index of node, that is.
|
# index of node, that is.
|
||||||
file.write_bits(node_huff.encode(child))
|
file.write_bits(node_huff.encode(child))
|
||||||
|
@ -244,16 +192,15 @@ def binary_write(file, parent, string_huff, node_huff)
|
||||||
# reserve null byte for end of node info
|
# reserve null byte for end of node info
|
||||||
# 3 0s are reserved in our name huffman table to denote end of node
|
# 3 0s are reserved in our name huffman table to denote end of node
|
||||||
file.write_bits("000")
|
file.write_bits("000")
|
||||||
parent.children.each do |child|
|
parent.children.each do |path, child|
|
||||||
binary_write(file, child, string_huff, node_huff)
|
binary_write(file, child, string_huff, node_huff)
|
||||||
child.children.written = true
|
child.written = true
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def write_strings(file, strings)
|
def write_strings(file, strings)
|
||||||
string_io = StringIO.new()
|
string_io = StringIO.new()
|
||||||
strings.each_key do |string|
|
strings.each_key do |string|
|
||||||
puts "STRING: " + string
|
|
||||||
string_io.write(string)
|
string_io.write(string)
|
||||||
string_io.write("\0")
|
string_io.write("\0")
|
||||||
end
|
end
|
||||||
|
@ -263,22 +210,38 @@ end
|
||||||
|
|
||||||
def collect_strings(parent)
|
def collect_strings(parent)
|
||||||
strings = {}
|
strings = {}
|
||||||
parent.flatten.each do |node|
|
parent.flatten.uniq.each do |node|
|
||||||
strings[node.path] = [0, ran_char(1)] unless strings.has_key? node.path
|
node.children.each_key do |key|
|
||||||
strings[node.path][0] += 1
|
strings[key] ||= 0
|
||||||
|
strings[key] += 1
|
||||||
|
end
|
||||||
end
|
end
|
||||||
strings
|
strings
|
||||||
end
|
end
|
||||||
|
|
||||||
def build_huffman_for_strings(parent)
|
def build_huffman_for_strings(parent)
|
||||||
nodes = parent.flatten.uniq
|
paths = []
|
||||||
paths = nodes.collect {|node| node.path}
|
parent.flatten.uniq.each do |node|
|
||||||
table = HuffmanEncoding.new paths
|
node.children.each_key {|key| paths << key}
|
||||||
|
end
|
||||||
|
HuffmanEncoding.new paths
|
||||||
end
|
end
|
||||||
|
|
||||||
def build_huffman_for_nodes(parent)
|
def build_huffman_for_nodes(parent)
|
||||||
nodes = parent.flatten
|
nodes = parent.flatten.uniq
|
||||||
table = HuffmanEncoding.new nodes
|
refs = {}
|
||||||
|
nodes.each do |node|
|
||||||
|
node.children.each do |key, node|
|
||||||
|
refs[node] ||= 0
|
||||||
|
refs[node] += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
refs[parent] = 1
|
||||||
|
expanded = []
|
||||||
|
refs.each do |node, freq|
|
||||||
|
freq.times {expanded << node}
|
||||||
|
end
|
||||||
|
table = HuffmanEncoding.new expanded
|
||||||
end
|
end
|
||||||
|
|
||||||
if $0 == __FILE__
|
if $0 == __FILE__
|
||||||
|
@ -322,7 +285,7 @@ if $0 == __FILE__
|
||||||
# prime the signatures
|
# prime the signatures
|
||||||
parent.signature
|
parent.signature
|
||||||
de_dupe_driver(parent)
|
de_dupe_driver(parent)
|
||||||
parent = compress_prefix(parent)
|
# parent = compress_prefix(parent)
|
||||||
|
|
||||||
string_huff = build_huffman_for_strings(parent)
|
string_huff = build_huffman_for_strings(parent)
|
||||||
node_huff = build_huffman_for_nodes(parent)
|
node_huff = build_huffman_for_nodes(parent)
|
||||||
|
|
Loading…
Reference in a new issue