Fix bug in duplicate detection.

Each node is written to disk as a list of (path, node pointer) pairs.
The duplicate detection code was considering the node's children and the
node's name. If we only look for  the children, we can find much more
duplicates.

Previous duplicate detection went from 424 nodes to 127. New duplicate
detection reduces to 48 nodes.

With this better duplicate detection, the prefix compression doesn't
appear to be useful anymore. comment it out.

Trims an extra 40 bytes off my sample data.
This commit is contained in:
James Bowes 2012-07-28 12:46:03 -03:00
parent a8a7fd57f6
commit 227e8de979
2 changed files with 71 additions and 106 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
unpack
*.sw?

169
thing.rb
View file

@ -45,109 +45,47 @@ class BitWriter
end end
end end
class Children
attr_accessor :children, :written
def initialize()
@children = []
@written = false
end
def each()
@children.each do |child|
yield child
end
end
def collect()
@children.each do |child|
yield child
end
end
def length()
@children.length
end
def [](i)
@children[i]
end
def []=(i, val)
@children[i] = val
end
def <<(other)
@children << other
end
def join(str)
@children.join(str)
end
def signature
@children.sort! do |a, b|
a.path <=> b.path
end
"[" + @children.collect { |x| x.path + x.signature }.join("|") + "]"
end
end
class Node class Node
attr_accessor :path, :children, :de_duped, :offset attr_accessor :children, :de_duped, :offset, :written
def initialize(path) def initialize(path)
@path = path @children = {}
@children = Children.new
@sig = nil
@de_duped = false @de_duped = false
@offset = ran_char(2) @offset = ran_char(2)
end end
def has_key?(key) def has_key?(key)
@children.each do |child| @children.has_key? key
if child.path == key
return true
end
end
return false
end end
def get_child(name) def get_child(name)
@children.each do |child| @children[name]
if child.path == name
return child
end
end
return nil
end end
def de_duped=(val) def de_duped=(val)
@de_duped = val @de_duped = val
@children.each do |child| @children.each do |key, child|
child.de_duped = true child.de_duped = true
end end
end end
def signature() def signature
if @sig.nil? sorted = @children.keys.sort do |a, b|
@sig = @children.signature a <=> b
end end
@sig "[" + sorted.collect { |key| key + @children[key].signature }.join("|") + "]"
end end
def flatten() def flatten()
flat = [self] flat = [self]
@children.each do |child| @children.each do |key, child|
flat += child.flatten flat += child.flatten
end end
flat flat
end end
def to_json(*a) def to_json(*a)
{ @children.to_json(*a)
@path => @children
}.to_json(*a)
end end
end end
@ -168,7 +106,7 @@ def mk_hash(sgmts, parent)
segment = sgmts.shift segment = sgmts.shift
return parent if segment.nil? return parent if segment.nil?
unless parent.has_key?(segment) unless parent.has_key?(segment)
parent.children << mk_hash(sgmts, Node.new(segment)) parent.children[segment] = mk_hash(sgmts, Node.new(segment))
else else
mk_hash(sgmts, parent.get_child(segment)) mk_hash(sgmts, parent.get_child(segment))
# else # else
@ -178,35 +116,51 @@ def mk_hash(sgmts, parent)
end end
def compress_prefix(parent) def compress_prefix(parent)
parent.children.each do |child| parent.children.keys.each do |key|
child = parent.children[key]
compress_prefix(child) compress_prefix(child)
if child.children.length == 1
puts "compressing #{key} and #{child.children.keys[0]}"
new_key = key + "/" + child.children.keys[0]
parent.children[new_key] = child
child.children = child.children.values[0].children
parent.children.delete(key)
end end
if parent.children.length == 1
puts "compressing #{parent.path} and #{parent.children[0].path}"
parent.path += "/" + parent.children[0].path
parent.children = parent.children[0].children
end end
return parent return parent
end end
def replace(tree, old, new)
tree.flatten.uniq.each do |node|
node.children.keys.each do |key|
if node.children[key] == old
node.children[key] = new
end
end
end
end
# given a tree of nodes, try and find branches that match the children of node. # given a tree of nodes, try and find branches that match the children of node.
# if found, replace those branches with node's children # if found, replace those branches with node's children
def de_dupe(tree, node) def de_dupe(tree, node)
tree.flatten.each do |sub_tree| tree.flatten.uniq.each do |sub_tree|
if sub_tree.children == node.children if sub_tree == node
# nothing # nothing
elsif node.signature == sub_tree.signature elsif node.signature == sub_tree.signature
sub_tree.de_duped = true sub_tree.de_duped = true
sub_tree.children = node.children replace(tree, sub_tree, node)
puts "Found dupe! " + node.signature unless node.signature == "[]" puts "Found dupe! " + node.signature unless node.signature == "[]"
end end
end end
end end
def de_dupe_driver(tree) def de_dupe_driver(tree)
before = tree.flatten.length
tree.flatten.each do |node| tree.flatten.each do |node|
de_dupe(tree, node) unless node.de_duped de_dupe(tree, node) unless node.de_duped
end end
puts "Total nodes Before: #{before} After: #{tree.flatten.uniq.length}"
end end
# simulate random file offsets # simulate random file offsets
@ -221,22 +175,16 @@ def binary_write(file, parent, string_huff, node_huff)
#offset to child node indicies #offset to child node indicies
# not needed, can just go write to children indicies # not needed, can just go write to children indicies
#file.write(ran_char) #file.write(ran_char)
if parent.children.written if parent.written
puts "not writing children of #{parent.path}"
return return
end end
# number of paths parent.children.each do |path, child|
length = parent.children.length.to_s
# path_count = (3 - length.length).times.collect { |i| "0" }.join + length
# file.write(path_count)
# puts "CHILD COUNT: " + parent.children.length.to_s
parent.children.each do |child|
# puts "PATH: " + child.path # puts "PATH: " + child.path
# file.write(child.path) # file.write(child.path)
# file.write("\0") # file.write("\0")
# index of path string # index of path string
file.write_bits(string_huff.encode(child.path)) file.write_bits(string_huff.encode(path))
# offset to node # offset to node
# index of node, that is. # index of node, that is.
file.write_bits(node_huff.encode(child)) file.write_bits(node_huff.encode(child))
@ -244,16 +192,15 @@ def binary_write(file, parent, string_huff, node_huff)
# reserve null byte for end of node info # reserve null byte for end of node info
# 3 0s are reserved in our name huffman table to denote end of node # 3 0s are reserved in our name huffman table to denote end of node
file.write_bits("000") file.write_bits("000")
parent.children.each do |child| parent.children.each do |path, child|
binary_write(file, child, string_huff, node_huff) binary_write(file, child, string_huff, node_huff)
child.children.written = true child.written = true
end end
end end
def write_strings(file, strings) def write_strings(file, strings)
string_io = StringIO.new() string_io = StringIO.new()
strings.each_key do |string| strings.each_key do |string|
puts "STRING: " + string
string_io.write(string) string_io.write(string)
string_io.write("\0") string_io.write("\0")
end end
@ -263,22 +210,38 @@ end
def collect_strings(parent) def collect_strings(parent)
strings = {} strings = {}
parent.flatten.each do |node| parent.flatten.uniq.each do |node|
strings[node.path] = [0, ran_char(1)] unless strings.has_key? node.path node.children.each_key do |key|
strings[node.path][0] += 1 strings[key] ||= 0
strings[key] += 1
end
end end
strings strings
end end
def build_huffman_for_strings(parent) def build_huffman_for_strings(parent)
nodes = parent.flatten.uniq paths = []
paths = nodes.collect {|node| node.path} parent.flatten.uniq.each do |node|
table = HuffmanEncoding.new paths node.children.each_key {|key| paths << key}
end
HuffmanEncoding.new paths
end end
def build_huffman_for_nodes(parent) def build_huffman_for_nodes(parent)
nodes = parent.flatten nodes = parent.flatten.uniq
table = HuffmanEncoding.new nodes refs = {}
nodes.each do |node|
node.children.each do |key, node|
refs[node] ||= 0
refs[node] += 1
end
end
refs[parent] = 1
expanded = []
refs.each do |node, freq|
freq.times {expanded << node}
end
table = HuffmanEncoding.new expanded
end end
if $0 == __FILE__ if $0 == __FILE__
@ -322,7 +285,7 @@ if $0 == __FILE__
# prime the signatures # prime the signatures
parent.signature parent.signature
de_dupe_driver(parent) de_dupe_driver(parent)
parent = compress_prefix(parent) # parent = compress_prefix(parent)
string_huff = build_huffman_for_strings(parent) string_huff = build_huffman_for_strings(parent)
node_huff = build_huffman_for_nodes(parent) node_huff = build_huffman_for_nodes(parent)