From 227e8de97944b4a36558b63607fd1505e40831d5 Mon Sep 17 00:00:00 2001 From: James Bowes Date: Sat, 28 Jul 2012 12:46:03 -0300 Subject: [PATCH] Fix bug in duplicate detection. Each node is written to disk as a list of (path, node pointer) pairs. The duplicate detection code was considering the node's children and the node's name. If we only look for the children, we can find much more duplicates. Previous duplicate detection went from 424 nodes to 127. New duplicate detection reduces to 48 nodes. With this better duplicate detection, the prefix compression doesn't appear to be useful anymore. comment it out. Trims an extra 40 bytes off my sample data. --- .gitignore | 2 + thing.rb | 175 +++++++++++++++++++++-------------------------------- 2 files changed, 71 insertions(+), 106 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a454759 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +unpack +*.sw? diff --git a/thing.rb b/thing.rb index 10e9e84..f6f8a70 100644 --- a/thing.rb +++ b/thing.rb @@ -45,109 +45,47 @@ class BitWriter end end -class Children - - attr_accessor :children, :written - - def initialize() - @children = [] - @written = false - end - - def each() - @children.each do |child| - yield child - end - end - - def collect() - @children.each do |child| - yield child - end - end - - def length() - @children.length - end - def [](i) - @children[i] - end - - def []=(i, val) - @children[i] = val - end - - def <<(other) - @children << other - end - - def join(str) - @children.join(str) - end - - def signature - @children.sort! do |a, b| - a.path <=> b.path - end - "[" + @children.collect { |x| x.path + x.signature }.join("|") + "]" - end -end - class Node - attr_accessor :path, :children, :de_duped, :offset + attr_accessor :children, :de_duped, :offset, :written def initialize(path) - @path = path - @children = Children.new - @sig = nil + @children = {} @de_duped = false @offset = ran_char(2) end def has_key?(key) - @children.each do |child| - if child.path == key - return true - end - end - return false + @children.has_key? key end def get_child(name) - @children.each do |child| - if child.path == name - return child - end - end - return nil + @children[name] end def de_duped=(val) @de_duped = val - @children.each do |child| + @children.each do |key, child| child.de_duped = true end end - def signature() - if @sig.nil? - @sig = @children.signature - end - @sig - end + def signature + sorted = @children.keys.sort do |a, b| + a <=> b + end + "[" + sorted.collect { |key| key + @children[key].signature }.join("|") + "]" + end def flatten() flat = [self] - @children.each do |child| + @children.each do |key, child| flat += child.flatten end flat end def to_json(*a) - { - @path => @children - }.to_json(*a) + @children.to_json(*a) end end @@ -168,7 +106,7 @@ def mk_hash(sgmts, parent) segment = sgmts.shift return parent if segment.nil? unless parent.has_key?(segment) - parent.children << mk_hash(sgmts, Node.new(segment)) + parent.children[segment] = mk_hash(sgmts, Node.new(segment)) else mk_hash(sgmts, parent.get_child(segment)) # else @@ -178,35 +116,51 @@ def mk_hash(sgmts, parent) end def compress_prefix(parent) - parent.children.each do |child| + parent.children.keys.each do |key| + child = parent.children[key] compress_prefix(child) - end - if parent.children.length == 1 - puts "compressing #{parent.path} and #{parent.children[0].path}" - parent.path += "/" + parent.children[0].path - parent.children = parent.children[0].children + if child.children.length == 1 + puts "compressing #{key} and #{child.children.keys[0]}" + new_key = key + "/" + child.children.keys[0] + parent.children[new_key] = child + child.children = child.children.values[0].children + parent.children.delete(key) + end end return parent end +def replace(tree, old, new) + tree.flatten.uniq.each do |node| + node.children.keys.each do |key| + if node.children[key] == old + node.children[key] = new + end + end + end +end + # given a tree of nodes, try and find branches that match the children of node. # if found, replace those branches with node's children def de_dupe(tree, node) - tree.flatten.each do |sub_tree| - if sub_tree.children == node.children + tree.flatten.uniq.each do |sub_tree| + if sub_tree == node # nothing elsif node.signature == sub_tree.signature sub_tree.de_duped = true - sub_tree.children = node.children + replace(tree, sub_tree, node) puts "Found dupe! " + node.signature unless node.signature == "[]" end end end def de_dupe_driver(tree) + before = tree.flatten.length tree.flatten.each do |node| de_dupe(tree, node) unless node.de_duped end + + puts "Total nodes Before: #{before} After: #{tree.flatten.uniq.length}" end # simulate random file offsets @@ -221,22 +175,16 @@ def binary_write(file, parent, string_huff, node_huff) #offset to child node indicies # not needed, can just go write to children indicies #file.write(ran_char) - if parent.children.written - puts "not writing children of #{parent.path}" + if parent.written return end - # number of paths - length = parent.children.length.to_s -# path_count = (3 - length.length).times.collect { |i| "0" }.join + length -# file.write(path_count) -# puts "CHILD COUNT: " + parent.children.length.to_s - parent.children.each do |child| + parent.children.each do |path, child| # puts "PATH: " + child.path # file.write(child.path) # file.write("\0") # index of path string - file.write_bits(string_huff.encode(child.path)) + file.write_bits(string_huff.encode(path)) # offset to node # index of node, that is. file.write_bits(node_huff.encode(child)) @@ -244,16 +192,15 @@ def binary_write(file, parent, string_huff, node_huff) # reserve null byte for end of node info # 3 0s are reserved in our name huffman table to denote end of node file.write_bits("000") - parent.children.each do |child| + parent.children.each do |path, child| binary_write(file, child, string_huff, node_huff) - child.children.written = true + child.written = true end end def write_strings(file, strings) string_io = StringIO.new() strings.each_key do |string| - puts "STRING: " + string string_io.write(string) string_io.write("\0") end @@ -263,22 +210,38 @@ end def collect_strings(parent) strings = {} - parent.flatten.each do |node| - strings[node.path] = [0, ran_char(1)] unless strings.has_key? node.path - strings[node.path][0] += 1 + parent.flatten.uniq.each do |node| + node.children.each_key do |key| + strings[key] ||= 0 + strings[key] += 1 + end end strings end def build_huffman_for_strings(parent) - nodes = parent.flatten.uniq - paths = nodes.collect {|node| node.path} - table = HuffmanEncoding.new paths + paths = [] + parent.flatten.uniq.each do |node| + node.children.each_key {|key| paths << key} + end + HuffmanEncoding.new paths end def build_huffman_for_nodes(parent) - nodes = parent.flatten - table = HuffmanEncoding.new nodes + nodes = parent.flatten.uniq + refs = {} + nodes.each do |node| + node.children.each do |key, node| + refs[node] ||= 0 + refs[node] += 1 + end + end + refs[parent] = 1 + expanded = [] + refs.each do |node, freq| + freq.times {expanded << node} + end + table = HuffmanEncoding.new expanded end if $0 == __FILE__ @@ -322,7 +285,7 @@ if $0 == __FILE__ # prime the signatures parent.signature de_dupe_driver(parent) - parent = compress_prefix(parent) +# parent = compress_prefix(parent) string_huff = build_huffman_for_strings(parent) node_huff = build_huffman_for_nodes(parent)