This commit is contained in:
Vincent Batts 2012-08-07 06:50:48 -07:00
commit d8b65106ab
4 changed files with 147 additions and 11 deletions

38
README.md Normal file
View file

@ -0,0 +1,38 @@
== Overview ==
POC to compile a data set into a modified radix tree,
and applying huffman encoding.
== Usage ==
Take in an v1 x509 certificate, and extract the
content sets, output them to newline delimited output
`$> ruby ./thing.rb d this-cert.pem`
This would produce a file named 'this-cert.txt'
To see this txt list, in the tree format, do:
`$> ruby ./thing.rb p this-cert.txt | less`
Process this output to generate the compiled dictionary output
`$> ruby ./thing.rb c this-cert.txt`
This would produce a file named 'this-cert.bin'
Then, the unpack the binary with:
`$> ./unpack this-cert.bin`
or
`$> ruby ./unpack.rb this-cert.bin`
The 'thing.rb' supports a "-v" verbose flag.
== Code compiles ==
To compile the 'unpack' command, just run `make`.
( this requires make, gcc, and zlib-devel)

View file

@ -89,11 +89,11 @@ class HuffmanEncoding
end
def encode(entry)
self.lookup.invert[entry]
self.lookup.invert[entry] || ""
end
def decode(code)
self.lookup[code]
self.lookup[code] || ""
end
def encode_list(list)

View file

@ -1,8 +1,16 @@
#!/usr/bin/env ruby
=begin
usage: ruby ./thing.rb <dpc> 5286016419950084643.{pem,txt}
=end
# stdlib
require 'openssl'
require 'zlib'
require 'stringio'
require 'logger'
require 'pp'
# gems
require 'rubygems'
begin
require 'json'
@ -10,9 +18,12 @@ rescue
abort('ERROR: plz2run #> gem install json')
end
# local
require './huffman'
# usage: ./content_from_pem.rb 5286016419950084643.pem
$log = Logger.new(STDOUT)
#$log.level = Logger::DEBUG
$log.level = Logger::FATAL
class BitWriter
@ -59,7 +70,7 @@ class Node
@children.has_key? key
end
def get_child(name)
def [](name)
@children[name]
end
@ -90,6 +101,10 @@ class Node
def to_json(*a)
@children.to_json(*a)
end
def to_h
Hash[@children.map {|k, v| [k, v.to_h] }]
end
end
def akamai_hex_to_content_set(akamai_hex)
@ -111,7 +126,7 @@ def mk_hash(sgmts, parent)
unless parent.has_key?(segment)
parent.children[segment] = mk_hash(sgmts, Node.new(segment))
else
mk_hash(sgmts, parent.get_child(segment))
mk_hash(sgmts, parent[segment])
# else
# hash[segment].update(mk_hash(sgmts, hash[segment]))
end
@ -159,7 +174,7 @@ def de_dupe(list, node)
if sub_tree.children[key].signature == node.signature
sub_tree.children[key].de_duped = true
sub_tree.children[key] = node
puts "Found dupe! " + node.signature unless node.signature == "[]"
$log.info("Found dupe!" ) { node.signature unless node.signature == "[]" }
end
end
end
@ -170,7 +185,7 @@ def de_dupe_driver(tree)
before = list.length
i = 1
list.each do |node|
puts "de dupe #{i} / #{before}"
$log.info('de_dupe_driver') { "de dupe #{i} / #{before}" }
i += 1
de_dupe(list, node) unless node.de_duped
end
@ -199,6 +214,7 @@ def binary_write(file, parent, string_huff, node_huff)
# file.write(child.path)
# file.write("\0")
# index of path string
$log.debug('binary_write') { "path: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect }
file.write_bits(string_huff.encode(path))
# offset to node
# index of node, that is.
@ -213,6 +229,23 @@ def binary_write(file, parent, string_huff, node_huff)
end
end
def list_from_file(path)
paths = File.read(path)
paths.split("\n")
end
def tree_from_list(sets)
parent = Node.new("")
sets.each do |set|
line = set.start_with?("/") ? set[1..-1] : set
# => ["content", "beta", "rhel", "server", "6", "$releasever", "$basearch", "scalablefilesystem", "debug"]
chunks = line.split("/")
parent = mk_hash(chunks, parent)
end
parent
end
def write_strings(file, strings)
string_io = StringIO.new()
strings.each_key do |string|
@ -260,15 +293,21 @@ def build_huffman_for_nodes(parent)
end
if $0 == __FILE__
if ARGV.include?("-v")
$log.level = Logger::DEBUG
ARGV.delete("-v")
end
if ARGV.length != 2
puts "usage: thing.rb <d|c> <file>"
puts "please specify one of d or c"
puts "d - dump an x509 cert into a newline delimited output"
puts "p - pretty print the newline delimited list, as a tree"
puts "c - compress the newline delimited input list of paths"
exit()
end
if ARGV[0] == 'd'
case ARGV[0]
when 'd'
cert_data = File.read(ARGV[1])
cert = OpenSSL::X509::Certificate.new(cert_data)
@ -282,8 +321,13 @@ if $0 == __FILE__
file.write("\n")
end
exit()
end
when 'p'
sets = list_from_file(ARGV[1])
parent = tree_from_list(sets)
de_dupe_driver(parent)
pp parent.to_h
when 'c'
paths = File.read(ARGV[1])
sets = paths.split("\n")
@ -316,5 +360,7 @@ if $0 == __FILE__
bit_file = BitWriter.new file
binary_write(bit_file, parent, string_huff, node_huff)
bit_file.pad
end
end
end # esac
end

52
unpack.rb Executable file
View file

@ -0,0 +1,52 @@
#!/usr/bin/env ruby
# stdlib
require 'stringio'
require 'zlib'
def inflate(data)
Zlib::Inflate.inflate(data)
end
def deflate(data)
Zlib::Deflate.deflate(data)
end
# there is not a difference for us, in these two
def inflate2(data)
zlib = Zlib::Inflate.new(15)
buff = zlib.inflate(data)
zlib.finish
zlib.close
buff
end
def load_dictionary(data)
data.split("\x00")
end
if $0 == __FILE__
abort("usage: %s <bin_file> ..." % __FILE__) unless (ARGV.length > 0)
ARGV.each do |arg|
file = File.open(arg)
z_data_io = StringIO.new(file.read())
data = inflate(z_data_io.read())
e_pos = deflate(data).bytesize()
z_data_io.seek(e_pos)
puts "data is:"
puts load_dictionary(data).map {|x| "\t#{x}" }
puts "dictionary stats:"
puts "\tcompressed size: %d" % deflate(data).bytesize()
puts "\tuncompressed size: %d" % data.bytesize()
buf = z_data_io.read()
puts "Read %d bytes\n" % buf.bytesize()
end
end