path-packer-c/thing.rb

425 lines
10 KiB
Ruby
Raw Permalink Normal View History

2012-07-26 16:18:58 +00:00
#!/usr/bin/env ruby
=begin
usage: ruby ./thing.rb <dpc> 5286016419950084643.{pem,txt}
=end
2012-07-26 16:18:58 +00:00
# stdlib
2012-07-26 16:18:58 +00:00
require 'openssl'
require 'zlib'
require 'stringio'
require 'logger'
require 'pp'
# gems
2012-07-26 16:18:58 +00:00
require 'rubygems'
begin
require 'json'
rescue
abort('ERROR: plz2run #> gem install json')
end
# local
2012-07-27 17:47:20 +00:00
require './huffman'
2012-08-06 21:04:30 +00:00
$log = Logger.new(STDOUT)
2012-08-12 12:35:26 +00:00
#$log.level = Logger::DEBUG
$log.level = Logger::FATAL
2012-08-10 13:49:13 +00:00
$sentinal = "SENTINAL"
2012-07-26 16:18:58 +00:00
2012-07-27 17:47:20 +00:00
class BitWriter
def initialize(stream)
@stream = stream
2012-07-27 19:41:44 +00:00
@byte = 0x00
2012-08-10 13:49:13 +00:00
@count = 7
2012-07-27 17:47:20 +00:00
end
def write(char)
if char == '1'
2012-08-10 13:49:13 +00:00
@byte |= 0x01 << @count
2012-07-27 17:47:20 +00:00
end
@count -= 1
if @count == -1
self.pad
end
end
2012-07-27 19:41:44 +00:00
def write_bits(string)
string.each_char do |c|
self.write(c)
end
end
2012-07-27 17:47:20 +00:00
def pad()
2012-08-10 13:49:13 +00:00
@stream.write(Array(@byte).pack('c'))
@count = 7
2012-07-27 19:41:44 +00:00
@byte = 0x00
2012-07-27 17:47:20 +00:00
end
end
2012-07-26 17:21:16 +00:00
class Node
attr_accessor :children, :de_duped, :offset, :written
2012-07-26 17:21:16 +00:00
def initialize(path)
@children = {}
2012-07-26 19:38:10 +00:00
@de_duped = false
2012-07-27 17:47:20 +00:00
@offset = ran_char(2)
2012-08-01 09:48:24 +00:00
@sig = nil
2012-07-26 17:21:16 +00:00
end
def has_key?(key)
@children.has_key? key
2012-07-26 17:21:16 +00:00
end
2012-08-06 20:47:54 +00:00
def [](name)
@children[name]
2012-07-26 17:21:16 +00:00
end
2012-07-27 17:47:20 +00:00
def de_duped=(val)
@de_duped = val
@children.each do |key, child|
2012-07-27 17:47:20 +00:00
child.de_duped = true
end
end
def signature
2012-08-01 09:48:24 +00:00
return @sig unless @sig.nil?
sorted = @children.keys.sort do |a, b|
a <=> b
end
2012-08-01 09:48:24 +00:00
@sig = "[" + sorted.collect { |key| key + @children[key].signature }.join("|") + "]"
return @sig
end
2012-07-26 19:38:10 +00:00
def flatten()
flat = [self]
@children.each do |key, child|
2012-07-26 19:38:10 +00:00
flat += child.flatten
end
flat
end
2012-07-26 17:21:16 +00:00
def to_json(*a)
@children.to_json(*a)
2012-07-26 17:21:16 +00:00
end
def to_h
Hash[@children.map {|k, v| [k, v.to_h] }]
end
2012-07-26 17:21:16 +00:00
end
2012-07-26 16:18:58 +00:00
def akamai_hex_to_content_set(akamai_hex)
gzipped_hex = akamai_hex.gsub(":","").chomp("00")
gzipped_data = [gzipped_hex].pack("H*")
gzipped_data_io = StringIO.new(gzipped_data)
gz = Zlib::GzipReader.new(gzipped_data_io)
content_sets = gz.read.split("|")
begin
gz.close
rescue Zlib::GzipFile::NoFooter
end
return content_sets
end
2012-07-26 17:21:16 +00:00
def mk_hash(sgmts, parent)
2012-07-26 16:18:58 +00:00
segment = sgmts.shift
2012-07-26 17:21:16 +00:00
return parent if segment.nil?
unless parent.has_key?(segment)
parent.children[segment] = mk_hash(sgmts, Node.new(segment))
2012-07-26 16:18:58 +00:00
else
2012-08-06 20:47:54 +00:00
mk_hash(sgmts, parent[segment])
2012-07-26 17:21:16 +00:00
# else
# hash[segment].update(mk_hash(sgmts, hash[segment]))
2012-07-26 16:18:58 +00:00
end
2012-07-26 17:21:16 +00:00
return parent
2012-07-26 16:18:58 +00:00
end
2012-07-26 17:21:16 +00:00
def compress_prefix(parent)
parent.children.keys.each do |key|
child = parent.children[key]
2012-07-26 17:21:16 +00:00
compress_prefix(child)
if child.children.length == 1
puts "compressing #{key} and #{child.children.keys[0]}"
new_key = key + "/" + child.children.keys[0]
parent.children[new_key] = child
child.children = child.children.values[0].children
parent.children.delete(key)
end
2012-07-26 16:18:58 +00:00
end
2012-07-26 17:21:16 +00:00
return parent
2012-07-26 16:18:58 +00:00
end
2012-08-01 09:48:24 +00:00
def replace(list, old, new)
puts "replace"
length = list.length
list.each do |node|
node.children.keys.each do |key|
if node.children[key] == old
node.children[key] = new
end
end
end
end
2012-08-01 09:48:24 +00:00
# given a list of nodes, try and find branches that match the children of node.
2012-07-26 19:38:10 +00:00
# if found, replace those branches with node's children
2012-08-01 09:48:24 +00:00
def de_dupe(list, node)
list.each do |sub_tree|
if sub_tree == node or sub_tree.de_duped
next
end
2012-07-26 19:38:10 +00:00
# nothing
2012-08-01 09:48:24 +00:00
sub_tree.children.keys.each do |key|
next if sub_tree.children[key] == node
next if sub_tree.children[key].de_duped
if sub_tree.children[key].signature == node.signature
sub_tree.children[key].de_duped = true
sub_tree.children[key] = node
2012-08-06 21:04:30 +00:00
$log.info("Found dupe!" ) { node.signature unless node.signature == "[]" }
2012-08-01 09:48:24 +00:00
end
2012-07-26 19:38:10 +00:00
end
end
end
2012-07-27 17:47:20 +00:00
def de_dupe_driver(tree)
2012-08-01 09:48:24 +00:00
list = tree.flatten
before = list.length
i = 1
list.each do |node|
2012-08-06 21:04:30 +00:00
$log.info('de_dupe_driver') { "de dupe #{i} / #{before}" }
2012-08-01 09:48:24 +00:00
i += 1
de_dupe(list, node) unless node.de_duped
2012-07-26 19:38:10 +00:00
end
puts "Total nodes Before: #{before} After: #{tree.flatten.uniq.length}"
2012-07-26 19:38:10 +00:00
end
2012-07-27 17:47:20 +00:00
# simulate random file offsets
def ran_char(val)
val = (0..val - 1).map {rand(256).chr}.join
return val
end
2012-08-10 13:49:13 +00:00
def binary_write(file, node_list, string_huff, node_huff)
node_list.each do |node|
$log.debug('binary_write') { "begin node: " + node_huff.encode(node) }
node.children.each do |path, child|
# index of path string
$log.debug('binary_write') { "\tpath: " + path.inspect + "; encoded: " + string_huff.encode(path).inspect }
file.write_bits(string_huff.encode(path))
# offset to node
# index of node, that is.
file.write_bits(node_huff.encode(child))
$log.debug('binary_write') { "\tnode encoded: " + node_huff.encode(child) }
end
# end of node is indicated by the special sentinal huffman coding of \0
file.write_bits(string_huff.encode($sentinal))
2012-07-27 17:47:20 +00:00
end
end
def list_from_file(path)
paths = File.read(path)
paths.split("\n")
end
def tree_from_list(sets)
parent = Node.new("")
sets.each do |set|
line = set.start_with?("/") ? set[1..-1] : set
# => ["content", "beta", "rhel", "server", "6", "$releasever", "$basearch", "scalablefilesystem", "debug"]
chunks = line.split("/")
parent = mk_hash(chunks, parent)
end
parent
end
2012-07-27 17:47:20 +00:00
def write_strings(file, strings)
string_io = StringIO.new()
2012-08-09 19:09:29 +00:00
strings.each do |string|
2012-07-27 17:47:20 +00:00
string_io.write(string)
string_io.write("\0")
2012-07-26 19:38:10 +00:00
end
2012-07-27 17:47:20 +00:00
zlib = Zlib::Deflate.new(Zlib::BEST_COMPRESSION, 15, Zlib::MAX_MEM_LEVEL)
2012-07-27 19:41:44 +00:00
file.write zlib.deflate(string_io.string, Zlib::FINISH)
2012-07-27 17:47:20 +00:00
end
def collect_strings(parent)
strings = {}
parent.flatten.uniq.each do |node|
node.children.each_key do |key|
strings[key] ||= 0
strings[key] += 1
end
2012-07-27 17:47:20 +00:00
end
2012-08-09 19:09:29 +00:00
list = []
strings.sort { |l, r| l[1] <=> r[1] }.each do |string, weight|
list << string
end
list
2012-07-27 17:47:20 +00:00
end
2012-08-09 19:09:29 +00:00
def build_huffman_for_strings(strings)
paths = []
2012-08-09 19:09:29 +00:00
i = 1
strings.each do |string|
i.times { paths << string }
i += 1
end
2012-08-10 13:49:13 +00:00
# add on sentinal string
i.times { paths << $sentinal }
HuffmanEncoding.new paths
2012-07-27 19:41:44 +00:00
end
2012-07-27 17:47:20 +00:00
2012-08-10 13:49:13 +00:00
def build_node_frequencies(parent)
nodes = parent.flatten.uniq
refs = {}
nodes.each do |node|
node.children.each do |key, child|
refs[child] ||= 0
refs[child] += 1
end
2012-08-10 13:49:13 +00:00
end
list = []
refs.sort { |l, r| l[1] <=> r[1] }.each do |node, weight|
list << node
end
list
end
def build_huffman_for_nodes(list)
# parent doesn't have to go into the table
i = 1
expanded = []
2012-08-10 13:49:13 +00:00
list.each do |node|
i.times {expanded << node}
i += 1
end
table = HuffmanEncoding.new expanded
2012-07-26 16:56:45 +00:00
end
2012-07-26 16:18:58 +00:00
if $0 == __FILE__
2012-08-06 21:04:30 +00:00
if ARGV.include?("-v")
$log.level = Logger::DEBUG
ARGV.delete("-v")
end
2012-08-01 09:48:24 +00:00
if ARGV.length != 2
puts "usage: thing.rb <d|c> <file>"
puts "please specify one of d or c"
puts "d - dump an x509 cert into a newline delimited output"
puts "p - pretty print the newline delimited list, as a tree"
2012-08-01 09:48:24 +00:00
puts "c - compress the newline delimited input list of paths"
exit()
2012-07-26 16:18:58 +00:00
end
2012-08-01 09:48:24 +00:00
case ARGV[0]
when 'd'
2012-08-01 09:48:24 +00:00
cert_data = File.read(ARGV[1])
2012-07-26 16:18:58 +00:00
cert = OpenSSL::X509::Certificate.new(cert_data)
content_hex = cert.extensions.detect {|ext| ext.oid == 'subjectKeyIdentifier' }
abort('ERROR: no X509v3 extension for subjectKeyIdentifier') unless content_hex
2012-08-01 09:48:24 +00:00
ext = File.extname(ARGV[1])
txt_name = File.basename(ARGV[1], ext) + ".txt"
2012-07-26 16:18:58 +00:00
File.open(txt_name, "w+") do |file|
2012-08-01 09:48:24 +00:00
file.write(akamai_hex_to_content_set(content_hex.value).join("\n"))
file.write("\n")
2012-07-26 16:18:58 +00:00
end
2012-08-01 09:48:24 +00:00
when 'p'
sets = list_from_file(ARGV[1])
parent = tree_from_list(sets)
de_dupe_driver(parent)
pp parent.to_h
when 'c'
2012-08-01 09:48:24 +00:00
paths = File.read(ARGV[1])
sets = paths.split("\n")
ext = File.extname(ARGV[1])
binary = File.basename(ARGV[1], ext) + ".bin"
File.open(binary, "w+") do |file|
2012-07-26 17:21:16 +00:00
parent = Node.new("")
2012-07-26 16:18:58 +00:00
sets.each do |set|
line = set.start_with?("/") ? set[1..-1] : set
# => ["content", "beta", "rhel", "server", "6", "$releasever", "$basearch", "scalablefilesystem", "debug"]
chunks = line.split("/")
2012-07-26 17:21:16 +00:00
parent = mk_hash(chunks, parent)
2012-07-26 16:18:58 +00:00
end
2012-08-01 09:48:24 +00:00
puts "priming node signatures"
2012-07-27 17:47:20 +00:00
parent.signature
2012-08-01 09:48:24 +00:00
puts "removing duplicates"
2012-07-27 17:47:20 +00:00
de_dupe_driver(parent)
2012-08-01 09:48:24 +00:00
# parent = compress_prefix(parent)
2012-07-27 17:47:20 +00:00
2012-08-01 09:48:24 +00:00
puts "building huffman table for nodes"
2012-08-10 13:49:13 +00:00
node_list = build_node_frequencies(parent)
node_huff = build_huffman_for_nodes(node_list)
# XXX add sentinal value to strings to indicate end of node.
# should be most frequent one. the string itself doesn't have to
# be stored, since we just care about the bitstring.
2012-07-27 19:41:44 +00:00
strings = collect_strings(parent)
2012-08-09 19:09:29 +00:00
puts "building huffman table for strings"
string_huff = build_huffman_for_strings(strings)
puts "writing"
2012-08-01 09:48:24 +00:00
write_strings(file, strings)
2012-08-10 13:49:13 +00:00
# write out the number of unique path nodes into 1 or more bytes. if <
# 128 nodes, write in a single byte. if > 128 nodes, the first byte will
# begin with a '1' to indicate as such. the following bits in the byte
# indicate how many bytes following the first byte are used to store the
# size.
node_count = node_list.count + 1
if node_count < 128
file.write([node_count].pack("C"))
else
bits = Math.log(node_count, 2).ceil
2012-10-25 14:51:37 +00:00
bytes = (bits / 8.0).ceil
if bytes == 3
# must write this as a 32-bit int
bytes = 4
end
file.write([128 + bytes].pack("C"))
# get the correct integer directive for pack()
case bytes
when 1
# 8-bit unsigned
directive = "C"
when 2
# 16-bit unsigned big-endian
directive = "n"
when 4
# 32-bit unsigned big-endian
directive = "N"
else
# Give up. This is an impractical number of nodes.
puts("Too many nodes.")
exit(false)
end
file.write([node_count].pack(directive))
end
2012-08-10 13:49:13 +00:00
2012-08-01 09:48:24 +00:00
bit_file = BitWriter.new file
2012-08-10 13:49:13 +00:00
binary_write(bit_file, [parent] + node_list, string_huff, node_huff)
2012-07-27 19:41:44 +00:00
bit_file.pad
end
end # esac
2012-07-26 16:18:58 +00:00
end