From c4f3ab31d02adc9a5a20712c4116decdc4ce5f09 Mon Sep 17 00:00:00 2001
From: yackob03 <jacob.moshenko@gmail.com>
Date: Fri, 11 Oct 2013 01:16:51 -0400
Subject: [PATCH] Add another version of the snapshotter that is a crawler,
 unfinished.

---
 seo-snapshots/crawl.py | 60 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 seo-snapshots/crawl.py

diff --git a/seo-snapshots/crawl.py b/seo-snapshots/crawl.py
new file mode 100644
index 000000000..79adcef57
--- /dev/null
+++ b/seo-snapshots/crawl.py
@@ -0,0 +1,60 @@
+import subprocess
+import urllib
+import os
+import logging
+import codecs
+
+from bs4 import BeautifulSoup
+from Queue import Queue
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.DEBUG)
+
+
+BASE_URL = 'http://localhost:5000'
+OUTPUT_PATH = 'snapshots/'
+
+aware_of = set()
+crawl_queue = Queue()
+
+def crawl_url(url):
+  final_url = BASE_URL + url
+  to_write = OUTPUT_PATH + url + 'index.html'
+
+  logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))
+
+  out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
+                                      'phantomjs-runner.js', final_url])
+
+  # Remove script tags
+  soup = BeautifulSoup(out_html)
+  to_extract = soup.findAll('script')
+  for item in to_extract:
+    item.extract()
+
+  # Find all links and add them to the crawl queue
+  for link in soup.findAll('a'):
+    to_add = link.get('href')
+
+    if to_add not in aware_of and to_add.startswith('/'):
+      logger.info('Adding link to be crawled: %s' % to_add)
+      crawl_queue.put(to_add)
+      aware_of.add(to_add)
+
+  to_write_dir = os.path.dirname(to_write)
+
+  if not os.path.exists(to_write_dir):
+    os.makedirs(to_write_dir)
+
+  with codecs.open(to_write, 'w', 'utf-8') as output_file:
+    output_file.write(soup.prettify())
+
+# Seed the crawler
+crawl_queue.put('/')
+aware_of.add('/')
+
+# Crawl
+while not crawl_queue.empty():
+  to_crawl = crawl_queue.get()
+  crawl_url(to_crawl)