Add another version of the snapshotter that is a crawler, unfinished.

2013-10-11 01:16:51 -04:00 · 2013-10-11 01:16:51 -04:00 · c4f3ab31d0
commit c4f3ab31d0
parent 2d9c3f2c38
1 changed files with 60 additions and 0 deletions
--- a/seo-snapshots/crawl.py
+++ b/seo-snapshots/crawl.py
@ -0,0 +1,60 @@
+import subprocess
+import urllib
+import os
+import logging
+import codecs
+
+from bs4 import BeautifulSoup
+from Queue import Queue
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.DEBUG)
+
+
+BASE_URL = 'http://localhost:5000'
+OUTPUT_PATH = 'snapshots/'
+
+aware_of = set()
+crawl_queue = Queue()
+
+def crawl_url(url):
+  final_url = BASE_URL + url
+  to_write = OUTPUT_PATH + url + 'index.html'
+
+  logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))
+
+  out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
+                                      'phantomjs-runner.js', final_url])
+
+  # Remove script tags
+  soup = BeautifulSoup(out_html)
+  to_extract = soup.findAll('script')
+  for item in to_extract:
+    item.extract()
+
+  # Find all links and add them to the crawl queue
+  for link in soup.findAll('a'):
+    to_add = link.get('href')
+
+    if to_add not in aware_of and to_add.startswith('/'):
+      logger.info('Adding link to be crawled: %s' % to_add)
+      crawl_queue.put(to_add)
+      aware_of.add(to_add)
+
+  to_write_dir = os.path.dirname(to_write)
+
+  if not os.path.exists(to_write_dir):
+    os.makedirs(to_write_dir)
+
+  with codecs.open(to_write, 'w', 'utf-8') as output_file:
+    output_file.write(soup.prettify())
+
+# Seed the crawler
+crawl_queue.put('/')
+aware_of.add('/')
+
+# Crawl
+while not crawl_queue.empty():
+  to_crawl = crawl_queue.get()
+  crawl_url(to_crawl)