From c4f3ab31d02adc9a5a20712c4116decdc4ce5f09 Mon Sep 17 00:00:00 2001 From: yackob03 Date: Fri, 11 Oct 2013 01:16:51 -0400 Subject: [PATCH] Add another version of the snapshotter that is a crawler, unfinished. --- seo-snapshots/crawl.py | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 seo-snapshots/crawl.py diff --git a/seo-snapshots/crawl.py b/seo-snapshots/crawl.py new file mode 100644 index 000000000..79adcef57 --- /dev/null +++ b/seo-snapshots/crawl.py @@ -0,0 +1,60 @@ +import subprocess +import urllib +import os +import logging +import codecs + +from bs4 import BeautifulSoup +from Queue import Queue + + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + + +BASE_URL = 'http://localhost:5000' +OUTPUT_PATH = 'snapshots/' + +aware_of = set() +crawl_queue = Queue() + +def crawl_url(url): + final_url = BASE_URL + url + to_write = OUTPUT_PATH + url + 'index.html' + + logger.info('Snapshotting url: %s -> %s' % (final_url, to_write)) + + out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', + 'phantomjs-runner.js', final_url]) + + # Remove script tags + soup = BeautifulSoup(out_html) + to_extract = soup.findAll('script') + for item in to_extract: + item.extract() + + # Find all links and add them to the crawl queue + for link in soup.findAll('a'): + to_add = link.get('href') + + if to_add not in aware_of and to_add.startswith('/'): + logger.info('Adding link to be crawled: %s' % to_add) + crawl_queue.put(to_add) + aware_of.add(to_add) + + to_write_dir = os.path.dirname(to_write) + + if not os.path.exists(to_write_dir): + os.makedirs(to_write_dir) + + with codecs.open(to_write, 'w', 'utf-8') as output_file: + output_file.write(soup.prettify()) + +# Seed the crawler +crawl_queue.put('/') +aware_of.add('/') + +# Crawl +while not crawl_queue.empty(): + to_crawl = crawl_queue.get() + crawl_url(to_crawl)