import subprocess import urllib import os import logging import codecs from bs4 import BeautifulSoup from Queue import Queue logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) BASE_URL = 'http://localhost:5000' OUTPUT_PATH = 'snapshots/' aware_of = set() crawl_queue = Queue() def crawl_url(url): final_url = BASE_URL + url to_write = OUTPUT_PATH + url + 'index.html' logger.info('Snapshotting url: %s -> %s' % (final_url, to_write)) out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', 'phantomjs-runner.js', final_url]) # Remove script tags soup = BeautifulSoup(out_html) to_extract = soup.findAll('script') for item in to_extract: item.extract() # Find all links and add them to the crawl queue for link in soup.findAll('a'): to_add = link.get('href') if to_add not in aware_of and to_add.startswith('/'): logger.info('Adding link to be crawled: %s' % to_add) crawl_queue.put(to_add) aware_of.add(to_add) to_write_dir = os.path.dirname(to_write) if not os.path.exists(to_write_dir): os.makedirs(to_write_dir) with codecs.open(to_write, 'w', 'utf-8') as output_file: output_file.write(soup.prettify()) # Seed the crawler crawl_queue.put('/') aware_of.add('/') # Crawl while not crawl_queue.empty(): to_crawl = crawl_queue.get() crawl_url(to_crawl)