import subprocess import urllib import os import logging import codecs from bs4 import BeautifulSoup logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) BASE_URL = 'https://localhost/' OUTPUT_PATH = '../static/snapshots/' URLS = [ '', 'guide/', 'plans/', 'repository/', 'signin/', ] for url in URLS: final_url = BASE_URL + url to_write = OUTPUT_PATH + url + 'index.html' logger.info('Snapshotting url: %s -> %s' % (final_url, to_write)) out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', 'phantomjs-runner.js', final_url]) # Remove script tags soup = BeautifulSoup(out_html) to_extract = soup.findAll('script') for item in to_extract: item.extract() to_write_dir = os.path.dirname(to_write) if not os.path.exists(to_write_dir): os.makedirs(to_write_dir) with codecs.open(to_write, 'w', 'utf-8') as output_file: output_file.write(soup.prettify())