import subprocess import logging from urllib import urlencode from urlparse import parse_qs, urlsplit, urlunsplit from bs4 import BeautifulSoup logger = logging.getLogger(__name__) def set_query_parameter(url, param_name, param_value): # From: http://stackoverflow.com/questions/4293460/how-to-add-custom-parameters-to-an-url-query-string-with-python scheme, netloc, path, query_string, fragment = urlsplit(url) query_params = parse_qs(query_string) query_params[param_name] = [param_value] new_query_string = urlencode(query_params, doseq=True) return urlunsplit((scheme, netloc, path, new_query_string, fragment)) def render_snapshot(url): logger.info('Snapshotting url: %s' % url) url = set_query_parameter(url, 'use_cdn', False) out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', '--disk-cache=yes', '--ssl-protocol=tlsv1', 'util/phantomjs-runner.js', url]) if not out_html or out_html.strip() == 'Not Found': return None # Remove script tags logger.info('Removing script tags: %s' % url) try: soup = BeautifulSoup(out_html.decode('utf8'), 'html.parser') to_extract = soup.findAll('script') for item in to_extract: item.extract() except: logger.exception('Exception when trying to parse served HTML') return out_html return str(soup)