42 lines
		
	
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			42 lines
		
	
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import subprocess
 | |
| import logging
 | |
| 
 | |
| from urllib import urlencode
 | |
| from urlparse import parse_qs, urlsplit, urlunsplit
 | |
| from bs4 import BeautifulSoup
 | |
| 
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| def set_query_parameter(url, param_name, param_value):
 | |
|   # From: http://stackoverflow.com/questions/4293460/how-to-add-custom-parameters-to-an-url-query-string-with-python
 | |
|   scheme, netloc, path, query_string, fragment = urlsplit(url)
 | |
|   query_params = parse_qs(query_string)
 | |
| 
 | |
|   query_params[param_name] = [param_value]
 | |
|   new_query_string = urlencode(query_params, doseq=True)
 | |
| 
 | |
|   return urlunsplit((scheme, netloc, path, new_query_string, fragment))
 | |
| 
 | |
| 
 | |
| def render_snapshot(url):
 | |
|   logger.info('Snapshotting url: %s' % url)
 | |
| 
 | |
|   url = set_query_parameter(url, 'use_cdn', False)
 | |
|   out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
 | |
|                                       '--disk-cache=yes', '--ssl-protocol=tlsv1',
 | |
|                                       'util/phantomjs-runner.js', url])
 | |
| 
 | |
|   if not out_html or out_html.strip() == 'Not Found':
 | |
|     return None
 | |
| 
 | |
|   # Remove script tags
 | |
|   logger.info('Removing script tags: %s' % url)
 | |
| 
 | |
|   soup = BeautifulSoup(out_html.decode('utf8'))
 | |
|   to_extract = soup.findAll('script')
 | |
|   for item in to_extract:
 | |
|     item.extract()
 | |
| 
 | |
|   logger.info('Snapshotted url: %s' % url)
 | |
| 
 | |
|   return str(soup)
 |