Remove escaped_fragment snapshot rendering.
This commit is contained in:
parent
fea47bdaed
commit
746728ba24
5 changed files with 0 additions and 64 deletions
44
util/seo.py
44
util/seo.py
|
@ -1,44 +0,0 @@
|
|||
import subprocess
|
||||
import logging
|
||||
|
||||
from urllib import urlencode
|
||||
from urlparse import parse_qs, urlsplit, urlunsplit
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def set_query_parameter(url, param_name, param_value):
|
||||
# From: http://stackoverflow.com/questions/4293460/how-to-add-custom-parameters-to-an-url-query-string-with-python
|
||||
scheme, netloc, path, query_string, fragment = urlsplit(url)
|
||||
query_params = parse_qs(query_string)
|
||||
|
||||
query_params[param_name] = [param_value]
|
||||
new_query_string = urlencode(query_params, doseq=True)
|
||||
|
||||
return urlunsplit((scheme, netloc, path, new_query_string, fragment))
|
||||
|
||||
|
||||
def render_snapshot(url):
|
||||
logger.info('Snapshotting url: %s' % url)
|
||||
|
||||
url = set_query_parameter(url, 'use_cdn', False)
|
||||
out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
|
||||
'--disk-cache=yes', '--ssl-protocol=tlsv1',
|
||||
'util/phantomjs-runner.js', url])
|
||||
|
||||
if not out_html or out_html.strip() == 'Not Found':
|
||||
return None
|
||||
|
||||
# Remove script tags
|
||||
logger.info('Removing script tags: %s' % url)
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(out_html.decode('utf8'), 'html.parser')
|
||||
to_extract = soup.findAll('script')
|
||||
for item in to_extract:
|
||||
item.extract()
|
||||
except:
|
||||
logger.exception('Exception when trying to parse served HTML')
|
||||
return out_html
|
||||
|
||||
return str(soup)
|
Reference in a new issue