Merge pull request #475 from coreos-inc/seofix

Use a proper HTML parser with BS and catch exceptions
This commit is contained in:
josephschorr 2015-09-14 15:56:03 -04:00
commit 57329b6c78

View file

@ -32,11 +32,13 @@ def render_snapshot(url):
# Remove script tags
logger.info('Removing script tags: %s' % url)
soup = BeautifulSoup(out_html.decode('utf8'))
to_extract = soup.findAll('script')
for item in to_extract:
item.extract()
logger.info('Snapshotted url: %s' % url)
try:
soup = BeautifulSoup(out_html.decode('utf8'), 'html.parser')
to_extract = soup.findAll('script')
for item in to_extract:
item.extract()
except:
logger.exception('Exception when trying to parse served HTML')
return out_html
return str(soup)