Use a proper HTML parser with BS and catch exceptions

Fixes #473
2015-09-10 16:14:29 -04:00 · 2015-09-10 16:14:29 -04:00 · 6ca33ca108
commit 6ca33ca108
parent 14107893a6
1 changed files with 8 additions and 6 deletions
--- a/util/seo.py
+++ b/util/seo.py
@ -32,11 +32,13 @@ def render_snapshot(url):
  # Remove script tags
  logger.info('Removing script tags: %s' % url)
-  soup = BeautifulSoup(out_html.decode('utf8'))
+  try:
    soup = BeautifulSoup(out_html.decode('utf8'), 'html.parser')
    to_extract = soup.findAll('script')
    for item in to_extract:
      item.extract()
-
+  except:
-  logger.info('Snapshotted url: %s' % url)
+    logger.exception('Exception when trying to parse served HTML')
    return out_html
  return str(soup)