Merge pull request #475 from coreos-inc/seofix

Use a proper HTML parser with BS and catch exceptions
2015-09-14 15:56:03 -04:00 · 2015-09-14 15:56:03 -04:00 · 57329b6c78
commit 57329b6c78
parent 6d8752bdb5 6ca33ca108
1 changed files with 8 additions and 6 deletions
--- a/util/seo.py
+++ b/util/seo.py
@ -32,11 +32,13 @@ def render_snapshot(url):
  # Remove script tags
  logger.info('Removing script tags: %s' % url)

-  soup = BeautifulSoup(out_html.decode('utf8'))
-  to_extract = soup.findAll('script')
-  for item in to_extract:
-    item.extract()
-
-  logger.info('Snapshotted url: %s' % url)
+  try:
+    soup = BeautifulSoup(out_html.decode('utf8'), 'html.parser')
+    to_extract = soup.findAll('script')
+    for item in to_extract:
+      item.extract()
+  except:
+    logger.exception('Exception when trying to parse served HTML')
+    return out_html

  return str(soup)