diff --git a/util/phantomjs-runner.js b/util/phantomjs-runner.js index 30b0439fa..fae6496e0 100644 --- a/util/phantomjs-runner.js +++ b/util/phantomjs-runner.js @@ -1,37 +1,55 @@ var system = require('system'); var url = system.args[1] || ''; +var count = 0; + if(url.length > 0) { var page = require('webpage').create(); page.open(url, function (status) { - if (status == 'success') { - var delay, checker = (function() { - var html = page.evaluate(function () { - var found = document.getElementsByTagName('html')[0].outerHTML || ''; - if (window.__isLoading && !window.__isLoading()) { - return found; - } - if (found.indexOf('404 Not Found') > 0) { - return found; - } - return null; - }); + try { + if (status == 'success') { + var delay; + var checker = (function() { + count++; - if (html) { - if (html.indexOf('404 Not Found') > 0) { + if (count > 100) { console.log('Not Found'); phantom.exit(); - return; + return null; } - clearTimeout(delay); - console.log(html); - phantom.exit(); - } - }); - delay = setInterval(checker, 100); - } else { + var html = page.evaluate(function () { + var found = document.getElementsByTagName('html')[0].outerHTML || ''; + if (window.__isLoading && !window.__isLoading()) { + return found; + } + if (found.indexOf('404 Not Found') > 0) { + return found; + } + return null; + }); + + if (html) { + if (html.indexOf('404 Not Found') > 0) { + console.log('Not Found'); + phantom.exit(); + return; + } + + clearTimeout(delay); + console.log(html); + phantom.exit(); + } + }); + delay = setInterval(checker, 100); + } else { + console.log('Not Found'); + phantom.exit(); + } + } catch (e) { console.log('Not Found'); phantom.exit(); } }); +} else { + phantom.exit(); } \ No newline at end of file diff --git a/util/seo.py b/util/seo.py index 42af53502..8a88b0e05 100644 --- a/util/seo.py +++ b/util/seo.py @@ -3,12 +3,12 @@ import logging from bs4 import BeautifulSoup - logger = logging.getLogger(__name__) def render_snapshot(url): logger.info('Snapshotting url: %s' % url) + out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', 'util/phantomjs-runner.js', url]) @@ -16,9 +16,13 @@ def render_snapshot(url): return None # Remove script tags + logger.info('Removing script tags: %s' % url) + soup = BeautifulSoup(out_html.decode('utf8')) to_extract = soup.findAll('script') for item in to_extract: item.extract() + logger.info('Snapshotted url: %s' % url) + return str(soup)