Add better logging to the snapshot generator for timing purposes and make sure the PhantomJS script always exists after a maximum of 10 seconds.

This commit is contained in:
Joseph Schorr 2014-05-19 13:11:07 -04:00
parent 29dc7fd079
commit 1c0c551d00
2 changed files with 45 additions and 23 deletions

View file

@ -1,37 +1,55 @@
var system = require('system'); var system = require('system');
var url = system.args[1] || ''; var url = system.args[1] || '';
var count = 0;
if(url.length > 0) { if(url.length > 0) {
var page = require('webpage').create(); var page = require('webpage').create();
page.open(url, function (status) { page.open(url, function (status) {
if (status == 'success') { try {
var delay, checker = (function() { if (status == 'success') {
var html = page.evaluate(function () { var delay;
var found = document.getElementsByTagName('html')[0].outerHTML || ''; var checker = (function() {
if (window.__isLoading && !window.__isLoading()) { count++;
return found;
}
if (found.indexOf('404 Not Found') > 0) {
return found;
}
return null;
});
if (html) { if (count > 100) {
if (html.indexOf('404 Not Found') > 0) {
console.log('Not Found'); console.log('Not Found');
phantom.exit(); phantom.exit();
return; return null;
} }
clearTimeout(delay); var html = page.evaluate(function () {
console.log(html); var found = document.getElementsByTagName('html')[0].outerHTML || '';
phantom.exit(); if (window.__isLoading && !window.__isLoading()) {
} return found;
}); }
delay = setInterval(checker, 100); if (found.indexOf('404 Not Found') > 0) {
} else { return found;
}
return null;
});
if (html) {
if (html.indexOf('404 Not Found') > 0) {
console.log('Not Found');
phantom.exit();
return;
}
clearTimeout(delay);
console.log(html);
phantom.exit();
}
});
delay = setInterval(checker, 100);
} else {
console.log('Not Found');
phantom.exit();
}
} catch (e) {
console.log('Not Found'); console.log('Not Found');
phantom.exit(); phantom.exit();
} }
}); });
} else {
phantom.exit();
} }

View file

@ -3,12 +3,12 @@ import logging
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def render_snapshot(url): def render_snapshot(url):
logger.info('Snapshotting url: %s' % url) logger.info('Snapshotting url: %s' % url)
out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
'util/phantomjs-runner.js', url]) 'util/phantomjs-runner.js', url])
@ -16,9 +16,13 @@ def render_snapshot(url):
return None return None
# Remove script tags # Remove script tags
logger.info('Removing script tags: %s' % url)
soup = BeautifulSoup(out_html.decode('utf8')) soup = BeautifulSoup(out_html.decode('utf8'))
to_extract = soup.findAll('script') to_extract = soup.findAll('script')
for item in to_extract: for item in to_extract:
item.extract() item.extract()
logger.info('Snapshotted url: %s' % url)
return str(soup) return str(soup)