Add better logging to the snapshot generator for timing purposes and make sure the PhantomJS script always exists after a maximum of 10 seconds.

2014-05-19 13:11:07 -04:00 · 2014-05-19 13:11:07 -04:00 · 1c0c551d00
commit 1c0c551d00
parent 29dc7fd079
2 changed files with 45 additions and 23 deletions
--- a/util/phantomjs-runner.js
+++ b/util/phantomjs-runner.js
@ -1,37 +1,55 @@
 var system = require('system');
 var url = system.args[1] || '';
+var count = 0;
+
 if(url.length > 0) {
  var page = require('webpage').create();  
  page.open(url, function (status) {
-    if (status == 'success') {     
-      var delay, checker = (function() {
-        var html = page.evaluate(function () {
-          var found = document.getElementsByTagName('html')[0].outerHTML || '';
-          if (window.__isLoading && !window.__isLoading()) {
-            return found;
-          }
-          if (found.indexOf('404 Not Found') > 0) {
-            return found;
-          }
-          return null;
-        });
+    try {
+      if (status == 'success') {     
+        var delay;
+        var checker = (function() {
+          count++;

-        if (html) {
-          if (html.indexOf('404 Not Found') > 0) {
+          if (count > 100) {
            console.log('Not Found');
            phantom.exit();
-            return;
+            return null;
          }

-          clearTimeout(delay);
-          console.log(html);
-          phantom.exit();
-        }
-      });
-      delay = setInterval(checker, 100);
-    } else {
+          var html = page.evaluate(function () {
+            var found = document.getElementsByTagName('html')[0].outerHTML || '';
+            if (window.__isLoading && !window.__isLoading()) {
+              return found;
+            }
+            if (found.indexOf('404 Not Found') > 0) {
+              return found;
+            }
+            return null;
+          });
+
+          if (html) {
+            if (html.indexOf('404 Not Found') > 0) {
+              console.log('Not Found');
+              phantom.exit();
+              return;
+            }
+
+            clearTimeout(delay);
+            console.log(html);
+            phantom.exit();
+          }
+        });
+        delay = setInterval(checker, 100);
+      } else {
+        console.log('Not Found');
+        phantom.exit();
+      }
+    } catch (e) {
      console.log('Not Found');
      phantom.exit();
    }
  });
+} else {
+  phantom.exit();
 }
--- a/util/seo.py
+++ b/util/seo.py
@ -3,12 +3,12 @@ import logging

 from bs4 import BeautifulSoup

-
 logger = logging.getLogger(__name__)


 def render_snapshot(url):
  logger.info('Snapshotting url: %s' % url)
+
  out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
                                      'util/phantomjs-runner.js', url])

@ -16,9 +16,13 @@ def render_snapshot(url):
    return None

  # Remove script tags
+  logger.info('Removing script tags: %s' % url)
+
  soup = BeautifulSoup(out_html.decode('utf8'))
  to_extract = soup.findAll('script')
  for item in to_extract:
    item.extract()

+  logger.info('Snapshotted url: %s' % url)
+
  return str(soup)