Add the snapshot endpoint to web.py and have the phantomjs running only load the page's HTML once there are no further pending XHR requests

This commit is contained in:
Joseph Schorr 2013-11-18 17:11:06 -05:00
parent bde0a29296
commit 738973cf39
4 changed files with 87 additions and 1 deletions

27
util/seo.py Normal file
View file

@ -0,0 +1,27 @@
import subprocess
import urllib
import os
import logging
import codecs
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)
def renderSnapshot(path):
final_url = 'http://localhost:5000/' + path
logger.info('Snapshotting url: %s -> %s' % (path, final_url))
out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
'util/phantomjs-runner.js', final_url])
if not out_html or out_html.strip() == 'Not Found':
return None
# Remove script tags
soup = BeautifulSoup(out_html)
to_extract = soup.findAll('script')
for item in to_extract:
item.extract()
return soup.prettify()