Add the snapshot endpoint to web.py and have the phantomjs running only load the page's HTML once there are no further pending XHR requests
This commit is contained in:
parent
bde0a29296
commit
738973cf39
4 changed files with 87 additions and 1 deletions
27
util/seo.py
Normal file
27
util/seo.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import subprocess
|
||||
import urllib
|
||||
import os
|
||||
import logging
|
||||
import codecs
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
def renderSnapshot(path):
|
||||
final_url = 'http://localhost:5000/' + path
|
||||
logger.info('Snapshotting url: %s -> %s' % (path, final_url))
|
||||
out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
|
||||
'util/phantomjs-runner.js', final_url])
|
||||
|
||||
if not out_html or out_html.strip() == 'Not Found':
|
||||
return None
|
||||
|
||||
# Remove script tags
|
||||
soup = BeautifulSoup(out_html)
|
||||
to_extract = soup.findAll('script')
|
||||
for item in to_extract:
|
||||
item.extract()
|
||||
|
||||
return soup.prettify()
|
Reference in a new issue