First stab at trying to pre-render content for search crawlers.

This commit is contained in:
yackob03 2013-10-10 20:53:14 -04:00
parent ce81431cd3
commit 785995b473
6 changed files with 60 additions and 4 deletions

View file

@ -0,0 +1,29 @@
import subprocess
import urllib
from BeautifulSoup import BeautifulSoup
BASE_URL = 'http://localhost:5000'
OUTPUT_PATH = 'snapshots/'
URLS = [
('/', 'index.html')
]
for url, output in URLS:
final_url = BASE_URL + url
out_html = subprocess.check_output(['phantomjs', 'phantomjs-runner.js',
final_url])
# Remove script tags
soup = BeautifulSoup(out_html)
to_extract = soup.findAll('script')
for item in to_extract:
item.extract()
to_write = OUTPUT_PATH + output
with open(to_write, 'w') as output_file:
output_file.write(soup.prettify())