Remove the old SEO snapshots stuff.
This commit is contained in:
parent
6355b4a217
commit
84b9fdd007
5 changed files with 0 additions and 145 deletions
|
@ -24,9 +24,6 @@ sudo nginx -c `pwd`/nginx.conf
|
|||
STACK=prod gunicorn -D --workers 4 -b unix:/tmp/gunicorn.sock --worker-class gevent -t 2000 application:application
|
||||
```
|
||||
|
||||
set up the snapshot script:
|
||||
(instructions in the seo-snapshots directory)[seo-snapshots/README.md]
|
||||
|
||||
start the workers:
|
||||
|
||||
```
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
Follow the instructions to set up a host of the whole project before attempting to run.
|
||||
|
||||
to run once:
|
||||
|
||||
```
|
||||
python make_snapshot.py
|
||||
```
|
||||
|
||||
cron line to update every 30 minutes:
|
||||
|
||||
```
|
||||
0,30 * * * * cd /home/ubuntu/quay/seo-snapshots && ../venv/bin/python make_snapshot.py
|
||||
```
|
|
@ -1,60 +0,0 @@
|
|||
import subprocess
|
||||
import urllib
|
||||
import os
|
||||
import logging
|
||||
import codecs
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from Queue import Queue
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
|
||||
BASE_URL = 'http://localhost:5000'
|
||||
OUTPUT_PATH = 'snapshots/'
|
||||
|
||||
aware_of = set()
|
||||
crawl_queue = Queue()
|
||||
|
||||
def crawl_url(url):
|
||||
final_url = BASE_URL + url
|
||||
to_write = OUTPUT_PATH + url + 'index.html'
|
||||
|
||||
logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))
|
||||
|
||||
out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
|
||||
'phantomjs-runner.js', final_url])
|
||||
|
||||
# Remove script tags
|
||||
soup = BeautifulSoup(out_html)
|
||||
to_extract = soup.findAll('script')
|
||||
for item in to_extract:
|
||||
item.extract()
|
||||
|
||||
# Find all links and add them to the crawl queue
|
||||
for link in soup.findAll('a'):
|
||||
to_add = link.get('href')
|
||||
|
||||
if to_add not in aware_of and to_add.startswith('/'):
|
||||
logger.info('Adding link to be crawled: %s' % to_add)
|
||||
crawl_queue.put(to_add)
|
||||
aware_of.add(to_add)
|
||||
|
||||
to_write_dir = os.path.dirname(to_write)
|
||||
|
||||
if not os.path.exists(to_write_dir):
|
||||
os.makedirs(to_write_dir)
|
||||
|
||||
with codecs.open(to_write, 'w', 'utf-8') as output_file:
|
||||
output_file.write(soup.prettify())
|
||||
|
||||
# Seed the crawler
|
||||
crawl_queue.put('/')
|
||||
aware_of.add('/')
|
||||
|
||||
# Crawl
|
||||
while not crawl_queue.empty():
|
||||
to_crawl = crawl_queue.get()
|
||||
crawl_url(to_crawl)
|
|
@ -1,46 +0,0 @@
|
|||
import subprocess
|
||||
import urllib
|
||||
import os
|
||||
import logging
|
||||
import codecs
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
|
||||
BASE_URL = 'https://localhost/'
|
||||
OUTPUT_PATH = '../static/snapshots/'
|
||||
|
||||
URLS = [
|
||||
'',
|
||||
'guide/',
|
||||
'plans/',
|
||||
'repository/',
|
||||
'signin/',
|
||||
]
|
||||
|
||||
for url in URLS:
|
||||
final_url = BASE_URL + url
|
||||
to_write = OUTPUT_PATH + url + 'index.html'
|
||||
|
||||
logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))
|
||||
|
||||
out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
|
||||
'phantomjs-runner.js', final_url])
|
||||
|
||||
# Remove script tags
|
||||
soup = BeautifulSoup(out_html)
|
||||
to_extract = soup.findAll('script')
|
||||
for item in to_extract:
|
||||
item.extract()
|
||||
|
||||
to_write_dir = os.path.dirname(to_write)
|
||||
|
||||
if not os.path.exists(to_write_dir):
|
||||
os.makedirs(to_write_dir)
|
||||
|
||||
with codecs.open(to_write, 'w', 'utf-8') as output_file:
|
||||
output_file.write(soup.prettify())
|
|
@ -1,23 +0,0 @@
|
|||
var system = require('system');
|
||||
var url = system.args[1] || '';
|
||||
if(url.length > 0) {
|
||||
var page = require('webpage').create();
|
||||
page.open(url, function (status) {
|
||||
if (status == 'success') {
|
||||
var delay, checker = (function() {
|
||||
var html = page.evaluate(function () {
|
||||
var ready = document.getElementsByClassName('ready-indicator')[0];
|
||||
if(ready.getAttribute('data-status') == 'ready') {
|
||||
return document.getElementsByTagName('html')[0].outerHTML;
|
||||
}
|
||||
});
|
||||
if(html) {
|
||||
clearTimeout(delay);
|
||||
console.log(html);
|
||||
phantom.exit();
|
||||
}
|
||||
});
|
||||
delay = setInterval(checker, 100);
|
||||
}
|
||||
});
|
||||
}
|
Reference in a new issue