Remove the old SEO snapshots stuff.

2013-11-18 18:43:50 -05:00 · 2013-11-18 18:43:50 -05:00 · 84b9fdd007
commit 84b9fdd007
parent 6355b4a217
5 changed files with 0 additions and 145 deletions
--- a/README.md
+++ b/README.md
@ -24,9 +24,6 @@ sudo nginx -c `pwd`/nginx.conf
 STACK=prod gunicorn -D --workers 4 -b unix:/tmp/gunicorn.sock --worker-class gevent -t 2000 application:application
 ```

-set up the snapshot script:
-(instructions in the seo-snapshots directory)[seo-snapshots/README.md]
-
 start the workers:

 ```
--- a/seo-snapshots/README.md
+++ b/seo-snapshots/README.md
@ -1,13 +0,0 @@
-Follow the instructions to set up a host of the whole project before attempting to run.
-
-to run once:
-
-```
-python make_snapshot.py
-```
-
-cron line to update every 30 minutes:
-
-```
-0,30 * * * * cd /home/ubuntu/quay/seo-snapshots && ../venv/bin/python make_snapshot.py
-```
--- a/seo-snapshots/crawl.py
+++ b/seo-snapshots/crawl.py
@ -1,60 +0,0 @@
-import subprocess
-import urllib
-import os
-import logging
-import codecs
-
-from bs4 import BeautifulSoup
-from Queue import Queue
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.DEBUG)
-
-
-BASE_URL = 'http://localhost:5000'
-OUTPUT_PATH = 'snapshots/'
-
-aware_of = set()
-crawl_queue = Queue()
-
-def crawl_url(url):
-  final_url = BASE_URL + url
-  to_write = OUTPUT_PATH + url + 'index.html'
-
-  logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))
-
-  out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
-                                      'phantomjs-runner.js', final_url])
-
-  # Remove script tags
-  soup = BeautifulSoup(out_html)
-  to_extract = soup.findAll('script')
-  for item in to_extract:
-    item.extract()
-
-  # Find all links and add them to the crawl queue
-  for link in soup.findAll('a'):
-    to_add = link.get('href')
-
-    if to_add not in aware_of and to_add.startswith('/'):
-      logger.info('Adding link to be crawled: %s' % to_add)
-      crawl_queue.put(to_add)
-      aware_of.add(to_add)
-
-  to_write_dir = os.path.dirname(to_write)
-
-  if not os.path.exists(to_write_dir):
-    os.makedirs(to_write_dir)
-
-  with codecs.open(to_write, 'w', 'utf-8') as output_file:
-    output_file.write(soup.prettify())
-
-# Seed the crawler
-crawl_queue.put('/')
-aware_of.add('/')
-
-# Crawl
-while not crawl_queue.empty():
-  to_crawl = crawl_queue.get()
-  crawl_url(to_crawl)
--- a/seo-snapshots/make_snapshot.py
+++ b/seo-snapshots/make_snapshot.py
@ -1,46 +0,0 @@
-import subprocess
-import urllib
-import os
-import logging
-import codecs
-
-from bs4 import BeautifulSoup
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.DEBUG)
-
-
-BASE_URL = 'https://localhost/'
-OUTPUT_PATH = '../static/snapshots/'
-
-URLS = [
-  '',
-  'guide/',
-  'plans/',
-  'repository/',
-  'signin/',
-]
-
-for url in URLS:
-  final_url = BASE_URL + url
-  to_write = OUTPUT_PATH + url + 'index.html'
-
-  logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))
-
-  out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
-                                      'phantomjs-runner.js', final_url])
-
-  # Remove script tags
-  soup = BeautifulSoup(out_html)
-  to_extract = soup.findAll('script')
-  for item in to_extract:
-      item.extract()
-
-  to_write_dir = os.path.dirname(to_write)
-
-  if not os.path.exists(to_write_dir):
-    os.makedirs(to_write_dir)
-
-  with codecs.open(to_write, 'w', 'utf-8') as output_file:
-    output_file.write(soup.prettify())
--- a/seo-snapshots/phantomjs-runner.js
+++ b/seo-snapshots/phantomjs-runner.js
@ -1,23 +0,0 @@
-var system = require('system');
-var url = system.args[1] || '';
-if(url.length > 0) {
-  var page = require('webpage').create();  
-  page.open(url, function (status) {
-    if (status == 'success') {
-      var delay, checker = (function() {
-        var html = page.evaluate(function () {
-          var ready = document.getElementsByClassName('ready-indicator')[0];
-          if(ready.getAttribute('data-status') == 'ready') {
-            return document.getElementsByTagName('html')[0].outerHTML;
-          }
-        });
-        if(html) {
-          clearTimeout(delay);
-          console.log(html);
-          phantom.exit();
-        }
-      });
-      delay = setInterval(checker, 100);
-    }
-  });
-}