From 785995b4730c3e9bf556fd4e9d2f772d457d918c Mon Sep 17 00:00:00 2001 From: yackob03 Date: Thu, 10 Oct 2013 20:53:14 -0400 Subject: [PATCH 1/6] First stab at trying to pre-render content for search crawlers. --- requirements-nover.txt | 3 ++- seo-snapshots/make_snapshot.py | 29 +++++++++++++++++++++++++++++ seo-snapshots/phantomjs-runner.js | 23 +++++++++++++++++++++++ static/js/controllers.js | 5 ++++- static/partials/guide.html | 2 +- static/partials/landing.html | 2 +- 6 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 seo-snapshots/make_snapshot.py create mode 100644 seo-snapshots/phantomjs-runner.js diff --git a/requirements-nover.txt b/requirements-nover.txt index 69d73fb1b..cd844d0cb 100644 --- a/requirements-nover.txt +++ b/requirements-nover.txt @@ -10,4 +10,5 @@ pymysql stripe gunicorn eventlet -mixpanel-py \ No newline at end of file +mixpanel-py +beautifulsoup4 \ No newline at end of file diff --git a/seo-snapshots/make_snapshot.py b/seo-snapshots/make_snapshot.py new file mode 100644 index 000000000..75ab0be9d --- /dev/null +++ b/seo-snapshots/make_snapshot.py @@ -0,0 +1,29 @@ +import subprocess +import urllib + +from BeautifulSoup import BeautifulSoup + + +BASE_URL = 'http://localhost:5000' +OUTPUT_PATH = 'snapshots/' + +URLS = [ + ('/', 'index.html') +] + +for url, output in URLS: + final_url = BASE_URL + url + + out_html = subprocess.check_output(['phantomjs', 'phantomjs-runner.js', + final_url]) + + # Remove script tags + soup = BeautifulSoup(out_html) + to_extract = soup.findAll('script') + for item in to_extract: + item.extract() + + to_write = OUTPUT_PATH + output + + with open(to_write, 'w') as output_file: + output_file.write(soup.prettify()) \ No newline at end of file diff --git a/seo-snapshots/phantomjs-runner.js b/seo-snapshots/phantomjs-runner.js new file mode 100644 index 000000000..e001b73c8 --- /dev/null +++ b/seo-snapshots/phantomjs-runner.js @@ -0,0 +1,23 @@ +var system = require('system'); +var url = system.args[1] || ''; +if(url.length > 0) { + var page = require('webpage').create(); + page.open(url, function (status) { + if (status == 'success') { + var delay, checker = (function() { + var html = page.evaluate(function () { + var ready = document.getElementsByClassName('ready-indicator')[0]; + if(ready.getAttribute('data-status') == 'ready') { + return document.getElementsByTagName('html')[0].outerHTML; + } + }); + if(html) { + clearTimeout(delay); + console.log(html); + phantom.exit(); + } + }); + delay = setInterval(checker, 100); + } + }); +} diff --git a/static/js/controllers.js b/static/js/controllers.js index 57eacb213..f297c9282 100644 --- a/static/js/controllers.js +++ b/static/js/controllers.js @@ -93,7 +93,8 @@ function PlansCtrl($scope, UserService, PlanService) { }; } -function GuideCtrl($scope, Restangular) { +function GuideCtrl($scope) { + $scope.status = 'ready'; } function RepoListCtrl($scope, Restangular, UserService) { @@ -194,6 +195,8 @@ function LandingCtrl($scope, $timeout, Restangular, UserService, KeyService) { $scope.loadingmyrepos = false; }); }; + + $scope.status = 'ready'; } function RepoCtrl($scope, Restangular, $routeParams, $rootScope) { diff --git a/static/partials/guide.html b/static/partials/guide.html index 4020cb287..5ee657c1d 100644 --- a/static/partials/guide.html +++ b/static/partials/guide.html @@ -1,4 +1,4 @@ -
+
Warning: Quay requires docker version 0.6.2 or higher to work

Getting started guide

diff --git a/static/partials/landing.html b/static/partials/landing.html index b7e2d6248..2baf6baec 100644 --- a/static/partials/landing.html +++ b/static/partials/landing.html @@ -1,4 +1,4 @@ -
+
From da29da5c66a57b48e8baa864ecfc8cc8773a0707 Mon Sep 17 00:00:00 2001 From: yackob03 Date: Thu, 10 Oct 2013 23:42:03 -0400 Subject: [PATCH 2/6] More updates to allow for static snapshotting. --- .gitignore | 1 + seo-snapshots/make_snapshot.py | 25 ++++++++++++++++++++----- static/js/controllers.js | 2 ++ static/partials/plans.html | 2 +- static/partials/repo-list.html | 2 +- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index beed9c400..2befc02c4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pyc venv .elasticbeanstalk/ +static/snapshots/ \ No newline at end of file diff --git a/seo-snapshots/make_snapshot.py b/seo-snapshots/make_snapshot.py index 75ab0be9d..21ec3f7ab 100644 --- a/seo-snapshots/make_snapshot.py +++ b/seo-snapshots/make_snapshot.py @@ -1,18 +1,30 @@ import subprocess import urllib +import os +import logging from BeautifulSoup import BeautifulSoup -BASE_URL = 'http://localhost:5000' -OUTPUT_PATH = 'snapshots/' +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + + +BASE_URL = 'http://localhost:5000/' +OUTPUT_PATH = '../static/snapshots/' URLS = [ - ('/', 'index.html') + '', + 'guide/', + 'plans/', + 'repository/', ] -for url, output in URLS: +for url in URLS: final_url = BASE_URL + url + to_write = OUTPUT_PATH + url + 'index.html' + + logger.info('Snapshotting url: %s -> %s' % (final_url, to_write)) out_html = subprocess.check_output(['phantomjs', 'phantomjs-runner.js', final_url]) @@ -23,7 +35,10 @@ for url, output in URLS: for item in to_extract: item.extract() - to_write = OUTPUT_PATH + output + to_write_dir = os.path.dirname(to_write) + + if not os.path.exists(to_write_dir): + os.makedirs(to_write_dir) with open(to_write, 'w') as output_file: output_file.write(soup.prettify()) \ No newline at end of file diff --git a/static/js/controllers.js b/static/js/controllers.js index f297c9282..f84d283be 100644 --- a/static/js/controllers.js +++ b/static/js/controllers.js @@ -91,6 +91,8 @@ function PlansCtrl($scope, UserService, PlanService) { $('#signinModal').modal({}); } }; + + $scope.status = 'ready'; } function GuideCtrl($scope) { diff --git a/static/partials/plans.html b/static/partials/plans.html index b1c8c0236..5c92ad3c3 100644 --- a/static/partials/plans.html +++ b/static/partials/plans.html @@ -1,4 +1,4 @@ -
+
Plans & Pricing
diff --git a/static/partials/repo-list.html b/static/partials/repo-list.html index 7350559c0..9bb340d81 100644 --- a/static/partials/repo-list.html +++ b/static/partials/repo-list.html @@ -2,7 +2,7 @@
-
+

Your Repositories

From 3c3cca11409b4a04ae031f1524c2260bef68c46f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 11 Oct 2013 03:56:27 +0000 Subject: [PATCH 3/6] Some fixes for generating snapshots against the local host. --- requirements.txt | 1 + seo-snapshots/make_snapshot.py | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index de2ef9669..e637b3718 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ MarkupSafe==0.18 PyMySQL==0.5 Werkzeug==0.9.4 argparse==1.2.1 +beautifulsoup4==4.3.2 blinker==1.3 boto==2.13.3 distribute==0.6.34 diff --git a/seo-snapshots/make_snapshot.py b/seo-snapshots/make_snapshot.py index 21ec3f7ab..6ae5de64d 100644 --- a/seo-snapshots/make_snapshot.py +++ b/seo-snapshots/make_snapshot.py @@ -2,15 +2,16 @@ import subprocess import urllib import os import logging +import codecs -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) -BASE_URL = 'http://localhost:5000/' +BASE_URL = 'https://localhost/' OUTPUT_PATH = '../static/snapshots/' URLS = [ @@ -26,8 +27,8 @@ for url in URLS: logger.info('Snapshotting url: %s -> %s' % (final_url, to_write)) - out_html = subprocess.check_output(['phantomjs', 'phantomjs-runner.js', - final_url]) + out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', + 'phantomjs-runner.js', final_url]) # Remove script tags soup = BeautifulSoup(out_html) @@ -40,5 +41,5 @@ for url in URLS: if not os.path.exists(to_write_dir): os.makedirs(to_write_dir) - with open(to_write, 'w') as output_file: - output_file.write(soup.prettify()) \ No newline at end of file + with codecs.open(to_write, 'w', 'utf-8') as output_file: + output_file.write(soup.prettify()) From 02c651ed36c8f0ef242c5e4984ccf2283cf5570d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 11 Oct 2013 04:41:21 +0000 Subject: [PATCH 4/6] Add rules to the nginx.conf to allow it to serve files out of the snapshot directory. --- nginx.conf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nginx.conf b/nginx.conf index c7209270e..93e5a1ce1 100644 --- a/nginx.conf +++ b/nginx.conf @@ -43,6 +43,10 @@ http { ssl_ciphers ALL:!ADH:!EXPORT56:RC4+RSA:+HIGH:+MEDIUM:+LOW:+SSLv3:+EXP; ssl_prefer_server_ciphers on; + if ($args ~ "_escaped_fragment_") { + rewrite ^ /static/snapshots$uri/index.html; + } + location /static/ { # checks for static file, if not found proxy to app alias /home/ubuntu/quay/static/; From 2d9c3f2c3888eb3371ef7be9e11375d2f4e358ee Mon Sep 17 00:00:00 2001 From: yackob03 Date: Fri, 11 Oct 2013 00:57:27 -0400 Subject: [PATCH 5/6] Add some readmes about how to run a new server as well as how to schedule the pre-render script. --- README.md | 20 ++++++++++++++++++++ seo-snapshots/README.md | 13 +++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 seo-snapshots/README.md diff --git a/README.md b/README.md index c3a52a523..02d4cbb2c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,24 @@ +to prepare a new host: + +``` +sudo apt-add-repository -y ppa:nginx/stable +sudo apt-get update +sudo apt-get install -y git python-virtualenv python-dev phantomjs +sudo apt-get install -y nginx-full +``` + +check out the code: + +``` +git clone https://bitbucket.org/yackob03/quay.git +virtualenv --distribute venv +source venv/bin/activate +pip install -r requirements.txt +``` + running: +``` sudo nginx -c `pwd`/nginx.conf STACK=prod gunicorn -D --workers 4 -b unix:/tmp/gunicorn.sock --worker-class eventlet -t 500 application:application +``` \ No newline at end of file diff --git a/seo-snapshots/README.md b/seo-snapshots/README.md new file mode 100644 index 000000000..1340fc7fe --- /dev/null +++ b/seo-snapshots/README.md @@ -0,0 +1,13 @@ +Follow the instructions to set up a host of the whole project before attempting to run. + +to run once: + +``` +python make_snapshot.py +``` + +cron line to update every 30 minutes: + +``` +0,30 * * * * cd /home/ubuntu/quay/seo-snapshots && ../venv/bin/python make_snapshot.py +``` \ No newline at end of file From c4f3ab31d02adc9a5a20712c4116decdc4ce5f09 Mon Sep 17 00:00:00 2001 From: yackob03 Date: Fri, 11 Oct 2013 01:16:51 -0400 Subject: [PATCH 6/6] Add another version of the snapshotter that is a crawler, unfinished. --- seo-snapshots/crawl.py | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 seo-snapshots/crawl.py diff --git a/seo-snapshots/crawl.py b/seo-snapshots/crawl.py new file mode 100644 index 000000000..79adcef57 --- /dev/null +++ b/seo-snapshots/crawl.py @@ -0,0 +1,60 @@ +import subprocess +import urllib +import os +import logging +import codecs + +from bs4 import BeautifulSoup +from Queue import Queue + + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + + +BASE_URL = 'http://localhost:5000' +OUTPUT_PATH = 'snapshots/' + +aware_of = set() +crawl_queue = Queue() + +def crawl_url(url): + final_url = BASE_URL + url + to_write = OUTPUT_PATH + url + 'index.html' + + logger.info('Snapshotting url: %s -> %s' % (final_url, to_write)) + + out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes', + 'phantomjs-runner.js', final_url]) + + # Remove script tags + soup = BeautifulSoup(out_html) + to_extract = soup.findAll('script') + for item in to_extract: + item.extract() + + # Find all links and add them to the crawl queue + for link in soup.findAll('a'): + to_add = link.get('href') + + if to_add not in aware_of and to_add.startswith('/'): + logger.info('Adding link to be crawled: %s' % to_add) + crawl_queue.put(to_add) + aware_of.add(to_add) + + to_write_dir = os.path.dirname(to_write) + + if not os.path.exists(to_write_dir): + os.makedirs(to_write_dir) + + with codecs.open(to_write, 'w', 'utf-8') as output_file: + output_file.write(soup.prettify()) + +# Seed the crawler +crawl_queue.put('/') +aware_of.add('/') + +# Crawl +while not crawl_queue.empty(): + to_crawl = crawl_queue.get() + crawl_url(to_crawl)