quay/seo-snapshots/crawl.py

import subprocess
import urllib
import os
import logging
import codecs

from bs4 import BeautifulSoup
from Queue import Queue


logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)


BASE_URL = 'http://localhost:5000'
OUTPUT_PATH = 'snapshots/'

aware_of = set()
crawl_queue = Queue()

def crawl_url(url):
  final_url = BASE_URL + url
  to_write = OUTPUT_PATH + url + 'index.html'

  logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))

  out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',
                                      'phantomjs-runner.js', final_url])

  # Remove script tags
  soup = BeautifulSoup(out_html)
  to_extract = soup.findAll('script')
  for item in to_extract:
    item.extract()

  # Find all links and add them to the crawl queue
  for link in soup.findAll('a'):
    to_add = link.get('href')

    if to_add not in aware_of and to_add.startswith('/'):
      logger.info('Adding link to be crawled: %s' % to_add)
      crawl_queue.put(to_add)
      aware_of.add(to_add)

  to_write_dir = os.path.dirname(to_write)

  if not os.path.exists(to_write_dir):
    os.makedirs(to_write_dir)

  with codecs.open(to_write, 'w', 'utf-8') as output_file:
    output_file.write(soup.prettify())

# Seed the crawler
crawl_queue.put('/')
aware_of.add('/')

# Crawl
while not crawl_queue.empty():
  to_crawl = crawl_queue.get()
  crawl_url(to_crawl)
Add another version of the snapshotter that is a crawler, unfinished. 2013-10-11 05:16:51 +00:00			`import subprocess`
			`import urllib`
			`import os`
			`import logging`
			`import codecs`

			`from bs4 import BeautifulSoup`
			`from Queue import Queue`


			`logger = logging.getLogger(__name__)`
			`logging.basicConfig(level=logging.DEBUG)`


			`BASE_URL = 'http://localhost:5000'`
			`OUTPUT_PATH = 'snapshots/'`

			`aware_of = set()`
			`crawl_queue = Queue()`

			`def crawl_url(url):`
			`final_url = BASE_URL + url`
			`to_write = OUTPUT_PATH + url + 'index.html'`

			`logger.info('Snapshotting url: %s -> %s' % (final_url, to_write))`

			`out_html = subprocess.check_output(['phantomjs', '--ignore-ssl-errors=yes',`
			`'phantomjs-runner.js', final_url])`

			`# Remove script tags`
			`soup = BeautifulSoup(out_html)`
			`to_extract = soup.findAll('script')`
			`for item in to_extract:`
			`item.extract()`

			`# Find all links and add them to the crawl queue`
			`for link in soup.findAll('a'):`
			`to_add = link.get('href')`

			`if to_add not in aware_of and to_add.startswith('/'):`
			`logger.info('Adding link to be crawled: %s' % to_add)`
			`crawl_queue.put(to_add)`
			`aware_of.add(to_add)`

			`to_write_dir = os.path.dirname(to_write)`

			`if not os.path.exists(to_write_dir):`
			`os.makedirs(to_write_dir)`

			`with codecs.open(to_write, 'w', 'utf-8') as output_file:`
			`output_file.write(soup.prettify())`

			`# Seed the crawler`
			`crawl_queue.put('/')`
			`aware_of.add('/')`

			`# Crawl`
			`while not crawl_queue.empty():`
			`to_crawl = crawl_queue.get()`
			`crawl_url(to_crawl)`