mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-13 01:37:59 +00:00
We now build a separate APE binary for each test so they can run in parallel. We've got 148 tests running fast and stable so far.
289 lines
7.4 KiB
Python
289 lines
7.4 KiB
Python
import io
|
|
import os
|
|
import unittest
|
|
import urllib.robotparser
|
|
from test import support
|
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
try:
|
|
import _thread
|
|
import threading
|
|
except ImportError:
|
|
threading = None
|
|
|
|
|
|
class BaseRobotTest:
|
|
robots_txt = ''
|
|
agent = 'test_robotparser'
|
|
good = []
|
|
bad = []
|
|
|
|
def setUp(self):
|
|
lines = io.StringIO(self.robots_txt).readlines()
|
|
self.parser = urllib.robotparser.RobotFileParser()
|
|
self.parser.parse(lines)
|
|
|
|
def get_agent_and_url(self, url):
|
|
if isinstance(url, tuple):
|
|
agent, url = url
|
|
return agent, url
|
|
return self.agent, url
|
|
|
|
def test_good_urls(self):
|
|
for url in self.good:
|
|
agent, url = self.get_agent_and_url(url)
|
|
with self.subTest(url=url, agent=agent):
|
|
self.assertTrue(self.parser.can_fetch(agent, url))
|
|
|
|
def test_bad_urls(self):
|
|
for url in self.bad:
|
|
agent, url = self.get_agent_and_url(url)
|
|
with self.subTest(url=url, agent=agent):
|
|
self.assertFalse(self.parser.can_fetch(agent, url))
|
|
|
|
|
|
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
|
robots_txt = """\
|
|
User-agent: *
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
Disallow: /tmp/ # these will soon disappear
|
|
Disallow: /foo.html
|
|
"""
|
|
good = ['/', '/test.html']
|
|
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
|
|
|
|
|
|
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
|
|
robots_txt = """\
|
|
# robots.txt for http://www.example.com/
|
|
|
|
User-agent: *
|
|
Crawl-delay: 1
|
|
Request-rate: 3/15
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
|
|
# Cybermapper knows where to go.
|
|
User-agent: cybermapper
|
|
Disallow:
|
|
"""
|
|
good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
|
|
bad = ['/cyberworld/map/index.html']
|
|
|
|
|
|
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
|
|
robots_txt = """\
|
|
# go away
|
|
User-agent: *
|
|
Disallow: /
|
|
"""
|
|
good = []
|
|
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
|
|
|
|
|
|
class BaseRequestRateTest(BaseRobotTest):
|
|
|
|
def test_request_rate(self):
|
|
for url in self.good + self.bad:
|
|
agent, url = self.get_agent_and_url(url)
|
|
with self.subTest(url=url, agent=agent):
|
|
if self.crawl_delay:
|
|
self.assertEqual(
|
|
self.parser.crawl_delay(agent), self.crawl_delay
|
|
)
|
|
if self.request_rate:
|
|
self.assertIsInstance(
|
|
self.parser.request_rate(agent),
|
|
urllib.robotparser.RequestRate
|
|
)
|
|
self.assertEqual(
|
|
self.parser.request_rate(agent).requests,
|
|
self.request_rate.requests
|
|
)
|
|
self.assertEqual(
|
|
self.parser.request_rate(agent).seconds,
|
|
self.request_rate.seconds
|
|
)
|
|
|
|
|
|
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
|
|
robots_txt = """\
|
|
User-agent: figtree
|
|
Crawl-delay: 3
|
|
Request-rate: 9/30
|
|
Disallow: /tmp
|
|
Disallow: /a%3cd.html
|
|
Disallow: /a%2fb.html
|
|
Disallow: /%7ejoe/index.html
|
|
"""
|
|
agent = 'figtree'
|
|
request_rate = urllib.robotparser.RequestRate(9, 30)
|
|
crawl_delay = 3
|
|
good = [('figtree', '/foo.html')]
|
|
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
|
|
'/a%2fb.html', '/~joe/index.html']
|
|
|
|
|
|
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
|
|
agent = 'FigTree Robot libwww-perl/5.04'
|
|
# these are not actually tested, but we still need to parse it
|
|
# in order to accommodate the input parameters
|
|
request_rate = None
|
|
crawl_delay = None
|
|
|
|
|
|
class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
|
|
robots_txt = """\
|
|
User-agent: *
|
|
Disallow: /tmp/
|
|
Disallow: /a%3Cd.html
|
|
Disallow: /a/b.html
|
|
Disallow: /%7ejoe/index.html
|
|
Crawl-delay: 3
|
|
Request-rate: 9/banana
|
|
"""
|
|
good = ['/tmp']
|
|
bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
|
|
'/%7Ejoe/index.html']
|
|
crawl_delay = 3
|
|
|
|
|
|
class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
|
|
# From bug report #523041
|
|
robots_txt = """\
|
|
User-Agent: *
|
|
Disallow: /.
|
|
Crawl-delay: pears
|
|
"""
|
|
good = ['/foo.html']
|
|
# bug report says "/" should be denied, but that is not in the RFC
|
|
bad = []
|
|
|
|
|
|
class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
|
|
# also test that Allow and Diasallow works well with each other
|
|
robots_txt = """\
|
|
User-agent: Googlebot
|
|
Allow: /folder1/myfile.html
|
|
Disallow: /folder1/
|
|
Request-rate: whale/banana
|
|
"""
|
|
agent = 'Googlebot'
|
|
good = ['/folder1/myfile.html']
|
|
bad = ['/folder1/anotherfile.html']
|
|
|
|
|
|
class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
|
|
# the order of User-agent should be correct. note
|
|
# that this file is incorrect because "Googlebot" is a
|
|
# substring of "Googlebot-Mobile"
|
|
robots_txt = """\
|
|
User-agent: Googlebot
|
|
Disallow: /
|
|
|
|
User-agent: Googlebot-Mobile
|
|
Allow: /
|
|
"""
|
|
agent = 'Googlebot'
|
|
bad = ['/something.jpg']
|
|
|
|
|
|
class UserAgentGoogleMobileTest(UserAgentOrderingTest):
|
|
agent = 'Googlebot-Mobile'
|
|
|
|
|
|
class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
|
|
# Google also got the order wrong. You need
|
|
# to specify the URLs from more specific to more general
|
|
robots_txt = """\
|
|
User-agent: Googlebot
|
|
Allow: /folder1/myfile.html
|
|
Disallow: /folder1/
|
|
"""
|
|
agent = 'googlebot'
|
|
good = ['/folder1/myfile.html']
|
|
bad = ['/folder1/anotherfile.html']
|
|
|
|
|
|
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
|
|
# see issue #6325 for details
|
|
robots_txt = """\
|
|
User-agent: *
|
|
Disallow: /some/path?name=value
|
|
"""
|
|
good = ['/some/path']
|
|
bad = ['/some/path?name=value']
|
|
|
|
|
|
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
|
# obey first * entry (#4108)
|
|
robots_txt = """\
|
|
User-agent: *
|
|
Disallow: /some/path
|
|
|
|
User-agent: *
|
|
Disallow: /another/path
|
|
"""
|
|
good = ['/another/path']
|
|
bad = ['/some/path']
|
|
|
|
|
|
class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
|
|
# normalize the URL first (#17403)
|
|
robots_txt = """\
|
|
User-agent: *
|
|
Allow: /some/path?
|
|
Disallow: /another/path?
|
|
"""
|
|
good = ['/some/path?']
|
|
bad = ['/another/path?']
|
|
|
|
|
|
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
|
|
robots_txt = """\
|
|
User-agent: *
|
|
Crawl-delay: 1
|
|
Request-rate: 3/15
|
|
Disallow: /cyberworld/map/
|
|
"""
|
|
request_rate = urllib.robotparser.RequestRate(3, 15)
|
|
crawl_delay = 1
|
|
good = ['/', '/test.html']
|
|
bad = ['/cyberworld/map/index.html']
|
|
|
|
|
|
class StringFormattingTest(BaseRobotTest, unittest.TestCase):
|
|
robots_txt = """\
|
|
User-agent: *
|
|
Crawl-delay: 1
|
|
Request-rate: 3/15
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
|
|
# Cybermapper knows where to go.
|
|
User-agent: cybermapper
|
|
Disallow: /some/path
|
|
"""
|
|
|
|
expected_output = """\
|
|
User-agent: cybermapper
|
|
Disallow: /some/path
|
|
|
|
User-agent: *
|
|
Crawl-delay: 1
|
|
Request-rate: 3/15
|
|
Disallow: /cyberworld/map/
|
|
|
|
"""
|
|
|
|
def test_string_formatting(self):
|
|
self.assertEqual(str(self.parser), self.expected_output)
|
|
|
|
|
|
class RobotHandler(BaseHTTPRequestHandler):
|
|
|
|
def do_GET(self):
|
|
self.send_error(403, "Forbidden access")
|
|
|
|
def log_message(self, format, *args):
|
|
pass
|
|
|
|
if __name__=='__main__':
|
|
unittest.main()
|