cosmopolitan/third_party/python/Lib/test/test_robotparser.py

import io
import os
import unittest
import urllib.robotparser
from test import support
from http.server import BaseHTTPRequestHandler, HTTPServer
try:
    import _thread
    import threading
except ImportError:
    threading = None


class BaseRobotTest:
    robots_txt = ''
    agent = 'test_robotparser'
    good = []
    bad = []

    def setUp(self):
        lines = io.StringIO(self.robots_txt).readlines()
        self.parser = urllib.robotparser.RobotFileParser()
        self.parser.parse(lines)

    def get_agent_and_url(self, url):
        if isinstance(url, tuple):
            agent, url = url
            return agent, url
        return self.agent, url

    def test_good_urls(self):
        for url in self.good:
            agent, url = self.get_agent_and_url(url)
            with self.subTest(url=url, agent=agent):
                self.assertTrue(self.parser.can_fetch(agent, url))

    def test_bad_urls(self):
        for url in self.bad:
            agent, url = self.get_agent_and_url(url)
            with self.subTest(url=url, agent=agent):
                self.assertFalse(self.parser.can_fetch(agent, url))


class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
Disallow: /tmp/ # these will soon disappear
Disallow: /foo.html
    """
    good = ['/', '/test.html']
    bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']


class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
# robots.txt for http://www.example.com/

User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space

# Cybermapper knows where to go.
User-agent: cybermapper
Disallow:
    """
    good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
    bad = ['/cyberworld/map/index.html']


class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
# go away
User-agent: *
Disallow: /
    """
    good = []
    bad = ['/cyberworld/map/index.html', '/', '/tmp/']


class BaseRequestRateTest(BaseRobotTest):

    def test_request_rate(self):
        for url in self.good + self.bad:
            agent, url = self.get_agent_and_url(url)
            with self.subTest(url=url, agent=agent):
                if self.crawl_delay:
                    self.assertEqual(
                        self.parser.crawl_delay(agent), self.crawl_delay
                    )
                if self.request_rate:
                    self.assertIsInstance(
                        self.parser.request_rate(agent),
                        urllib.robotparser.RequestRate
                    )
                    self.assertEqual(
                        self.parser.request_rate(agent).requests,
                        self.request_rate.requests
                    )
                    self.assertEqual(
                        self.parser.request_rate(agent).seconds,
                        self.request_rate.seconds
                    )


class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
    robots_txt = """\
User-agent: figtree
Crawl-delay: 3
Request-rate: 9/30
Disallow: /tmp
Disallow: /a%3cd.html
Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
    """
    agent = 'figtree'
    request_rate = urllib.robotparser.RequestRate(9, 30)
    crawl_delay = 3
    good = [('figtree', '/foo.html')]
    bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
           '/a%2fb.html', '/~joe/index.html']


class DifferentAgentTest(CrawlDelayAndRequestRateTest):
    agent = 'FigTree Robot libwww-perl/5.04'
    # these are not actually tested, but we still need to parse it
    # in order to accommodate the input parameters
    request_rate = None
    crawl_delay = None


class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /tmp/
Disallow: /a%3Cd.html
Disallow: /a/b.html
Disallow: /%7ejoe/index.html
Crawl-delay: 3
Request-rate: 9/banana
    """
    good = ['/tmp']
    bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
           '/%7Ejoe/index.html']
    crawl_delay = 3


class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
    # From bug report #523041
    robots_txt = """\
User-Agent: *
Disallow: /.
Crawl-delay: pears
    """
    good = ['/foo.html']
    # bug report says "/" should be denied, but that is not in the RFC
    bad = []


class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    # also test that Allow and Diasallow works well with each other
    robots_txt = """\
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
Request-rate: whale/banana
    """
    agent = 'Googlebot'
    good = ['/folder1/myfile.html']
    bad = ['/folder1/anotherfile.html']


class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
    # the order of User-agent should be correct. note
    # that this file is incorrect because "Googlebot" is a
    # substring of "Googlebot-Mobile"
    robots_txt = """\
User-agent: Googlebot
Disallow: /

User-agent: Googlebot-Mobile
Allow: /
    """
    agent = 'Googlebot'
    bad = ['/something.jpg']


class UserAgentGoogleMobileTest(UserAgentOrderingTest):
    agent = 'Googlebot-Mobile'


class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
    # Google also got the order wrong. You need
    # to specify the URLs from more specific to more general
    robots_txt = """\
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
    """
    agent = 'googlebot'
    good = ['/folder1/myfile.html']
    bad = ['/folder1/anotherfile.html']


class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
    # see issue #6325 for details
    robots_txt = """\
User-agent: *
Disallow: /some/path?name=value
    """
    good = ['/some/path']
    bad = ['/some/path?name=value']


class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    # obey first * entry (#4108)
    robots_txt = """\
User-agent: *
Disallow: /some/path

User-agent: *
Disallow: /another/path
    """
    good = ['/another/path']
    bad = ['/some/path']


class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
    # normalize the URL first (#17403)
    robots_txt = """\
User-agent: *
Allow: /some/path?
Disallow: /another/path?
    """
    good = ['/some/path?']
    bad = ['/another/path?']


class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/
    """
    request_rate = urllib.robotparser.RequestRate(3, 15)
    crawl_delay = 1
    good = ['/', '/test.html']
    bad = ['/cyberworld/map/index.html']


class StringFormattingTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space

# Cybermapper knows where to go.
User-agent: cybermapper
Disallow: /some/path
    """

    expected_output = """\
User-agent: cybermapper
Disallow: /some/path

User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/

"""

    def test_string_formatting(self):
        self.assertEqual(str(self.parser), self.expected_output)


class RobotHandler(BaseHTTPRequestHandler):

    def do_GET(self):
        self.send_error(403, "Forbidden access")

    def log_message(self, format, *args):
        pass

if __name__=='__main__':
    unittest.main()
python-3.6.zip added from Github README.cosmo contains the necessary links. 2021-08-08 09:38:33 +05:30			`import io`
			`import os`
			`import unittest`
			`import urllib.robotparser`
			`from test import support`
			`from http.server import BaseHTTPRequestHandler, HTTPServer`
			`try:`
Source changes for compilation These are the commits from https://github.com/ahgamut/cpython/tree/cosmo_py36 squashed for simplicity. Also included is the pyconfig.h used for compilation. The pyconfig.h has to be changed manually in case Cosmopolitan gets new features. 2021-08-08 19:22:49 +05:30			`import _thread`
python-3.6.zip added from Github README.cosmo contains the necessary links. 2021-08-08 09:38:33 +05:30			`import threading`
			`except ImportError:`
			`threading = None`


			`class BaseRobotTest:`
			`robots_txt = ''`
			`agent = 'test_robotparser'`
			`good = []`
			`bad = []`

			`def setUp(self):`
			`lines = io.StringIO(self.robots_txt).readlines()`
			`self.parser = urllib.robotparser.RobotFileParser()`
			`self.parser.parse(lines)`

			`def get_agent_and_url(self, url):`
			`if isinstance(url, tuple):`
			`agent, url = url`
			`return agent, url`
			`return self.agent, url`

			`def test_good_urls(self):`
			`for url in self.good:`
			`agent, url = self.get_agent_and_url(url)`
			`with self.subTest(url=url, agent=agent):`
			`self.assertTrue(self.parser.can_fetch(agent, url))`

			`def test_bad_urls(self):`
			`for url in self.bad:`
			`agent, url = self.get_agent_and_url(url)`
			`with self.subTest(url=url, agent=agent):`
			`self.assertFalse(self.parser.can_fetch(agent, url))`


			`class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):`
			`robots_txt = """\`
			`User-agent: *`
			`Disallow: /cyberworld/map/ # This is an infinite virtual URL space`
			`Disallow: /tmp/ # these will soon disappear`
			`Disallow: /foo.html`
			`"""`
			`good = ['/', '/test.html']`
			`bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']`


			`class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):`
			`robots_txt = """\`
			`# robots.txt for http://www.example.com/`

			`User-agent: *`
			`Crawl-delay: 1`
			`Request-rate: 3/15`
			`Disallow: /cyberworld/map/ # This is an infinite virtual URL space`

			`# Cybermapper knows where to go.`
			`User-agent: cybermapper`
			`Disallow:`
			`"""`
			`good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]`
			`bad = ['/cyberworld/map/index.html']`


			`class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):`
			`robots_txt = """\`
			`# go away`
			`User-agent: *`
			`Disallow: /`
			`"""`
			`good = []`
			`bad = ['/cyberworld/map/index.html', '/', '/tmp/']`


			`class BaseRequestRateTest(BaseRobotTest):`

			`def test_request_rate(self):`
			`for url in self.good + self.bad:`
			`agent, url = self.get_agent_and_url(url)`
			`with self.subTest(url=url, agent=agent):`
			`if self.crawl_delay:`
			`self.assertEqual(`
			`self.parser.crawl_delay(agent), self.crawl_delay`
			`)`
			`if self.request_rate:`
			`self.assertIsInstance(`
			`self.parser.request_rate(agent),`
			`urllib.robotparser.RequestRate`
			`)`
			`self.assertEqual(`
			`self.parser.request_rate(agent).requests,`
			`self.request_rate.requests`
			`)`
			`self.assertEqual(`
			`self.parser.request_rate(agent).seconds,`
			`self.request_rate.seconds`
			`)`


			`class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):`
			`robots_txt = """\`
			`User-agent: figtree`
			`Crawl-delay: 3`
			`Request-rate: 9/30`
			`Disallow: /tmp`
			`Disallow: /a%3cd.html`
			`Disallow: /a%2fb.html`
			`Disallow: /%7ejoe/index.html`
			`"""`
			`agent = 'figtree'`
			`request_rate = urllib.robotparser.RequestRate(9, 30)`
			`crawl_delay = 3`
			`good = [('figtree', '/foo.html')]`
			`bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',`
			`'/a%2fb.html', '/~joe/index.html']`


			`class DifferentAgentTest(CrawlDelayAndRequestRateTest):`
			`agent = 'FigTree Robot libwww-perl/5.04'`
			`# these are not actually tested, but we still need to parse it`
			`# in order to accommodate the input parameters`
			`request_rate = None`
			`crawl_delay = None`


			`class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):`
			`robots_txt = """\`
			`User-agent: *`
			`Disallow: /tmp/`
			`Disallow: /a%3Cd.html`
			`Disallow: /a/b.html`
			`Disallow: /%7ejoe/index.html`
			`Crawl-delay: 3`
			`Request-rate: 9/banana`
			`"""`
			`good = ['/tmp']`
			`bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',`
			`'/%7Ejoe/index.html']`
			`crawl_delay = 3`


			`class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):`
			`# From bug report #523041`
			`robots_txt = """\`
			`User-Agent: *`
			`Disallow: /.`
			`Crawl-delay: pears`
			`"""`
			`good = ['/foo.html']`
			`# bug report says "/" should be denied, but that is not in the RFC`
			`bad = []`


			`class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):`
			`# also test that Allow and Diasallow works well with each other`
			`robots_txt = """\`
			`User-agent: Googlebot`
			`Allow: /folder1/myfile.html`
			`Disallow: /folder1/`
			`Request-rate: whale/banana`
			`"""`
			`agent = 'Googlebot'`
			`good = ['/folder1/myfile.html']`
			`bad = ['/folder1/anotherfile.html']`


			`class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):`
			`# the order of User-agent should be correct. note`
			`# that this file is incorrect because "Googlebot" is a`
			`# substring of "Googlebot-Mobile"`
			`robots_txt = """\`
			`User-agent: Googlebot`
			`Disallow: /`

			`User-agent: Googlebot-Mobile`
			`Allow: /`
			`"""`
			`agent = 'Googlebot'`
			`bad = ['/something.jpg']`


			`class UserAgentGoogleMobileTest(UserAgentOrderingTest):`
			`agent = 'Googlebot-Mobile'`


			`class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):`
			`# Google also got the order wrong. You need`
			`# to specify the URLs from more specific to more general`
			`robots_txt = """\`
			`User-agent: Googlebot`
			`Allow: /folder1/myfile.html`
			`Disallow: /folder1/`
			`"""`
			`agent = 'googlebot'`
			`good = ['/folder1/myfile.html']`
			`bad = ['/folder1/anotherfile.html']`


			`class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):`
			`# see issue #6325 for details`
			`robots_txt = """\`
			`User-agent: *`
			`Disallow: /some/path?name=value`
			`"""`
			`good = ['/some/path']`
			`bad = ['/some/path?name=value']`


			`class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):`
			`# obey first * entry (#4108)`
			`robots_txt = """\`
			`User-agent: *`
			`Disallow: /some/path`

			`User-agent: *`
			`Disallow: /another/path`
			`"""`
			`good = ['/another/path']`
			`bad = ['/some/path']`


			`class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):`
			`# normalize the URL first (#17403)`
			`robots_txt = """\`
			`User-agent: *`
			`Allow: /some/path?`
			`Disallow: /another/path?`
			`"""`
			`good = ['/some/path?']`
			`bad = ['/another/path?']`


			`class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):`
			`robots_txt = """\`
			`User-agent: *`
			`Crawl-delay: 1`
			`Request-rate: 3/15`
			`Disallow: /cyberworld/map/`
			`"""`
			`request_rate = urllib.robotparser.RequestRate(3, 15)`
			`crawl_delay = 1`
			`good = ['/', '/test.html']`
			`bad = ['/cyberworld/map/index.html']`


			`class StringFormattingTest(BaseRobotTest, unittest.TestCase):`
			`robots_txt = """\`
			`User-agent: *`
			`Crawl-delay: 1`
			`Request-rate: 3/15`
			`Disallow: /cyberworld/map/ # This is an infinite virtual URL space`

			`# Cybermapper knows where to go.`
			`User-agent: cybermapper`
			`Disallow: /some/path`
			`"""`

			`expected_output = """\`
			`User-agent: cybermapper`
			`Disallow: /some/path`

			`User-agent: *`
			`Crawl-delay: 1`
			`Request-rate: 3/15`
			`Disallow: /cyberworld/map/`

			`"""`

			`def test_string_formatting(self):`
			`self.assertEqual(str(self.parser), self.expected_output)`


			`class RobotHandler(BaseHTTPRequestHandler):`

			`def do_GET(self):`
			`self.send_error(403, "Forbidden access")`

			`def log_message(self, format, *args):`
			`pass`

			`if __name__=='__main__':`
			`unittest.main()`