💾 Archived View for gmn.clttr.info › sources › geminispace.info.git › tree › tests › gus › lib › tes… captured on 2023-09-08 at 16:34:00.
⬅️ Previous capture (2023-06-14)
-=-=-=-=-=-=-
import pytest from gus.lib.gemini import GeminiResource, GeminiRobotFileParser class TestGeminiResource: def test_extract_contained_resources(self): url = "gemini://host" # no content resources = GeminiResource(url).extract_contained_resources("") assert resources == [] # not a link resources = GeminiResource(url).extract_contained_resources(" => link") assert resources == [] resources = GeminiResource(url).extract_contained_resources( "```\n=> preformatted\n```" ) assert resources == [] # some links resources = GeminiResource(url).extract_contained_resources( "=> link\ntext\n=> other" ) assert len(resources) == 2 assert resources[0].raw_url == "link" assert resources[1].raw_url == "other" resources = GeminiResource(url).extract_contained_resources( """ # title text => link text
=> other """ ) assert len(resources) == 2 assert resources[0].raw_url == "link" assert resources[1].raw_url == "other" @pytest.mark.parametrize("test_input,expected_result", [ (["gemini://gus.guru", None, None], [True, "gus.guru", "gemini://gus.guru/", "gus.guru"]), (["gemini://gus.guru/search?text", None, None], [True, "gus.guru", "gemini://gus.guru/search?text", "gus.guru"]), (["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]), (["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]), (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]), (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]), (["/bar?test", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar?test", "gus.guru"]), (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]), (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/foo/bar/", "gus.guru"]), (["//foo.com", None, None], [True, "foo.com", "gemini://foo.com/", "foo.com"]), (["gemini://gem.Splatt9990.com/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com"]), (["gemini://gem.Splatt9990.com:1965/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com" ]), (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com"]), (["gemini://MichaelNordmeyer.com", None, None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com"]), (["log.gmi", "gemini://MichaelNordmeyer.com:1965/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com"]), (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]), (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]), (["gemini://tilde.pink/~emily/log/productivity.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/~emily/log/productivity.gmi", "tilde.pink/~emily"]), (["gemini://tilde.pink/users/emily/index.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/users/emily/index.gmi", "tilde.pink/users/emily"]) ]) def test_url_parsing(self, test_input, expected_result): gr = GeminiResource(test_input[0], test_input[1], test_input[2]) assert gr.is_valid == expected_result[0] assert gr.normalized_host == expected_result[1] assert gr.fetchable_url == expected_result[2] assert gr.normalized_host_like == expected_result[3] @pytest.mark.parametrize("test_url,expected_result", [ ("gemini://gus.guru", True), ("gemini://gus.guru/", True), ("gemini://gus.guru/franz", False), ("gemini://gus.guru/~franz", True), ("gemini://gus.guru/~franz/foo", False), ]) def test_is_root_like(self, test_url, expected_result): gr = GeminiResource(test_url) assert gr.is_root_like == expected_result class TestGeminiRobotFileParser: def _get_parser(self, content): dummy_url = "gemini://dummy/robots.txt" rp = GeminiRobotFileParser(dummy_url) rp.read_from_string(content) return rp def _assert_fetchable(self, rp, url="/", fetchable=True): useragents = ["gus", "indexer", "*"] assert rp.can_fetch_prioritized(useragents, url) == fetchable def test_empty_robots(self): rp = self._get_parser("") self._assert_fetchable(rp) def test_disallow_star(self): rp = self._get_parser("""User-agent: * Disallow: /""") self._assert_fetchable(rp, "/", False) def test_allow_indexer(self): rp = self._get_parser("""User-agent: * Disallow: / User-agent: indexer Allow: /""") self._assert_fetchable(rp, "/test", True) def test_allow_all_but_disallow_indexer(self): rp = self._get_parser("""User-agent: * Allow: / User-agent: indexer Disallow: /""") self._assert_fetchable(rp, "/", False) def test_allow_star_but_disallow_genericbot(self): rp = self._get_parser("""User-agent: * Allow: / User-agent: indexer Disallow: /""") self._assert_fetchable(rp, "/", False) def test_allow_only_gus(self): rp = self._get_parser("""User-agent: * Disallow: / User-agent: genericbot Disallow: / User-agent: gus Allow: /""") self._assert_fetchable(rp) def test_disallow_gemidev_waffle(self): rp = self._get_parser("""user-agent: * Disallow: /cgi-bin/wp.cgi/view Disallow: /cgi-bin/wp.cgi/media Disallow: /cgi-bin/wp.cgi/search Disallow: /cgi-bin/waffle.cgi/article Disallow: /cgi-bin/waffle.cgi/feed Disallow: /cgi-bin/waffle.cgi/links Disallow: /cgi-bin/waffle.cgi/view Disallow: /cgi-bin/witw.cgi/play """) self._assert_fetchable(rp, "/cgi-bin/waffle.cgi/feed/link", False) def test_disallow_infinite_maze(self): rp = self._get_parser("""User-agent: * # We don't accept automated donations Disallow: /donate # Robots are not allowed to vote Disallow: /vote Disallow: /vote/ Disallow: /voteru Disallow: /voteru/ # Robots are forbidden to enter the infinite maze Disallow: /maze Disallow: /maze/ """) self._assert_fetchable(rp, "/maze/l/", False) def test_disallow_gemski_git(self): rp = self._get_parser("""# disallowing because kineto at least doesn't have its own robots.txt to # prevent web crawling by proxy User-agent: webproxy Disallow: / User-agent: archiver User-agent: indexer User-agent: researcher Disallow: /gredig/ # I'd like to just do this, but it seems at least some crawlers don't match by # prefix. #Disallow: /gemski/play Disallow: /ski/ Disallow: /gemski/ # This doesn't exist, but GUS seems to get confused with the tuner server # running on another port. Disallow: /stations/ """) self._assert_fetchable(rp, "/ski/sds", False) def test_disallow_unlimitedpizza_git(self): rp = self._get_parser("""User-agent: * Allow: /git Disallow: /git/dotfiles/ """) self._assert_fetchable(rp, "/git/dotfiles/tree/0b0de929fa98457d22cbbcee65013ec261b660e2/atom/packages/ex-mode/node_modules/space-pen/node_modules/grim/node_modules/emissary/node_modules/es6-weak-map/node_modules/es5-ext/math/atanh/implement.js", False)