💾 Archived View for gmn.clttr.info › sources › geminispace.info.git › tree › tests › gus › lib › tes… captured on 2023-09-08 at 16:34:00.

View Raw

More Information

⬅️ Previous capture (2023-06-14)

➡️ Next capture (2023-12-28)

🚧 View Differences

-=-=-=-=-=-=-

import pytest

from gus.lib.gemini import GeminiResource, GeminiRobotFileParser

class TestGeminiResource:
    def test_extract_contained_resources(self):
        url = "gemini://host"

        # no content
        resources = GeminiResource(url).extract_contained_resources("")
        assert resources == []

        # not a link
        resources = GeminiResource(url).extract_contained_resources(" => link")
        assert resources == []
        resources = GeminiResource(url).extract_contained_resources(
            "```\n=> preformatted\n```"
        )
        assert resources == []

        # some links
        resources = GeminiResource(url).extract_contained_resources(
            "=> link\ntext\n=> other"
        )
        assert len(resources) == 2
        assert resources[0].raw_url == "link"
        assert resources[1].raw_url == "other"

        resources = GeminiResource(url).extract_contained_resources(
            """
# title
text
=> link
text

link

=> other
            """
        )
        assert len(resources) == 2
        assert resources[0].raw_url == "link"
        assert resources[1].raw_url == "other"

    @pytest.mark.parametrize("test_input,expected_result", [
        (["gemini://gus.guru", None, None], [True, "gus.guru", "gemini://gus.guru/", "gus.guru"]),
        (["gemini://gus.guru/search?text", None, None], [True, "gus.guru", "gemini://gus.guru/search?text", "gus.guru"]),
        (["/bar", "gemini://gus.guru/foo", None], [False, None, None, None]),
        (["/bar", "gemini://gus.guru/foo/", None], [False, None, None, None]),
        (["/bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]),
        (["/bar", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]),
        (["/bar?test", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar?test", "gus.guru"]),
        (["bar", "gemini://gus.guru/foo", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/bar", "gus.guru"]),
        (["bar/", "gemini://gus.guru/foo/", "gus.guru"], [True, "gus.guru", "gemini://gus.guru/foo/bar/", "gus.guru"]),
        (["//foo.com", None, None], [True, "foo.com", "gemini://foo.com/", "foo.com"]),
        (["gemini://gem.Splatt9990.com/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com"]),
        (["gemini://gem.Splatt9990.com:1965/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com/index.gmi", "gem.splatt9990.com" ]),
        (["gemini://gem.splatt9990.com:1966/index.gmi", None, None], [True, "gem.splatt9990.com", "gemini://gem.splatt9990.com:1966/index.gmi", "gem.splatt9990.com"]),
        (["gemini://MichaelNordmeyer.com", None, None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/", "michaelnordmeyer.com"]),
        (["log.gmi", "gemini://MichaelNordmeyer.com:1965/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/log.gmi", "michaelnordmeyer.com"]),
        (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]),
        (["Log.gmi", "gemini://MichaelNordmeyer.com/", None], [True, "michaelnordmeyer.com", "gemini://michaelnordmeyer.com/Log.gmi", "michaelnordmeyer.com"]),
        (["gemini://tilde.pink/~emily/log/productivity.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/~emily/log/productivity.gmi", "tilde.pink/~emily"]),
        (["gemini://tilde.pink/users/emily/index.gmi", None, None], [True, "tilde.pink", "gemini://tilde.pink/users/emily/index.gmi", "tilde.pink/users/emily"])
    ])
    def test_url_parsing(self, test_input, expected_result):
        gr = GeminiResource(test_input[0], test_input[1], test_input[2])
        assert gr.is_valid == expected_result[0]
        assert gr.normalized_host == expected_result[1]
        assert gr.fetchable_url == expected_result[2]
        assert gr.normalized_host_like == expected_result[3]

    @pytest.mark.parametrize("test_url,expected_result", [
        ("gemini://gus.guru", True),
        ("gemini://gus.guru/", True),
        ("gemini://gus.guru/franz", False),
        ("gemini://gus.guru/~franz", True),
        ("gemini://gus.guru/~franz/foo", False),
    ])
    def test_is_root_like(self, test_url, expected_result):
        gr = GeminiResource(test_url)
        assert gr.is_root_like == expected_result


class TestGeminiRobotFileParser:
    def _get_parser(self, content):
        dummy_url = "gemini://dummy/robots.txt"
        rp = GeminiRobotFileParser(dummy_url)
        rp.read_from_string(content)
        return rp

    def _assert_fetchable(self, rp, url="/", fetchable=True):
        useragents = ["gus", "indexer", "*"]
        assert rp.can_fetch_prioritized(useragents, url) == fetchable

    def test_empty_robots(self):
        rp = self._get_parser("")
        self._assert_fetchable(rp)

    def test_disallow_star(self):
        rp = self._get_parser("""User-agent: *
Disallow: /""")
        self._assert_fetchable(rp, "/", False)

    def test_allow_indexer(self):
        rp = self._get_parser("""User-agent: *
Disallow: /

User-agent: indexer
Allow: /""")
        self._assert_fetchable(rp, "/test", True)

    def test_allow_all_but_disallow_indexer(self):
        rp = self._get_parser("""User-agent: *
Allow: /

User-agent: indexer
Disallow: /""")
        self._assert_fetchable(rp, "/", False)

    def test_allow_star_but_disallow_genericbot(self):
        rp = self._get_parser("""User-agent: *
Allow: /

User-agent: indexer
Disallow: /""")
        self._assert_fetchable(rp, "/", False)

    def test_allow_only_gus(self):
        rp = self._get_parser("""User-agent: *
Disallow: /

User-agent: genericbot
Disallow: /

User-agent: gus
Allow: /""")
        self._assert_fetchable(rp)

    def test_disallow_gemidev_waffle(self):
        rp = self._get_parser("""user-agent: *
Disallow: /cgi-bin/wp.cgi/view
Disallow: /cgi-bin/wp.cgi/media
Disallow: /cgi-bin/wp.cgi/search
Disallow: /cgi-bin/waffle.cgi/article
Disallow: /cgi-bin/waffle.cgi/feed
Disallow: /cgi-bin/waffle.cgi/links
Disallow: /cgi-bin/waffle.cgi/view
Disallow: /cgi-bin/witw.cgi/play
""")
        self._assert_fetchable(rp, "/cgi-bin/waffle.cgi/feed/link", False)
    
    def test_disallow_infinite_maze(self):
        rp = self._get_parser("""User-agent: *
# We don't accept automated donations
Disallow: /donate
# Robots are not allowed to vote
Disallow: /vote
Disallow: /vote/
Disallow: /voteru
Disallow: /voteru/
# Robots are forbidden to enter the infinite maze
Disallow: /maze
Disallow: /maze/
""")
        self._assert_fetchable(rp, "/maze/l/", False)

    def test_disallow_gemski_git(self):
        rp = self._get_parser("""# disallowing because kineto at least doesn't have its own robots.txt to
# prevent web crawling by proxy
User-agent: webproxy
Disallow: /

User-agent: archiver
User-agent: indexer
User-agent: researcher
Disallow: /gredig/
# I'd like to just do this, but it seems at least some crawlers don't match by
# prefix.
#Disallow: /gemski/play
Disallow: /ski/
Disallow: /gemski/
# This doesn't exist, but GUS seems to get confused with the tuner server
# running on another port.
Disallow: /stations/

""")
        self._assert_fetchable(rp, "/ski/sds", False)

    def test_disallow_unlimitedpizza_git(self):
        rp = self._get_parser("""User-agent: *
Allow: /git
Disallow: /git/dotfiles/
""")
        self._assert_fetchable(rp, "/git/dotfiles/tree/0b0de929fa98457d22cbbcee65013ec261b660e2/atom/packages/ex-mode/node_modules/space-pen/node_modules/grim/node_modules/emissary/node_modules/es6-weak-map/node_modules/es5-ext/math/atanh/implement.js", False)