diff --git a/README.md b/README.md

index ee5d72d02aeaceb018b14e1ccc4349d6e20f42f7..5c227ed8500cdfa9838601b9617ef788c1ea998b 100644

--- a/README.md

+++ b/README.md

@@ -34,7 +34,7 @@

## Running test suite

-Run: "poetry run python -m pytest"

+Run: "poetry run pytest"

## Roadmap / TODOs

diff --git a/gus/build_index.py b/gus/build_index.py

index 30eaf61f266da6e48d75f3b0e80729588f28ce89..acb97bce60e06597081b34aead1c22bc11049d70 100644

--- a/gus/build_index.py

+++ b/gus/build_index.py

@@ -7,6 +7,7 @@

from . import constants

from gus.crawl import EXCLUDED_URL_PREFIXES

from gus.lib.db_model import init_db, Page

+from gus.lib.gemini import GeminiResource

from gus.lib.index_statistics import (

compute_index_statistics,

persist_statistics,

@@ -23,14 +24,9 @@ uses_netloc.append("gemini")

def index_page(index, page, indexed_urls):

- should_skip = False

- for excluded_prefix in EXCLUDED_URL_PREFIXES:

- if page.normalized_url.startswith(excluded_prefix):

- should_skip = True

- break

- if should_skip:

+ if should_skip(GeminiResource(page.url)):

logging.debug(

- "URL prefix matches exclusion list, skipping: %s",

+ "URL is excluded, skipping: %s",

strip_control_chars(page.url),

)

return False

diff --git a/gus/crawl.py b/gus/crawl.py

index 34e1e6555c77cd4db756daeecb9a51b81b998d76..d6445c1b3798f4097c68e190a4006814959449f7 100644

--- a/gus/crawl.py

+++ b/gus/crawl.py

@@ -1,5 +1,6 @@

import argparse

import logging

+import re

from datetime import datetime, timedelta

import os

@@ -20,6 +21,11 @@ # hack: the built-in methods in urllib need to know the

# Gemini protocol exists

uses_relative.append("gemini")

uses_netloc.append("gemini")

+

+EXCLUDED_URL_PATTERN = re.compile(

+ r"^gemini://(\d{6}\.ch|almp\d{4}\.app|.*/_(revert|history)/).*",

+ flags=re.IGNORECASE

+)

# These are checked against normalized_url, so they should be

# prepended with the gemini:// protocol, be all lowercased, and

@@ -331,17 +337,28 @@ page.save()

return page, is_different

+def should_skip(resource):

+ should_skip = False

+ for excluded_prefix in EXCLUDED_URL_PREFIXES:

+ if resource.normalized_url.startswith(excluded_prefix):

+ should_skip = True

+ break

+ for excluded_path in EXCLUDED_URL_PATHS:

+ if resource.urlsplit.path.lower().endswith(excluded_path):

+ should_skip = True

+ break

+ m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)

+ if m:

+ should_skip = True

+ return should_skip

+

+

def index_links(from_resource, contained_resources):

from_page, created = Page.get_or_create(url=from_resource.indexable_url)

Link.delete().where(Link.from_page == from_page).execute()

data = []

for cr in contained_resources:

- should_skip = False

- for excluded_prefix in EXCLUDED_URL_PREFIXES:

- if cr.normalized_url.startswith(excluded_prefix):

- should_skip = True

- break

- if should_skip:

+ if should_skip(cr):

continue

to_page = Page.get_or_none(url=cr.indexable_url)

if not to_page:

@@ -392,21 +409,12 @@ "Not a valid gemini resource, skipping: %s",

gus.lib.logging.strip_control_chars(url),

)

return

- for excluded_prefix in EXCLUDED_URL_PREFIXES:

- if gr.normalized_url.startswith(excluded_prefix):

- logging.info(

- "URL prefix matches exclusion list, skipping: %s",

- gus.lib.logging.strip_control_chars(url),

- )

- return

- for excluded_path in EXCLUDED_URL_PATHS:

- if gr.urlsplit.path.lower().endswith(excluded_path):

- logging.info(

- "URL on exclusion list, skipping: %s",

- gus.lib.logging.strip_control_chars(url),

- )

- return

-

+ if should_skip(gr):

+ logging.info(

+ "URL is excluded, skipping: %s",

+ gus.lib.logging.strip_control_chars(url),

+ )

+ return

if should_check_if_expired:

existing_page = Page.get_or_none(url=gr.indexable_url)

if existing_page and existing_page.change_frequency is not None:

diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py

index 61fd1a4b7a7745a7629ec1d5909f9c53588d287d..1463814b4786e2d07868d6237188113900252912 100644

--- a/tests/gus/lib/test_gemini.py

+++ b/tests/gus/lib/test_gemini.py

@@ -1,30 +1,28 @@

-import unittest

from gus.lib.gemini import GeminiResource

-

-class TestGeminiResource(unittest.TestCase):

+class TestGeminiResource:

def test_extract_contained_resources(self):

url = "gemini://host"

# no content

resources = GeminiResource(url).extract_contained_resources("")

- self.assertEqual(resources, [])

+ assert resources == []

# not a link

resources = GeminiResource(url).extract_contained_resources(" => link")

- self.assertEqual(resources, [])

+ assert resources == []

resources = GeminiResource(url).extract_contained_resources(

"```\n=> preformatted\n```"

)

- self.assertEqual(resources, [])

+ assert resources == []

# some links

resources = GeminiResource(url).extract_contained_resources(

"=> link\ntext\n=> other"

)

- self.assertEqual(len(resources), 2)

- self.assertEqual(resources[0].raw_url, "link")

- self.assertEqual(resources[1].raw_url, "other")

+ assert len(resources) == 2

+ assert resources[0].raw_url == "link"

+ assert resources[1].raw_url == "other"

resources = GeminiResource(url).extract_contained_resources(

"""

@@ -36,8 +34,8 @@ ``` preformatted

=> no link

```

=> other

- """

+ """

)

- self.assertEqual(len(resources), 2)

- self.assertEqual(resources[0].raw_url, "link")

- self.assertEqual(resources[1].raw_url, "other")

+ assert len(resources) == 2

+ assert resources[0].raw_url == "link"

+ assert resources[1].raw_url == "other"

diff --git a/tests/gus/test_crawl.py b/tests/gus/test_crawl.py

new file mode 100644

index 0000000000000000000000000000000000000000..658632c92b9cca202b49b7e836327cd1e7b9c4a1

--- /dev/null

+++ b/tests/gus/test_crawl.py

@@ -0,0 +1,39 @@

+import pytest

+

+from gus.crawl import should_skip

+from gus.lib.gemini import GeminiResource

+

+class TestUrlExclusion:

+ @pytest.mark.parametrize("test_url,expected_result", [

+ ("gemini://gemini.circumlunar.space/favicon.ico", True),

+ ("gemini://gemini.circumlunar.space/rss.txt", True),

+ ])

+ def test_excluded_url_paths(self, test_url, expected_result):

+ resource = GeminiResource(test_url)

+ assert should_skip(resource) == expected_result

+

+

+ @pytest.mark.parametrize("test_url,expected_result", [

+ ("gemini://hannuhartikainen.fi/twinwiki/_revert/1594367314474", True),

+ ("gemini://hannuhartikainen.fi/twinwiki/1594367314474", False),

+ ("gemini://hannuhartikainen.fi/twinwiki/Sandbox/_history/1594037613712", True),

+ ("gemini://hannuhartikainen.fi/twinwiki", False),

+ ("gemini://123456.ch", True),

+ ("gemini://123456.ch/fnord", True),

+ ("gemini://almp1234.app", True),

+ ("gemini://almp1234.app/fnord", True),

+ ])

+ def test_excluded_url_pattern(self, test_url, expected_result):

+ resource = GeminiResource(test_url)

+ assert should_skip(resource) == expected_result

+

+

+ @pytest.mark.parametrize("test_url,expected_result", [

+ ("gemini://localhost", True),

+ ("gemini://example.org", True),

+ ("gus.guru", False),

+ ("gus.guru/search?turkey", True),

+ ])

+ def test_excluded_url_prefixes(self, test_url, expected_result):

+ resource = GeminiResource(test_url)

+ assert should_skip(resource) == expected_result