diff --git a/README.md b/README.md
index ee5d72d02aeaceb018b14e1ccc4349d6e20f42f7..5c227ed8500cdfa9838601b9617ef788c1ea998b 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@
## Running test suite
-Run: "poetry run python -m pytest"
+Run: "poetry run pytest"
## Roadmap / TODOs
diff --git a/gus/build_index.py b/gus/build_index.py
index 30eaf61f266da6e48d75f3b0e80729588f28ce89..acb97bce60e06597081b34aead1c22bc11049d70 100644
--- a/gus/build_index.py
+++ b/gus/build_index.py
@@ -7,6 +7,7 @@
from . import constants
from gus.crawl import EXCLUDED_URL_PREFIXES
from gus.lib.db_model import init_db, Page
+from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import (
compute_index_statistics,
persist_statistics,
@@ -23,14 +24,9 @@ uses_netloc.append("gemini")
def index_page(index, page, indexed_urls):
- should_skip = False
- for excluded_prefix in EXCLUDED_URL_PREFIXES:
- if page.normalized_url.startswith(excluded_prefix):
- should_skip = True
- break
- if should_skip:
+ if should_skip(GeminiResource(page.url)):
logging.debug(
- "URL prefix matches exclusion list, skipping: %s",
+ "URL is excluded, skipping: %s",
strip_control_chars(page.url),
)
return False
diff --git a/gus/crawl.py b/gus/crawl.py
index 34e1e6555c77cd4db756daeecb9a51b81b998d76..d6445c1b3798f4097c68e190a4006814959449f7 100644
--- a/gus/crawl.py
+++ b/gus/crawl.py
@@ -1,5 +1,6 @@
import argparse
import logging
+import re
from datetime import datetime, timedelta
import os
@@ -20,6 +21,11 @@ # hack: the built-in methods in urllib need to know the
# Gemini protocol exists
uses_relative.append("gemini")
uses_netloc.append("gemini")
+
+EXCLUDED_URL_PATTERN = re.compile(
+ r"^gemini://(\d{6}\.ch|almp\d{4}\.app|.*/_(revert|history)/).*",
+ flags=re.IGNORECASE
+)
# These are checked against normalized_url, so they should be
# prepended with the gemini:// protocol, be all lowercased, and
@@ -331,17 +337,28 @@ page.save()
return page, is_different
+def should_skip(resource):
+ should_skip = False
+ for excluded_prefix in EXCLUDED_URL_PREFIXES:
+ if resource.normalized_url.startswith(excluded_prefix):
+ should_skip = True
+ break
+ for excluded_path in EXCLUDED_URL_PATHS:
+ if resource.urlsplit.path.lower().endswith(excluded_path):
+ should_skip = True
+ break
+ m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)
+ if m:
+ should_skip = True
+ return should_skip
+
+
def index_links(from_resource, contained_resources):
from_page, created = Page.get_or_create(url=from_resource.indexable_url)
Link.delete().where(Link.from_page == from_page).execute()
data = []
for cr in contained_resources:
- should_skip = False
- for excluded_prefix in EXCLUDED_URL_PREFIXES:
- if cr.normalized_url.startswith(excluded_prefix):
- should_skip = True
- break
- if should_skip:
+ if should_skip(cr):
continue
to_page = Page.get_or_none(url=cr.indexable_url)
if not to_page:
@@ -392,21 +409,12 @@ "Not a valid gemini resource, skipping: %s",
gus.lib.logging.strip_control_chars(url),
)
return
- for excluded_prefix in EXCLUDED_URL_PREFIXES:
- if gr.normalized_url.startswith(excluded_prefix):
- logging.info(
- "URL prefix matches exclusion list, skipping: %s",
- gus.lib.logging.strip_control_chars(url),
- )
- return
- for excluded_path in EXCLUDED_URL_PATHS:
- if gr.urlsplit.path.lower().endswith(excluded_path):
- logging.info(
- "URL on exclusion list, skipping: %s",
- gus.lib.logging.strip_control_chars(url),
- )
- return
-
+ if should_skip(gr):
+ logging.info(
+ "URL is excluded, skipping: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
+ return
if should_check_if_expired:
existing_page = Page.get_or_none(url=gr.indexable_url)
if existing_page and existing_page.change_frequency is not None:
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
index 61fd1a4b7a7745a7629ec1d5909f9c53588d287d..1463814b4786e2d07868d6237188113900252912 100644
--- a/tests/gus/lib/test_gemini.py
+++ b/tests/gus/lib/test_gemini.py
@@ -1,30 +1,28 @@
-import unittest
from gus.lib.gemini import GeminiResource
-
-class TestGeminiResource(unittest.TestCase):
+class TestGeminiResource:
def test_extract_contained_resources(self):
url = "gemini://host"
# no content
resources = GeminiResource(url).extract_contained_resources("")
- self.assertEqual(resources, [])
+ assert resources == []
# not a link
resources = GeminiResource(url).extract_contained_resources(" => link")
- self.assertEqual(resources, [])
+ assert resources == []
resources = GeminiResource(url).extract_contained_resources(
"```\n=> preformatted\n```"
)
- self.assertEqual(resources, [])
+ assert resources == []
# some links
resources = GeminiResource(url).extract_contained_resources(
"=> link\ntext\n=> other"
)
- self.assertEqual(len(resources), 2)
- self.assertEqual(resources[0].raw_url, "link")
- self.assertEqual(resources[1].raw_url, "other")
+ assert len(resources) == 2
+ assert resources[0].raw_url == "link"
+ assert resources[1].raw_url == "other"
resources = GeminiResource(url).extract_contained_resources(
"""
@@ -36,8 +34,8 @@ ``` preformatted
=> no link
```
=> other
- """
+ """
)
- self.assertEqual(len(resources), 2)
- self.assertEqual(resources[0].raw_url, "link")
- self.assertEqual(resources[1].raw_url, "other")
+ assert len(resources) == 2
+ assert resources[0].raw_url == "link"
+ assert resources[1].raw_url == "other"
diff --git a/tests/gus/test_crawl.py b/tests/gus/test_crawl.py
new file mode 100644
index 0000000000000000000000000000000000000000..658632c92b9cca202b49b7e836327cd1e7b9c4a1
--- /dev/null
+++ b/tests/gus/test_crawl.py
@@ -0,0 +1,39 @@
+import pytest
+
+from gus.crawl import should_skip
+from gus.lib.gemini import GeminiResource
+
+class TestUrlExclusion:
+ @pytest.mark.parametrize("test_url,expected_result", [
+ ("gemini://gemini.circumlunar.space/favicon.ico", True),
+ ("gemini://gemini.circumlunar.space/rss.txt", True),
+ ])
+ def test_excluded_url_paths(self, test_url, expected_result):
+ resource = GeminiResource(test_url)
+ assert should_skip(resource) == expected_result
+
+
+ @pytest.mark.parametrize("test_url,expected_result", [
+ ("gemini://hannuhartikainen.fi/twinwiki/_revert/1594367314474", True),
+ ("gemini://hannuhartikainen.fi/twinwiki/1594367314474", False),
+ ("gemini://hannuhartikainen.fi/twinwiki/Sandbox/_history/1594037613712", True),
+ ("gemini://hannuhartikainen.fi/twinwiki", False),
+ ("gemini://123456.ch", True),
+ ("gemini://123456.ch/fnord", True),
+ ("gemini://almp1234.app", True),
+ ("gemini://almp1234.app/fnord", True),
+ ])
+ def test_excluded_url_pattern(self, test_url, expected_result):
+ resource = GeminiResource(test_url)
+ assert should_skip(resource) == expected_result
+
+
+ @pytest.mark.parametrize("test_url,expected_result", [
+ ("gemini://localhost", True),
+ ("gemini://example.org", True),
+ ("gus.guru", False),
+ ("gus.guru/search?turkey", True),
+ ])
+ def test_excluded_url_prefixes(self, test_url, expected_result):
+ resource = GeminiResource(test_url)
+ assert should_skip(resource) == expected_result