💾 Archived View for gmn.clttr.info › sources › geminispace.info.git › tree › gus › lib › whoosh_exte… captured on 2024-02-05 at 10:09:05.

View Raw

More Information

⬅️ Previous capture (2023-01-29)

-=-=-=-=-=-=-

import math
import re

from urllib.parse import urlparse

from whoosh import highlight
from whoosh.analysis import IntraWordFilter, LowercaseFilter, RegexTokenizer, StemFilter


def UrlAnalyzer():
    """Composes a RegexTokenizer with a LowercaseFilter.

    >>> ana = UrlAnalyzer()
    >>> [token.text for token in ana("gemini://foo.bar.baz/hum/drum?har=floom")]
    ["foo", "bar", "baz", "hum", "drum"]

    """

    return (
        RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True)
        | IntraWordFilter()
        | LowercaseFilter()
        | StemFilter()
    )


class GeminiFormatter(highlight.Formatter):
    """Puts quotes around the fragments, and then splits them by
    line and formats the lines as a bulleted list.
    """

    between = "<<HL-SPLIT>>"

    def format_token(self, text, token, replace=False):
        # Use the get_text function to get the text corresponding to the
        # token
        tokentext = highlight.get_text(text, token, replace)

        # Return the text as you want it to appear in the highlighted
        # string
        return "%s" % tokentext

    def format_fragment(self, fragment, replace=False):
        """Returns a formatted version of the given text, using the "token"
        objects in the given :class:`Fragment`.

        :param fragment: a :class:`Fragment` object representing a list of
            matches in the text.
        :param replace: if True, the original text corresponding to each
            match will be replaced with the value of the token object's
            ``text`` attribute.
        """

        output = ["* ..."]
        index = fragment.startchar
        text = fragment.text

        for t in fragment.matches:
            if t.startchar is None:
                continue
            if t.startchar < index:
                continue
            if t.startchar > index:
                output.append(self._text(text[index : t.startchar]))
            output.append(self.format_token(text, t, replace))
            index = t.endchar
        output.append(self._text(text[index : fragment.endchar]))
        output.append("...")

        out_string = "".join(output)
        out_string = out_string.replace("\n", " ").replace("\r", " ")
        out_string = " ".join(out_string.split())
        return out_string


special_char_pattern = re.compile("[^\w\s,\.;-\?\!']")
link_pattern = re.compile("://|=>")


class GeminiScorer(highlight.FragmentScorer):
    def __call__(self, f):
        # Add up the boosts for the matched terms in this passage
        score = sum(t.boost for t in f.matches)

        # Favor diversity: multiply score by the number of separate
        # terms matched
        score *= (len(f.matched_terms) * 100) or 1

        # lower the score substantially for any special characters we we find,
        # where special characters are non-word characters that also are not
        # typically found in textual content. This should penalize things like
        # ascii art, as well as source code (which, I suppose will make snippets
        # lower quality for actual searches for source code, but that is a very
        # small minority of searches in the current state of things).
        num_special_chars = len(
            special_char_pattern.findall(f.text[f.startchar : f.endchar])
        )
        score -= 4 * num_special_chars + math.pow(num_special_chars, 1.5)

        num_links = len(link_pattern.findall(f.text[f.startchar : f.endchar]))
        score -= 30 * num_links

        return max(0, score)