import math import re from urllib.parse import urlparse from whoosh import highlight from whoosh.analysis import IntraWordFilter, LowercaseFilter, RegexTokenizer, StemFilter def UrlAnalyzer(): """Composes a RegexTokenizer with a LowercaseFilter. >>> ana = UrlAnalyzer() >>> [token.text for token in ana("gemini://foo.bar.baz/hum/drum?har=floom")] ["foo", "bar", "baz", "hum", "drum"] """ return ( RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) | IntraWordFilter() | LowercaseFilter() | StemFilter() ) class GeminiFormatter(highlight.Formatter): """Puts quotes around the fragments, and then splits them by line and formats the lines as a bulleted list. """ between = "<>" def format_token(self, text, token, replace=False): # Use the get_text function to get the text corresponding to the # token tokentext = highlight.get_text(text, token, replace) # Return the text as you want it to appear in the highlighted # string return "%s" % tokentext def format_fragment(self, fragment, replace=False): """Returns a formatted version of the given text, using the "token" objects in the given :class:`Fragment`. :param fragment: a :class:`Fragment` object representing a list of matches in the text. :param replace: if True, the original text corresponding to each match will be replaced with the value of the token object's ``text`` attribute. """ output = ["* ..."] index = fragment.startchar text = fragment.text for t in fragment.matches: if t.startchar is None: continue if t.startchar < index: continue if t.startchar > index: output.append(self._text(text[index : t.startchar])) output.append(self.format_token(text, t, replace)) index = t.endchar output.append(self._text(text[index : fragment.endchar])) output.append("...") out_string = "".join(output) out_string = out_string.replace("\n", " ").replace("\r", " ") out_string = " ".join(out_string.split()) return out_string special_char_pattern = re.compile("[^\w\s,\.;-\?\!']") link_pattern = re.compile("://|=>") class GeminiScorer(highlight.FragmentScorer): def __call__(self, f): # Add up the boosts for the matched terms in this passage score = sum(t.boost for t in f.matches) # Favor diversity: multiply score by the number of separate # terms matched score *= (len(f.matched_terms) * 100) or 1 # lower the score substantially for any special characters we we find, # where special characters are non-word characters that also are not # typically found in textual content. This should penalize things like # ascii art, as well as source code (which, I suppose will make snippets # lower quality for actual searches for source code, but that is a very # small minority of searches in the current state of things). num_special_chars = len( special_char_pattern.findall(f.text[f.startchar : f.endchar]) ) score -= 4 * num_special_chars + math.pow(num_special_chars, 1.5) num_links = len(link_pattern.findall(f.text[f.startchar : f.endchar])) score -= 30 * num_links return max(0, score)