💾 Archived View for gmn.clttr.info › sources › geminispace.info.git › tree › scripts › search_index.… captured on 2024-02-05 at 10:06:48.

View Raw

More Information

⬅️ Previous capture (2023-01-29)

-=-=-=-=-=-=-

import math
import re
import statistics
import sys

from whoosh.index import open_dir
from whoosh.query import Every
from whoosh.qparser import MultifieldParser
from whoosh import highlight

from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer
from gus.lib.misc import bytes2human

gemini_highlighter = highlight.Highlighter(
    formatter=GeminiFormatter(),
    fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80),
    scorer=GeminiScorer(),
    order=highlight.SCORE,
)

def get_highlight(result):
    if "content" not in result:
        return ""
    if result["content_type"] not in ["text/plain", "text/gemini", "text/markdown"]:
        return ""
    return gemini_highlighter.highlight_hit(result, "content", top=1).replace(GeminiFormatter.between, "\n")


def main():
    ix = open_dir("index")
    # ix.optimize()
    # if len(sys.argv) < 2:
    #     print("Please provide a search query...")
    #     return

    with ix.searcher() as searcher:
        query = Every("size")
        results = searcher.search(query, limit=9999999)
        size_lists = {}
        for result in results:
            if result["content_type"] not in size_lists:
                size_lists[result["content_type"]] = []
            size_lists[result["content_type"]].append(result["size"])
        for content_type, size_list in size_lists.items():
            if len(size_list) < 16:
                continue
            print("\n# {} ({})".format(content_type, len(size_list)))
            mean = bytes2human(statistics.mean(size_list), format="%(value).1f %(symbol)s")
            median = bytes2human(statistics.median(size_list), format="%(value).1f %(symbol)s")
            maximum = bytes2human(max(size_list), format="%(value).1f %(symbol)s")
            print("Mean   : {:>8}".format(mean))
            print("Median : {:>8}".format(median))
            print("Max    : {:>8}".format(maximum))

    # print("Searching index for: \"%s\"" % sys.argv[1])
    # ix = open_dir("index")
    # with ix.searcher() as searcher:
    #     query = MultifieldParser(["content", "url"], ix.schema).parse(sys.argv[1])

    #     results = searcher.search(query)
    #     render_results(
    #         sys.argv[1],
    #         len(results),
    #         [(
    #             result["indexed_at"],
    #             result.score,
    #             result["url"],
    #             get_highlight(result),
    #         ) for result in results]
    #     )


def render_results(query, num_results, results):
    print("          GUS")
    print(" Gemini Universal Search")
    print("==========================")
    print("| You searched for: \"%s\"" % query)
    print("| Number of hits: %s" % num_results)
    print("==========================")
    for i, result in enumerate(results):
        if i > 0:
            print()
        print("=> %s" % result[2])
        if len(result[3]) > 0:
            print("%s" % result[3])
    print("==========================")
    print("Page 1 of %s (paging coming later)" % math.ceil(num_results / 10))


if __name__ == "__main__":
    main()