💾 Archived View for gmn.clttr.info › sources › geminispace.git › tree › scripts › search_index.py.tx… captured on 2022-06-11 at 23:40:49.
⬅️ Previous capture (2021-12-03)
-=-=-=-=-=-=-
import math import re import statistics import sys from whoosh.index import open_dir from whoosh.query import Every from whoosh.qparser import MultifieldParser from whoosh import highlight from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer from gus.lib.misc import bytes2human gemini_highlighter = highlight.Highlighter( formatter=GeminiFormatter(), fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80), scorer=GeminiScorer(), order=highlight.SCORE, ) def get_highlight(result): if "content" not in result: return "" if result["content_type"] not in ["text/plain", "text/gemini", "text/markdown"]: return "" return gemini_highlighter.highlight_hit(result, "content", top=1).replace(GeminiFormatter.between, "\n") def main(): ix = open_dir("index") # ix.optimize() # if len(sys.argv) < 2: # print("Please provide a search query...") # return with ix.searcher() as searcher: query = Every("size") results = searcher.search(query, limit=9999999) size_lists = {} for result in results: if result["content_type"] not in size_lists: size_lists[result["content_type"]] = [] size_lists[result["content_type"]].append(result["size"]) for content_type, size_list in size_lists.items(): if len(size_list) < 16: continue print("\n# {} ({})".format(content_type, len(size_list))) mean = bytes2human(statistics.mean(size_list), format="%(value).1f %(symbol)s") median = bytes2human(statistics.median(size_list), format="%(value).1f %(symbol)s") maximum = bytes2human(max(size_list), format="%(value).1f %(symbol)s") print("Mean : {:>8}".format(mean)) print("Median : {:>8}".format(median)) print("Max : {:>8}".format(maximum)) # print("Searching index for: \"%s\"" % sys.argv[1]) # ix = open_dir("index") # with ix.searcher() as searcher: # query = MultifieldParser(["content", "url"], ix.schema).parse(sys.argv[1]) # results = searcher.search(query) # render_results( # sys.argv[1], # len(results), # [( # result["indexed_at"], # result.score, # result["url"], # get_highlight(result), # ) for result in results] # ) def render_results(query, num_results, results): print(" GUS") print(" Gemini Universal Search") print("==========================") print("| You searched for: \"%s\"" % query) print("| Number of hits: %s" % num_results) print("==========================") for i, result in enumerate(results): if i > 0: print() print("=> %s" % result[2]) if len(result[3]) > 0: print("%s" % result[3]) print("==========================") print("Page 1 of %s (paging coming later)" % math.ceil(num_results / 10)) if __name__ == "__main__": main()