from whoosh.fields import TEXT from whoosh.index import open_dir from whoosh.query import Every from gus.lib.gemini import GeminiResource from gus.lib.whoosh_extensions import UrlAnalyzer def main(): ix = open_dir("index") with ix.writer() as writer: writer.add_field("domain", TEXT(analyzer=UrlAnalyzer())) with ix.searcher() as searcher: query = Every() results = searcher.search(query, limit=None) for result in results: domain = GeminiResource(result["url"]).normalized_host print(domain) with ix.writer() as writer: writer.delete_document(result.docnum) writer.add_document( url = result["url"], domain = domain, content_type = result["content_type"], content = result["content"] if "content" in result else None, prompt = result["prompt"] if "prompt" in result else None, indexed_at = result["indexed_at"], ) if __name__ == "__main__": main()