💾 Archived View for gmn.clttr.info › sources › geminispace.git › tree › scripts › add_domains.py.txt captured on 2022-06-11 at 23:40:37.

View Raw

More Information

⬅️ Previous capture (2021-12-03)

-=-=-=-=-=-=-

from whoosh.fields import TEXT
from whoosh.index import open_dir
from whoosh.query import Every

from gus.lib.gemini import GeminiResource
from gus.lib.whoosh_extensions import UrlAnalyzer

def main():
    ix = open_dir("index")

    with ix.writer() as writer:
        writer.add_field("domain", TEXT(analyzer=UrlAnalyzer()))

    with ix.searcher() as searcher:
        query = Every()
        results = searcher.search(query, limit=None)
        for result in results:
            domain = GeminiResource(result["url"]).normalized_host
            print(domain)
            with ix.writer() as writer:
                writer.delete_document(result.docnum)
                writer.add_document(
                    url          = result["url"],
                    domain       = domain,
                    content_type = result["content_type"],
                    content      = result["content"] if "content" in result else None,
                    prompt       = result["prompt"] if "prompt" in result else None,
                    indexed_at   = result["indexed_at"],
                )


if __name__ == "__main__":
    main()