💾 Archived View for gmn.clttr.info › sources › geminispace.git › tree › scripts › add_none_charset.p… captured on 2022-06-11 at 23:40:41.

View Raw

More Information

⬅️ Previous capture (2021-12-03)

-=-=-=-=-=-=-

from whoosh.fields import ID
from whoosh.index import open_dir
from whoosh.query import Every

from gus.lib.gemini import GeminiResource
from gus.lib.whoosh_extensions import UrlAnalyzer

def main():
    ix = open_dir("index")

    with ix.searcher() as searcher:
        query = Every()
        results = searcher.search(query, limit=None)
        count = 0
        for result in results:
            if "charset" not in result:
                count += 1
                with ix.writer() as writer:
                    writer.delete_document(result.docnum)
                    writer.add_document(
                        url          = result["url"],
                        fetchable_url= result["fetchable_url"],
                        domain       = GeminiResource(result["url"]).normalized_host,
                        content_type = result["content_type"],
                        charset      = "none",
                        content      = result["content"] if "content" in result else None,
                        regex        = result["regex"] if "regex" in result else None,
                        prompt       = result["prompt"] if "prompt" in result else None,
                        indexed_at   = result["indexed_at"],
                    )
        print("{} documents updated.".format(count))


if __name__ == "__main__":
    main()