💾 Archived View for gmn.clttr.info › sources › geminispace.info.git › tree › gus › build_index.py.tx… captured on 2023-06-14 at 14:31:20.

View Raw

More Information

⬅️ Previous capture (2023-01-29)

🚧 View Differences

-=-=-=-=-=-=-

import argparse
import logging

from datetime import datetime, timedelta
from urllib.parse import uses_relative, uses_netloc
from peewee import fn

from . import constants
from gus.crawl import should_skip
from gus.excludes import EXCLUDED_URL_PREFIXES 
from gus.lib.db_model import init_db, Page, PageContent
from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import (
    compute_index_statistics,
    persist_statistics,
    log_index_statistics,
)
import gus.lib.logging
from gus.lib.logging import strip_control_chars
import gus.lib.search as search

# hack: the built-in methods in urllib need to know the
# Gemini protocol exists
uses_relative.append("gemini")
uses_netloc.append("gemini")


def index_page(index, page):
    if should_skip(GeminiResource(page.url)):
        logging.debug(
            "URL is excluded, skipping: %s",
            strip_control_chars(page.url),
        )
        return False

    logging.info("Indexing page: %s", strip_control_chars(page.url))

    u = page.url.rstrip("/")
    external_backlinks = Page.raw(
        """SELECT p_from.url
FROM page AS p_from
JOIN link as l ON l.from_page_id == p_from.id
JOIN page as p_to ON p_to.id == l.to_page_id
WHERE p_to.url == ?
AND l.is_cross_host_like == 1""",
        u
    )

    logging.debug("Calculating backlinks for %s", u)
    backlink_urls = [b.url for b in external_backlinks.execute()]
    backlink_count = len(backlink_urls)

    document = {
        "url_id": page.url,
        "url": page.url,
        "domain": page.domain,
        "port": page.port,
        "content_type": page.content_type,
        "charset": page.charset or "none",
        "lang": page.lang,
        "size": page.size,
        "indexed_at": datetime.utcnow(),
        "backlink_count": backlink_count,
    }

    pagecontent = PageContent.get_or_none(page_id = page.id)
    if pagecontent is not None:
        document["prompt"] = pagecontent.prompt
        document["content"] = pagecontent.content

    try:
        logging.debug("Adding document to index: %s", page.url);
        index.add_document(document)
        logging.debug("Document done") 
        
        return True
    except Exception as e:
        logging.exception(
            "Failed to index page: %s: %s",
            strip_control_chars(page.url),
            e
        )
        return False


def build_index(should_run_destructive=False):
    index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR

    db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
    index = search.Index(index_dir, should_run_destructive)

    # delete pages that never successfull crawled
    count=0
    q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False))
    for page in q.iterator():
        try:
            index.delete_by_term("url_id", page.url)
            count += page.delete_instance()
        except Exception as e:
            logging.error("Failed to delete row %s with outdated successful crawl: %s", page.url, e)
    logging.warn("Deleted %d rows without successfull crawl", count)

    # delete pages with last crawl success older than 30 days which have been recrawled since than
    # this avoids deletion of files that have a change_frequency longer than our timeout
    count=0
    q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30))))
    for page in q.iterator():
        try:
            index.delete_by_term("url_id", page.url)
            count += page.delete_instance()
        except Exception as e:
            logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e)
    logging.warn("Deleted %d rows with outdated successful crawl", count)

    # delete entire domain that has no page with a recent successfull crawl
    last_valid_timestamp = datetime.now() - timedelta(days = 30)
    outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at.is_null(False) & Page.last_crawl_success_at.is_null(False)).group_by(Page.domain)
    domains = outdated_domains_query.execute()
    for del_domain in domains:
        try:
            if (del_domain.last_crawl_success_at < last_valid_timestamp and del_domain.last_crawl_at > del_domain.last_crawl_success_at):
                logging.warn("Deleting pages for domain: %s, last crawl: %s, last crawl success: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at)
                outdated_pages_query = Page.select(Page.url).where(Page.domain == del_domain.domain)
                for outdated_page in outdated_pages_query.iterator():
                    # we need to delete every single page as "delete_by_term" does not work on
                    # fields that parsed by a stemmer like "domain" as a text field is
                    index.delete_by_term("url_id", outdated_page.url)
                    outdated_page.delete_instance()
        except Exception as e:
            logging.error("Failed to delete domain %s (last crawl: %s, last crawl success: %s) with outdated successful crawl: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at, e)

    if (should_run_destructive):
        pages = Page.raw(
        """SELECT p.* FROM page AS p
WHERE p.last_success_status == 20 
AND (p.content_type NOT LIKE 'text/%'
OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
    )
    else:
        pages = Page.raw(
        """SELECT p.* FROM page AS p
WHERE p.last_success_status == 20 
AND (p.indexed_at IS NULL OR 
p.indexed_at < p.last_crawl_success_at)
AND (p.content_type NOT LIKE 'text/%'
OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
    )

    for page in pages.__iter__():
        index_page(index, page)
        page.indexed_at = datetime.utcnow()
        page.save()

    try:
        logging.info("Commiting search index...")
        index.close()
    except Exception as e:
        logging.error('Closing of index failed: %s', e);
 
    logging.debug("Updating statistics...")
    index_statistics = compute_index_statistics(db)
    log_index_statistics(index_statistics)
    persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv")

    logging.info("Finished!")


def main():
    args = parse_args()
    gus.lib.logging.handle_arguments(args)
    build_index(args.should_run_destructive)


def parse_args():
    parser = argparse.ArgumentParser(description="Crawl Geminispace.")
    parser.add_argument(
        "--destructive",
        "-d",
        dest="should_run_destructive",
        action="store_true",
        default=False,
        help="create a fresh index",
    )
    gus.lib.logging.add_arguments(parser)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    main()