import argparse import logging from datetime import datetime, timedelta from urllib.parse import uses_relative, uses_netloc from peewee import fn from . import constants from gus.crawl import should_skip from gus.excludes import EXCLUDED_URL_PREFIXES from gus.lib.db_model import init_db, Page, PageContent from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import ( compute_index_statistics, persist_statistics, log_index_statistics, ) import gus.lib.logging from gus.lib.logging import strip_control_chars import gus.lib.search as search # hack: the built-in methods in urllib need to know the # Gemini protocol exists uses_relative.append("gemini") uses_netloc.append("gemini") def index_page(index, page): if should_skip(GeminiResource(page.url)): logging.debug( "URL is excluded, skipping: %s", strip_control_chars(page.url), ) return False logging.info("Indexing page: %s", strip_control_chars(page.url)) u = page.url.rstrip("/") external_backlinks = Page.raw( """SELECT p_from.url FROM page AS p_from JOIN link as l ON l.from_page_id == p_from.id JOIN page as p_to ON p_to.id == l.to_page_id WHERE p_to.url == ? AND l.is_cross_host_like == 1""", u ) logging.debug("Calculating backlinks for %s", u) backlink_urls = [b.url for b in external_backlinks.execute()] backlink_count = len(backlink_urls) document = { "url_id": page.url, "url": page.url, "domain": page.domain, "port": page.port, "content_type": page.content_type, "charset": page.charset or "none", "lang": page.lang, "size": page.size, "indexed_at": datetime.utcnow(), "backlink_count": backlink_count, } pagecontent = PageContent.get_or_none(page_id = page.id) if pagecontent is not None: document["prompt"] = pagecontent.prompt document["content"] = pagecontent.content try: logging.debug("Adding document to index: %s", page.url); index.add_document(document) logging.debug("Document done") return True except Exception as e: logging.exception( "Failed to index page: %s: %s", strip_control_chars(page.url), e ) return False def build_index(should_run_destructive=False): index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR db = init_db(f"{index_dir}/{constants.DB_FILENAME}") index = search.Index(index_dir, should_run_destructive) # delete pages that never successfull crawled count=0 q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) for page in q.iterator(): try: index.delete_by_term("url_id", page.url) count += page.delete_instance() except Exception as e: logging.error("Failed to delete row %s with outdated successful crawl: %s", page.url, e) logging.warn("Deleted %d rows without successfull crawl", count) # delete pages with last crawl success older than 30 days which have been recrawled since than # this avoids deletion of files that have a change_frequency longer than our timeout count=0 q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) for page in q.iterator(): try: index.delete_by_term("url_id", page.url) count += page.delete_instance() except Exception as e: logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) logging.warn("Deleted %d rows with outdated successful crawl", count) # delete entire domain that has no page with a recent successfull crawl last_valid_timestamp = datetime.now() - timedelta(days = 30) outdated_domains_query = Page.select(Page.domain, fn.MAX(Page.last_crawl_at).alias("last_crawl_at"), fn.MAX(Page.last_crawl_success_at).alias("last_crawl_success_at")).where(Page.last_crawl_at.is_null(False) & Page.last_crawl_success_at.is_null(False)).group_by(Page.domain) domains = outdated_domains_query.execute() for del_domain in domains: try: if (del_domain.last_crawl_success_at < last_valid_timestamp and del_domain.last_crawl_at > del_domain.last_crawl_success_at): logging.warn("Deleting pages for domain: %s, last crawl: %s, last crawl success: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at) outdated_pages_query = Page.select(Page.url).where(Page.domain == del_domain.domain) for outdated_page in outdated_pages_query.iterator(): # we need to delete every single page as "delete_by_term" does not work on # fields that parsed by a stemmer like "domain" as a text field is index.delete_by_term("url_id", outdated_page.url) outdated_page.delete_instance() except Exception as e: logging.error("Failed to delete domain %s (last crawl: %s, last crawl success: %s) with outdated successful crawl: %s", del_domain.domain, del_domain.last_crawl_at, del_domain.last_crawl_success_at, e) if (should_run_destructive): pages = Page.raw( """SELECT p.* FROM page AS p WHERE p.last_success_status == 20 AND (p.content_type NOT LIKE 'text/%' OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE ) else: pages = Page.raw( """SELECT p.* FROM page AS p WHERE p.last_success_status == 20 AND (p.indexed_at IS NULL OR p.indexed_at < p.last_crawl_success_at) AND (p.content_type NOT LIKE 'text/%' OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE ) for page in pages.__iter__(): index_page(index, page) page.indexed_at = datetime.utcnow() page.save() try: logging.info("Commiting search index...") index.close() except Exception as e: logging.error('Closing of index failed: %s', e); logging.debug("Updating statistics...") index_statistics = compute_index_statistics(db) log_index_statistics(index_statistics) persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv") logging.info("Finished!") def main(): args = parse_args() gus.lib.logging.handle_arguments(args) build_index(args.should_run_destructive) def parse_args(): parser = argparse.ArgumentParser(description="Crawl Geminispace.") parser.add_argument( "--destructive", "-d", dest="should_run_destructive", action="store_true", default=False, help="create a fresh index", ) gus.lib.logging.add_arguments(parser) args = parser.parse_args() return args if __name__ == "__main__": main()