💾 Archived View for gmn.clttr.info › sources › geminispace.info.git › tree › gus › lib › index_stati… captured on 2023-06-14 at 14:31:46.

View Raw

More Information

⬅️ Previous capture (2023-01-29)

-=-=-=-=-=-=-

import logging
from datetime import datetime

from peewee import fn, SQL

from gus.excludes import EXCLUDED_URL_PREFIXES
from gus.lib.db_model import Page


def compute_index_statistics(db):
    page_count = len(Page.raw("""SELECT DISTINCT p.id
FROM page AS p
WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""").dicts())

    domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port 
FROM page AS p 
WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""")
    domains = []
    for d in domains_query.execute():
        s = d.domain
        if d.port != 1965:
            s = f"{d.domain}:{d.port}"
        should_skip = False
        for excluded_prefix in EXCLUDED_URL_PREFIXES:
            if f"gemini://{s}".startswith(excluded_prefix):
                should_skip = True
                break
        if should_skip:
            continue
        domains.append(s)
    domain_count = len(domains)

    content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
FROM  page AS p
WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20
GROUP BY p.content_type
ORDER BY 2 desc""").dicts())
    charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count'
FROM page AS p
WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 AND  p.charset IS NOT NULL
GROUP BY upper(p.charset)
ORDER BY 2 desc""").dicts())
    index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar()

    return {
        "index_modification_time": index_modification_time,
        "page_count": page_count,
        "domain_count": domain_count,
        "content_type_frequencies": content_type_frequencies,
        "charset_frequencies": charset_frequencies,
        "domains": "",
    }


def log_index_statistics(index_statistics, crawl_statistics=None):
    logging.info('Index generated on: %s',
                  '{:%Y-%m-%d}'.format(index_statistics['index_modification_time']))
    logging.info('Number of pages indexed: %d',
                 index_statistics['page_count'])
    logging.info('Number of domains indexed: %d',
                 index_statistics['domain_count'])

    if crawl_statistics:
        logging.info('Number of redirects crawled: %d',
                     crawl_statistics['redirect_count'])
        logging.info('Number of redirects crawled (nontrivial): %d',
                     crawl_statistics['redirect_nontrivial_count'])
        logging.info('Number of broken URLs encountered while crawling: %d',
                     crawl_statistics['broken_url_count'])

    for entry in index_statistics['content_type_frequencies']:
        logging.info('Number of type "%s" resources indexed: %s',
                     entry['content_type'], entry['count'])

    for entry in index_statistics['charset_frequencies']:
        logging.info('Number of type "%s" charsets indexed: %s',
                     entry['charset'], entry['count'])


def persist_statistics(index_statistics, crawl_statistics, was_destructive, filename):
    with open(filename, "a") as f:
        f.write(serialize_statistics_line(index_statistics, crawl_statistics, was_destructive))


def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive):
    return "{:%Y-%m-%d},{},{},{},{},{},{},{},{},{}\n".format(
        index_statistics["index_modification_time"],
        was_destructive,
        index_statistics["page_count"],
        index_statistics["domain_count"],
        crawl_statistics["redirect_count"] if crawl_statistics else 0,
        crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0,
        crawl_statistics["broken_url_count"] if crawl_statistics else 0,
        "", 
        "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]),
        "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]),
    )


def load_last_statistics_from_file(filename):
    with open(filename) as f:
        data = f.readlines()
    lastline = data[-1].strip()
    statistics = deserialize_statistics_line(lastline)
    return statistics


def load_all_statistics_from_file(filename):
    with open(filename) as f:
        data = f.readlines()
    return [{
        "date": datetime.strptime(line.split(",")[0], "%Y-%m-%d"),
        "page_count": line.split(",")[2],
        "domain_count": line.split(",")[3],
    } for line in data[1:]]


def deserialize_statistics_line(line):
    line_parts = line.split(",")
    index_modification_time = datetime.strptime(line_parts[0], "%Y-%m-%d")
    # discard line_parts[1], which is `was_destructive`
    page_count = line_parts[2]
    domain_count = line_parts[3]
    redirect_count = line_parts[4]
    redirect_nontrivial_count = line_parts[5]
    broken_url_count = line_parts[6]
    content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")]
    charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")]

    return {
        "index_modification_time": index_modification_time,
        "page_count": page_count,
        "domain_count": domain_count,
        "redirect_count": redirect_count,
        "redirect_nontrivial_count": redirect_nontrivial_count,
        "broken_url_count": broken_url_count,
        "domains": "",
        "content_type_frequencies": content_type_frequencies,
        "charset_frequencies": charset_frequencies,
    }