import logging from datetime import datetime from peewee import fn, SQL from gus.excludes import EXCLUDED_URL_PREFIXES from gus.lib.db_model import Page def compute_index_statistics(db): page_count = len(Page.raw("""SELECT DISTINCT p.id FROM page AS p WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""").dicts()) domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port FROM page AS p WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""") domains = [] for d in domains_query.execute(): s = d.domain if d.port != 1965: s = f"{d.domain}:{d.port}" should_skip = False for excluded_prefix in EXCLUDED_URL_PREFIXES: if f"gemini://{s}".startswith(excluded_prefix): should_skip = True break if should_skip: continue domains.append(s) domain_count = len(domains) content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count' FROM page AS p WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 GROUP BY p.content_type ORDER BY 2 desc""").dicts()) charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count' FROM page AS p WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 AND p.charset IS NOT NULL GROUP BY upper(p.charset) ORDER BY 2 desc""").dicts()) index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar() return { "index_modification_time": index_modification_time, "page_count": page_count, "domain_count": domain_count, "content_type_frequencies": content_type_frequencies, "charset_frequencies": charset_frequencies, "domains": "", } def log_index_statistics(index_statistics, crawl_statistics=None): logging.info('Index generated on: %s', '{:%Y-%m-%d}'.format(index_statistics['index_modification_time'])) logging.info('Number of pages indexed: %d', index_statistics['page_count']) logging.info('Number of domains indexed: %d', index_statistics['domain_count']) if crawl_statistics: logging.info('Number of redirects crawled: %d', crawl_statistics['redirect_count']) logging.info('Number of redirects crawled (nontrivial): %d', crawl_statistics['redirect_nontrivial_count']) logging.info('Number of broken URLs encountered while crawling: %d', crawl_statistics['broken_url_count']) for entry in index_statistics['content_type_frequencies']: logging.info('Number of type "%s" resources indexed: %s', entry['content_type'], entry['count']) for entry in index_statistics['charset_frequencies']: logging.info('Number of type "%s" charsets indexed: %s', entry['charset'], entry['count']) def persist_statistics(index_statistics, crawl_statistics, was_destructive, filename): with open(filename, "a") as f: f.write(serialize_statistics_line(index_statistics, crawl_statistics, was_destructive)) def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive): return "{:%Y-%m-%d},{},{},{},{},{},{},{},{},{}\n".format( index_statistics["index_modification_time"], was_destructive, index_statistics["page_count"], index_statistics["domain_count"], crawl_statistics["redirect_count"] if crawl_statistics else 0, crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0, crawl_statistics["broken_url_count"] if crawl_statistics else 0, "", "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]), "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]), ) def load_last_statistics_from_file(filename): with open(filename) as f: data = f.readlines() lastline = data[-1].strip() statistics = deserialize_statistics_line(lastline) return statistics def load_all_statistics_from_file(filename): with open(filename) as f: data = f.readlines() return [{ "date": datetime.strptime(line.split(",")[0], "%Y-%m-%d"), "page_count": line.split(",")[2], "domain_count": line.split(",")[3], } for line in data[1:]] def deserialize_statistics_line(line): line_parts = line.split(",") index_modification_time = datetime.strptime(line_parts[0], "%Y-%m-%d") # discard line_parts[1], which is `was_destructive` page_count = line_parts[2] domain_count = line_parts[3] redirect_count = line_parts[4] redirect_nontrivial_count = line_parts[5] broken_url_count = line_parts[6] content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")] charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")] return { "index_modification_time": index_modification_time, "page_count": page_count, "domain_count": domain_count, "redirect_count": redirect_count, "redirect_nontrivial_count": redirect_nontrivial_count, "broken_url_count": broken_url_count, "domains": "", "content_type_frequencies": content_type_frequencies, "charset_frequencies": charset_frequencies, }