going-flying.com gemini git repository
aef063ca3c44abfa18092093dd0b93b9f64a4509 - Matthew Ernisse - 1619123142
put this here...
diff --git a/files/gemini-stats.py b/files/gemini-stats.py new file mode 100644 index 0000000..ed5b37f --- /dev/null +++ b/files/gemini-stats.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +# -*- coding: UTF-8 -*- +'''gemini-stats.py (c) 2020-2021 Matthew J Ernisse <matt@going-flying.com> +All Rights Reserved. + +Redistribution and use in source and binary forms, +with or without modification, are permitted provided +that the following conditions are met: + + * Redistributions of source code must retain the + above copyright notice, this list of conditions + and the following disclaimer. + * Redistributions in binary form must reproduce + the above copyright notice, this list of conditions + and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +''' +import datetime +import pytz +import re +import sys +import os +from urllib.parse import urlparse + +MAXWIDTH=60 +MAXLABEL=30 + +class MollyStats(object): + ''' Look at a Molly Brown logfile and generate some statistics + from it. + + Produces: + Hits per page + Hits per day + Errors (url, code tuple) + ''' + logre = re.compile( + r'^.*:\s+(?P<date>\d{4}-\d{2}-\d{2}T(\d{2}:){2}\d{2}Z)\s*(?P<addr>\[[a-f0-9:]+\]|[0-9\.]+)\s+(?P<code>\d{2})\s+(?P<url>.+) + ) + + def __init__(self, logfile): + ''' Load logfile and populate the internal data structures. ''' + self.byDay = {} + self.errors = {} + self.files = {} + + try: + self.start = datetime.datetime.utcfromtimestamp(2**64) + except OverflowError: + self.start = datetime.datetime.utcfromtimestamp(2**32) + + self.start = self.start.replace(tzinfo=pytz.utc) + self.end = datetime.datetime.utcfromtimestamp(0) + self.end = self.end.replace(tzinfo=pytz.utc) + + with open(logfile) as fd: + for line in fd.readlines(): + matches = self.logre.match(line) + if not matches: + continue + + addr = matches.group('addr') + code = int(matches.group('code')) + date = datetime.datetime.strptime( + matches.group('date'), + '%Y-%m-%dT%H:%M:%SZ' + ) + date = date.replace(tzinfo=pytz.utc) + url = urlparse(matches.group('url')).path + url = self.elide(url) + + if date < self.start: + self.start = date + + if date > self.end: + self.end = date + + if code > 39: + key = f'{url}:{code}' + if key not in self.errors.keys(): + self.errors[key] = 1 + else: + self.errors[key] += 1 + + continue + + if url not in self.files: + self.files[url] = 1 + else: + self.files[url] += 1 + + if date.day not in self.byDay: + self.byDay[date.day] = 1 + else: + self.byDay[date.day] += 1 + + def elide(self, url): + global MAXLABEL + if url.startswith('/git/cgi/gemini.git/'): + url = '/git/cgi/gemini.git...' + + elif len(url) > MAXLABEL: + url = url[:MAXLABEL-3] + '...' + + return url + + +def histogram(data, axis_label, sort_keys=True): + ''' Pretty print a histogram. ''' + global MAXLABEL, MAXWIDTH + + if sort_keys: + data.sort(key=lambda k: k[1], reverse=True) + + max = data[0][1] + min = data[-1][1] + else: + max = 0 + min = 2**128 + + for v in data: + if v[1] > max: + max = v[1] + + if v[1] < min: + min = v[1] + + scale = 1 + total = sum([v[1] for v in data]) + + axis_sz = 0 + for v in data: + if len(v[0]) > axis_sz: + axis_sz = len(v[0]) + 1 + + if len(axis_label) > axis_sz: + axis_sz = len(axis_label) + 1 + + # Calculate maximum bar length based on the axis label size and + # the value label size. Magic number 7 is the number of spaces + # and decorative characters in the line that are not accounted + # for otherwise. + bar_sz = MAXWIDTH - (axis_sz + 7) + # + len(f'{max}')) + while (max / scale) > bar_sz: + scale += 1 + + print('```') + + # Center-ish the axis label + n_sp = axis_sz - len(axis_label) + pad = ' ' * int(n_sp / 2) + bar = '=' * bar_sz + + print(f' {pad}{axis_label}') + print('=' * MAXWIDTH) + + # Finally print the data bars + for v in data: + n_sp = axis_sz - len(v[0]) + pad = ' ' * n_sp + bar = '∎' * round(v[1] / scale) + print(f'{pad}{v[0]} | {bar} ({v[1]})') + + # and the footer + print('=' * MAXWIDTH) + print(f'Min: {min}, Max: {max}, Total: {total}') + print(f'( ∎ = {scale} )') + print('```') + +def print_stats(stats): + tz = pytz.timezone('US/Eastern') + start = stats.start.astimezone(tz).strftime('%m/%d/%Y %H:%M %Z') + end = stats.end.astimezone(tz).strftime('%m/%d/%Y %H:%M %Z') + now = datetime.datetime.now().astimezone(tz).strftime( + '%m/%d/%Y %H:%M %Z' + ) + + print(f'# Gemini Log Stats for {start} to {end}.') + print(f'## Hits:') + print() + + if stats.files: + histogram([(k, v) for k, v in stats.files.items()], 'File') + else: + print('No Hits Recorded.') + + print() + print() + print('## Hits Per Day:') + print() + + histogram([(str(k), v) for k, v in stats.byDay.items()], 'Day', False) + + print() + print() + print('## Errors:') + print() + + if stats.errors: + histogram( + [(k, v) for k, v in stats.errors.items()], + 'Error Code' + ) + else: + print('No Errors Recorded.') + + + print() + print() + print(f'Generated at {now}.') + + +if __name__ == '__main__': + stats = MollyStats(sys.argv[1]) + print_stats(stats)