gemini.git

going-flying.com gemini git repository

summary

tree

log

refs

aef063ca3c44abfa18092093dd0b93b9f64a4509 - Matthew Ernisse - 1619123142

put this here...

view tree

view raw

diff --git a/files/gemini-stats.py b/files/gemini-stats.py
new file mode 100644
index 0000000..ed5b37f
--- /dev/null
+++ b/files/gemini-stats.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+'''gemini-stats.py (c) 2020-2021 Matthew J Ernisse <matt@going-flying.com>
+All Rights Reserved.
+
+Redistribution and use in source and binary forms,
+with or without modification, are permitted provided
+that the following conditions are met:
+
+    * Redistributions of source code must retain the
+      above copyright notice, this list of conditions
+      and the following disclaimer.
+    * Redistributions in binary form must reproduce
+      the above copyright notice, this list of conditions
+      and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+import datetime
+import pytz
+import re
+import sys
+import os
+from urllib.parse import urlparse
+
+MAXWIDTH=60
+MAXLABEL=30
+
+class MollyStats(object):
+	''' Look at a Molly Brown logfile and generate some statistics
+	from it.
+
+	Produces:
+		Hits per page
+		Hits per day
+		Errors (url, code tuple)
+	'''
+	logre = re.compile(
+		r'^.*:\s+(?P<date>\d{4}-\d{2}-\d{2}T(\d{2}:){2}\d{2}Z)\s*(?P<addr>\[[a-f0-9:]+\]|[0-9\.]+)\s+(?P<code>\d{2})\s+(?P<url>.+)



+	)
+
+	def __init__(self, logfile):
+		''' Load logfile and populate the internal data structures. '''
+		self.byDay = {}
+		self.errors = {}
+		self.files = {}
+
+		try:
+			self.start = datetime.datetime.utcfromtimestamp(2**64)
+		except OverflowError:
+			self.start = datetime.datetime.utcfromtimestamp(2**32)
+
+		self.start = self.start.replace(tzinfo=pytz.utc)
+		self.end = datetime.datetime.utcfromtimestamp(0)
+		self.end = self.end.replace(tzinfo=pytz.utc)
+
+		with open(logfile) as fd:
+			for line in fd.readlines():
+				matches = self.logre.match(line)
+				if not matches:
+					continue
+
+				addr = matches.group('addr')
+				code = int(matches.group('code'))
+				date = datetime.datetime.strptime(
+					matches.group('date'),
+					'%Y-%m-%dT%H:%M:%SZ'
+				)
+				date = date.replace(tzinfo=pytz.utc)
+				url = urlparse(matches.group('url')).path
+				url = self.elide(url)
+
+				if date < self.start:
+					self.start = date
+
+				if date > self.end:
+					self.end = date
+
+				if code > 39:
+					key = f'{url}:{code}'
+					if key not in self.errors.keys():
+						self.errors[key] = 1
+					else:
+						self.errors[key] += 1
+
+					continue
+
+				if url not in self.files:
+					self.files[url] = 1
+				else:
+					self.files[url] += 1
+
+				if date.day not in self.byDay:
+					self.byDay[date.day] = 1
+				else:
+					self.byDay[date.day] += 1
+
+	def elide(self, url):
+		global MAXLABEL
+		if url.startswith('/git/cgi/gemini.git/'):
+			url = '/git/cgi/gemini.git...'
+
+		elif len(url) > MAXLABEL:
+			url = url[:MAXLABEL-3] + '...'
+
+		return url
+
+
+def histogram(data, axis_label, sort_keys=True):
+	''' Pretty print a histogram. '''
+	global MAXLABEL, MAXWIDTH
+
+	if sort_keys:
+		data.sort(key=lambda k: k[1], reverse=True)
+
+		max = data[0][1]
+		min = data[-1][1]
+	else:
+		max = 0
+		min = 2**128
+
+		for v in data:
+			if v[1] > max:
+				max = v[1]
+
+			if v[1] < min:
+				min = v[1]
+			
+	scale = 1
+	total = sum([v[1] for v in data])
+
+	axis_sz = 0
+	for v in data:
+		if len(v[0]) > axis_sz:
+			axis_sz = len(v[0]) + 1
+
+	if len(axis_label) > axis_sz:
+		axis_sz = len(axis_label) + 1
+
+	# Calculate maximum bar length based on the axis label size and
+	# the value label size. Magic number 7 is the number of spaces
+	# and decorative characters in the line that are not accounted
+	# for otherwise.
+	bar_sz = MAXWIDTH - (axis_sz + 7)
+	# + len(f'{max}'))
+	while (max / scale) > bar_sz:
+		scale += 1
+
+	print('```')
+
+	# Center-ish the axis label
+	n_sp = axis_sz - len(axis_label)
+	pad = ' ' * int(n_sp / 2)
+	bar = '=' * bar_sz
+
+	print(f' {pad}{axis_label}')
+	print('=' * MAXWIDTH)
+	
+	# Finally print the data bars
+	for v in data:
+		n_sp = axis_sz - len(v[0])
+		pad = ' ' * n_sp
+		bar = '∎' * round(v[1] / scale)
+		print(f'{pad}{v[0]} | {bar} ({v[1]})')
+
+	# and the footer
+	print('=' * MAXWIDTH)
+	print(f'Min: {min}, Max: {max}, Total: {total}')
+	print(f'( ∎ = {scale} )')
+	print('```')
+
+def print_stats(stats):
+	tz = pytz.timezone('US/Eastern')
+	start = stats.start.astimezone(tz).strftime('%m/%d/%Y %H:%M %Z')
+	end = stats.end.astimezone(tz).strftime('%m/%d/%Y %H:%M %Z')
+	now = datetime.datetime.now().astimezone(tz).strftime(
+		'%m/%d/%Y %H:%M %Z'
+	)
+
+	print(f'# Gemini Log Stats for {start} to {end}.')
+	print(f'## Hits:')
+	print()
+
+	if stats.files:
+		histogram([(k, v) for k, v in stats.files.items()], 'File')
+	else:
+		print('No Hits Recorded.')
+
+	print()
+	print()
+	print('## Hits Per Day:')
+	print()
+
+	histogram([(str(k), v) for k, v in stats.byDay.items()], 'Day', False)
+
+	print()
+	print()
+	print('## Errors:')
+	print()
+
+	if stats.errors:
+		histogram(
+			[(k, v) for k, v in stats.errors.items()],
+			'Error Code'
+		)
+	else:
+		print('No Errors Recorded.')
+
+	
+	print()
+	print()
+	print(f'Generated at {now}.')
+
+
+if __name__ == '__main__':
+	stats = MollyStats(sys.argv[1])
+	print_stats(stats)