💾 Archived View for thrig.me › blog › 2024 › 05 › 19 › gemfeed2atom.c captured on 2024-08-31 at 14:07:41.

View Raw

More Information

⬅️ Previous capture (2024-05-26)

-=-=-=-=-=-=-

// gemfeed2atom - convert some number of gemfeed entries to Atom format
// making various simplifying assumptions. There may be things to
// TWEAK. Usage:
//
//   gemfeed2atom -b gemini://example.org/ < index.gmi > atom.xml
//
// though one could use mktemp(1) and instead write to a temporary file
// to avoid clobbering atom.xml when something goes awry.

#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <limits.h>
#include <locale.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <time.h>
#include <unistd.h>

static long argtoll(const char *arg, const long long min, const long long max);
static bool atom_header(const char *title, const char *link,
                        const struct tm *now);
static bool atom_entry(const char *base_url, const char *path, const char *date,
                       const char *title);
static void atom_footer(void);
static void emit_help(void);
static char *xml_attribute_quote(const char *s, size_t len);
static char *xml_element_quote(const char *s, size_t len);

int
main(int argc, char *argv[])
{
	char *base_url   = strdup(""); // -b flag
	char *feed_title = NULL;       // parsed from document
	const char *input_file;        // the filename or - by default
	size_t max_posts = SIZE_MAX;

	// PORTABILITY "C" is the default on OpenBSD, but the following
	// helps mandate that we aren't in a wacky locale that causes
	// problems for strftime or perhaps other calls.
	if (!setlocale(LC_ALL, "C")) err(1, "setlocale");

	int ch;
	while ((ch = getopt(argc, argv, "M:b:")) != -1) {
		switch (ch) {
		case 'M':
			max_posts =
			  (size_t) argtoll(optarg, 1, (long long) INT_MAX);
			break;
		case 'b':
			if (base_url) free(base_url);
			base_url = xml_attribute_quote(optarg, strlen(optarg));
			if (!base_url) err(1, "xml_attribute_quote");
			break;
		default:
			emit_help();
		}
	}
	argc -= optind;
	argv += optind;

	// operate as a filter or read from the given file
	FILE *gemf;
	if (argc > 0 && strcmp(*argv, "-") != 0) {
		gemf = fopen(*argv, "r");
		if (!gemf) err(1, "read '%s'", *argv);
		input_file = *argv;
	} else {
		input_file = "-";
		gemf       = stdin;
	}

#ifdef __OpenBSD__
	if (pledge("stdio", NULL) == -1) err(1, "pledge failed");
#endif

	// timestamp for the header of the feed
	time_t epoch;
	if (time(&epoch) == (time_t) -1) err(1, "time");
	struct tm *now = gmtime(&epoch);
	if (!now) err(1, "gmtime");

	// read relevant entries from the gemfeed file
	char *line      = NULL;
	size_t linesize = 0;
	ssize_t linelen;
	char *copy         = NULL;  // for strtok to chew on
	bool header        = false; // done XML header call?
	bool title         = false; // got the gemfeed title?
	size_t entry_count = 1;
	size_t line_number = 0;
#define GEMTEXT_LINK_LEN 2
	while ((linelen = getline(&line, &linesize, gemf)) != -1) {
		++line_number;

		if (!title) {
			if (linelen >= 3 && line[0] == '#' && line[1] != '#') {
				// POSIX files are supposed to end with
				// a '\n' but folks may forget to do
				// that, but if the title is the last
				// line of the file then there probably
				// aren't any entries that follow it
				if (line[linelen - 1] == '\n')
					line[linelen - 1] = '\0';
				char *tp = line + 1;
				while (*tp == ' ' || *tp == '\t')
					++tp;
				feed_title = strdup(tp);
				title      = 1;
			} else {
				continue;
			}
		}

		// NOTE the spec indicates that there must be a
		// "non-zero number of consecutive spaces or tabs"
		// following the "=>" link but this parser may allow for
		// no whitespace.
		// gemini://geminiprotocol.net/docs/gemtext-specification.gmi
		if (strncmp(line, "=>", GEMTEXT_LINK_LEN) != 0) continue;

		if (copy) free(copy);
		copy = strdup(line + GEMTEXT_LINK_LEN);
		if (!copy) err(1, "strdup");

		// PORTABILITY strtok might have issues with leading
		// blanks or all-blank lines?
		char *entry_path = strtok(copy, " \t\n");
		if (!entry_path) continue;

		char *entry_date = strtok(NULL, " \t\n");
		if (!entry_date) continue;

		char *entry_title = strtok(NULL, "\n"); // to end-of-line
		if (!entry_title) continue;

		// Is the date at least somewhat valid? (I haven't
		// tested how this behaves for e.g. 2025-02-29.)
		// TWEAK one could instead 'continue' here if there
		// are not-gemfeed links on the same page that fail
		// this check.
		struct tm when;
		when.tm_hour = when.tm_min = when.tm_sec = 0;
		if (strlen(entry_date) != 10 ||
		    !strptime(entry_date, "%Y-%m-%d", &when))
			errx(1, "invalid date '%s' at %s:%zu", entry_date,
			     input_file, line_number);

		if (!header) {
			if (!atom_header(feed_title, base_url, now))
				errx(1, "atom_header");
			header = 1;
		}
		if (!atom_entry(base_url, entry_path, entry_date, entry_title))
			errx(1, "atom_entry");
		if (++entry_count > max_posts) break;
	}
	if (header) atom_footer();
	// don't waste cycles on these as the script is about to exit
	//if (copy) free(copy);
	//free(line);
	//free(base_url);
	//free(feed_title);
	if (ferror(gemf)) err(1, "getline");
	exit(EXIT_SUCCESS);
}

inline static long
argtoll(const char *arg, const long long min, const long long max)
{
	char *ep;
	long long val;
	errno = 0;
	val   = strtoll(arg, &ep, 0);
	if (arg[0] == '\0' || *ep != '\0') err(1, "strtoll");
	if (errno == ERANGE && (val == LLONG_MIN || val == LLONG_MAX))
		err(1, "strtoll failed");
	if (min != LLONG_MIN && val < min)
		errx(1, "value is below minimum %lld", min);
	if (max != LLONG_MAX && val > max)
		errx(1, "value is above maximum %lld", max);
	return val;
}

// TWEAK the link and title are escaped for XML in particular ways, so
// do not change where those strings are used from element to attribute
// or the reverse without revisiting how the strings are xml_*_quote'd.
// For anything more complicated, you would probably want an XML library
// to handle such details; https://www.msweet.org/mxml/ is a minimal XML
// library, but I've never tried it.
inline static bool
atom_header(const char *title, const char *link, const struct tm *now)
{
	// ISO 8601, assuming gmtime, e.g. 2024-05-17T13:25:53Z
#define ATOM_TS_LEN 23
	char timebuf[ATOM_TS_LEN + 1];
	if (strftime(timebuf, ATOM_TS_LEN, "%Y-%m-%dT%H:%M:%SZ", now) < 1)
		return false;
	printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
	       "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n"
	       "<title>%s</title>\n"
	       "<updated>%s</updated>\n"
	       "<link rel=\"alternate\" href=\"%s\"/>\n",
	       title, timebuf, link);
	return true;
}

// TWEAK the links are assumed to be relative to the base_url. If not,
// you'll need suitable URI code to merge relative or passthrough fully
// qualified.
inline static bool
atom_entry(const char *base_url, const char *path, const char *date,
           const char *title)
{
	char *enc_title = xml_element_quote(title, strlen(title));
	if (!enc_title) return false;
	char *enc_path = xml_attribute_quote(path, strlen(path));
	if (!enc_path) return false;
	// The date is assumed to be "noon UTC on the day indicated" per
	// gemini://geminiprotocol.net/docs/companion/subscription.gmi
	printf("<entry><title>%s</title>"
	       "<updated>%sT12:00:00Z</updated>"
	       "<link rel=\"alternate\" href=\"%s%s\"/>"
	       "</entry>\n",
	       enc_title, date, base_url, enc_path);
	free(enc_title);
	free(enc_path);
	return true;
}

inline static void
atom_footer(void)
{
	printf("</feed>\n");
}

inline static void
emit_help(void)
{
	fputs("Usage: gemfeed2atom -b base_url [gemfeed-file|-]\n", stderr);
	exit(EX_USAGE);
}

// Text within an attribute that is assumed to use "" quotes -
// always quote " with &quot;.
//
// The caller must free the returned string.
static char *
xml_attribute_quote(const char *s, size_t len)
{
	if (len >= SIZE_MAX / 6) {
		errno = EOVERFLOW;
		return NULL;
	}
	char *news = malloc(len * 6 + 1);
	if (!news) return NULL;
	char *np = news;
	while (*s) {
		switch (*s) {
		case '"':
			strncpy(np, "&quot;", 6);
			np += 6;
			break;
		default:
			*np++ = *s;
		}
		++s;
	}
	*np++ = '\0';
	// most of the time the string will be downsized
	char *trim;
	if ((trim = realloc(news, (size_t) (np - news))) == NULL) {
		free(news);
		return NULL;
	}
	return trim;
}

// Text within an element - always quote < and also cover the ]]> edge
// case by always quoting >. Always quote & on the (possibly bad)
// assumption that the text does not already contain &-quoted entities,
// e.g. &gt; in the input string will be mangled to &amp;gt; and to
// avoid this any strings infested with entities must be decoded first.
//
// The caller must free the returned string.
static char *
xml_element_quote(const char *s, size_t len)
{
	if (len >= SIZE_MAX / 5) {
		errno = EOVERFLOW;
		return NULL;
	}
	char *news = malloc(len * 5 + 1);
	if (!news) return NULL;
	char *np = news;
	while (*s) {
		switch (*s) {
		case '<':
			strncpy(np, "&lt;", 4);
			np += 4;
			break;
		case '>':
			strncpy(np, "&gt;", 4);
			np += 4;
			break;
		case '&':
			strncpy(np, "&amp;", 5);
			np += 5;
			break;
		default:
			*np++ = *s;
		}
		++s;
	}
	*np++ = '\0';
	// most of the time the string will be downsized
	char *trim;
	if ((trim = realloc(news, (size_t) (np - news))) == NULL) {
		free(news);
		return NULL;
	}
	return trim;
}