💾 Archived View for thrig.me › blog › 2024 › 05 › 19 › gemfeed2atom.c captured on 2024-05-26 at 14:59:01.
-=-=-=-=-=-=-
// gemfeed2atom - convert some number of gemfeed entries to Atom format // making various simplifying assumptions. There may be things to // TWEAK. Usage: // // gemfeed2atom -b gemini://example.org/ < index.gmi > atom.xml // // though one could use mktemp(1) and instead write to a temporary file // to avoid clobbering atom.xml when something goes awry. #include <err.h> #include <errno.h> #include <fcntl.h> #include <getopt.h> #include <limits.h> #include <locale.h> #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sysexits.h> #include <time.h> #include <unistd.h> static long argtoll(const char *arg, const long long min, const long long max); static bool atom_header(const char *title, const char *link, const struct tm *now); static bool atom_entry(const char *base_url, const char *path, const char *date, const char *title); static void atom_footer(void); static void emit_help(void); static char *xml_attribute_quote(const char *s, size_t len); static char *xml_element_quote(const char *s, size_t len); int main(int argc, char *argv[]) { char *base_url = strdup(""); // -b flag char *feed_title = NULL; // parsed from document const char *input_file; // the filename or - by default size_t max_posts = SIZE_MAX; // PORTABILITY "C" is the default on OpenBSD, but the following // helps mandate that we aren't in a wacky locale that causes // problems for strftime or perhaps other calls. if (!setlocale(LC_ALL, "C")) err(1, "setlocale"); int ch; while ((ch = getopt(argc, argv, "M:b:")) != -1) { switch (ch) { case 'M': max_posts = (size_t) argtoll(optarg, 1, (long long) INT_MAX); break; case 'b': if (base_url) free(base_url); base_url = xml_attribute_quote(optarg, strlen(optarg)); if (!base_url) err(1, "xml_attribute_quote"); break; default: emit_help(); } } argc -= optind; argv += optind; // operate as a filter or read from the given file FILE *gemf; if (argc > 0 && strcmp(*argv, "-") != 0) { gemf = fopen(*argv, "r"); if (!gemf) err(1, "read '%s'", *argv); input_file = *argv; } else { input_file = "-"; gemf = stdin; } #ifdef __OpenBSD__ if (pledge("stdio", NULL) == -1) err(1, "pledge failed"); #endif // timestamp for the header of the feed time_t epoch; if (time(&epoch) == (time_t) -1) err(1, "time"); struct tm *now = gmtime(&epoch); if (!now) err(1, "gmtime"); // read relevant entries from the gemfeed file char *line = NULL; size_t linesize = 0; ssize_t linelen; char *copy = NULL; // for strtok to chew on bool header = false; // done XML header call? bool title = false; // got the gemfeed title? size_t entry_count = 1; size_t line_number = 0; #define GEMTEXT_LINK_LEN 2 while ((linelen = getline(&line, &linesize, gemf)) != -1) { ++line_number; if (!title) { if (linelen >= 3 && line[0] == '#' && line[1] != '#') { // POSIX files are supposed to end with // a '\n' but folks may forget to do // that, but if the title is the last // line of the file then there probably // aren't any entries that follow it if (line[linelen - 1] == '\n') line[linelen - 1] = '\0'; char *tp = line + 1; while (*tp == ' ' || *tp == '\t') ++tp; feed_title = strdup(tp); title = 1; } else { continue; } } // NOTE the spec indicates that there must be a // "non-zero number of consecutive spaces or tabs" // following the "=>" link but this parser may allow for // no whitespace. // gemini://geminiprotocol.net/docs/gemtext-specification.gmi if (strncmp(line, "=>", GEMTEXT_LINK_LEN) != 0) continue; if (copy) free(copy); copy = strdup(line + GEMTEXT_LINK_LEN); if (!copy) err(1, "strdup"); // PORTABILITY strtok might have issues with leading // blanks or all-blank lines? char *entry_path = strtok(copy, " \t\n"); if (!entry_path) continue; char *entry_date = strtok(NULL, " \t\n"); if (!entry_date) continue; char *entry_title = strtok(NULL, "\n"); // to end-of-line if (!entry_title) continue; // Is the date at least somewhat valid? (I haven't // tested how this behaves for e.g. 2025-02-29.) // TWEAK one could instead 'continue' here if there // are not-gemfeed links on the same page that fail // this check. struct tm when; when.tm_hour = when.tm_min = when.tm_sec = 0; if (strlen(entry_date) != 10 || !strptime(entry_date, "%Y-%m-%d", &when)) errx(1, "invalid date '%s' at %s:%zu", entry_date, input_file, line_number); if (!header) { if (!atom_header(feed_title, base_url, now)) errx(1, "atom_header"); header = 1; } if (!atom_entry(base_url, entry_path, entry_date, entry_title)) errx(1, "atom_entry"); if (++entry_count > max_posts) break; } if (header) atom_footer(); // don't waste cycles on these as the script is about to exit //if (copy) free(copy); //free(line); //free(base_url); //free(feed_title); if (ferror(gemf)) err(1, "getline"); exit(EXIT_SUCCESS); } inline static long argtoll(const char *arg, const long long min, const long long max) { char *ep; long long val; errno = 0; val = strtoll(arg, &ep, 0); if (arg[0] == '\0' || *ep != '\0') err(1, "strtoll"); if (errno == ERANGE && (val == LLONG_MIN || val == LLONG_MAX)) err(1, "strtoll failed"); if (min != LLONG_MIN && val < min) errx(1, "value is below minimum %lld", min); if (max != LLONG_MAX && val > max) errx(1, "value is above maximum %lld", max); return val; } // TWEAK the link and title are escaped for XML in particular ways, so // do not change where those strings are used from element to attribute // or the reverse without revisiting how the strings are xml_*_quote'd. // For anything more complicated, you would probably want an XML library // to handle such details; https://www.msweet.org/mxml/ is a minimal XML // library, but I've never tried it. inline static bool atom_header(const char *title, const char *link, const struct tm *now) { // ISO 8601, assuming gmtime, e.g. 2024-05-17T13:25:53Z #define ATOM_TS_LEN 23 char timebuf[ATOM_TS_LEN + 1]; if (strftime(timebuf, ATOM_TS_LEN, "%Y-%m-%dT%H:%M:%SZ", now) < 1) return false; printf("<?xml version=\"1.0\" encoding=\"UTF-8\"?>" "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n" "<title>%s</title>\n" "<updated>%s</updated>\n" "<link rel=\"alternate\" href=\"%s\"/>\n", title, timebuf, link); return true; } // TWEAK the links are assumed to be relative to the base_url. If not, // you'll need suitable URI code to merge relative or passthrough fully // qualified. inline static bool atom_entry(const char *base_url, const char *path, const char *date, const char *title) { char *enc_title = xml_element_quote(title, strlen(title)); if (!enc_title) return false; char *enc_path = xml_attribute_quote(path, strlen(path)); if (!enc_path) return false; // The date is assumed to be "noon UTC on the day indicated" per // gemini://geminiprotocol.net/docs/companion/subscription.gmi printf("<entry><title>%s</title>" "<updated>%sT12:00:00Z</updated>" "<link rel=\"alternate\" href=\"%s%s\"/>" "</entry>\n", enc_title, date, base_url, enc_path); free(enc_title); free(enc_path); return true; } inline static void atom_footer(void) { printf("</feed>\n"); } inline static void emit_help(void) { fputs("Usage: gemfeed2atom -b base_url [gemfeed-file|-]\n", stderr); exit(EX_USAGE); } // Text within an attribute that is assumed to use "" quotes - // always quote " with ". // // The caller must free the returned string. static char * xml_attribute_quote(const char *s, size_t len) { if (len >= SIZE_MAX / 6) { errno = EOVERFLOW; return NULL; } char *news = malloc(len * 6 + 1); if (!news) return NULL; char *np = news; while (*s) { switch (*s) { case '"': strncpy(np, """, 6); np += 6; break; default: *np++ = *s; } ++s; } *np++ = '\0'; // most of the time the string will be downsized char *trim; if ((trim = realloc(news, (size_t) (np - news))) == NULL) { free(news); return NULL; } return trim; } // Text within an element - always quote < and also cover the ]]> edge // case by always quoting >. Always quote & on the (possibly bad) // assumption that the text does not already contain &-quoted entities, // e.g. > in the input string will be mangled to &gt; and to // avoid this any strings infested with entities must be decoded first. // // The caller must free the returned string. static char * xml_element_quote(const char *s, size_t len) { if (len >= SIZE_MAX / 5) { errno = EOVERFLOW; return NULL; } char *news = malloc(len * 5 + 1); if (!news) return NULL; char *np = news; while (*s) { switch (*s) { case '<': strncpy(np, "<", 4); np += 4; break; case '>': strncpy(np, ">", 4); np += 4; break; case '&': strncpy(np, "&", 5); np += 5; break; default: *np++ = *s; } ++s; } *np++ = '\0'; // most of the time the string will be downsized char *trim; if ((trim = realloc(news, (size_t) (np - news))) == NULL) { free(news); return NULL; } return trim; }