💾 Archived View for rosenzweig.io › gemnify.py captured on 2021-12-03 at 14:04:38.
⬅️ Previous capture (2020-11-07)
-=-=-=-=-=-=-
""" Markdown to Gemtext converter Copyright (C) 2020 Alyssa Rosenzweig Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import sys import re with open(sys.argv[1], "r") as f: md = f.read() # Unwrap text to match Gemini conventions. Special cased for quoted text (to # avoid duplicating quotes) and code blocks (which should be skipped). # c.f. https://superuser.com/questions/610903/how-to-unwrap-80-character-text # define a state machine for parsing STATE_NONE = 0 STATE_PARAGRAPH = 1 STATE_QUOTE = 2 STATE_CODE = 3 STATE_CODE_INDENTED = 4 def flush_unwrap(state, buffer): if state == STATE_CODE: return buffer elif state == STATE_CODE_INDENTED: return ['```'] + buffer + ['```'] elif state == STATE_QUOTE: return ['> ' + ' '.join(buffer)] else: return [' '.join(buffer)] def unwrap_text(lines): state = STATE_NONE out = [] buffer = [] for line in lines: transition_to_none = False if state == STATE_NONE: # TODO: transition if line.startswith('```'): state = STATE_CODE buffer.append(line) elif (line.startswith(' ') or line.startswith('\t')) and len(line.strip()) > 0: state = STATE_CODE_INDENTED buffer.append(line[4:] if line.startswith(' ') else line[1:]) elif line.startswith('> '): buffer.append(line[2:]) state = STATE_QUOTE elif line.startswith('* '): out += [line] state = STATE_NONE elif len(line.strip()) > 0 and line.strip()[-1] == '\\': out += [line.strip()[0:-1]] state = STATE_NONE elif len(line.strip()) == 0: out += [""] else: buffer.append(line) state = STATE_PARAGRAPH elif state == STATE_PARAGRAPH: buffer.append(line) transition_to_none = (len(line.strip()) == 0) elif state == STATE_CODE: buffer.append(line) # End code with matching ``` if line == '```': transition_to_none = True elif state == STATE_CODE_INDENTED: if line.startswith(' '): buffer.append(line[4:]) elif line.startswith('\t'): buffer.append(line[1:]) else: transition_to_none = True assert(len(line.strip()) == 0) elif state == STATE_QUOTE: if line.startswith('> ') and len(line.strip()) > 1: buffer.append(line[2:]) else: transition_to_none = True #assert(len(line.strip()) == 0) else: # Impossible state assert(0) if transition_to_none: out += flush_unwrap(state, buffer) + [""] state = STATE_NONE buffer = [] out += flush_unwrap(state, buffer) return out # Unwrap links and images on their own line (Gemtext style links), and drop # inline links and images. The latter is controversial, I suppose. UNWRAP_BLOCK_REGEX = re.compile('^!?\[([^]]*)\]\((.*)\)[\s\\\]*