gemini - kennedy.gemi.dev

💾 Archived View for rosenzweig.io › gemnify.py captured on 2021-12-03 at 14:04:38.
-=-=-=-=-=-=-
"""
Markdown to Gemtext converter
Copyright (C) 2020 Alyssa Rosenzweig

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice (including the next
paragraph) shall be included in all copies or substantial portions of the
Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import sys
import re

with open(sys.argv[1], "r") as f:
    md = f.read()

# Unwrap text to match Gemini conventions. Special cased for quoted text (to
# avoid duplicating quotes) and code blocks (which should be skipped).
# c.f. https://superuser.com/questions/610903/how-to-unwrap-80-character-text

# define a state machine for parsing
STATE_NONE = 0
STATE_PARAGRAPH = 1
STATE_QUOTE = 2
STATE_CODE = 3
STATE_CODE_INDENTED = 4

def flush_unwrap(state, buffer):
    if state == STATE_CODE:
        return buffer
    elif state == STATE_CODE_INDENTED:
        return ['```'] + buffer + ['```']
    elif state == STATE_QUOTE:
        return ['> ' + ' '.join(buffer)]
    else:
        return [' '.join(buffer)]

def unwrap_text(lines):
    state = STATE_NONE

    out = []
    buffer = []

    for line in lines:
        transition_to_none = False

        if state == STATE_NONE:
            # TODO: transition
            if line.startswith('```'):
                state = STATE_CODE
                buffer.append(line)
            elif (line.startswith('    ') or line.startswith('\t')) and len(line.strip()) > 0:
                state = STATE_CODE_INDENTED
                buffer.append(line[4:] if line.startswith('    ') else line[1:])
            elif line.startswith('> '):
                buffer.append(line[2:])
                state = STATE_QUOTE
            elif line.startswith('* '):
                out += [line]
                state = STATE_NONE
            elif len(line.strip()) > 0 and line.strip()[-1] == '\\':
                out += [line.strip()[0:-1]]
                state = STATE_NONE
            elif len(line.strip()) == 0:
                out += [""]
            else:
                buffer.append(line)
                state = STATE_PARAGRAPH
        elif state == STATE_PARAGRAPH:
            buffer.append(line)
            transition_to_none = (len(line.strip()) == 0)
        elif state == STATE_CODE:
            buffer.append(line)

            # End code with matching ```
            if line == '```':
                transition_to_none = True
        elif state == STATE_CODE_INDENTED:
            if line.startswith('    '):
                buffer.append(line[4:])
            elif line.startswith('\t'):
                buffer.append(line[1:])
            else:
                transition_to_none = True
                assert(len(line.strip()) == 0)
        elif state == STATE_QUOTE:
            if line.startswith('> ') and len(line.strip()) > 1:
                buffer.append(line[2:])
            else:
                transition_to_none = True
                #assert(len(line.strip()) == 0)
        else:
            # Impossible state
            assert(0)

        if transition_to_none:
            out += flush_unwrap(state, buffer) + [""]
            state = STATE_NONE
            buffer = []

    out += flush_unwrap(state, buffer)
    return out

# Unwrap links and images on their own line (Gemtext style links), and drop
# inline links and images. The latter is controversial, I suppose.

UNWRAP_BLOCK_REGEX = re.compile('^!?\[([^]]*)\]\((.*)\)[\s\\\]*


) 
UNWRAP_INLINE_REGEX = re.compile('!?\[([^]]*)\]\([^)]*\)')
UNWRAP_BLOCK_NEST_REGEX = re.compile('^\[!\[([^]]*)\]\([^)]*\)\]\(([^)]*)\)[\s\\\]*


)

def unwrap_block_link_line(line):
    line = UNWRAP_BLOCK_NEST_REGEX.sub(r'=> \2 \1', line)
    line = UNWRAP_BLOCK_REGEX.sub(r'=> \2 \1', line)
    line = UNWRAP_INLINE_REGEX.sub(r'\1', line)
    return line

# Unwrap footnotes to parantheticals
UNWRAP_FOOTNOTES = re.compile('\^(\[[^]]*\])')

def unwrap_footnotes(line):
    return UNWRAP_FOOTNOTES.sub(r' \1', line)

# After unwrapping everything else, unwrap backslash escapes since Gemini
# doesn't need the escapes
UNWRAP_ESCAPES = re.compile('\\\([]<">\[])')
def unwrap_escapes(line):
    return UNWRAP_ESCAPES.sub(r'\1', line)

# Extension: pandoc-style YAML header blocks
# Snarf out the title but strip the rest

def unwrap_pandoc_yaml(lines):
    if lines[0] != '---' or '...' not in lines:
        return lines

    rest = lines[lines.index('...') + 1:]

    titles = [l[len('title: '):] for l in lines if l.startswith('title: ')]

    if len(titles) > 0:
        assert(len(titles) == 1)
        return ['# ' + titles[0]] + rest

    return rest

# Extension: pandoc simple header blocks

def unwrap_pandoc_simple_header(lines):
    if lines[0].startswith('% '):
        return ['# ' + lines[0][2:]] + lines[1:]
    else:
        return lines

# In order of application
UNWRAPS = [
        unwrap_pandoc_yaml,
        unwrap_pandoc_simple_header,
        unwrap_text,
        lambda lines: [unwrap_block_link_line(x) for x in lines],
        lambda lines: [unwrap_footnotes(x) for x in lines],
        lambda lines: [unwrap_escapes(x) for x in lines],
]

lines = md.splitlines()

for unwrap in UNWRAPS:
    lines = unwrap(lines)

print('\n'.join(lines))