💾 Archived View for mozz.us › markdown › extract_ast.py captured on 2023-11-04 at 11:48:52.

View Raw

More Information

⬅️ Previous capture (2020-09-24)

-=-=-=-=-=-=-

#!/usr/bin/env python3
"""
Convert a gemini markdown document into a JSON-encoded AST.
"""


def parse_ast(markdown):
    tree = []

    def add_node(name, value, multi_line=False):
        if multi_line:
            if tree and name == tree[-1][0]:
                tree[-1][1].append(value)
            else:
                tree.append([name, [value]])
        else:
            tree.append([name, value])

    preformatted_mode = False
    for line in markdown.splitlines(keepends=False):

        if line == '```':
            preformatted_mode = not preformatted_mode

        elif preformatted_mode:
            add_node('Preformatted', line, multi_line=True)

        elif line.strip():
            for name, token, multi_line in [
                ('Link', '=>', False),
                ('Title', '# ', False),
                ('Heading', '## ', False),
                ('Sub-Heading', '### ', False),
                ('Horizontal Rule', '---', False),
                ('Ordered List', '+ ', True),
                ('Unordered List', '* ', True),
                ('Paragraph', '', True),
            ]:
                if line.startswith(token):
                    value = line[len(token):].strip()
                    add_node(name, value, multi_line)
                    break
        else:
            add_node('Empty', None)

    # Drop any empty nodes
    tree = [node for node in tree if node[0] != 'Empty']

    # Do a second pass to merge paragraph lines, parse links, etc.
    for i, (name, value) in enumerate(tree):
        if name == 'Paragraph':
            tree[i] = [name, ' '.join(value)]
        elif name == 'Preformatted':
            tree[i] = [name, '\n'.join(value)]
        elif name == 'Link':
            link_parts = value.split(maxsplit=1)
            link_url = link_parts[0] if link_parts else ''
            link_text = link_parts[1] if len(link_parts) > 1 else link_url
            tree[i] = [name, [link_url, link_text]]
        elif name == 'Horizontal Rule':
            tree[i] = [name, None]

    return tree


def main():
    import argparse
    import sys
    import json

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('infile', type=argparse.FileType('r'), default=sys.stdin)
    args = parser.parse_args()

    markdown = args.infile.read()
    ast = parse_ast(markdown)
    json.dump(ast, sys.stdout, indent='    ')


if __name__ == '__main__':
    main()