💾 Archived View for mozz.us › markdown › extract_ast.py captured on 2024-02-05 at 09:52:21.
⬅️ Previous capture (2020-09-24)
-=-=-=-=-=-=-
#!/usr/bin/env python3 """ Convert a gemini markdown document into a JSON-encoded AST. """ def parse_ast(markdown): tree = [] def add_node(name, value, multi_line=False): if multi_line: if tree and name == tree[-1][0]: tree[-1][1].append(value) else: tree.append([name, [value]]) else: tree.append([name, value]) preformatted_mode = False for line in markdown.splitlines(keepends=False): if line == '```': preformatted_mode = not preformatted_mode elif preformatted_mode: add_node('Preformatted', line, multi_line=True) elif line.strip(): for name, token, multi_line in [ ('Link', '=>', False), ('Title', '# ', False), ('Heading', '## ', False), ('Sub-Heading', '### ', False), ('Horizontal Rule', '---', False), ('Ordered List', '+ ', True), ('Unordered List', '* ', True), ('Paragraph', '', True), ]: if line.startswith(token): value = line[len(token):].strip() add_node(name, value, multi_line) break else: add_node('Empty', None) # Drop any empty nodes tree = [node for node in tree if node[0] != 'Empty'] # Do a second pass to merge paragraph lines, parse links, etc. for i, (name, value) in enumerate(tree): if name == 'Paragraph': tree[i] = [name, ' '.join(value)] elif name == 'Preformatted': tree[i] = [name, '\n'.join(value)] elif name == 'Link': link_parts = value.split(maxsplit=1) link_url = link_parts[0] if link_parts else '' link_text = link_parts[1] if len(link_parts) > 1 else link_url tree[i] = [name, [link_url, link_text]] elif name == 'Horizontal Rule': tree[i] = [name, None] return tree def main(): import argparse import sys import json parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('infile', type=argparse.FileType('r'), default=sys.stdin) args = parser.parse_args() markdown = args.infile.read() ast = parse_ast(markdown) json.dump(ast, sys.stdout, indent=' ') if __name__ == '__main__': main()