gemini - kennedy.gemi.dev

💾 Archived View for gemini.zachdecook.com › usfm2gmi › usfm2gmi.py captured on 2023-09-28 at 16:10:44.
-=-=-=-=-=-=-
#!/usr/bin/env python3
"""Convert usfm line-by-line into gemtext"""
__author__ = "Zach DeCook"
__email__ = "zachdecook@librem.one"
__copyright__ = "Copyright (C) 2021 Zach DeCook"
__license__ = "AGPL"
__version__ = "3"
import fileinput

def printf(string):
  print(string,end='')

def smallcaps(word):
  sc = 'ᴀʙᴄᴅᴇғɢʜɪᴊᴋʟᴍɴᴏᴘǫʀsᴛᴜᴠᴡxʏᴢ'
  new = ''
  for c in word:
    if c >= 'a' and c <= 'z':
      # I like C programming.
      new += sc[ord(c)-ord('a')]
    else:
      new += c
  return new

def superscript(word):
  #TODO: also superscript lowercase letters
  ss='⁰¹²³⁴⁵⁶⁷⁸⁹:;<=>?@ᴬᴮCᴰᴱFᴳᴴᶦᴶᴷᴸᴹᴺᴼᴾQᴿSᵀᵁⱽᵂ'
  new = ''
  for c in word:
    if c >= '0' and c <= 'W':
      new += ss[ord(c)-ord('0')]
    else:
      new += c
  return new

def convert(line):
  """Convert a string to a list of tuples, each a token"""
  # TODO: preserve the lack of whitespace before a backslash.
  split = line.replace('\\', ' \\').replace('\\nd*','\\nd* ').replace('\\+nd*','\\+nd* ').replace('\\f*','\\f* ').replace('\\wj*','\\wj* ').replace('\\w*',' \\w* ').replace('\\+w*', '\\+w* ').split()
  out = ''
  nd = False
  if len(split) == 0:
    return out
  elif split[0] in ['\\mt1','\\mt','\\ms','\\h']:
    return '\n# ' + convert(' '.join(split[1:]))
  # TODO: parse as word for title tags in title line
  elif split[0] in ['\\mt2','\\s','\\s1']:
    return '\n## ' + convert(' '.join(split[1:]))
  elif split[0] in ['\\mt3','\\d', '\\sp']:
    return '\n### ' + convert(' '.join(split[1:]))
  elif split[0] == '\\b':
    return '\n'
  elif split[0] == '\\rem':
    return out
  skip = 0
  for word in split:
    if skip > 0:
      skip = skip - 1
    elif word in ['\\id','\\ide']:
      skip = 1
    elif word in ['\\v','\\c']:
      skip = 1
    elif word in ['\\p','\\m']:
      out += '\n'
    elif word in ['\\pi','\\pi1','\\mi']:
      out += '\n\t'
    elif word in ['\\li1']:
      out += '\n* '
    elif word in ['\\q', '\\q1']:
      out += '\n> '
    elif word in ['\\q2', '\\q22']: # \q22 is bad input
      out += '\n>\t'
    elif word in ['\\q3']:
      out += '\n>\t\t'
    elif word in ['\\qs']:
      out += '\t'
    elif word in ['\\qs*']:
      continue
    elif word in ['\\wj','\\wj*']:
      continue
    elif word in ['\\em','\\it']:
      out += '*'
    elif word in ['\\em*', '\\it*']:
      out = out.rstrip() + '*'
    elif word in ['\\nd','\\+nd']:
      nd = True
    elif word in ['\\nd*','\\+nd*']:
      nd = False
    # Footnotes (https://ubsicap.github.io/usfm/notes_basic/fnotes.html)
    elif word == '\\f':
      out += '['
      skip = 1 # the next character is the footnote caller
    elif word == '\\fr':
      skip = 1 # verse reference not necessary for inline fn
    elif word == '\\f*':
      out += ']'
    # Cross-references (https://ubsicap.github.io/usfm/notes_basic/xrefs.html)
    elif word == '\\x':
      out += '('
      skip = 1 # next character is xref caller
    elif word == '\\xo':
      skip = 1 # verse reference not necessary for inline xref
    elif word in ['\\xt']:
      continue
    elif word == '\\x*':
      out += ')'
    # TODO: support Endnotes (\fe and \fe*)
    elif word in ['\\ft']:
      continue # TODO: fancy formatting of more types
    # Words which appear in the glossary.
    elif word in ['\\w','\\w*', '\\+w', '\\+w*']:
      continue
    elif word in ['\\nb']:
      continue
    elif '|strong="' in word:
      spl = word.split('|')
      out += spl[0] + ' ' #superscript(spl[1][8:-1]) + ' '
    # Remove those extra spaces that sneak in.
    elif word in [',', '.', ';', '”', ',”', '.”', '?”', ')', ':', '!', '?', '.’', '.’”', '?’”', '?’', ';”', '!”', ');', '),']:
      if out[-1] == ' ':
        out = out[:-1] + word + ' '
      else:
        out += word + ' '
    elif word in ['“', '(', '‘']:
      out += word
    else:
     if nd:
      out += smallcaps(word) + ' '
     else:
      out += word + ' '
  return out

def main():
  """Read usfm from stdin, output gemtext to stdout
     ./usfm2gmi <in.usfm >out.md
  """
  for line in fileinput.input():
    gmi = convert(line)
    printf(gmi)

if __name__ == '__main__':
  main()