💾 Archived View for gemini.conman.org › extensions › port70 › handlers › blog › format.lua captured on 2023-12-28 at 18:39:19.

View Raw

More Information

⬅️ Previous capture (2023-11-04)

➡️ Next capture (2024-03-22)

🚧 View Differences

-=-=-=-=-=-=-

-- ***************************************************************
--
-- Format a parsed HTML document into text.
-- Copyright 2020 by Sean Conner.  All Rights Reserved.
--
-- This library is free software; you can redistribute it and/or modify it
-- under the terms of the GNU Lesser General Public License as published by
-- the Free Software Foundation; either version 3 of the License, or (at your
-- option) any later version.
--
-- This library is distributed in the hope that it will be useful, but
-- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-- License for more details.
--
-- You should have received a copy of the GNU Lesser General Public License
-- along with this library; if not, see <http://www.gnu.org/licenses/>.
--
-- Comments, questions and criticisms can be sent to: sean@conman.org
--
-- ********************************************************************
-- luacheck: ignore 611

local ENTITIES = require "org.conman.const.entity"
local wrapt    = require "org.conman.string".wrapt
local lpeg     = require "lpeg"

local formats = {}

-- *************************************************************************

local function wrap_text(text,state)
  local lines = wrapt(text,77 - #state.nest[#state.nest])
  local res   = state.initial[#state.initial] .. lines[1] .. '\n'
  for i = 2 , #lines do
    res = res .. state.nest[#state.nest] .. lines[i] .. '\n'
  end
  
  return res
end

-- *************************************************************************

local function run_inline(node,state)
  local res = ""
  for _,item in ipairs(node) do
    if type(item) == 'table' then
      if not item.comment then
        res = res .. formats[item.tag](item,state)
      end
    else
      res = res .. item
    end
  end
  
  return res
end

-- *************************************************************************

local function run_block(node,state)
  local res = ""
  for _,item in ipairs(node) do
    if type(item) == 'table' then
      if not item.comment then
        res = res .. formats[item.tag](item,state)
      end
    else
      res = res .. item
    end
  end
  
  if node.tag  ~= 'thead'
  and node.tag ~= 'tfoot'
  and node.tag ~= 'tbody'
  and node.tag ~= 'blockquote' then
    res = res .. state.nest[#state.nest] .. '\n'
  end
  
  return res
end

-- *************************************************************************

local function run_flow(node,state)
  local acc  = {}
  local text = ""
  
  for _,item in ipairs(node) do
    if type(item) == 'table' then
      if item.inline then
        text = text .. formats[item.tag](item,state)
      elseif item.comment then
        text = text .. ""
      else
        assert(item.block)
        
        if text ~= "" then
          table.insert(acc,wrap_text(text,state))
        end
        
        text = formats[item.tag](item,state)
        if  item.tag ~= 'li'
        and item.tag ~= 'table'
        and item.tag ~= 'dl' then
          text = text .. state.nest[#state.nest] .. '\n'
        end
        
        table.insert(acc,text)
        text = ""
      end
    else
      text = text .. item
    end
  end
  
  if text ~= "" then
    table.insert(acc,wrap_text(text,state))
  end
  
  return table.concat(acc)
end

-- *************************************************************************
-- INLINE
-- *************************************************************************

function formats.tt(node,state)
  return "`" .. run_inline(node,state) .. "`"
end

function formats.i(node,state)
  return "_" .. run_inline(node,state) .. "_"
end

function formats.b(node,state)
  return "**" .. run_inline(node,state) .. "**"
end

formats.big    = run_inline
formats.small  = run_inline
formats.em     = run_inline
formats.strong = formats.b
formats.dfn    = run_inline
formats.code   = formats.tt
formats.samp   = formats.tt
formats.kbd    = formats.tt
formats.var    = formats.tt

function formats.cite(node,state)
  return ENTITIES.ldquo .. run_inline(node,state) .. ENTITIES.rdquo
end

function formats.abbr(node,state)
  local res = run_inline(node,state)
  if not state.abbr[res] then
    state.abbr[res] = true
    res = res .. string.format(" (%s)",node.attributes.title)
  end
  
  return res
end

formats.acronym = formats.abbr

function formats.a(node,state)
  local res = run_inline(node,state)
  
  if node.attributes.title then
    res = string.format("%s (%s)",res,node.attributes.title)
  end
  
  if node.attributes.href then
    table.insert(state.links,node.attributes.href)
    res = string.format("%s [%d]",res,#state.links)
  end
  
  return res
end

function formats.img(node,state)
  local ref do
    if node.attributes.src then
      table.insert(state.links,node.attributes.src)
      ref = string.format(" [%d]",#state.links)
    else
      ref = ""
    end
  end
  
  local verbiage do
    if node.attributes.alt then
      if node.attributes.alt == "" then
        if node.attributes.title then
          if node.attributes.title ~= "" then
            verbiage = "[" .. node.attributes.title .. "]"
          else
            verbiage = ""
          end
        else
         verbiage = ""
        end
      else
        verbiage = node.attributes.alt
      end
    else
      verbiage = node.attributes.title or ""
    end
  end
  
  local res = verbiage .. ref
  if state.div_pf then
    res = res .. "\194\133" -- <BR>
  end
  
  return res
end

function formats.br()
  return "\194\133"
end

function formats.script()
  return ""
end

function formats.q(node,state)
  return ENTITIES.ldquo .. run_inline(node,state) .. ENTITIES.rdquo
end

function formats.sub()
  return ""
end

function formats.sup(node,state)
  return "^" .. run_inline(node,state)
end

function formats.span(node,state)
  local res = run_inline(node,state)
  if node.attributes.lang
  or (node.attributes.class and node.attributes.class:match"booktitle") then
    res = "_" .. res .. "_"
  end
  return res
end

function formats.bdo()
  return ""
end

function formats.map()
  return ""
end

function formats.area()
  return ""
end

function formats.object()
  return ""
end

function formats.param()
  return ""
end

function formats.input()
  return ""
end

function formats.select()
  return ""
end

function formats.textarea()
  return ""
end

function formats.label()
  return ""
end

function formats.button()
  return ""
end

function formats.optgroup()
  return ""
end

function formats.option()
  return ""
end

function formats.u(node,state)
  return "_" .. run_inline(node,state) .. "_"
end

function formats.font(node,state)
  return run_inline(node,state)
end

-- *************************************************************************
-- BLOCK
-- *************************************************************************

function formats.p(node,state)
  local res = run_inline(node,state)
  
  if node.attributes.class then
    if state.dl_screenplay and node.attributes.class == 'setting' then
      res = '[Setting: ' .. res .. ']'
    elseif state.dl_screenplay and node.attributes.class == 'direction' then
      res = '[' .. res .. ']'
    end
  end
  return wrap_text(res,state)
end

local fixline do
  local char = lpeg.P"\n" * #lpeg.P(1) / '\n' * lpeg.Carg(1)
             + lpeg.P(1)
  fixline    = lpeg.Cs(lpeg.Carg(1) * char^0)
end

function formats.pre(node,state)
  local res = ""
  for _,item in ipairs(node) do
    if type(item) == 'table' then
      res = res .. formats[item.tag](item,state)
    else
      res = res .. item
    end
  end
  return fixline:match(res,1,state.nest[#state.nest])
end

function formats.blockquote(node,state)
  table.insert(state.initial,state.initial[#state.initial] .. "> ")
  table.insert(state.nest,state.nest[#state.nest] .. "> ")
  local res = run_flow(node,state)
  table.remove(state.nest)
  table.remove(state.initial)
  return res
end

function formats.hr(_,state)
  return state.nest[#state.nest] .. "* * * * *\n"
end

function formats.address()
  return ""
end

function formats.h1()
  return ""
end

function formats.h2(node,state)
  local res = run_inline(node,state)
  return wrap_text(res,state)
end

formats.h3 = formats.h2
formats.h4 = formats.h2
formats.h5 = formats.h2
formats.h6 = formats.h2

function formats.div(node,state)
  state.div_pf = node.attributes.class and node.attributes.class == 'pf'
  local res = run_flow(node,state)
  state.div_pf = false
  return res
end

        -- ***************
        -- DICTIONARY LIST
        -- ***************
        
function formats.dl(node,state)
  if node.attributes.class then
    state.dl_header = node.attributes.class:match "header"
    state.dl_screenplay = node.attributes.class:match "screenplay"
  end
  
  local res = run_block(node,state)
  
  state.dl_header     = false
  state.dl_screenplay = false
  
  return res
end

function formats.dt(node,state)
  local res = run_inline(node,state)
  
  state.dt = true
  if state.dl_header then
    res = res .. ":" .. ENTITIES.nbsp
    table.insert(state.initial,state.initial[#state.initial] .. res)
    table.insert(state.nest,state.nest[#state.nest] .. "        ")
    return ""
  elseif state.dl_screenplay then
    res = state.nest[#state.nest] .. res .. ": "
    table.insert(state.initial,state.initial[#state.initial] .. res)
    table.insert(state.nest,state.nest[#state.nest] .. string.rep(" ",#res))
    return ""
  else
    res = state.nest[#state.nest] .. res .. '\n\n'
    table.insert(state.initial,state.initial[#state.initial] .. '        ')
    table.insert(state.nest,state.nest[#state.nest] .. '        ')
    return res
  end
end

function formats.dd(node,state)
  local res = run_flow(node,state)
  
  if state.dt then
    table.remove(state.nest)
    table.remove(state.initial)
  end
  
  return res
end

        -- ********
        -- LISTS
        -- ********
        
function formats.ul(node,state)
  table.insert(state.list, { type = 'u' })
  local res = run_flow(node,state)
  table.remove(state.list)
  return res
end

function formats.ol(node,state)
  table.insert(state.list, { type = 'o' , idx = 1 })
  state.ol_footnote = node.attributes.class and node.attributes.class == 'footnote'
  state.ol_outline  = node.attributes.class and node.attributes.class == 'outline'
  local res = run_flow(node,state)
  table.remove(state.list)
  return res
end

function formats.li(node,state)
  local info = state.list[#state.list]
  local hdr,pad do
    if info.type == 'u' then
      hdr = "* "
      pad = "  "
    else
      local i = info.idx
      info.idx = info.idx + 1
      
      if state.ol_footnote then
        hdr = string.format("[%3d] ",i)
        pad = "     "
      else
        hdr = string.format("%3d. ",i)
        pad = "     "
      end
    end
  end
  
  table.insert(state.initial,state.initial[#state.initial] .. hdr)
  table.insert(state.nest,state.nest[#state.nest] .. pad)
  local res = run_flow(node,state)
  table.remove(state.nest)
  table.remove(state.initial)
  return res
end

function formats.noscript()
  return ""
end

function formats.form()
  return ""
end

function formats.fieldset()
  return ""
end

function formats.legend()
  return ""
end

        -- **********
        -- TABLE
        -- **********
        
function formats.table(node,state)
  local res = run_block(node,state) .. (state.tfoot or "")
  state.tfoot = nil
  return res
end

function formats.caption(node,state)
  return state.nest[#state.nest] .. "Table: " .. run_inline(node,state) .. '\n'
end

function formats.col()
  return ""
end

function formats.colgroup()
  return ""
end

function formats.thead(node,state)
  return run_block(node,state)
      .. state.nest[#state.nest] .. "------------------------------\n"
end

function formats.tfoot(node,state)
  state.tfoot = state.nest[#state.nest] .. "------------------------------\n"
             .. run_block(node,state)
  return ""
end

function formats.tbody(node,state)
  return run_block(node,state)
end

function formats.tr(node,state)
  state.tr_sep = ""
  return state.nest[#state.nest] .. run_block(node,state)
end

        -- -------------------------------------------------------------------------
        -- <TH> and <TD> are flow types, but I don't use tables to format
        -- text, but to present data, so I'm uisng inline type here.
        -- -------------------------------------------------------------------------
        
function formats.th(node,state)
  local text   = state.tr_sep .. run_inline(node,state)
  state.tr_sep = "\t"
  return text
end

function formats.td(node,state)
  local  text  = state.tr_sep .. run_inline(node,state)
  state.tr_sep = "\t"
  return text
end

-- *************************************************************************
-- INS/DEL
-- *************************************************************************

function formats.ins(node,state)
  if node.inline then
    return run_inline(node,state)
  else
    return run_block(node,state)
  end
end

function formats.del(node,state)
  if node.inline then
    return "[DELETED-" .. run_inline(node,state) .. "-DELETED]"
  else
    return "[DELETED-" .. run_block(node,state) .. "-DELETED]\n"
  end
end

-- *************************************************************************

return function(doc)
  local state =
  {
    links   = {},
    abbr    = {},
    list    = {},
    nest    = { "" },
    initial = { "" },
    pre     = false,
  }
  
  local res = run_flow(doc,state)
  return res,state.links
end

-- *************************************************************************