-- *************************************************************** -- -- Format a parsed HTML document into text. -- Copyright 2020 by Sean Conner. All Rights Reserved. -- -- This library is free software; you can redistribute it and/or modify it -- under the terms of the GNU Lesser General Public License as published by -- the Free Software Foundation; either version 3 of the License, or (at your -- option) any later version. -- -- This library is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -- License for more details. -- -- You should have received a copy of the GNU Lesser General Public License -- along with this library; if not, see . -- -- Comments, questions and criticisms can be sent to: sean@conman.org -- -- ******************************************************************** -- luacheck: ignore 611 local ENTITIES = require "org.conman.const.entity" local wrapt = require "org.conman.string".wrapt local lpeg = require "lpeg" local formats = {} -- ************************************************************************* local function wrap_text(text,state) local lines = wrapt(text,77 - #state.nest[#state.nest]) local res = state.initial[#state.initial] .. lines[1] .. '\n' for i = 2 , #lines do res = res .. state.nest[#state.nest] .. lines[i] .. '\n' end return res end -- ************************************************************************* local function run_inline(node,state) local res = "" for _,item in ipairs(node) do if type(item) == 'table' then if not item.comment then res = res .. formats[item.tag](item,state) end else res = res .. item end end return res end -- ************************************************************************* local function run_block(node,state) local res = "" for _,item in ipairs(node) do if type(item) == 'table' then if not item.comment then res = res .. formats[item.tag](item,state) end else res = res .. item end end if node.tag ~= 'thead' and node.tag ~= 'tfoot' and node.tag ~= 'tbody' and node.tag ~= 'blockquote' then res = res .. state.nest[#state.nest] .. '\n' end return res end -- ************************************************************************* local function run_flow(node,state) local acc = {} local text = "" for _,item in ipairs(node) do if type(item) == 'table' then if item.inline then text = text .. formats[item.tag](item,state) elseif item.comment then text = text .. "" else assert(item.block) if text ~= "" then table.insert(acc,wrap_text(text,state)) end text = formats[item.tag](item,state) if item.tag ~= 'li' and item.tag ~= 'table' and item.tag ~= 'dl' then text = text .. state.nest[#state.nest] .. '\n' end table.insert(acc,text) text = "" end else text = text .. item end end if text ~= "" then table.insert(acc,wrap_text(text,state)) end return table.concat(acc) end -- ************************************************************************* -- INLINE -- ************************************************************************* function formats.tt(node,state) return "`" .. run_inline(node,state) .. "`" end function formats.i(node,state) return "_" .. run_inline(node,state) .. "_" end function formats.b(node,state) return "**" .. run_inline(node,state) .. "**" end formats.big = run_inline formats.small = run_inline function formats.em(node,state) return "*" .. run_inline(node,state) .. "*" end formats.strong = formats.b formats.dfn = run_inline formats.code = formats.tt formats.samp = formats.tt formats.kbd = formats.tt formats.var = formats.tt function formats.cite(node,state) return ENTITIES.ldquo .. run_inline(node,state) .. ENTITIES.rdquo end function formats.abbr(node,state) local res = run_inline(node,state) if not state.abbr[res] then state.abbr[res] = true res = res .. string.format(" (%s)",node.attributes.title) end return res end formats.acronym = formats.abbr function formats.a(node,state) local res = run_inline(node,state) if node.attributes.title then res = string.format("%s (%s)",res,node.attributes.title) end if node.attributes.href then table.insert(state.links,node.attributes.href) res = string.format("%s [%d]",res,#state.links) end return res end function formats.img(node,state) local ref do if node.attributes.src then table.insert(state.links,node.attributes.src) ref = string.format(" [%d]",#state.links) else ref = "" end end local verbiage do if node.attributes.alt then if node.attributes.alt == "" then if node.attributes.title then if node.attributes.title ~= "" then verbiage = "[" .. node.attributes.title .. "]" else verbiage = "" end else verbiage = "" end else verbiage = node.attributes.alt end else verbiage = node.attributes.title or "" end end local res = verbiage .. ref if state.div_pf then res = res .. "\194\133" --
end return res end function formats.br() return "\194\133" end function formats.script() return "" end function formats.q(node,state) return ENTITIES.ldquo .. run_inline(node,state) .. ENTITIES.rdquo end function formats.sub() return "" end function formats.sup(node,state) return "^" .. run_inline(node,state) end function formats.span(node,state) local res = run_inline(node,state) if node.attributes.lang or (node.attributes.class and node.attributes.class:match"booktitle") then res = "_" .. res .. "_" end return res end function formats.bdo() return "" end function formats.map() return "" end function formats.area() return "" end function formats.object() return "" end function formats.param() return "" end function formats.input() return "" end function formats.select() return "" end function formats.textarea() return "" end function formats.label() return "" end function formats.button() return "" end function formats.optgroup() return "" end function formats.option() return "" end function formats.u(node,state) return "_" .. run_inline(node,state) .. "_" end function formats.font(node,state) return run_inline(node,state) end -- ************************************************************************* -- BLOCK -- ************************************************************************* function formats.p(node,state) local res = run_inline(node,state) if node.attributes.class then if state.dl_screenplay and node.attributes.class == 'setting' then res = '[Setting: ' .. res .. ']' elseif state.dl_screenplay and node.attributes.class == 'direction' then res = '[' .. res .. ']' end end return wrap_text(res,state) end local fixline do local char = lpeg.P"\n" * #lpeg.P(1) / '\n' * lpeg.Carg(1) + lpeg.P(1) fixline = lpeg.Cs(lpeg.Carg(1) * char^0) end function formats.pre(node,state) local res = "" for _,item in ipairs(node) do if type(item) == 'table' then res = res .. formats[item.tag](item,state) else res = res .. item end end return fixline:match(res,1,state.nest[#state.nest]) end function formats.blockquote(node,state) table.insert(state.initial,state.initial[#state.initial] .. "| ") table.insert(state.nest,state.nest[#state.nest] .. "| ") local res = run_flow(node,state) table.remove(state.nest) table.remove(state.initial) return res end function formats.hr(_,state) return state.nest[#state.nest] .. "* * * * *\n" end function formats.address() return "" end function formats.h1() return "" end function formats.h2() return "" end function formats.h3() return "" end function formats.h4(node,state) local res = run_inline(node,state) return wrap_text(res,state) end function formats.h5() return "" end function formats.h6() return "" end function formats.div(node,state) state.div_pf = node.attributes.class and node.attributes.class == 'pf' local res = run_flow(node,state) state.div_pf = false return res end -- *************** -- DICTIONARY LIST -- *************** function formats.dl(node,state) if node.attributes.class then state.dl_header = node.attributes.class:match "header" state.dl_screenplay = node.attributes.class:match "screenplay" end local res = run_block(node,state) state.dl_header = false state.dl_screenplay = false return res end function formats.dt(node,state) local res = run_inline(node,state) state.dt = true if state.dl_header then res = state.nest[#state.nest] .. res .. ":" .. ENTITIES.nbsp table.insert(state.initial,state.initial[#state.initial] .. res) table.insert(state.nest,state.nest[#state.nest] .. " ") return "" elseif state.dl_screenplay then res = state.nest[#state.nest] .. res .. ": " table.insert(state.initial,state.initial[#state.initial] .. res) table.insert(state.nest,state.nest[#state.nest] .. string.rep(" ",#res)) return "" else res = state.nest[#state.nest] .. res .. '\n' table.insert(state.initial,state.initial[#state.initial] .. ' ') table.insert(state.nest,state.nest[#state.nest] .. ' ') return res end end function formats.dd(node,state) local res = run_flow(node,state) if state.dt then table.remove(state.nest) table.remove(state.initial) end return res end -- ******** -- LISTS -- ******** function formats.ul(node,state) table.insert(state.list, { type = 'u' }) local res = run_flow(node,state) table.remove(state.list) return res end function formats.ol(node,state) table.insert(state.list, { type = 'o' , idx = 1 }) state.ol_footnote = node.attributes.class and node.attributes.class == 'footnote' local res = run_flow(node,state) table.remove(state.list) return res end function formats.li(node,state) local info = state.list[#state.list] local hdr,pad do if info.type == 'u' then hdr = "* " pad = " " else local i = info.idx info.idx = info.idx + 1 if state.ol_footnote then hdr = string.format("[%3d] ",i) pad = " " else hdr = string.format("%3d. ",i) pad = " " end end end table.insert(state.initial,state.initial[#state.initial] .. hdr) table.insert(state.nest,state.nest[#state.nest] .. pad) local res = run_flow(node,state) table.remove(state.nest) table.remove(state.initial) return res end function formats.noscript() return "" end function formats.form() return "" end function formats.fieldset() return "" end function formats.legend() return "" end -- ********** -- TABLE -- ********** function formats.table(node,state) return run_block(node,state) .. (state.tfoot or "") end function formats.caption(node,state) return state.nest[#state.nest] .. "Table: " .. run_inline(node,state) .. '\n' end function formats.col() return "" end function formats.colgroup() return "" end function formats.thead(node,state) return run_block(node,state) .. state.nest[#state.nest] .. "------------------------------\n" end function formats.tfoot(node,state) state.tfoot = state.nest[#state.nest] .. "------------------------------\n" .. run_block(node,state) return "" end function formats.tbody(node,state) return run_block(node,state) end function formats.tr(node,state) state.tr_sep = "" return state.nest[#state.nest] .. run_block(node,state) end -- ------------------------------------------------------------------------- -- and are flow types, but I don't use tables to format -- text, but to present data, so I'm uisng inline type here. -- ------------------------------------------------------------------------- function formats.th(node,state) local text = state.tr_sep .. run_inline(node,state) state.tr_sep = "\t" return text end function formats.td(node,state) local text = state.tr_sep .. run_inline(node,state) state.tr_sep = "\t" return text end -- ************************************************************************* -- INS/DEL -- ************************************************************************* formats.ins = run_flow function formats.del(node,state) if node.inline then return "[DELETED-" .. run_inline(node,state) .. "-DELETED]" else return "[DELETED-" .. run_block(node,state) .. "-DELETED]" end end -- ************************************************************************* return function(doc) local state = { links = {}, abbr = {}, list = {}, nest = { "" }, initial = { "" }, pre = false, } local res = run_flow(doc,state) return res,state.links end -- *************************************************************************