💾 Archived View for gmi.noulin.net › gitRepositories › md4c › file › test › normalize.py.gmi captured on 2024-09-29 at 01:14:47. Gemini links have been rewritten to link to archived content

View Raw

More Information

⬅️ Previous capture (2023-01-29)

-=-=-=-=-=-=-

md4c

Log

Files

Refs

README

LICENSE

normalize.py (6506B)

     1 # -*- coding: utf-8 -*-
     2 from html.parser import HTMLParser
     3 import urllib
     4 
     5 try:
     6     from html.parser import HTMLParseError
     7 except ImportError:
     8     # HTMLParseError was removed in Python 3.5. It could never be
     9     # thrown, so we define a placeholder instead.
    10     class HTMLParseError(Exception):
    11         pass
    12 
    13 from html.entities import name2codepoint
    14 import sys
    15 import re
    16 import cgi
    17 
    18 # Normalization code, adapted from
    19 # https://github.com/karlcow/markdown-testsuite/
    20 significant_attrs = ["alt", "href", "src", "title"]
    21 whitespace_re = re.compile('\s+')
    22 class MyHTMLParser(HTMLParser):
    23     def __init__(self):
    24         HTMLParser.__init__(self)
    25         self.convert_charrefs = False
    26         self.last = "starttag"
    27         self.in_pre = False
    28         self.output = ""
    29         self.last_tag = ""
    30     def handle_data(self, data):
    31         after_tag = self.last == "endtag" or self.last == "starttag"
    32         after_block_tag = after_tag and self.is_block_tag(self.last_tag)
    33         if after_tag and self.last_tag == "br":
    34             data = data.lstrip('\n')
    35         if not self.in_pre:
    36             data = whitespace_re.sub(' ', data)
    37         if after_block_tag and not self.in_pre:
    38             if self.last == "starttag":
    39                 data = data.lstrip()
    40             elif self.last == "endtag":
    41                 data = data.strip()
    42         self.output += data
    43         self.last = "data"
    44     def handle_endtag(self, tag):
    45         if tag == "pre":
    46             self.in_pre = False
    47         elif self.is_block_tag(tag):
    48             self.output = self.output.rstrip()
    49         self.output += "</" + tag + ">"
    50         self.last_tag = tag
    51         self.last = "endtag"
    52     def handle_starttag(self, tag, attrs):
    53         if tag == "pre":
    54             self.in_pre = True
    55         if self.is_block_tag(tag):
    56             self.output = self.output.rstrip()
    57         self.output += "<" + tag
    58         # For now we don't strip out 'extra' attributes, because of
    59         # raw HTML test cases.
    60         # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs)
    61         if attrs:
    62             attrs.sort()
    63             for (k,v) in attrs:
    64                 self.output += " " + k
    65                 if v in ['href','src']:
    66                     self.output += ("=" + '"' +
    67                             urllib.quote(urllib.unquote(v), safe='/') + '"')
    68                 elif v != None:
    69                     self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"')
    70         self.output += ">"
    71         self.last_tag = tag
    72         self.last = "starttag"
    73     def handle_startendtag(self, tag, attrs):
    74         """Ignore closing tag for self-closing """
    75         self.handle_starttag(tag, attrs)
    76         self.last_tag = tag
    77         self.last = "endtag"
    78     def handle_comment(self, data):
    79         self.output += '<!--' + data + '-->'
    80         self.last = "comment"
    81     def handle_decl(self, data):
    82         self.output += '<!' + data + '>'
    83         self.last = "decl"
    84     def unknown_decl(self, data):
    85         self.output += '<!' + data + '>'
    86         self.last = "decl"
    87     def handle_pi(self,data):
    88         self.output += '<?' + data + '>'
    89         self.last = "pi"
    90     def handle_entityref(self, name):
    91         try:
    92             c = chr(name2codepoint[name])
    93         except KeyError:
    94             c = None
    95         self.output_char(c, '&' + name + ';')
    96         self.last = "ref"
    97     def handle_charref(self, name):
    98         try:
    99             if name.startswith("x"):
   100                 c = chr(int(name[1:], 16))
   101             else:
   102                 c = chr(int(name))
   103         except ValueError:
   104                 c = None
   105         self.output_char(c, '&' + name + ';')
   106         self.last = "ref"
   107     # Helpers.
   108     def output_char(self, c, fallback):
   109         if c == '<':
   110             self.output += "&lt;"
   111         elif c == '>':
   112             self.output += "&gt;"
   113         elif c == '&':
   114             self.output += "&amp;"
   115         elif c == '"':
   116             self.output += "&quot;"
   117         elif c == None:
   118             self.output += fallback
   119         else:
   120             self.output += c
   121 
   122     def is_block_tag(self,tag):
   123         return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
   124             'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
   125             'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
   126             'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
   127             'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
   128             'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
   129             'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
   130 
   131 def normalize_html(html):
   132     r"""
   133     Return normalized form of HTML which ignores insignificant output
   134     differences:
   135 
   136     Multiple inner whitespaces are collapsed to a single space (except
   137     in pre tags):
   138 
   139         >>> normalize_html("<p>a  \t b</p>")
   140         '<p>a b</p>'
   141 
   142         >>> normalize_html("<p>a  \t\nb</p>")
   143         '<p>a b</p>'
   144 
   145     * Whitespace surrounding block-level tags is removed.
   146 
   147         >>> normalize_html("<p>a  b</p>")
   148         '<p>a b</p>'
   149 
   150         >>> normalize_html(" <p>a  b</p>")
   151         '<p>a b</p>'
   152 
   153         >>> normalize_html("<p>a  b</p> ")
   154         '<p>a b</p>'
   155 
   156         >>> normalize_html("\n\t<p>\n\t\ta  b\t\t</p>\n\t")
   157         '<p>a b</p>'
   158 
   159         >>> normalize_html("<i>a  b</i> ")
   160         '<i>a b</i> '
   161 
   162     * Self-closing tags are converted to open tags.
   163 
   164         >>> normalize_html("<br />")
   165         '<br>'
   166 
   167     * Attributes are sorted and lowercased.
   168 
   169         >>> normalize_html('<a title="bar" HREF="foo">x</a>')
   170         '<a href="foo" title="bar">x</a>'
   171 
   172     * References are converted to unicode, except that '<', '>', '&', and
   173       '"' are rendered using entities.
   174 
   175         >>> normalize_html("&forall;&amp;&gt;&lt;&quot;")
   176         '\u2200&amp;&gt;&lt;&quot;'
   177 
   178     """
   179     html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
   180     try:
   181         parser = MyHTMLParser()
   182         # We work around HTMLParser's limitations parsing CDATA
   183         # by breaking the input into chunks and passing CDATA chunks
   184         # through verbatim.
   185         for chunk in re.finditer(html_chunk_re, html):
   186             if chunk.group(0)[:8] == "<![CDATA":
   187                 parser.output += chunk.group(0)
   188             else:
   189                 parser.feed(chunk.group(0))
   190         parser.close()
   191         return parser.output
   192     except HTMLParseError as e:
   193         sys.stderr.write("Normalization error: " + e.msg + "\n")
   194         return html  # on error, return unnormalized HTML