💾 Archived View for gmi.noulin.net › gitRepositories › md4c › file › test › normalize.py.gmi captured on 2024-09-29 at 01:14:47. Gemini links have been rewritten to link to archived content
⬅️ Previous capture (2023-01-29)
-=-=-=-=-=-=-
normalize.py (6506B)
1 # -*- coding: utf-8 -*- 2 from html.parser import HTMLParser 3 import urllib 4 5 try: 6 from html.parser import HTMLParseError 7 except ImportError: 8 # HTMLParseError was removed in Python 3.5. It could never be 9 # thrown, so we define a placeholder instead. 10 class HTMLParseError(Exception): 11 pass 12 13 from html.entities import name2codepoint 14 import sys 15 import re 16 import cgi 17 18 # Normalization code, adapted from 19 # https://github.com/karlcow/markdown-testsuite/ 20 significant_attrs = ["alt", "href", "src", "title"] 21 whitespace_re = re.compile('\s+') 22 class MyHTMLParser(HTMLParser): 23 def __init__(self): 24 HTMLParser.__init__(self) 25 self.convert_charrefs = False 26 self.last = "starttag" 27 self.in_pre = False 28 self.output = "" 29 self.last_tag = "" 30 def handle_data(self, data): 31 after_tag = self.last == "endtag" or self.last == "starttag" 32 after_block_tag = after_tag and self.is_block_tag(self.last_tag) 33 if after_tag and self.last_tag == "br": 34 data = data.lstrip('\n') 35 if not self.in_pre: 36 data = whitespace_re.sub(' ', data) 37 if after_block_tag and not self.in_pre: 38 if self.last == "starttag": 39 data = data.lstrip() 40 elif self.last == "endtag": 41 data = data.strip() 42 self.output += data 43 self.last = "data" 44 def handle_endtag(self, tag): 45 if tag == "pre": 46 self.in_pre = False 47 elif self.is_block_tag(tag): 48 self.output = self.output.rstrip() 49 self.output += "</" + tag + ">" 50 self.last_tag = tag 51 self.last = "endtag" 52 def handle_starttag(self, tag, attrs): 53 if tag == "pre": 54 self.in_pre = True 55 if self.is_block_tag(tag): 56 self.output = self.output.rstrip() 57 self.output += "<" + tag 58 # For now we don't strip out 'extra' attributes, because of 59 # raw HTML test cases. 60 # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs) 61 if attrs: 62 attrs.sort() 63 for (k,v) in attrs: 64 self.output += " " + k 65 if v in ['href','src']: 66 self.output += ("=" + '"' + 67 urllib.quote(urllib.unquote(v), safe='/') + '"') 68 elif v != None: 69 self.output += ("=" + '"' + cgi.escape(v,quote=True) + '"') 70 self.output += ">" 71 self.last_tag = tag 72 self.last = "starttag" 73 def handle_startendtag(self, tag, attrs): 74 """Ignore closing tag for self-closing """ 75 self.handle_starttag(tag, attrs) 76 self.last_tag = tag 77 self.last = "endtag" 78 def handle_comment(self, data): 79 self.output += '<!--' + data + '-->' 80 self.last = "comment" 81 def handle_decl(self, data): 82 self.output += '<!' + data + '>' 83 self.last = "decl" 84 def unknown_decl(self, data): 85 self.output += '<!' + data + '>' 86 self.last = "decl" 87 def handle_pi(self,data): 88 self.output += '<?' + data + '>' 89 self.last = "pi" 90 def handle_entityref(self, name): 91 try: 92 c = chr(name2codepoint[name]) 93 except KeyError: 94 c = None 95 self.output_char(c, '&' + name + ';') 96 self.last = "ref" 97 def handle_charref(self, name): 98 try: 99 if name.startswith("x"): 100 c = chr(int(name[1:], 16)) 101 else: 102 c = chr(int(name)) 103 except ValueError: 104 c = None 105 self.output_char(c, '&' + name + ';') 106 self.last = "ref" 107 # Helpers. 108 def output_char(self, c, fallback): 109 if c == '<': 110 self.output += "<" 111 elif c == '>': 112 self.output += ">" 113 elif c == '&': 114 self.output += "&" 115 elif c == '"': 116 self.output += """ 117 elif c == None: 118 self.output += fallback 119 else: 120 self.output += c 121 122 def is_block_tag(self,tag): 123 return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote', 124 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas', 125 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd', 126 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt', 127 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption', 128 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul', 129 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style']) 130 131 def normalize_html(html): 132 r""" 133 Return normalized form of HTML which ignores insignificant output 134 differences: 135 136 Multiple inner whitespaces are collapsed to a single space (except 137 in pre tags): 138 139 >>> normalize_html("<p>a \t b</p>") 140 '<p>a b</p>' 141 142 >>> normalize_html("<p>a \t\nb</p>") 143 '<p>a b</p>' 144 145 * Whitespace surrounding block-level tags is removed. 146 147 >>> normalize_html("<p>a b</p>") 148 '<p>a b</p>' 149 150 >>> normalize_html(" <p>a b</p>") 151 '<p>a b</p>' 152 153 >>> normalize_html("<p>a b</p> ") 154 '<p>a b</p>' 155 156 >>> normalize_html("\n\t<p>\n\t\ta b\t\t</p>\n\t") 157 '<p>a b</p>' 158 159 >>> normalize_html("<i>a b</i> ") 160 '<i>a b</i> ' 161 162 * Self-closing tags are converted to open tags. 163 164 >>> normalize_html("<br />") 165 '<br>' 166 167 * Attributes are sorted and lowercased. 168 169 >>> normalize_html('<a title="bar" HREF="foo">x</a>') 170 '<a href="foo" title="bar">x</a>' 171 172 * References are converted to unicode, except that '<', '>', '&', and 173 '"' are rendered using entities. 174 175 >>> normalize_html("∀&><"") 176 '\u2200&><"' 177 178 """ 179 html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)") 180 try: 181 parser = MyHTMLParser() 182 # We work around HTMLParser's limitations parsing CDATA 183 # by breaking the input into chunks and passing CDATA chunks 184 # through verbatim. 185 for chunk in re.finditer(html_chunk_re, html): 186 if chunk.group(0)[:8] == "<![CDATA": 187 parser.output += chunk.group(0) 188 else: 189 parser.feed(chunk.group(0)) 190 parser.close() 191 return parser.output 192 except HTMLParseError as e: 193 sys.stderr.write("Normalization error: " + e.msg + "\n") 194 return html # on error, return unnormalized HTML