md4c.c (234850B)
1 /* commit e9ff661ff818ee94a4a231958d9b6768dc6882c9 - mity/md4c repo 2 * MD4C: Markdown parser for C 3 * (http://github.com/mity/md4c) 4 * 5 * Copyright (c) 2016-2020 Martin Mitas 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23 * IN THE SOFTWARE. 24 */ 25 26 #include "md4c.h" 27 28 #include <limits.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 33 34 /***************************** 35 *** Miscellaneous Stuff *** 36 *****************************/ 37 38 #if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L 39 /* C89/90 or old compilers in general may not understand "inline". */ 40 #if defined __GNUC__ 41 #define inline __inline__ 42 #elif defined _MSC_VER 43 #define inline __inline 44 #else 45 #define inline 46 #endif 47 #endif 48 49 /* Make the UTF-8 support the default. */ 50 #if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16 51 #define MD4C_USE_UTF8 52 #endif 53 54 /* Magic for making wide literals with MD4C_USE_UTF16. */ 55 #ifdef _T 56 #undef _T 57 #endif 58 #if defined MD4C_USE_UTF16 59 #define _T(x) L##x 60 #else 61 #define _T(x) x 62 #endif 63 64 /* Misc. macros. */ 65 #define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0])) 66 67 #define STRINGIZE_(x) #x 68 #define STRINGIZE(x) STRINGIZE_(x) 69 70 #ifndef TRUE 71 #define TRUE 1 72 #define FALSE 0 73 #endif 74 75 #define MD_LOG(msg) \ 76 do { \ 77 if(ctx->parser.debug_log != NULL) \ 78 ctx->parser.debug_log((msg), ctx->userdata); \ 79 } while(0) 80 81 #ifdef DEBUG 82 #define MD_ASSERT(cond) \ 83 do { \ 84 if(!(cond)) { \ 85 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \ 86 "Assertion '" STRINGIZE(cond) "' failed."); \ 87 exit(1); \ 88 } \ 89 } while(0) 90 91 #define MD_UNREACHABLE() MD_ASSERT(1 == 0) 92 #else 93 #ifdef __GNUC__ 94 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0) 95 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0) 96 #elif defined _MSC_VER && _MSC_VER > 120 97 #define MD_ASSERT(cond) do { __assume(cond); } while(0) 98 #define MD_UNREACHABLE() do { __assume(0); } while(0) 99 #else 100 #define MD_ASSERT(cond) do {} while(0) 101 #define MD_UNREACHABLE() do {} while(0) 102 #endif 103 #endif 104 105 /* For falling through case labels in switch statements. */ 106 #if defined __clang__ && __clang_major__ >= 12 107 #define MD_FALLTHROUGH() __attribute__((fallthrough)) 108 #elif defined __GNUC__ && __GNUC__ >= 7 109 #define MD_FALLTHROUGH() __attribute__((fallthrough)) 110 #else 111 #define MD_FALLTHROUGH() ((void)0) 112 #endif 113 114 /* Suppress "unused parameter" warnings. */ 115 #define MD_UNUSED(x) ((void)x) 116 117 118 /************************ 119 *** Internal Types *** 120 ************************/ 121 122 /* These are omnipresent so lets save some typing. */ 123 #define CHAR MD_CHAR 124 #define SZ MD_SIZE 125 #define OFF MD_OFFSET 126 127 typedef struct MD_MARK_tag MD_MARK; 128 typedef struct MD_BLOCK_tag MD_BLOCK; 129 typedef struct MD_CONTAINER_tag MD_CONTAINER; 130 typedef struct MD_REF_DEF_tag MD_REF_DEF; 131 132 133 /* During analyzes of inline marks, we need to manage some "mark chains", 134 * of (yet unresolved) openers. This structure holds start/end of the chain. 135 * The chain internals are then realized through MD_MARK::prev and ::next. 136 */ 137 typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN; 138 struct MD_MARKCHAIN_tag { 139 int head; /* Index of first mark in the chain, or -1 if empty. */ 140 int tail; /* Index of last mark in the chain, or -1 if empty. */ 141 }; 142 143 /* Context propagated through all the parsing. */ 144 typedef struct MD_CTX_tag MD_CTX; 145 struct MD_CTX_tag { 146 /* Immutable stuff (parameters of md_parse()). */ 147 const CHAR* text; 148 SZ size; 149 MD_PARSER parser; 150 void* userdata; 151 152 /* When this is true, it allows some optimizations. */ 153 int doc_ends_with_newline; 154 155 /* Helper temporary growing buffer. */ 156 CHAR* buffer; 157 unsigned alloc_buffer; 158 159 /* Reference definitions. */ 160 MD_REF_DEF* ref_defs; 161 int n_ref_defs; 162 int alloc_ref_defs; 163 void** ref_def_hashtable; 164 int ref_def_hashtable_size; 165 166 /* Stack of inline/span markers. 167 * This is only used for parsing a single block contents but by storing it 168 * here we may reuse the stack for subsequent blocks; i.e. we have fewer 169 * (re)allocations. */ 170 MD_MARK* marks; 171 int n_marks; 172 int alloc_marks; 173 174 #if defined MD4C_USE_UTF16 175 char mark_char_map[128]; 176 #else 177 char mark_char_map[256]; 178 #endif 179 180 /* For resolving of inline spans. */ 181 MD_MARKCHAIN mark_chains[17]; 182 #define PTR_CHAIN (ctx->mark_chains[0]) 183 #define TABLECELLBOUNDARIES (ctx->mark_chains[1]) 184 #define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2]) 185 #define ASTERISK_OPENERS_extraword_mod3_1 (ctx->mark_chains[3]) 186 #define ASTERISK_OPENERS_extraword_mod3_2 (ctx->mark_chains[4]) 187 #define ASTERISK_OPENERS_intraword_mod3_0 (ctx->mark_chains[5]) 188 #define ASTERISK_OPENERS_intraword_mod3_1 (ctx->mark_chains[6]) 189 #define ASTERISK_OPENERS_intraword_mod3_2 (ctx->mark_chains[7]) 190 #define UNDERSCORE_OPENERS (ctx->mark_chains[8]) 191 #define TILDE_OPENERS_1 (ctx->mark_chains[9]) 192 #define TILDE_OPENERS_2 (ctx->mark_chains[10]) 193 #define BRACKET_OPENERS (ctx->mark_chains[11]) 194 #define DOLLAR_OPENERS (ctx->mark_chains[12]) 195 #define FAINT_OPENERS (ctx->mark_chains[13]) 196 #define INVERSE_OPENERS (ctx->mark_chains[14]) 197 #define CONCEAL_OPENERS (ctx->mark_chains[15]) 198 #define BLINK_OPENERS (ctx->mark_chains[16]) 199 #define OPENERS_CHAIN_FIRST 1 200 #define OPENERS_CHAIN_LAST 16 201 202 int n_table_cell_boundaries; 203 204 /* For resolving links. */ 205 int unresolved_link_head; 206 int unresolved_link_tail; 207 208 /* For resolving raw HTML. */ 209 OFF html_comment_horizon; 210 OFF html_proc_instr_horizon; 211 OFF html_decl_horizon; 212 OFF html_cdata_horizon; 213 214 /* For block analysis. 215 * Notes: 216 * -- It holds MD_BLOCK as well as MD_LINE structures. After each 217 * MD_BLOCK, its (multiple) MD_LINE(s) follow. 218 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used 219 * instead of MD_LINE(s). 220 */ 221 void* block_bytes; 222 MD_BLOCK* current_block; 223 int n_block_bytes; 224 int alloc_block_bytes; 225 226 /* For container block analysis. */ 227 MD_CONTAINER* containers; 228 int n_containers; 229 int alloc_containers; 230 231 /* Minimal indentation to call the block "indented code block". */ 232 unsigned code_indent_offset; 233 234 /* Contextual info for line analysis. */ 235 SZ code_fence_length; /* For checking closing fence length. */ 236 int html_block_type; /* For checking closing raw HTML condition. */ 237 int last_line_has_list_loosening_effect; 238 int last_list_item_starts_with_two_blank_lines; 239 }; 240 241 enum MD_LINETYPE_tag { 242 MD_LINE_BLANK, 243 MD_LINE_HR, 244 MD_LINE_ATXHEADER, 245 MD_LINE_SETEXTHEADER, 246 MD_LINE_SETEXTUNDERLINE, 247 MD_LINE_INDENTEDCODE, 248 MD_LINE_FENCEDCODE, 249 MD_LINE_HTML, 250 MD_LINE_TEXT, 251 MD_LINE_TABLE, 252 MD_LINE_TABLEUNDERLINE 253 }; 254 typedef enum MD_LINETYPE_tag MD_LINETYPE; 255 256 typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS; 257 struct MD_LINE_ANALYSIS_tag { 258 MD_LINETYPE type : 16; 259 unsigned data : 16; 260 OFF beg; 261 OFF end; 262 unsigned indent; /* Indentation level. */ 263 }; 264 265 typedef struct MD_LINE_tag MD_LINE; 266 struct MD_LINE_tag { 267 OFF beg; 268 OFF end; 269 }; 270 271 typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE; 272 struct MD_VERBATIMLINE_tag { 273 OFF beg; 274 OFF end; 275 OFF indent; 276 }; 277 278 279 /***************** 280 *** Helpers *** 281 *****************/ 282 283 /* Character accessors. */ 284 #define CH(off) (ctx->text[(off)]) 285 #define STR(off) (ctx->text + (off)) 286 287 /* Character classification. 288 * Note we assume ASCII compatibility of code points < 128 here. */ 289 #define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max)) 290 #define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL) 291 #define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2)) 292 #define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3)) 293 #define ISASCII_(ch) ((unsigned)(ch) <= 127) 294 #define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t'))) 295 #define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n'))) 296 #define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f'))) 297 #define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127) 298 #define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126)) 299 #define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z'))) 300 #define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z'))) 301 #define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch)) 302 #define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9'))) 303 #define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f'))) 304 #define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch)) 305 306 #define ISANYOF(off, palette) ISANYOF_(CH(off), (palette)) 307 #define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2)) 308 #define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3)) 309 #define ISASCII(off) ISASCII_(CH(off)) 310 #define ISBLANK(off) ISBLANK_(CH(off)) 311 #define ISNEWLINE(off) ISNEWLINE_(CH(off)) 312 #define ISWHITESPACE(off) ISWHITESPACE_(CH(off)) 313 #define ISCNTRL(off) ISCNTRL_(CH(off)) 314 #define ISPUNCT(off) ISPUNCT_(CH(off)) 315 #define ISUPPER(off) ISUPPER_(CH(off)) 316 #define ISLOWER(off) ISLOWER_(CH(off)) 317 #define ISALPHA(off) ISALPHA_(CH(off)) 318 #define ISDIGIT(off) ISDIGIT_(CH(off)) 319 #define ISXDIGIT(off) ISXDIGIT_(CH(off)) 320 #define ISALNUM(off) ISALNUM_(CH(off)) 321 322 323 #if defined MD4C_USE_UTF16 324 #define md_strchr wcschr 325 #else 326 #define md_strchr strchr 327 #endif 328 329 330 /* Case insensitive check of string equality. */ 331 static inline int 332 md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n) 333 { 334 OFF i; 335 for(i = 0; i < n; i++) { 336 CHAR ch1 = s1[i]; 337 CHAR ch2 = s2[i]; 338 339 if(ISLOWER_(ch1)) 340 ch1 += ('A'-'a'); 341 if(ISLOWER_(ch2)) 342 ch2 += ('A'-'a'); 343 if(ch1 != ch2) 344 return FALSE; 345 } 346 return TRUE; 347 } 348 349 static inline int 350 md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n) 351 { 352 return memcmp(s1, s2, n * sizeof(CHAR)) == 0; 353 } 354 355 static int 356 md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size) 357 { 358 OFF off = 0; 359 int ret = 0; 360 361 while(1) { 362 while(off < size && str[off] != _T('\0')) 363 off++; 364 365 if(off > 0) { 366 ret = ctx->parser.text(type, str, off, ctx->userdata); 367 if(ret != 0) 368 return ret; 369 370 str += off; 371 size -= off; 372 off = 0; 373 } 374 375 if(off >= size) 376 return 0; 377 378 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata); 379 if(ret != 0) 380 return ret; 381 off++; 382 } 383 } 384 385 386 #define MD_CHECK(func) \ 387 do { \ 388 ret = (func); \ 389 if(ret < 0) \ 390 goto abort; \ 391 } while(0) 392 393 394 #define MD_TEMP_BUFFER(sz) \ 395 do { \ 396 if(sz > ctx->alloc_buffer) { \ 397 CHAR* new_buffer; \ 398 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \ 399 \ 400 new_buffer = realloc(ctx->buffer, new_size); \ 401 if(new_buffer == NULL) { \ 402 MD_LOG("realloc() failed."); \ 403 ret = -1; \ 404 goto abort; \ 405 } \ 406 \ 407 ctx->buffer = new_buffer; \ 408 ctx->alloc_buffer = new_size; \ 409 } \ 410 } while(0) 411 412 413 #define MD_ENTER_BLOCK(type, arg) \ 414 do { \ 415 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \ 416 if(ret != 0) { \ 417 MD_LOG("Aborted from enter_block() callback."); \ 418 goto abort; \ 419 } \ 420 } while(0) 421 422 #define MD_LEAVE_BLOCK(type, arg) \ 423 do { \ 424 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \ 425 if(ret != 0) { \ 426 MD_LOG("Aborted from leave_block() callback."); \ 427 goto abort; \ 428 } \ 429 } while(0) 430 431 #define MD_ENTER_SPAN(type, arg) \ 432 do { \ 433 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \ 434 if(ret != 0) { \ 435 MD_LOG("Aborted from enter_span() callback."); \ 436 goto abort; \ 437 } \ 438 } while(0) 439 440 #define MD_LEAVE_SPAN(type, arg) \ 441 do { \ 442 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \ 443 if(ret != 0) { \ 444 MD_LOG("Aborted from leave_span() callback."); \ 445 goto abort; \ 446 } \ 447 } while(0) 448 449 #define MD_TEXT(type, str, size) \ 450 do { \ 451 if(size > 0) { \ 452 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \ 453 if(ret != 0) { \ 454 MD_LOG("Aborted from text() callback."); \ 455 goto abort; \ 456 } \ 457 } \ 458 } while(0) 459 460 #define MD_TEXT_INSECURE(type, str, size) \ 461 do { \ 462 if(size > 0) { \ 463 ret = md_text_with_null_replacement(ctx, type, str, size); \ 464 if(ret != 0) { \ 465 MD_LOG("Aborted from text() callback."); \ 466 goto abort; \ 467 } \ 468 } \ 469 } while(0) 470 471 472 /* If the offset falls into a gap between line, we return the following 473 * line. */ 474 static const MD_LINE* 475 md_lookup_line(OFF off, const MD_LINE* lines, int n_lines) 476 { 477 int lo, hi; 478 int pivot; 479 const MD_LINE* line; 480 481 lo = 0; 482 hi = n_lines - 1; 483 while(lo <= hi) { 484 pivot = (lo + hi) / 2; 485 line = &lines[pivot]; 486 487 if(off < line->beg) { 488 hi = pivot - 1; 489 if(hi < 0 || lines[hi].end <= off) 490 return line; 491 } else if(off > line->end) { 492 lo = pivot + 1; 493 } else { 494 return line; 495 } 496 } 497 498 return NULL; 499 } 500 501 502 /************************* 503 *** Unicode Support *** 504 *************************/ 505 506 typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO; 507 struct MD_UNICODE_FOLD_INFO_tag { 508 unsigned codepoints[3]; 509 unsigned n_codepoints; 510 }; 511 512 513 #if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8 514 /* Binary search over sorted "map" of codepoints. Consecutive sequences 515 * of codepoints may be encoded in the map by just using the 516 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000). 517 * 518 * Returns index of the found record in the map (in the case of ranges, 519 * the minimal value is used); or -1 on failure. */ 520 static int 521 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size) 522 { 523 int beg, end; 524 int pivot_beg, pivot_end; 525 526 beg = 0; 527 end = (int) map_size-1; 528 while(beg <= end) { 529 /* Pivot may be a range, not just a single value. */ 530 pivot_beg = pivot_end = (beg + end) / 2; 531 if(map[pivot_end] & 0x40000000) 532 pivot_end++; 533 if(map[pivot_beg] & 0x80000000) 534 pivot_beg--; 535 536 if(codepoint < (map[pivot_beg] & 0x00ffffff)) 537 end = pivot_beg - 1; 538 else if(codepoint > (map[pivot_end] & 0x00ffffff)) 539 beg = pivot_end + 1; 540 else 541 return pivot_beg; 542 } 543 544 return -1; 545 } 546 547 static int 548 md_is_unicode_whitespace__(unsigned codepoint) 549 { 550 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) 551 #define S(cp) (cp) 552 /* Unicode "Zs" category. 553 * (generated by scripts/build_whitespace_map.py) */ 554 static const unsigned WHITESPACE_MAP[] = { 555 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000) 556 }; 557 #undef R 558 #undef S 559 560 /* The ASCII ones are the most frequently used ones, also CommonMark 561 * specification requests few more in this range. */ 562 if(codepoint <= 0x7f) 563 return ISWHITESPACE_(codepoint); 564 565 return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0); 566 } 567 568 static int 569 md_is_unicode_punct__(unsigned codepoint) 570 { 571 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) 572 #define S(cp) (cp) 573 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories. 574 * (generated by scripts/build_punct_map.py) */ 575 static const unsigned PUNCT_MAP[] = { 576 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040), 577 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7), 578 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0), 579 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f), 580 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e), 581 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f), 582 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4), 583 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c), 584 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a), 585 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60), 586 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027), 587 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e), 588 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef), 589 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70), 590 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f), 591 S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e), 592 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f), 593 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1), 594 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68), 595 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b), 596 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102), 597 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f), 598 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59), 599 R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175), 600 R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9), 601 R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643), 602 R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46), 603 R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8), 604 S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44), 605 R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f) 606 }; 607 #undef R 608 #undef S 609 610 /* The ASCII ones are the most frequently used ones, also CommonMark 611 * specification requests few more in this range. */ 612 if(codepoint <= 0x7f) 613 return ISPUNCT_(codepoint); 614 615 return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0); 616 } 617 618 static void 619 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) 620 { 621 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) 622 #define S(cp) (cp) 623 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories. 624 * (generated by scripts/build_folding_map.py) */ 625 static const unsigned FOLD_MAP_1[] = { 626 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136), 627 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182), 628 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190), 629 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f), 630 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2), 631 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8), 632 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7), 633 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241), 634 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f), 635 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab), 636 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1), 637 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f), 638 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e), 639 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81), 640 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba), 641 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d), 642 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f), 643 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8), 644 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8), 645 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183), 646 R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b), 647 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2), 648 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e), 649 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792), 650 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2), 651 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5), 652 R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2), 653 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921) 654 }; 655 static const unsigned FOLD_MAP_1_DATA[] = { 656 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148, 657 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257, 658 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275, 659 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292, 660 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3, 661 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242, 662 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af, 663 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0, 664 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f, 665 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586, 666 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a, 667 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07, 668 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60, 669 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0, 670 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170, 671 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251, 672 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641, 673 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c, 674 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d, 675 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41, 676 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922, 677 0x1e943 678 }; 679 static const unsigned FOLD_MAP_2[] = { 680 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99), 681 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f), 682 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2), 683 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3), 684 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13), 685 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17) 686 }; 687 static const unsigned FOLD_MAP_2_DATA[] = { 688 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308, 689 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9, 690 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9, 691 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342, 692 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342, 693 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9, 694 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565, 695 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d 696 }; 697 static const unsigned FOLD_MAP_3[] = { 698 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3), 699 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04) 700 }; 701 static const unsigned FOLD_MAP_3_DATA[] = { 702 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301, 703 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300, 704 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301, 705 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c 706 }; 707 #undef R 708 #undef S 709 static const struct { 710 const unsigned* map; 711 const unsigned* data; 712 size_t map_size; 713 unsigned n_codepoints; 714 } FOLD_MAP_LIST[] = { 715 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 }, 716 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 }, 717 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 } 718 }; 719 720 int i; 721 722 /* Fast path for ASCII characters. */ 723 if(codepoint <= 0x7f) { 724 info->codepoints[0] = codepoint; 725 if(ISUPPER_(codepoint)) 726 info->codepoints[0] += 'a' - 'A'; 727 info->n_codepoints = 1; 728 return; 729 } 730 731 /* Try to locate the codepoint in any of the maps. */ 732 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) { 733 int index; 734 735 index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size); 736 if(index >= 0) { 737 /* Found the mapping. */ 738 unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints; 739 const unsigned* map = FOLD_MAP_LIST[i].map; 740 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints); 741 742 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints); 743 info->n_codepoints = n_codepoints; 744 745 if(FOLD_MAP_LIST[i].map[index] != codepoint) { 746 /* The found mapping maps whole range of codepoints, 747 * i.e. we have to offset info->codepoints[0] accordingly. */ 748 if((map[index] & 0x00ffffff)+1 == codepoints[0]) { 749 /* Alternating type of the range. */ 750 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0); 751 } else { 752 /* Range to range kind of mapping. */ 753 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff)); 754 } 755 } 756 757 return; 758 } 759 } 760 761 /* No mapping found. Map the codepoint to itself. */ 762 info->codepoints[0] = codepoint; 763 info->n_codepoints = 1; 764 } 765 #endif 766 767 768 #if defined MD4C_USE_UTF16 769 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800) 770 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00) 771 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0))) 772 773 static unsigned 774 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size) 775 { 776 if(IS_UTF16_SURROGATE_HI(str[0])) { 777 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) { 778 if(p_size != NULL) 779 *p_size = 2; 780 return UTF16_DECODE_SURROGATE(str[0], str[1]); 781 } 782 } 783 784 if(p_size != NULL) 785 *p_size = 1; 786 return str[0]; 787 } 788 789 static unsigned 790 md_decode_utf16le_before__(MD_CTX* ctx, OFF off) 791 { 792 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1))) 793 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1)); 794 795 return CH(off); 796 } 797 798 /* No whitespace uses surrogates, so no decoding needed here. */ 799 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) 800 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off)) 801 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1)) 802 803 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL)) 804 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off)) 805 806 static inline int 807 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) 808 { 809 return md_decode_utf16le__(str+off, str_size-off, p_char_size); 810 } 811 #elif defined MD4C_USE_UTF8 812 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f) 813 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0) 814 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0) 815 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0) 816 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80) 817 818 static unsigned 819 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size) 820 { 821 if(!IS_UTF8_LEAD1(str[0])) { 822 if(IS_UTF8_LEAD2(str[0])) { 823 if(1 < str_size && IS_UTF8_TAIL(str[1])) { 824 if(p_size != NULL) 825 *p_size = 2; 826 827 return (((unsigned int)str[0] & 0x1f) << 6) | 828 (((unsigned int)str[1] & 0x3f) << 0); 829 } 830 } else if(IS_UTF8_LEAD3(str[0])) { 831 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) { 832 if(p_size != NULL) 833 *p_size = 3; 834 835 return (((unsigned int)str[0] & 0x0f) << 12) | 836 (((unsigned int)str[1] & 0x3f) << 6) | 837 (((unsigned int)str[2] & 0x3f) << 0); 838 } 839 } else if(IS_UTF8_LEAD4(str[0])) { 840 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) { 841 if(p_size != NULL) 842 *p_size = 4; 843 844 return (((unsigned int)str[0] & 0x07) << 18) | 845 (((unsigned int)str[1] & 0x3f) << 12) | 846 (((unsigned int)str[2] & 0x3f) << 6) | 847 (((unsigned int)str[3] & 0x3f) << 0); 848 } 849 } 850 } 851 852 if(p_size != NULL) 853 *p_size = 1; 854 return (unsigned) str[0]; 855 } 856 857 static unsigned 858 md_decode_utf8_before__(MD_CTX* ctx, OFF off) 859 { 860 if(!IS_UTF8_LEAD1(CH(off-1))) { 861 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) 862 return (((unsigned int)CH(off-2) & 0x1f) << 6) | 863 (((unsigned int)CH(off-1) & 0x3f) << 0); 864 865 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) 866 return (((unsigned int)CH(off-3) & 0x0f) << 12) | 867 (((unsigned int)CH(off-2) & 0x3f) << 6) | 868 (((unsigned int)CH(off-1) & 0x3f) << 0); 869 870 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1))) 871 return (((unsigned int)CH(off-4) & 0x07) << 18) | 872 (((unsigned int)CH(off-3) & 0x3f) << 12) | 873 (((unsigned int)CH(off-2) & 0x3f) << 6) | 874 (((unsigned int)CH(off-1) & 0x3f) << 0); 875 } 876 877 return (unsigned) CH(off-1); 878 } 879 880 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint) 881 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) 882 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off)) 883 884 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) 885 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off)) 886 887 static inline unsigned 888 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) 889 { 890 return md_decode_utf8__(str+off, str_size-off, p_char_size); 891 } 892 #else 893 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint) 894 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off) 895 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1) 896 897 #define ISUNICODEPUNCT(off) ISPUNCT(off) 898 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) 899 900 static inline void 901 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) 902 { 903 info->codepoints[0] = codepoint; 904 if(ISUPPER_(codepoint)) 905 info->codepoints[0] += 'a' - 'A'; 906 info->n_codepoints = 1; 907 } 908 909 static inline unsigned 910 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size) 911 { 912 *p_size = 1; 913 return (unsigned) str[off]; 914 } 915 #endif 916 917 918 /************************************* 919 *** Helper string manipulations *** 920 *************************************/ 921 922 /* Fill buffer with copy of the string between 'beg' and 'end' but replace any 923 * line breaks with given replacement character. 924 * 925 * NOTE: Caller is responsible to make sure the buffer is large enough. 926 * (Given the output is always shorter then input, (end - beg) is good idea 927 * what the caller should allocate.) 928 */ 929 static void 930 md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, 931 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size) 932 { 933 CHAR* ptr = buffer; 934 int line_index = 0; 935 OFF off = beg; 936 937 MD_UNUSED(n_lines); 938 939 while(1) { 940 const MD_LINE* line = &lines[line_index]; 941 OFF line_end = line->end; 942 if(end < line_end) 943 line_end = end; 944 945 while(off < line_end) { 946 *ptr = CH(off); 947 ptr++; 948 off++; 949 } 950 951 if(off >= end) { 952 *p_size = (MD_SIZE)(ptr - buffer); 953 return; 954 } 955 956 *ptr = line_break_replacement_char; 957 ptr++; 958 959 line_index++; 960 off = lines[line_index].beg; 961 } 962 } 963 964 /* Wrapper of md_merge_lines() which allocates new buffer for the output string. 965 */ 966 static int 967 md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines, 968 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size) 969 { 970 CHAR* buffer; 971 972 buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg)); 973 if(buffer == NULL) { 974 MD_LOG("malloc() failed."); 975 return -1; 976 } 977 978 md_merge_lines(ctx, beg, end, lines, n_lines, 979 line_break_replacement_char, buffer, p_size); 980 981 *p_str = buffer; 982 return 0; 983 } 984 985 static OFF 986 md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size) 987 { 988 SZ char_size; 989 unsigned codepoint; 990 991 while(off < size) { 992 codepoint = md_decode_unicode(label, off, size, &char_size); 993 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off])) 994 break; 995 off += char_size; 996 } 997 998 return off; 999 } 1000 1001 1002 /****************************** 1003 *** Recognizing raw HTML *** 1004 ******************************/ 1005 1006 /* md_is_html_tag() may be called when processing inlines (inline raw HTML) 1007 * or when breaking document to blocks (checking for start of HTML block type 7). 1008 * 1009 * When breaking document to blocks, we do not yet know line boundaries, but 1010 * in that case the whole tag has to live on a single line. We distinguish this 1011 * by n_lines == 0. 1012 */ 1013 static int 1014 md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1015 { 1016 int attr_state; 1017 OFF off = beg; 1018 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size; 1019 int i = 0; 1020 1021 MD_ASSERT(CH(beg) == _T('<')); 1022 1023 if(off + 1 >= line_end) 1024 return FALSE; 1025 off++; 1026 1027 /* For parsing attributes, we need a little state automaton below. 1028 * State -1: no attributes are allowed. 1029 * State 0: attribute could follow after some whitespace. 1030 * State 1: after a whitespace (attribute name may follow). 1031 * State 2: after attribute name ('=' MAY follow). 1032 * State 3: after '=' (value specification MUST follow). 1033 * State 41: in middle of unquoted attribute value. 1034 * State 42: in middle of single-quoted attribute value. 1035 * State 43: in middle of double-quoted attribute value. 1036 */ 1037 attr_state = 0; 1038 1039 if(CH(off) == _T('/')) { 1040 /* Closer tag "</ ... >". No attributes may be present. */ 1041 attr_state = -1; 1042 off++; 1043 } 1044 1045 /* Tag name */ 1046 if(off >= line_end || !ISALPHA(off)) 1047 return FALSE; 1048 off++; 1049 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-'))) 1050 off++; 1051 1052 /* (Optional) attributes (if not closer), (optional) '/' (if not closer) 1053 * and final '>'. */ 1054 while(1) { 1055 while(off < line_end && !ISNEWLINE(off)) { 1056 if(attr_state > 40) { 1057 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) { 1058 attr_state = 0; 1059 off--; /* Put the char back for re-inspection in the new state. */ 1060 } else if(attr_state == 42 && CH(off) == _T('\'')) { 1061 attr_state = 0; 1062 } else if(attr_state == 43 && CH(off) == _T('"')) { 1063 attr_state = 0; 1064 } 1065 off++; 1066 } else if(ISWHITESPACE(off)) { 1067 if(attr_state == 0) 1068 attr_state = 1; 1069 off++; 1070 } else if(attr_state <= 2 && CH(off) == _T('>')) { 1071 /* End. */ 1072 goto done; 1073 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) { 1074 /* End with digraph '/>' */ 1075 off++; 1076 goto done; 1077 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) { 1078 off++; 1079 /* Attribute name */ 1080 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-")))) 1081 off++; 1082 attr_state = 2; 1083 } else if(attr_state == 2 && CH(off) == _T('=')) { 1084 /* Attribute assignment sign */ 1085 off++; 1086 attr_state = 3; 1087 } else if(attr_state == 3) { 1088 /* Expecting start of attribute value. */ 1089 if(CH(off) == _T('"')) 1090 attr_state = 43; 1091 else if(CH(off) == _T('\'')) 1092 attr_state = 42; 1093 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off)) 1094 attr_state = 41; 1095 else 1096 return FALSE; 1097 off++; 1098 } else { 1099 /* Anything unexpected. */ 1100 return FALSE; 1101 } 1102 } 1103 1104 /* We have to be on a single line. See definition of start condition 1105 * of HTML block, type 7. */ 1106 if(n_lines == 0) 1107 return FALSE; 1108 1109 i++; 1110 if(i >= n_lines) 1111 return FALSE; 1112 1113 off = lines[i].beg; 1114 line_end = lines[i].end; 1115 1116 if(attr_state == 0 || attr_state == 41) 1117 attr_state = 1; 1118 1119 if(off >= max_end) 1120 return FALSE; 1121 } 1122 1123 done: 1124 if(off >= max_end) 1125 return FALSE; 1126 1127 *p_end = off+1; 1128 return TRUE; 1129 } 1130 1131 static int 1132 md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len, 1133 const MD_LINE* lines, int n_lines, 1134 OFF beg, OFF max_end, OFF* p_end, 1135 OFF* p_scan_horizon) 1136 { 1137 OFF off = beg; 1138 int i = 0; 1139 1140 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) { 1141 /* We have already scanned the range up to the max_end so we know 1142 * there is nothing to see. */ 1143 return FALSE; 1144 } 1145 1146 while(TRUE) { 1147 while(off + len <= lines[i].end && off + len <= max_end) { 1148 if(md_ascii_eq(STR(off), str, len)) { 1149 /* Success. */ 1150 *p_end = off + len; 1151 return TRUE; 1152 } 1153 off++; 1154 } 1155 1156 i++; 1157 if(off >= max_end || i >= n_lines) { 1158 /* Failure. */ 1159 *p_scan_horizon = off; 1160 return FALSE; 1161 } 1162 1163 off = lines[i].beg; 1164 } 1165 } 1166 1167 static int 1168 md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1169 { 1170 OFF off = beg; 1171 1172 MD_ASSERT(CH(beg) == _T('<')); 1173 1174 if(off + 4 >= lines[0].end) 1175 return FALSE; 1176 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-')) 1177 return FALSE; 1178 off += 4; 1179 1180 /* ">" and "->" must not follow the opening. */ 1181 if(off < lines[0].end && CH(off) == _T('>')) 1182 return FALSE; 1183 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>')) 1184 return FALSE; 1185 1186 /* HTML comment must not contain "--", so we scan just for "--" instead 1187 * of "-->" and verify manually that '>' follows. */ 1188 if(md_scan_for_html_closer(ctx, _T("--"), 2, 1189 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon)) 1190 { 1191 if(*p_end < max_end && CH(*p_end) == _T('>')) { 1192 *p_end = *p_end + 1; 1193 return TRUE; 1194 } 1195 } 1196 1197 return FALSE; 1198 } 1199 1200 static int 1201 md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1202 { 1203 OFF off = beg; 1204 1205 if(off + 2 >= lines[0].end) 1206 return FALSE; 1207 if(CH(off+1) != _T('?')) 1208 return FALSE; 1209 off += 2; 1210 1211 return md_scan_for_html_closer(ctx, _T("?>"), 2, 1212 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon); 1213 } 1214 1215 static int 1216 md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1217 { 1218 OFF off = beg; 1219 1220 if(off + 2 >= lines[0].end) 1221 return FALSE; 1222 if(CH(off+1) != _T('!')) 1223 return FALSE; 1224 off += 2; 1225 1226 /* Declaration name. */ 1227 if(off >= lines[0].end || !ISALPHA(off)) 1228 return FALSE; 1229 off++; 1230 while(off < lines[0].end && ISALPHA(off)) 1231 off++; 1232 if(off < lines[0].end && !ISWHITESPACE(off)) 1233 return FALSE; 1234 1235 return md_scan_for_html_closer(ctx, _T(">"), 1, 1236 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon); 1237 } 1238 1239 static int 1240 md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1241 { 1242 static const CHAR open_str[] = _T("<![CDATA["); 1243 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1; 1244 1245 OFF off = beg; 1246 1247 if(off + open_size >= lines[0].end) 1248 return FALSE; 1249 if(memcmp(STR(off), open_str, open_size) != 0) 1250 return FALSE; 1251 off += open_size; 1252 1253 if(lines[n_lines-1].end < max_end) 1254 max_end = lines[n_lines-1].end - 2; 1255 1256 return md_scan_for_html_closer(ctx, _T("]]>"), 3, 1257 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon); 1258 } 1259 1260 static int 1261 md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end) 1262 { 1263 MD_ASSERT(CH(beg) == _T('<')); 1264 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) || 1265 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) || 1266 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) || 1267 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) || 1268 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end)); 1269 } 1270 1271 1272 /**************************** 1273 *** Recognizing Entity *** 1274 ****************************/ 1275 1276 static int 1277 md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1278 { 1279 OFF off = beg; 1280 MD_UNUSED(ctx); 1281 1282 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8) 1283 off++; 1284 1285 if(1 <= off - beg && off - beg <= 6) { 1286 *p_end = off; 1287 return TRUE; 1288 } else { 1289 return FALSE; 1290 } 1291 } 1292 1293 static int 1294 md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1295 { 1296 OFF off = beg; 1297 MD_UNUSED(ctx); 1298 1299 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8) 1300 off++; 1301 1302 if(1 <= off - beg && off - beg <= 7) { 1303 *p_end = off; 1304 return TRUE; 1305 } else { 1306 return FALSE; 1307 } 1308 } 1309 1310 static int 1311 md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1312 { 1313 OFF off = beg; 1314 MD_UNUSED(ctx); 1315 1316 if(off < max_end && ISALPHA_(text[off])) 1317 off++; 1318 else 1319 return FALSE; 1320 1321 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48) 1322 off++; 1323 1324 if(2 <= off - beg && off - beg <= 48) { 1325 *p_end = off; 1326 return TRUE; 1327 } else { 1328 return FALSE; 1329 } 1330 } 1331 1332 static int 1333 md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end) 1334 { 1335 int is_contents; 1336 OFF off = beg; 1337 1338 MD_ASSERT(text[off] == _T('&')); 1339 off++; 1340 1341 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X'))) 1342 is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off); 1343 else if(off+1 < max_end && text[off] == _T('#')) 1344 is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off); 1345 else 1346 is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off); 1347 1348 if(is_contents && off < max_end && text[off] == _T(';')) { 1349 *p_end = off+1; 1350 return TRUE; 1351 } else { 1352 return FALSE; 1353 } 1354 } 1355 1356 static inline int 1357 md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end) 1358 { 1359 return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end); 1360 } 1361 1362 1363 /****************************** 1364 *** Attribute Management *** 1365 ******************************/ 1366 1367 typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD; 1368 struct MD_ATTRIBUTE_BUILD_tag { 1369 CHAR* text; 1370 MD_TEXTTYPE* substr_types; 1371 OFF* substr_offsets; 1372 int substr_count; 1373 int substr_alloc; 1374 MD_TEXTTYPE trivial_types[1]; 1375 OFF trivial_offsets[2]; 1376 }; 1377 1378 1379 #define MD_BUILD_ATTR_NO_ESCAPES 0x0001 1380 1381 static int 1382 md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build, 1383 MD_TEXTTYPE type, OFF off) 1384 { 1385 if(build->substr_count >= build->substr_alloc) { 1386 MD_TEXTTYPE* new_substr_types; 1387 OFF* new_substr_offsets; 1388 1389 build->substr_alloc = (build->substr_alloc > 0 1390 ? build->substr_alloc + build->substr_alloc / 2 1391 : 8); 1392 new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types, 1393 build->substr_alloc * sizeof(MD_TEXTTYPE)); 1394 if(new_substr_types == NULL) { 1395 MD_LOG("realloc() failed."); 1396 return -1; 1397 } 1398 /* Note +1 to reserve space for final offset (== raw_size). */ 1399 new_substr_offsets = (OFF*) realloc(build->substr_offsets, 1400 (build->substr_alloc+1) * sizeof(OFF)); 1401 if(new_substr_offsets == NULL) { 1402 MD_LOG("realloc() failed."); 1403 free(new_substr_types); 1404 return -1; 1405 } 1406 1407 build->substr_types = new_substr_types; 1408 build->substr_offsets = new_substr_offsets; 1409 } 1410 1411 build->substr_types[build->substr_count] = type; 1412 build->substr_offsets[build->substr_count] = off; 1413 build->substr_count++; 1414 return 0; 1415 } 1416 1417 static void 1418 md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build) 1419 { 1420 MD_UNUSED(ctx); 1421 1422 if(build->substr_alloc > 0) { 1423 free(build->text); 1424 free(build->substr_types); 1425 free(build->substr_offsets); 1426 } 1427 } 1428 1429 static int 1430 md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, 1431 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build) 1432 { 1433 OFF raw_off, off; 1434 int is_trivial; 1435 int ret = 0; 1436 1437 memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); 1438 1439 /* If there is no backslash and no ampersand, build trivial attribute 1440 * without any malloc(). */ 1441 is_trivial = TRUE; 1442 for(raw_off = 0; raw_off < raw_size; raw_off++) { 1443 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) { 1444 is_trivial = FALSE; 1445 break; 1446 } 1447 } 1448 1449 if(is_trivial) { 1450 build->text = (CHAR*) (raw_size ? raw_text : NULL); 1451 build->substr_types = build->trivial_types; 1452 build->substr_offsets = build->trivial_offsets; 1453 build->substr_count = 1; 1454 build->substr_alloc = 0; 1455 build->trivial_types[0] = MD_TEXT_NORMAL; 1456 build->trivial_offsets[0] = 0; 1457 build->trivial_offsets[1] = raw_size; 1458 off = raw_size; 1459 } else { 1460 build->text = (CHAR*) malloc(raw_size * sizeof(CHAR)); 1461 if(build->text == NULL) { 1462 MD_LOG("malloc() failed."); 1463 goto abort; 1464 } 1465 1466 raw_off = 0; 1467 off = 0; 1468 1469 while(raw_off < raw_size) { 1470 if(raw_text[raw_off] == _T('\0')) { 1471 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off)); 1472 memcpy(build->text + off, raw_text + raw_off, 1); 1473 off++; 1474 raw_off++; 1475 continue; 1476 } 1477 1478 if(raw_text[raw_off] == _T('&')) { 1479 OFF ent_end; 1480 1481 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) { 1482 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off)); 1483 memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off); 1484 off += ent_end - raw_off; 1485 raw_off = ent_end; 1486 continue; 1487 } 1488 } 1489 1490 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL) 1491 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off)); 1492 1493 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) && 1494 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size && 1495 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1]))) 1496 raw_off++; 1497 1498 build->text[off++] = raw_text[raw_off++]; 1499 } 1500 build->substr_offsets[build->substr_count] = off; 1501 } 1502 1503 attr->text = build->text; 1504 attr->size = off; 1505 attr->substr_offsets = build->substr_offsets; 1506 attr->substr_types = build->substr_types; 1507 return 0; 1508 1509 abort: 1510 md_free_attribute(ctx, build); 1511 return -1; 1512 } 1513 1514 1515 /********************************************* 1516 *** Dictionary of Reference Definitions *** 1517 *********************************************/ 1518 1519 #define MD_FNV1A_BASE 2166136261U 1520 #define MD_FNV1A_PRIME 16777619U 1521 1522 static inline unsigned 1523 md_fnv1a(unsigned base, const void* data, size_t n) 1524 { 1525 const unsigned char* buf = (const unsigned char*) data; 1526 unsigned hash = base; 1527 size_t i; 1528 1529 for(i = 0; i < n; i++) { 1530 hash ^= buf[i]; 1531 hash *= MD_FNV1A_PRIME; 1532 } 1533 1534 return hash; 1535 } 1536 1537 1538 struct MD_REF_DEF_tag { 1539 CHAR* label; 1540 CHAR* title; 1541 unsigned hash; 1542 SZ label_size; 1543 SZ title_size; 1544 OFF dest_beg; 1545 OFF dest_end; 1546 unsigned char label_needs_free : 1; 1547 unsigned char title_needs_free : 1; 1548 }; 1549 1550 /* Label equivalence is quite complicated with regards to whitespace and case 1551 * folding. This complicates computing a hash of it as well as direct comparison 1552 * of two labels. */ 1553 1554 static unsigned 1555 md_link_label_hash(const CHAR* label, SZ size) 1556 { 1557 unsigned hash = MD_FNV1A_BASE; 1558 OFF off; 1559 unsigned codepoint; 1560 int is_whitespace = FALSE; 1561 1562 off = md_skip_unicode_whitespace(label, 0, size); 1563 while(off < size) { 1564 SZ char_size; 1565 1566 codepoint = md_decode_unicode(label, off, size, &char_size); 1567 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]); 1568 1569 if(is_whitespace) { 1570 codepoint = ' '; 1571 hash = md_fnv1a(hash, &codepoint, sizeof(unsigned)); 1572 off = md_skip_unicode_whitespace(label, off, size); 1573 } else { 1574 MD_UNICODE_FOLD_INFO fold_info; 1575 1576 md_get_unicode_fold_info(codepoint, &fold_info); 1577 hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned)); 1578 off += char_size; 1579 } 1580 } 1581 1582 return hash; 1583 } 1584 1585 static OFF 1586 md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size, 1587 MD_UNICODE_FOLD_INFO* fold_info) 1588 { 1589 unsigned codepoint; 1590 SZ char_size; 1591 1592 if(off >= size) { 1593 /* Treat end of a link label as a whitespace. */ 1594 goto whitespace; 1595 } 1596 1597 codepoint = md_decode_unicode(label, off, size, &char_size); 1598 off += char_size; 1599 if(ISUNICODEWHITESPACE_(codepoint)) { 1600 /* Treat all whitespace as equivalent */ 1601 goto whitespace; 1602 } 1603 1604 /* Get real folding info. */ 1605 md_get_unicode_fold_info(codepoint, fold_info); 1606 return off; 1607 1608 whitespace: 1609 fold_info->codepoints[0] = _T(' '); 1610 fold_info->n_codepoints = 1; 1611 return md_skip_unicode_whitespace(label, off, size); 1612 } 1613 1614 static int 1615 md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size) 1616 { 1617 OFF a_off; 1618 OFF b_off; 1619 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 }; 1620 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 }; 1621 OFF a_fi_off = 0; 1622 OFF b_fi_off = 0; 1623 int cmp; 1624 1625 a_off = md_skip_unicode_whitespace(a_label, 0, a_size); 1626 b_off = md_skip_unicode_whitespace(b_label, 0, b_size); 1627 while(a_off < a_size || a_fi_off < a_fi.n_codepoints || 1628 b_off < b_size || b_fi_off < b_fi.n_codepoints) 1629 { 1630 /* If needed, load fold info for next char. */ 1631 if(a_fi_off >= a_fi.n_codepoints) { 1632 a_fi_off = 0; 1633 a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi); 1634 } 1635 if(b_fi_off >= b_fi.n_codepoints) { 1636 b_fi_off = 0; 1637 b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi); 1638 } 1639 1640 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off]; 1641 if(cmp != 0) 1642 return cmp; 1643 1644 a_fi_off++; 1645 b_fi_off++; 1646 } 1647 1648 return 0; 1649 } 1650 1651 typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST; 1652 struct MD_REF_DEF_LIST_tag { 1653 int n_ref_defs; 1654 int alloc_ref_defs; 1655 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */ 1656 }; 1657 1658 static int 1659 md_ref_def_cmp(const void* a, const void* b) 1660 { 1661 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a; 1662 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b; 1663 1664 if(a_ref->hash < b_ref->hash) 1665 return -1; 1666 else if(a_ref->hash > b_ref->hash) 1667 return +1; 1668 else 1669 return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size); 1670 } 1671 1672 static int 1673 md_ref_def_cmp_for_sort(const void* a, const void* b) 1674 { 1675 int cmp; 1676 1677 cmp = md_ref_def_cmp(a, b); 1678 1679 /* Ensure stability of the sorting. */ 1680 if(cmp == 0) { 1681 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a; 1682 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b; 1683 1684 if(a_ref < b_ref) 1685 cmp = -1; 1686 else if(a_ref > b_ref) 1687 cmp = +1; 1688 else 1689 cmp = 0; 1690 } 1691 1692 return cmp; 1693 } 1694 1695 static int 1696 md_build_ref_def_hashtable(MD_CTX* ctx) 1697 { 1698 int i, j; 1699 1700 if(ctx->n_ref_defs == 0) 1701 return 0; 1702 1703 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4; 1704 ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*)); 1705 if(ctx->ref_def_hashtable == NULL) { 1706 MD_LOG("malloc() failed."); 1707 goto abort; 1708 } 1709 memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*)); 1710 1711 /* Each member of ctx->ref_def_hashtable[] can be: 1712 * -- NULL, 1713 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or 1714 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to 1715 * such MD_REF_DEFs. 1716 */ 1717 for(i = 0; i < ctx->n_ref_defs; i++) { 1718 MD_REF_DEF* def = &ctx->ref_defs[i]; 1719 void* bucket; 1720 MD_REF_DEF_LIST* list; 1721 1722 def->hash = md_link_label_hash(def->label, def->label_size); 1723 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size]; 1724 1725 if(bucket == NULL) { 1726 /* The bucket is empty. Make it just point to the def. */ 1727 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def; 1728 continue; 1729 } 1730 1731 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) { 1732 /* The bucket already contains one ref. def. Lets see whether it 1733 * is the same label (ref. def. duplicate) or different one 1734 * (hash conflict). */ 1735 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket; 1736 1737 if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) { 1738 /* Duplicate label: Ignore this ref. def. */ 1739 continue; 1740 } 1741 1742 /* Make the bucket complex, i.e. able to hold more ref. defs. */ 1743 list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*)); 1744 if(list == NULL) { 1745 MD_LOG("malloc() failed."); 1746 goto abort; 1747 } 1748 list->ref_defs[0] = old_def; 1749 list->ref_defs[1] = def; 1750 list->n_ref_defs = 2; 1751 list->alloc_ref_defs = 2; 1752 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list; 1753 continue; 1754 } 1755 1756 /* Append the def to the complex bucket list. 1757 * 1758 * Note in this case we ignore potential duplicates to avoid expensive 1759 * iterating over the complex bucket. Below, we revisit all the complex 1760 * buckets and handle it more cheaply after the complex bucket contents 1761 * is sorted. */ 1762 list = (MD_REF_DEF_LIST*) bucket; 1763 if(list->n_ref_defs >= list->alloc_ref_defs) { 1764 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2; 1765 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list, 1766 sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*)); 1767 if(list_tmp == NULL) { 1768 MD_LOG("realloc() failed."); 1769 goto abort; 1770 } 1771 list = list_tmp; 1772 list->alloc_ref_defs = alloc_ref_defs; 1773 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list; 1774 } 1775 1776 list->ref_defs[list->n_ref_defs] = def; 1777 list->n_ref_defs++; 1778 } 1779 1780 /* Sort the complex buckets so we can use bsearch() with them. */ 1781 for(i = 0; i < ctx->ref_def_hashtable_size; i++) { 1782 void* bucket = ctx->ref_def_hashtable[i]; 1783 MD_REF_DEF_LIST* list; 1784 1785 if(bucket == NULL) 1786 continue; 1787 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) 1788 continue; 1789 1790 list = (MD_REF_DEF_LIST*) bucket; 1791 qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort); 1792 1793 /* Disable all duplicates in the complex bucket by forcing all such 1794 * records to point to the 1st such ref. def. I.e. no matter which 1795 * record is found during the lookup, it will always point to the right 1796 * ref. def. in ctx->ref_defs[]. */ 1797 for(j = 1; j < list->n_ref_defs; j++) { 1798 if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0) 1799 list->ref_defs[j] = list->ref_defs[j-1]; 1800 } 1801 } 1802 1803 return 0; 1804 1805 abort: 1806 return -1; 1807 } 1808 1809 static void 1810 md_free_ref_def_hashtable(MD_CTX* ctx) 1811 { 1812 if(ctx->ref_def_hashtable != NULL) { 1813 int i; 1814 1815 for(i = 0; i < ctx->ref_def_hashtable_size; i++) { 1816 void* bucket = ctx->ref_def_hashtable[i]; 1817 if(bucket == NULL) 1818 continue; 1819 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) 1820 continue; 1821 free(bucket); 1822 } 1823 1824 free(ctx->ref_def_hashtable); 1825 } 1826 } 1827 1828 static const MD_REF_DEF* 1829 md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size) 1830 { 1831 unsigned hash; 1832 void* bucket; 1833 1834 if(ctx->ref_def_hashtable_size == 0) 1835 return NULL; 1836 1837 hash = md_link_label_hash(label, label_size); 1838 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size]; 1839 1840 if(bucket == NULL) { 1841 return NULL; 1842 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) { 1843 const MD_REF_DEF* def = (MD_REF_DEF*) bucket; 1844 1845 if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0) 1846 return def; 1847 else 1848 return NULL; 1849 } else { 1850 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket; 1851 MD_REF_DEF key_buf; 1852 const MD_REF_DEF* key = &key_buf; 1853 const MD_REF_DEF** ret; 1854 1855 key_buf.label = (CHAR*) label; 1856 key_buf.label_size = label_size; 1857 key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size); 1858 1859 ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs, 1860 list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp); 1861 if(ret != NULL) 1862 return *ret; 1863 else 1864 return NULL; 1865 } 1866 } 1867 1868 1869 /*************************** 1870 *** Recognizing Links *** 1871 ***************************/ 1872 1873 /* Note this code is partially shared between processing inlines and blocks 1874 * as reference definitions and links share some helper parser functions. 1875 */ 1876 1877 typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR; 1878 struct MD_LINK_ATTR_tag { 1879 OFF dest_beg; 1880 OFF dest_end; 1881 1882 CHAR* title; 1883 SZ title_size; 1884 int title_needs_free; 1885 }; 1886 1887 1888 static int 1889 md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, 1890 OFF* p_end, int* p_beg_line_index, int* p_end_line_index, 1891 OFF* p_contents_beg, OFF* p_contents_end) 1892 { 1893 OFF off = beg; 1894 OFF contents_beg = 0; 1895 OFF contents_end = 0; 1896 int line_index = 0; 1897 int len = 0; 1898 1899 if(CH(off) != _T('[')) 1900 return FALSE; 1901 off++; 1902 1903 while(1) { 1904 OFF line_end = lines[line_index].end; 1905 1906 while(off < line_end) { 1907 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { 1908 if(contents_end == 0) { 1909 contents_beg = off; 1910 *p_beg_line_index = line_index; 1911 } 1912 contents_end = off + 2; 1913 off += 2; 1914 } else if(CH(off) == _T('[')) { 1915 return FALSE; 1916 } else if(CH(off) == _T(']')) { 1917 if(contents_beg < contents_end) { 1918 /* Success. */ 1919 *p_contents_beg = contents_beg; 1920 *p_contents_end = contents_end; 1921 *p_end = off+1; 1922 *p_end_line_index = line_index; 1923 return TRUE; 1924 } else { 1925 /* Link label must have some non-whitespace contents. */ 1926 return FALSE; 1927 } 1928 } else { 1929 unsigned codepoint; 1930 SZ char_size; 1931 1932 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size); 1933 if(!ISUNICODEWHITESPACE_(codepoint)) { 1934 if(contents_end == 0) { 1935 contents_beg = off; 1936 *p_beg_line_index = line_index; 1937 } 1938 contents_end = off + char_size; 1939 } 1940 1941 off += char_size; 1942 } 1943 1944 len++; 1945 if(len > 999) 1946 return FALSE; 1947 } 1948 1949 line_index++; 1950 len++; 1951 if(line_index < n_lines) 1952 off = lines[line_index].beg; 1953 else 1954 break; 1955 } 1956 1957 return FALSE; 1958 } 1959 1960 static int 1961 md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, 1962 OFF* p_contents_beg, OFF* p_contents_end) 1963 { 1964 OFF off = beg; 1965 1966 if(off >= max_end || CH(off) != _T('<')) 1967 return FALSE; 1968 off++; 1969 1970 while(off < max_end) { 1971 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { 1972 off += 2; 1973 continue; 1974 } 1975 1976 if(ISNEWLINE(off) || CH(off) == _T('<')) 1977 return FALSE; 1978 1979 if(CH(off) == _T('>')) { 1980 /* Success. */ 1981 *p_contents_beg = beg+1; 1982 *p_contents_end = off; 1983 *p_end = off+1; 1984 return TRUE; 1985 } 1986 1987 off++; 1988 } 1989 1990 return FALSE; 1991 } 1992 1993 static int 1994 md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, 1995 OFF* p_contents_beg, OFF* p_contents_end) 1996 { 1997 OFF off = beg; 1998 int parenthesis_level = 0; 1999 2000 while(off < max_end) { 2001 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) { 2002 off += 2; 2003 continue; 2004 } 2005 2006 if(ISWHITESPACE(off) || ISCNTRL(off)) 2007 break; 2008 2009 /* Link destination may include balanced pairs of unescaped '(' ')'. 2010 * Note we limit the maximal nesting level by 32 to protect us from 2011 * https://github.com/jgm/cmark/issues/214 */ 2012 if(CH(off) == _T('(')) { 2013 parenthesis_level++; 2014 if(parenthesis_level > 32) 2015 return FALSE; 2016 } else if(CH(off) == _T(')')) { 2017 if(parenthesis_level == 0) 2018 break; 2019 parenthesis_level--; 2020 } 2021 2022 off++; 2023 } 2024 2025 if(parenthesis_level != 0 || off == beg) 2026 return FALSE; 2027 2028 /* Success. */ 2029 *p_contents_beg = beg; 2030 *p_contents_end = off; 2031 *p_end = off; 2032 return TRUE; 2033 } 2034 2035 static inline int 2036 md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, 2037 OFF* p_contents_beg, OFF* p_contents_end) 2038 { 2039 if(CH(beg) == _T('<')) 2040 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); 2041 else 2042 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); 2043 } 2044 2045 static int 2046 md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, 2047 OFF* p_end, int* p_beg_line_index, int* p_end_line_index, 2048 OFF* p_contents_beg, OFF* p_contents_end) 2049 { 2050 OFF off = beg; 2051 CHAR closer_char; 2052 int line_index = 0; 2053 2054 /* White space with up to one line break. */ 2055 while(off < lines[line_index].end && ISWHITESPACE(off)) 2056 off++; 2057 if(off >= lines[line_index].end) { 2058 line_index++; 2059 if(line_index >= n_lines) 2060 return FALSE; 2061 off = lines[line_index].beg; 2062 } 2063 if(off == beg) 2064 return FALSE; 2065 2066 *p_beg_line_index = line_index; 2067 2068 /* First char determines how to detect end of it. */ 2069 switch(CH(off)) { 2070 case _T('"'): closer_char = _T('"'); break; 2071 case _T('\''): closer_char = _T('\''); break; 2072 case _T('('): closer_char = _T(')'); break; 2073 default: return FALSE; 2074 } 2075 off++; 2076 2077 *p_contents_beg = off; 2078 2079 while(line_index < n_lines) { 2080 OFF line_end = lines[line_index].end; 2081 2082 while(off < line_end) { 2083 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) { 2084 off++; 2085 } else if(CH(off) == closer_char) { 2086 /* Success. */ 2087 *p_contents_end = off; 2088 *p_end = off+1; 2089 *p_end_line_index = line_index; 2090 return TRUE; 2091 } else if(closer_char == _T(')') && CH(off) == _T('(')) { 2092 /* ()-style title cannot contain (unescaped '(')) */ 2093 return FALSE; 2094 } 2095 2096 off++; 2097 } 2098 2099 line_index++; 2100 } 2101 2102 return FALSE; 2103 } 2104 2105 /* Returns 0 if it is not a reference definition. 2106 * 2107 * Returns N > 0 if it is a reference definition. N then corresponds to the 2108 * number of lines forming it). In this case the definition is stored for 2109 * resolving any links referring to it. 2110 * 2111 * Returns -1 in case of an error (out of memory). 2112 */ 2113 static int 2114 md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) 2115 { 2116 OFF label_contents_beg; 2117 OFF label_contents_end; 2118 int label_contents_line_index = -1; 2119 int label_is_multiline = FALSE; 2120 OFF dest_contents_beg; 2121 OFF dest_contents_end; 2122 OFF title_contents_beg; 2123 OFF title_contents_end; 2124 int title_contents_line_index; 2125 int title_is_multiline = FALSE; 2126 OFF off; 2127 int line_index = 0; 2128 int tmp_line_index; 2129 MD_REF_DEF* def = NULL; 2130 int ret = 0; 2131 2132 /* Link label. */ 2133 if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg, 2134 &off, &label_contents_line_index, &line_index, 2135 &label_contents_beg, &label_contents_end)) 2136 return FALSE; 2137 label_is_multiline = (label_contents_line_index != line_index); 2138 2139 /* Colon. */ 2140 if(off >= lines[line_index].end || CH(off) != _T(':')) 2141 return FALSE; 2142 off++; 2143 2144 /* Optional white space with up to one line break. */ 2145 while(off < lines[line_index].end && ISWHITESPACE(off)) 2146 off++; 2147 if(off >= lines[line_index].end) { 2148 line_index++; 2149 if(line_index >= n_lines) 2150 return FALSE; 2151 off = lines[line_index].beg; 2152 } 2153 2154 /* Link destination. */ 2155 if(!md_is_link_destination(ctx, off, lines[line_index].end, 2156 &off, &dest_contents_beg, &dest_contents_end)) 2157 return FALSE; 2158 2159 /* (Optional) title. Note we interpret it as an title only if nothing 2160 * more follows on its last line. */ 2161 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, 2162 &off, &title_contents_line_index, &tmp_line_index, 2163 &title_contents_beg, &title_contents_end) 2164 && off >= lines[line_index + tmp_line_index].end) 2165 { 2166 title_is_multiline = (tmp_line_index != title_contents_line_index); 2167 title_contents_line_index += line_index; 2168 line_index += tmp_line_index; 2169 } else { 2170 /* Not a title. */ 2171 title_is_multiline = FALSE; 2172 title_contents_beg = off; 2173 title_contents_end = off; 2174 title_contents_line_index = 0; 2175 } 2176 2177 /* Nothing more can follow on the last line. */ 2178 if(off < lines[line_index].end) 2179 return FALSE; 2180 2181 /* So, it _is_ a reference definition. Remember it. */ 2182 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) { 2183 MD_REF_DEF* new_defs; 2184 2185 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0 2186 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2 2187 : 16); 2188 new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF)); 2189 if(new_defs == NULL) { 2190 MD_LOG("realloc() failed."); 2191 goto abort; 2192 } 2193 2194 ctx->ref_defs = new_defs; 2195 } 2196 def = &ctx->ref_defs[ctx->n_ref_defs]; 2197 memset(def, 0, sizeof(MD_REF_DEF)); 2198 2199 if(label_is_multiline) { 2200 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end, 2201 lines + label_contents_line_index, n_lines - label_contents_line_index, 2202 _T(' '), &def->label, &def->label_size)); 2203 def->label_needs_free = TRUE; 2204 } else { 2205 def->label = (CHAR*) STR(label_contents_beg); 2206 def->label_size = label_contents_end - label_contents_beg; 2207 } 2208 2209 if(title_is_multiline) { 2210 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, 2211 lines + title_contents_line_index, n_lines - title_contents_line_index, 2212 _T('\n'), &def->title, &def->title_size)); 2213 def->title_needs_free = TRUE; 2214 } else { 2215 def->title = (CHAR*) STR(title_contents_beg); 2216 def->title_size = title_contents_end - title_contents_beg; 2217 } 2218 2219 def->dest_beg = dest_contents_beg; 2220 def->dest_end = dest_contents_end; 2221 2222 /* Success. */ 2223 ctx->n_ref_defs++; 2224 return line_index + 1; 2225 2226 abort: 2227 /* Failure. */ 2228 if(def != NULL && def->label_needs_free) 2229 free(def->label); 2230 if(def != NULL && def->title_needs_free) 2231 free(def->title); 2232 return ret; 2233 } 2234 2235 static int 2236 md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines, 2237 OFF beg, OFF end, MD_LINK_ATTR* attr) 2238 { 2239 const MD_REF_DEF* def; 2240 const MD_LINE* beg_line; 2241 int is_multiline; 2242 CHAR* label; 2243 SZ label_size; 2244 int ret; 2245 2246 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!')); 2247 MD_ASSERT(CH(end-1) == _T(']')); 2248 2249 beg += (CH(beg) == _T('!') ? 2 : 1); 2250 end--; 2251 2252 /* Find lines corresponding to the beg and end positions. */ 2253 beg_line = md_lookup_line(beg, lines, n_lines); 2254 is_multiline = (end > beg_line->end); 2255 2256 if(is_multiline) { 2257 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line, 2258 (int)(n_lines - (beg_line - lines)), _T(' '), &label, &label_size)); 2259 } else { 2260 label = (CHAR*) STR(beg); 2261 label_size = end - beg; 2262 } 2263 2264 def = md_lookup_ref_def(ctx, label, label_size); 2265 if(def != NULL) { 2266 attr->dest_beg = def->dest_beg; 2267 attr->dest_end = def->dest_end; 2268 attr->title = def->title; 2269 attr->title_size = def->title_size; 2270 attr->title_needs_free = FALSE; 2271 } 2272 2273 if(is_multiline) 2274 free(label); 2275 2276 ret = (def != NULL); 2277 2278 abort: 2279 return ret; 2280 } 2281 2282 static int 2283 md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, 2284 OFF beg, OFF* p_end, MD_LINK_ATTR* attr) 2285 { 2286 int line_index = 0; 2287 int tmp_line_index; 2288 OFF title_contents_beg; 2289 OFF title_contents_end; 2290 int title_contents_line_index; 2291 int title_is_multiline; 2292 OFF off = beg; 2293 int ret = FALSE; 2294 2295 while(off >= lines[line_index].end) 2296 line_index++; 2297 2298 MD_ASSERT(CH(off) == _T('(')); 2299 off++; 2300 2301 /* Optional white space with up to one line break. */ 2302 while(off < lines[line_index].end && ISWHITESPACE(off)) 2303 off++; 2304 if(off >= lines[line_index].end && (off >= ctx->size || ISNEWLINE(off))) { 2305 line_index++; 2306 if(line_index >= n_lines) 2307 return FALSE; 2308 off = lines[line_index].beg; 2309 } 2310 2311 /* Link destination may be omitted, but only when not also having a title. */ 2312 if(off < ctx->size && CH(off) == _T(')')) { 2313 attr->dest_beg = off; 2314 attr->dest_end = off; 2315 attr->title = NULL; 2316 attr->title_size = 0; 2317 attr->title_needs_free = FALSE; 2318 off++; 2319 *p_end = off; 2320 return TRUE; 2321 } 2322 2323 /* Link destination. */ 2324 if(!md_is_link_destination(ctx, off, lines[line_index].end, 2325 &off, &attr->dest_beg, &attr->dest_end)) 2326 return FALSE; 2327 2328 /* (Optional) title. */ 2329 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off, 2330 &off, &title_contents_line_index, &tmp_line_index, 2331 &title_contents_beg, &title_contents_end)) 2332 { 2333 title_is_multiline = (tmp_line_index != title_contents_line_index); 2334 title_contents_line_index += line_index; 2335 line_index += tmp_line_index; 2336 } else { 2337 /* Not a title. */ 2338 title_is_multiline = FALSE; 2339 title_contents_beg = off; 2340 title_contents_end = off; 2341 title_contents_line_index = 0; 2342 } 2343 2344 /* Optional whitespace followed with final ')'. */ 2345 while(off < lines[line_index].end && ISWHITESPACE(off)) 2346 off++; 2347 if (off >= lines[line_index].end && (off >= ctx->size || ISNEWLINE(off))) { 2348 line_index++; 2349 if(line_index >= n_lines) 2350 return FALSE; 2351 off = lines[line_index].beg; 2352 } 2353 if(CH(off) != _T(')')) 2354 goto abort; 2355 off++; 2356 2357 if(title_contents_beg >= title_contents_end) { 2358 attr->title = NULL; 2359 attr->title_size = 0; 2360 attr->title_needs_free = FALSE; 2361 } else if(!title_is_multiline) { 2362 attr->title = (CHAR*) STR(title_contents_beg); 2363 attr->title_size = title_contents_end - title_contents_beg; 2364 attr->title_needs_free = FALSE; 2365 } else { 2366 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end, 2367 lines + title_contents_line_index, n_lines - title_contents_line_index, 2368 _T('\n'), &attr->title, &attr->title_size)); 2369 attr->title_needs_free = TRUE; 2370 } 2371 2372 *p_end = off; 2373 ret = TRUE; 2374 2375 abort: 2376 return ret; 2377 } 2378 2379 static void 2380 md_free_ref_defs(MD_CTX* ctx) 2381 { 2382 int i; 2383 2384 for(i = 0; i < ctx->n_ref_defs; i++) { 2385 MD_REF_DEF* def = &ctx->ref_defs[i]; 2386 2387 if(def->label_needs_free) 2388 free(def->label); 2389 if(def->title_needs_free) 2390 free(def->title); 2391 } 2392 2393 free(ctx->ref_defs); 2394 } 2395 2396 2397 /****************************************** 2398 *** Processing Inlines (a.k.a Spans) *** 2399 ******************************************/ 2400 2401 /* We process inlines in few phases: 2402 * 2403 * (1) We go through the block text and collect all significant characters 2404 * which may start/end a span or some other significant position into 2405 * ctx->marks[]. Core of this is what md_collect_marks() does. 2406 * 2407 * We also do some very brief preliminary context-less analysis, whether 2408 * it might be opener or closer (e.g. of an emphasis span). 2409 * 2410 * This speeds the other steps as we do not need to re-iterate over all 2411 * characters anymore. 2412 * 2413 * (2) We analyze each potential mark types, in order by their precedence. 2414 * 2415 * In each md_analyze_XXX() function, we re-iterate list of the marks, 2416 * skipping already resolved regions (in preceding precedences) and try to 2417 * resolve them. 2418 * 2419 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark 2420 * them as resolved. 2421 * 2422 * (2.2) For range-type marks, we analyze whether the mark could be closer 2423 * and, if yes, whether there is some preceding opener it could satisfy. 2424 * 2425 * If not we check whether it could be really an opener and if yes, we 2426 * remember it so subsequent closers may resolve it. 2427 * 2428 * (3) Finally, when all marks were analyzed, we render the block contents 2429 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span() 2430 * or ::close_span() whenever we reach a resolved mark. 2431 */ 2432 2433 2434 /* The mark structure. 2435 * 2436 * '\\': Maybe escape sequence. 2437 * '\0': NULL char. 2438 * '*': Maybe (strong) emphasis start/end. 2439 * '_': Maybe (strong) emphasis start/end. 2440 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH). 2441 * '`': Maybe code span start/end. 2442 * '&': Maybe start of entity. 2443 * ';': Maybe end of entity. 2444 * '<': Maybe start of raw HTML or autolink. 2445 * '>': Maybe end of raw HTML or autolink. 2446 * '[': Maybe start of link label or link text. 2447 * '!': Equivalent of '[' for image. 2448 * ']': Maybe end of link label or link text. 2449 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS). 2450 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS). 2451 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS). 2452 * 'D': Dummy mark, it reserves a space for splitting a previous mark 2453 * (e.g. emphasis) or to make more space for storing some special data 2454 * related to the preceding mark (e.g. link). 2455 * 2456 * Note that not all instances of these chars in the text imply creation of the 2457 * structure. Only those which have (or may have, after we see more context) 2458 * the special meaning. 2459 * 2460 * (Keep this struct as small as possible to fit as much of them into CPU 2461 * cache line.) 2462 */ 2463 struct MD_MARK_tag { 2464 OFF beg; 2465 OFF end; 2466 2467 /* For unresolved openers, 'prev' and 'next' form the chain of open openers 2468 * of given type 'ch'. 2469 * 2470 * During resolving, we disconnect from the chain and point to the 2471 * corresponding counterpart so opener points to its closer and vice versa. 2472 */ 2473 int prev; 2474 int next; 2475 CHAR ch; 2476 unsigned char flags; 2477 }; 2478 2479 /* Mark flags (these apply to ALL mark types). */ 2480 #define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */ 2481 #define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */ 2482 #define MD_MARK_OPENER 0x04 /* Definitely opener. */ 2483 #define MD_MARK_CLOSER 0x08 /* Definitely closer. */ 2484 #define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */ 2485 2486 /* Mark flags specific for various mark types (so they can share bits). */ 2487 #define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */ 2488 #define MD_MARK_EMPH_MOD3_0 0x40 2489 #define MD_MARK_EMPH_MOD3_1 0x80 2490 #define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80) 2491 #define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80) 2492 #define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */ 2493 #define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */ 2494 #define MD_MARK_HASNESTEDBRACKETS 0x20 /* For '[' to rule out invalid link labels early */ 2495 2496 static MD_MARKCHAIN* 2497 md_asterisk_chain(MD_CTX* ctx, unsigned flags) 2498 { 2499 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) { 2500 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0; 2501 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1; 2502 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2; 2503 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0; 2504 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1; 2505 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2; 2506 default: MD_UNREACHABLE(); 2507 } 2508 return NULL; 2509 } 2510 2511 static MD_MARKCHAIN* 2512 md_mark_chain(MD_CTX* ctx, int mark_index) 2513 { 2514 MD_MARK* mark = &ctx->marks[mark_index]; 2515 2516 switch(mark->ch) { 2517 case _T('*'): return md_asterisk_chain(ctx, mark->flags); 2518 case _T('_'): return &UNDERSCORE_OPENERS; 2519 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2; 2520 /* case _T('!'): MD_FALLTHROUGH(); */ 2521 case _T('['): return &BRACKET_OPENERS; 2522 case _T('|'): return &TABLECELLBOUNDARIES; 2523 case _T('-'): return &FAINT_OPENERS; 2524 case _T('%'): return &INVERSE_OPENERS; 2525 case _T('!'): return &CONCEAL_OPENERS; 2526 case _T('^'): return &BLINK_OPENERS; 2527 default: return NULL; 2528 } 2529 } 2530 2531 static MD_MARK* 2532 md_push_mark(MD_CTX* ctx) 2533 { 2534 if(ctx->n_marks >= ctx->alloc_marks) { 2535 MD_MARK* new_marks; 2536 2537 ctx->alloc_marks = (ctx->alloc_marks > 0 2538 ? ctx->alloc_marks + ctx->alloc_marks / 2 2539 : 64); 2540 new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK)); 2541 if(new_marks == NULL) { 2542 MD_LOG("realloc() failed."); 2543 return NULL; 2544 } 2545 2546 ctx->marks = new_marks; 2547 } 2548 2549 return &ctx->marks[ctx->n_marks++]; 2550 } 2551 2552 #define PUSH_MARK_() \ 2553 do { \ 2554 mark = md_push_mark(ctx); \ 2555 if(mark == NULL) { \ 2556 ret = -1; \ 2557 goto abort; \ 2558 } \ 2559 } while(0) 2560 2561 #define PUSH_MARK(ch_, beg_, end_, flags_) \ 2562 do { \ 2563 PUSH_MARK_(); \ 2564 mark->beg = (beg_); \ 2565 mark->end = (end_); \ 2566 mark->prev = -1; \ 2567 mark->next = -1; \ 2568 mark->ch = (char)(ch_); \ 2569 mark->flags = (flags_); \ 2570 } while(0) 2571 2572 2573 static void 2574 md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index) 2575 { 2576 if(chain->tail >= 0) 2577 ctx->marks[chain->tail].next = mark_index; 2578 else 2579 chain->head = mark_index; 2580 2581 ctx->marks[mark_index].prev = chain->tail; 2582 ctx->marks[mark_index].next = -1; 2583 chain->tail = mark_index; 2584 } 2585 2586 /* Sometimes, we need to store a pointer into the mark. It is quite rare 2587 * so we do not bother to make MD_MARK use union, and it can only happen 2588 * for dummy marks. */ 2589 static inline void 2590 md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr) 2591 { 2592 MD_MARK* mark = &ctx->marks[mark_index]; 2593 MD_ASSERT(mark->ch == 'D'); 2594 2595 /* Check only members beg and end are misused for this. */ 2596 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF)); 2597 memcpy(mark, &ptr, sizeof(void*)); 2598 } 2599 2600 static inline void* 2601 md_mark_get_ptr(MD_CTX* ctx, int mark_index) 2602 { 2603 void* ptr; 2604 MD_MARK* mark = &ctx->marks[mark_index]; 2605 MD_ASSERT(mark->ch == 'D'); 2606 memcpy(&ptr, mark, sizeof(void*)); 2607 return ptr; 2608 } 2609 2610 static void 2611 md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index) 2612 { 2613 MD_MARK* opener = &ctx->marks[opener_index]; 2614 MD_MARK* closer = &ctx->marks[closer_index]; 2615 2616 /* Remove opener from the list of openers. */ 2617 if(chain != NULL) { 2618 if(opener->prev >= 0) 2619 ctx->marks[opener->prev].next = opener->next; 2620 else 2621 chain->head = opener->next; 2622 2623 if(opener->next >= 0) 2624 ctx->marks[opener->next].prev = opener->prev; 2625 else 2626 chain->tail = opener->prev; 2627 } 2628 2629 /* Interconnect opener and closer and mark both as resolved. */ 2630 opener->next = closer_index; 2631 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED; 2632 closer->prev = opener_index; 2633 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED; 2634 } 2635 2636 2637 #define MD_ROLLBACK_ALL 0 2638 #define MD_ROLLBACK_CROSSING 1 2639 2640 /* In the range ctx->marks[opener_index] ... [closer_index], undo some or all 2641 * resolvings accordingly to these rules: 2642 * 2643 * (1) All openers BEFORE the range corresponding to any closer inside the 2644 * range are un-resolved and they are re-added to their respective chains 2645 * of unresolved openers. This ensures we can reuse the opener for closers 2646 * AFTER the range. 2647 * 2648 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range 2649 * are discarded. 2650 * 2651 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled 2652 * in (1) are discarded. I.e. pairs of openers and closers which are both 2653 * inside the range are retained as well as any unpaired marks. 2654 */ 2655 static void 2656 md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how) 2657 { 2658 int i; 2659 int mark_index; 2660 2661 /* Cut all unresolved openers at the mark index. */ 2662 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) { 2663 MD_MARKCHAIN* chain = &ctx->mark_chains[i]; 2664 2665 while(chain->tail >= opener_index) { 2666 int same = chain->tail == opener_index; 2667 chain->tail = ctx->marks[chain->tail].prev; 2668 if (same) break; 2669 } 2670 2671 if(chain->tail >= 0) 2672 ctx->marks[chain->tail].next = -1; 2673 else 2674 chain->head = -1; 2675 } 2676 2677 /* Go backwards so that unresolved openers are re-added into their 2678 * respective chains, in the right order. */ 2679 mark_index = closer_index - 1; 2680 while(mark_index > opener_index) { 2681 MD_MARK* mark = &ctx->marks[mark_index]; 2682 int mark_flags = mark->flags; 2683 int discard_flag = (how == MD_ROLLBACK_ALL); 2684 2685 if(mark->flags & MD_MARK_CLOSER) { 2686 int mark_opener_index = mark->prev; 2687 2688 /* Undo opener BEFORE the range. */ 2689 if(mark_opener_index < opener_index) { 2690 MD_MARK* mark_opener = &ctx->marks[mark_opener_index]; 2691 MD_MARKCHAIN* chain; 2692 2693 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED); 2694 chain = md_mark_chain(ctx, opener_index); 2695 if(chain != NULL) { 2696 md_mark_chain_append(ctx, chain, mark_opener_index); 2697 discard_flag = 1; 2698 } 2699 } 2700 } 2701 2702 /* And reset our flags. */ 2703 if(discard_flag) { 2704 /* Make zero-length closer a dummy mark as that's how it was born */ 2705 if((mark->flags & MD_MARK_CLOSER) && mark->beg == mark->end) 2706 mark->ch = 'D'; 2707 2708 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED); 2709 } 2710 2711 /* Jump as far as we can over unresolved or non-interesting marks. */ 2712 switch(how) { 2713 case MD_ROLLBACK_CROSSING: 2714 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) { 2715 /* If we are closer with opener INSIDE the range, there may 2716 * not be any other crosser inside the subrange. */ 2717 mark_index = mark->prev; 2718 break; 2719 } 2720 MD_FALLTHROUGH(); 2721 default: 2722 mark_index--; 2723 break; 2724 } 2725 } 2726 } 2727 2728 static void 2729 md_build_mark_char_map(MD_CTX* ctx) 2730 { 2731 memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map)); 2732 2733 ctx->mark_char_map['\\'] = 1; 2734 ctx->mark_char_map['^'] = 1; 2735 ctx->mark_char_map['%'] = 1; 2736 ctx->mark_char_map['-'] = 1; 2737 ctx->mark_char_map['*'] = 1; 2738 ctx->mark_char_map['_'] = 1; 2739 ctx->mark_char_map['`'] = 1; 2740 ctx->mark_char_map['&'] = 1; 2741 ctx->mark_char_map[';'] = 1; 2742 ctx->mark_char_map['<'] = 1; 2743 ctx->mark_char_map['>'] = 1; 2744 ctx->mark_char_map['['] = 1; 2745 ctx->mark_char_map['!'] = 1; 2746 ctx->mark_char_map[']'] = 1; 2747 ctx->mark_char_map['\0'] = 1; 2748 2749 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH) 2750 ctx->mark_char_map['~'] = 1; 2751 2752 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS) 2753 ctx->mark_char_map['