💾 Archived View for gmi.noulin.net › gitRepositories › md › file › shpPackages › md4c › md4c.c.gmi captured on 2023-07-10 at 15:36:40. Gemini links have been rewritten to link to archived content

View Raw

More Information

⬅️ Previous capture (2023-01-29)

-=-=-=-=-=-=-

md

Log

Files

Refs

README

LICENSE

md4c.c (234850B)

     1 /* commit e9ff661ff818ee94a4a231958d9b6768dc6882c9 - mity/md4c repo
     2  * MD4C: Markdown parser for C
     3  * (http://github.com/mity/md4c)
     4  *
     5  * Copyright (c) 2016-2020 Martin Mitas
     6  *
     7  * Permission is hereby granted, free of charge, to any person obtaining a
     8  * copy of this software and associated documentation files (the "Software"),
     9  * to deal in the Software without restriction, including without limitation
    10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    11  * and/or sell copies of the Software, and to permit persons to whom the
    12  * Software is furnished to do so, subject to the following conditions:
    13  *
    14  * The above copyright notice and this permission notice shall be included in
    15  * all copies or substantial portions of the Software.
    16  *
    17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
    18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
    23  * IN THE SOFTWARE.
    24  */
    25 
    26 #include "md4c.h"
    27 
    28 #include <limits.h>
    29 #include <stdio.h>
    30 #include <stdlib.h>
    31 #include <string.h>
    32 
    33 
    34 /*****************************
    35  ***  Miscellaneous Stuff  ***
    36  *****************************/
    37 
    38 #if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
    39     /* C89/90 or old compilers in general may not understand "inline". */
    40     #if defined __GNUC__
    41         #define inline __inline__
    42     #elif defined _MSC_VER
    43         #define inline __inline
    44     #else
    45         #define inline
    46     #endif
    47 #endif
    48 
    49 /* Make the UTF-8 support the default. */
    50 #if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
    51     #define MD4C_USE_UTF8
    52 #endif
    53 
    54 /* Magic for making wide literals with MD4C_USE_UTF16. */
    55 #ifdef _T
    56     #undef _T
    57 #endif
    58 #if defined MD4C_USE_UTF16
    59     #define _T(x)           L##x
    60 #else
    61     #define _T(x)           x
    62 #endif
    63 
    64 /* Misc. macros. */
    65 #define SIZEOF_ARRAY(a)     (sizeof(a) / sizeof(a[0]))
    66 
    67 #define STRINGIZE_(x)       #x
    68 #define STRINGIZE(x)        STRINGIZE_(x)
    69 
    70 #ifndef TRUE
    71     #define TRUE            1
    72     #define FALSE           0
    73 #endif
    74 
    75 #define MD_LOG(msg)                                                     \
    76     do {                                                                \
    77         if(ctx->parser.debug_log != NULL)                               \
    78             ctx->parser.debug_log((msg), ctx->userdata);                \
    79     } while(0)
    80 
    81 #ifdef DEBUG
    82     #define MD_ASSERT(cond)                                             \
    83             do {                                                        \
    84                 if(!(cond)) {                                           \
    85                     MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": "        \
    86                            "Assertion '" STRINGIZE(cond) "' failed.");  \
    87                     exit(1);                                            \
    88                 }                                                       \
    89             } while(0)
    90 
    91     #define MD_UNREACHABLE()        MD_ASSERT(1 == 0)
    92 #else
    93     #ifdef __GNUC__
    94         #define MD_ASSERT(cond)     do { if(!(cond)) __builtin_unreachable(); } while(0)
    95         #define MD_UNREACHABLE()    do { __builtin_unreachable(); } while(0)
    96     #elif defined _MSC_VER  &&  _MSC_VER > 120
    97         #define MD_ASSERT(cond)     do { __assume(cond); } while(0)
    98         #define MD_UNREACHABLE()    do { __assume(0); } while(0)
    99     #else
   100         #define MD_ASSERT(cond)     do {} while(0)
   101         #define MD_UNREACHABLE()    do {} while(0)
   102     #endif
   103 #endif
   104 
   105 /* For falling through case labels in switch statements. */
   106 #if defined __clang__ && __clang_major__ >= 12
   107     #define MD_FALLTHROUGH()        __attribute__((fallthrough))
   108 #elif defined __GNUC__ && __GNUC__ >= 7
   109     #define MD_FALLTHROUGH()        __attribute__((fallthrough))
   110 #else
   111     #define MD_FALLTHROUGH()        ((void)0)
   112 #endif
   113 
   114 /* Suppress "unused parameter" warnings. */
   115 #define MD_UNUSED(x)                ((void)x)
   116 
   117 
   118 /************************
   119  ***  Internal Types  ***
   120  ************************/
   121 
   122 /* These are omnipresent so lets save some typing. */
   123 #define CHAR    MD_CHAR
   124 #define SZ      MD_SIZE
   125 #define OFF     MD_OFFSET
   126 
   127 typedef struct MD_MARK_tag MD_MARK;
   128 typedef struct MD_BLOCK_tag MD_BLOCK;
   129 typedef struct MD_CONTAINER_tag MD_CONTAINER;
   130 typedef struct MD_REF_DEF_tag MD_REF_DEF;
   131 
   132 
   133 /* During analyzes of inline marks, we need to manage some "mark chains",
   134  * of (yet unresolved) openers. This structure holds start/end of the chain.
   135  * The chain internals are then realized through MD_MARK::prev and ::next.
   136  */
   137 typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
   138 struct MD_MARKCHAIN_tag {
   139     int head;   /* Index of first mark in the chain, or -1 if empty. */
   140     int tail;   /* Index of last mark in the chain, or -1 if empty. */
   141 };
   142 
   143 /* Context propagated through all the parsing. */
   144 typedef struct MD_CTX_tag MD_CTX;
   145 struct MD_CTX_tag {
   146     /* Immutable stuff (parameters of md_parse()). */
   147     const CHAR* text;
   148     SZ size;
   149     MD_PARSER parser;
   150     void* userdata;
   151 
   152     /* When this is true, it allows some optimizations. */
   153     int doc_ends_with_newline;
   154 
   155     /* Helper temporary growing buffer. */
   156     CHAR* buffer;
   157     unsigned alloc_buffer;
   158 
   159     /* Reference definitions. */
   160     MD_REF_DEF* ref_defs;
   161     int n_ref_defs;
   162     int alloc_ref_defs;
   163     void** ref_def_hashtable;
   164     int ref_def_hashtable_size;
   165 
   166     /* Stack of inline/span markers.
   167      * This is only used for parsing a single block contents but by storing it
   168      * here we may reuse the stack for subsequent blocks; i.e. we have fewer
   169      * (re)allocations. */
   170     MD_MARK* marks;
   171     int n_marks;
   172     int alloc_marks;
   173 
   174 #if defined MD4C_USE_UTF16
   175     char mark_char_map[128];
   176 #else
   177     char mark_char_map[256];
   178 #endif
   179 
   180     /* For resolving of inline spans. */
   181     MD_MARKCHAIN mark_chains[17];
   182 #define PTR_CHAIN                               (ctx->mark_chains[0])
   183 #define TABLECELLBOUNDARIES                     (ctx->mark_chains[1])
   184 #define ASTERISK_OPENERS_extraword_mod3_0       (ctx->mark_chains[2])
   185 #define ASTERISK_OPENERS_extraword_mod3_1       (ctx->mark_chains[3])
   186 #define ASTERISK_OPENERS_extraword_mod3_2       (ctx->mark_chains[4])
   187 #define ASTERISK_OPENERS_intraword_mod3_0       (ctx->mark_chains[5])
   188 #define ASTERISK_OPENERS_intraword_mod3_1       (ctx->mark_chains[6])
   189 #define ASTERISK_OPENERS_intraword_mod3_2       (ctx->mark_chains[7])
   190 #define UNDERSCORE_OPENERS                      (ctx->mark_chains[8])
   191 #define TILDE_OPENERS_1                         (ctx->mark_chains[9])
   192 #define TILDE_OPENERS_2                         (ctx->mark_chains[10])
   193 #define BRACKET_OPENERS                         (ctx->mark_chains[11])
   194 #define DOLLAR_OPENERS                          (ctx->mark_chains[12])
   195 #define FAINT_OPENERS                           (ctx->mark_chains[13])
   196 #define INVERSE_OPENERS                         (ctx->mark_chains[14])
   197 #define CONCEAL_OPENERS                         (ctx->mark_chains[15])
   198 #define BLINK_OPENERS                           (ctx->mark_chains[16])
   199 #define OPENERS_CHAIN_FIRST                     1
   200 #define OPENERS_CHAIN_LAST                      16
   201 
   202     int n_table_cell_boundaries;
   203 
   204     /* For resolving links. */
   205     int unresolved_link_head;
   206     int unresolved_link_tail;
   207 
   208     /* For resolving raw HTML. */
   209     OFF html_comment_horizon;
   210     OFF html_proc_instr_horizon;
   211     OFF html_decl_horizon;
   212     OFF html_cdata_horizon;
   213 
   214     /* For block analysis.
   215      * Notes:
   216      *   -- It holds MD_BLOCK as well as MD_LINE structures. After each
   217      *      MD_BLOCK, its (multiple) MD_LINE(s) follow.
   218      *   -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
   219      *      instead of MD_LINE(s).
   220      */
   221     void* block_bytes;
   222     MD_BLOCK* current_block;
   223     int n_block_bytes;
   224     int alloc_block_bytes;
   225 
   226     /* For container block analysis. */
   227     MD_CONTAINER* containers;
   228     int n_containers;
   229     int alloc_containers;
   230 
   231     /* Minimal indentation to call the block "indented code block". */
   232     unsigned code_indent_offset;
   233 
   234     /* Contextual info for line analysis. */
   235     SZ code_fence_length;   /* For checking closing fence length. */
   236     int html_block_type;    /* For checking closing raw HTML condition. */
   237     int last_line_has_list_loosening_effect;
   238     int last_list_item_starts_with_two_blank_lines;
   239 };
   240 
   241 enum MD_LINETYPE_tag {
   242     MD_LINE_BLANK,
   243     MD_LINE_HR,
   244     MD_LINE_ATXHEADER,
   245     MD_LINE_SETEXTHEADER,
   246     MD_LINE_SETEXTUNDERLINE,
   247     MD_LINE_INDENTEDCODE,
   248     MD_LINE_FENCEDCODE,
   249     MD_LINE_HTML,
   250     MD_LINE_TEXT,
   251     MD_LINE_TABLE,
   252     MD_LINE_TABLEUNDERLINE
   253 };
   254 typedef enum MD_LINETYPE_tag MD_LINETYPE;
   255 
   256 typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
   257 struct MD_LINE_ANALYSIS_tag {
   258     MD_LINETYPE type    : 16;
   259     unsigned data       : 16;
   260     OFF beg;
   261     OFF end;
   262     unsigned indent;        /* Indentation level. */
   263 };
   264 
   265 typedef struct MD_LINE_tag MD_LINE;
   266 struct MD_LINE_tag {
   267     OFF beg;
   268     OFF end;
   269 };
   270 
   271 typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
   272 struct MD_VERBATIMLINE_tag {
   273     OFF beg;
   274     OFF end;
   275     OFF indent;
   276 };
   277 
   278 
   279 /*****************
   280  ***  Helpers  ***
   281  *****************/
   282 
   283 /* Character accessors. */
   284 #define CH(off)                 (ctx->text[(off)])
   285 #define STR(off)                (ctx->text + (off))
   286 
   287 /* Character classification.
   288  * Note we assume ASCII compatibility of code points < 128 here. */
   289 #define ISIN_(ch, ch_min, ch_max)       ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
   290 #define ISANYOF_(ch, palette)           ((ch) != _T('\0')  &&  md_strchr((palette), (ch)) != NULL)
   291 #define ISANYOF2_(ch, ch1, ch2)         ((ch) == (ch1) || (ch) == (ch2))
   292 #define ISANYOF3_(ch, ch1, ch2, ch3)    ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
   293 #define ISASCII_(ch)                    ((unsigned)(ch) <= 127)
   294 #define ISBLANK_(ch)                    (ISANYOF2_((ch), _T(' '), _T('\t')))
   295 #define ISNEWLINE_(ch)                  (ISANYOF2_((ch), _T('\r'), _T('\n')))
   296 #define ISWHITESPACE_(ch)               (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
   297 #define ISCNTRL_(ch)                    ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
   298 #define ISPUNCT_(ch)                    (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
   299 #define ISUPPER_(ch)                    (ISIN_(ch, _T('A'), _T('Z')))
   300 #define ISLOWER_(ch)                    (ISIN_(ch, _T('a'), _T('z')))
   301 #define ISALPHA_(ch)                    (ISUPPER_(ch) || ISLOWER_(ch))
   302 #define ISDIGIT_(ch)                    (ISIN_(ch, _T('0'), _T('9')))
   303 #define ISXDIGIT_(ch)                   (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
   304 #define ISALNUM_(ch)                    (ISALPHA_(ch) || ISDIGIT_(ch))
   305 
   306 #define ISANYOF(off, palette)           ISANYOF_(CH(off), (palette))
   307 #define ISANYOF2(off, ch1, ch2)         ISANYOF2_(CH(off), (ch1), (ch2))
   308 #define ISANYOF3(off, ch1, ch2, ch3)    ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
   309 #define ISASCII(off)                    ISASCII_(CH(off))
   310 #define ISBLANK(off)                    ISBLANK_(CH(off))
   311 #define ISNEWLINE(off)                  ISNEWLINE_(CH(off))
   312 #define ISWHITESPACE(off)               ISWHITESPACE_(CH(off))
   313 #define ISCNTRL(off)                    ISCNTRL_(CH(off))
   314 #define ISPUNCT(off)                    ISPUNCT_(CH(off))
   315 #define ISUPPER(off)                    ISUPPER_(CH(off))
   316 #define ISLOWER(off)                    ISLOWER_(CH(off))
   317 #define ISALPHA(off)                    ISALPHA_(CH(off))
   318 #define ISDIGIT(off)                    ISDIGIT_(CH(off))
   319 #define ISXDIGIT(off)                   ISXDIGIT_(CH(off))
   320 #define ISALNUM(off)                    ISALNUM_(CH(off))
   321 
   322 
   323 #if defined MD4C_USE_UTF16
   324     #define md_strchr wcschr
   325 #else
   326     #define md_strchr strchr
   327 #endif
   328 
   329 
   330 /* Case insensitive check of string equality. */
   331 static inline int
   332 md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
   333 {
   334     OFF i;
   335     for(i = 0; i < n; i++) {
   336         CHAR ch1 = s1[i];
   337         CHAR ch2 = s2[i];
   338 
   339         if(ISLOWER_(ch1))
   340             ch1 += ('A'-'a');
   341         if(ISLOWER_(ch2))
   342             ch2 += ('A'-'a');
   343         if(ch1 != ch2)
   344             return FALSE;
   345     }
   346     return TRUE;
   347 }
   348 
   349 static inline int
   350 md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
   351 {
   352     return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
   353 }
   354 
   355 static int
   356 md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
   357 {
   358     OFF off = 0;
   359     int ret = 0;
   360 
   361     while(1) {
   362         while(off < size  &&  str[off] != _T('\0'))
   363             off++;
   364 
   365         if(off > 0) {
   366             ret = ctx->parser.text(type, str, off, ctx->userdata);
   367             if(ret != 0)
   368                 return ret;
   369 
   370             str += off;
   371             size -= off;
   372             off = 0;
   373         }
   374 
   375         if(off >= size)
   376             return 0;
   377 
   378         ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
   379         if(ret != 0)
   380             return ret;
   381         off++;
   382     }
   383 }
   384 
   385 
   386 #define MD_CHECK(func)                                                      \
   387     do {                                                                    \
   388         ret = (func);                                                       \
   389         if(ret < 0)                                                         \
   390             goto abort;                                                     \
   391     } while(0)
   392 
   393 
   394 #define MD_TEMP_BUFFER(sz)                                                  \
   395     do {                                                                    \
   396         if(sz > ctx->alloc_buffer) {                                        \
   397             CHAR* new_buffer;                                               \
   398             SZ new_size = ((sz) + (sz) / 2 + 128) & ~127;                   \
   399                                                                             \
   400             new_buffer = realloc(ctx->buffer, new_size);                    \
   401             if(new_buffer == NULL) {                                        \
   402                 MD_LOG("realloc() failed.");                                \
   403                 ret = -1;                                                   \
   404                 goto abort;                                                 \
   405             }                                                               \
   406                                                                             \
   407             ctx->buffer = new_buffer;                                       \
   408             ctx->alloc_buffer = new_size;                                   \
   409         }                                                                   \
   410     } while(0)
   411 
   412 
   413 #define MD_ENTER_BLOCK(type, arg)                                           \
   414     do {                                                                    \
   415         ret = ctx->parser.enter_block((type), (arg), ctx->userdata);        \
   416         if(ret != 0) {                                                      \
   417             MD_LOG("Aborted from enter_block() callback.");                 \
   418             goto abort;                                                     \
   419         }                                                                   \
   420     } while(0)
   421 
   422 #define MD_LEAVE_BLOCK(type, arg)                                           \
   423     do {                                                                    \
   424         ret = ctx->parser.leave_block((type), (arg), ctx->userdata);        \
   425         if(ret != 0) {                                                      \
   426             MD_LOG("Aborted from leave_block() callback.");                 \
   427             goto abort;                                                     \
   428         }                                                                   \
   429     } while(0)
   430 
   431 #define MD_ENTER_SPAN(type, arg)                                            \
   432     do {                                                                    \
   433         ret = ctx->parser.enter_span((type), (arg), ctx->userdata);         \
   434         if(ret != 0) {                                                      \
   435             MD_LOG("Aborted from enter_span() callback.");                  \
   436             goto abort;                                                     \
   437         }                                                                   \
   438     } while(0)
   439 
   440 #define MD_LEAVE_SPAN(type, arg)                                            \
   441     do {                                                                    \
   442         ret = ctx->parser.leave_span((type), (arg), ctx->userdata);         \
   443         if(ret != 0) {                                                      \
   444             MD_LOG("Aborted from leave_span() callback.");                  \
   445             goto abort;                                                     \
   446         }                                                                   \
   447     } while(0)
   448 
   449 #define MD_TEXT(type, str, size)                                            \
   450     do {                                                                    \
   451         if(size > 0) {                                                      \
   452             ret = ctx->parser.text((type), (str), (size), ctx->userdata);   \
   453             if(ret != 0) {                                                  \
   454                 MD_LOG("Aborted from text() callback.");                    \
   455                 goto abort;                                                 \
   456             }                                                               \
   457         }                                                                   \
   458     } while(0)
   459 
   460 #define MD_TEXT_INSECURE(type, str, size)                                   \
   461     do {                                                                    \
   462         if(size > 0) {                                                      \
   463             ret = md_text_with_null_replacement(ctx, type, str, size);      \
   464             if(ret != 0) {                                                  \
   465                 MD_LOG("Aborted from text() callback.");                    \
   466                 goto abort;                                                 \
   467             }                                                               \
   468         }                                                                   \
   469     } while(0)
   470 
   471 
   472 /* If the offset falls into a gap between line, we return the following
   473  * line. */
   474 static const MD_LINE*
   475 md_lookup_line(OFF off, const MD_LINE* lines, int n_lines)
   476 {
   477     int lo, hi;
   478     int pivot;
   479     const MD_LINE* line;
   480 
   481     lo = 0;
   482     hi = n_lines - 1;
   483     while(lo <= hi) {
   484         pivot = (lo + hi) / 2;
   485         line = &lines[pivot];
   486 
   487         if(off < line->beg) {
   488             hi = pivot - 1;
   489             if(hi < 0  ||  lines[hi].end <= off)
   490                 return line;
   491         } else if(off > line->end) {
   492             lo = pivot + 1;
   493         } else {
   494             return line;
   495         }
   496     }
   497 
   498     return NULL;
   499 }
   500 
   501 
   502 /*************************
   503  ***  Unicode Support  ***
   504  *************************/
   505 
   506 typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
   507 struct MD_UNICODE_FOLD_INFO_tag {
   508     unsigned codepoints[3];
   509     unsigned n_codepoints;
   510 };
   511 
   512 
   513 #if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
   514     /* Binary search over sorted "map" of codepoints. Consecutive sequences
   515      * of codepoints may be encoded in the map by just using the
   516      * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
   517      *
   518      * Returns index of the found record in the map (in the case of ranges,
   519      * the minimal value is used); or -1 on failure. */
   520     static int
   521     md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
   522     {
   523         int beg, end;
   524         int pivot_beg, pivot_end;
   525 
   526         beg = 0;
   527         end = (int) map_size-1;
   528         while(beg <= end) {
   529             /* Pivot may be a range, not just a single value. */
   530             pivot_beg = pivot_end = (beg + end) / 2;
   531             if(map[pivot_end] & 0x40000000)
   532                 pivot_end++;
   533             if(map[pivot_beg] & 0x80000000)
   534                 pivot_beg--;
   535 
   536             if(codepoint < (map[pivot_beg] & 0x00ffffff))
   537                 end = pivot_beg - 1;
   538             else if(codepoint > (map[pivot_end] & 0x00ffffff))
   539                 beg = pivot_end + 1;
   540             else
   541                 return pivot_beg;
   542         }
   543 
   544         return -1;
   545     }
   546 
   547     static int
   548     md_is_unicode_whitespace__(unsigned codepoint)
   549     {
   550 #define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
   551 #define S(cp)               (cp)
   552         /* Unicode "Zs" category.
   553          * (generated by scripts/build_whitespace_map.py) */
   554         static const unsigned WHITESPACE_MAP[] = {
   555             S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
   556         };
   557 #undef R
   558 #undef S
   559 
   560         /* The ASCII ones are the most frequently used ones, also CommonMark
   561          * specification requests few more in this range. */
   562         if(codepoint <= 0x7f)
   563             return ISWHITESPACE_(codepoint);
   564 
   565         return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
   566     }
   567 
   568     static int
   569     md_is_unicode_punct__(unsigned codepoint)
   570     {
   571 #define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
   572 #define S(cp)               (cp)
   573         /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
   574          * (generated by scripts/build_punct_map.py) */
   575         static const unsigned PUNCT_MAP[] = {
   576             R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
   577             R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
   578             S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
   579             S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
   580             R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
   581             R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
   582             R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
   583             R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
   584             R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
   585             R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
   586             R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
   587             R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
   588             R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
   589             R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
   590             R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
   591             S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
   592             R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
   593             S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
   594             S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
   595             R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
   596             R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
   597             S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
   598             R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
   599             R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
   600             R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
   601             R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
   602             R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
   603             R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
   604             S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
   605             R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
   606         };
   607 #undef R
   608 #undef S
   609 
   610         /* The ASCII ones are the most frequently used ones, also CommonMark
   611          * specification requests few more in this range. */
   612         if(codepoint <= 0x7f)
   613             return ISPUNCT_(codepoint);
   614 
   615         return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
   616     }
   617 
   618     static void
   619     md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
   620     {
   621 #define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
   622 #define S(cp)               (cp)
   623         /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
   624          * (generated by scripts/build_folding_map.py) */
   625         static const unsigned FOLD_MAP_1[] = {
   626             R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
   627             R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
   628             S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
   629             S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
   630             R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
   631             S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
   632             S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
   633             R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
   634             S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
   635             S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
   636             S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
   637             S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
   638             R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
   639             R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
   640             S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
   641             R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
   642             R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
   643             R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
   644             S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
   645             S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
   646             R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
   647             S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
   648             S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
   649             S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
   650             R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
   651             S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
   652             R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
   653             R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
   654         };
   655         static const unsigned FOLD_MAP_1_DATA[] = {
   656             0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
   657             0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
   658             0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
   659             0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
   660             0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
   661             0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
   662             0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
   663             0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
   664             0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
   665             0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
   666             0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
   667             0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
   668             0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
   669             0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
   670             0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
   671             0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
   672             0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
   673             0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
   674             0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
   675             0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
   676             0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
   677             0x1e943
   678         };
   679         static const unsigned FOLD_MAP_2[] = {
   680             S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
   681             S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
   682             R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
   683             S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
   684             S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
   685             S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
   686         };
   687         static const unsigned FOLD_MAP_2_DATA[] = {
   688             0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
   689             0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
   690             0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
   691             0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
   692             0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
   693             0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
   694             0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
   695             0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
   696         };
   697         static const unsigned FOLD_MAP_3[] = {
   698             S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
   699             S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
   700         };
   701         static const unsigned FOLD_MAP_3_DATA[] = {
   702             0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
   703             0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
   704             0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
   705             0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
   706         };
   707 #undef R
   708 #undef S
   709         static const struct {
   710             const unsigned* map;
   711             const unsigned* data;
   712             size_t map_size;
   713             unsigned n_codepoints;
   714         } FOLD_MAP_LIST[] = {
   715             { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
   716             { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
   717             { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
   718         };
   719 
   720         int i;
   721 
   722         /* Fast path for ASCII characters. */
   723         if(codepoint <= 0x7f) {
   724             info->codepoints[0] = codepoint;
   725             if(ISUPPER_(codepoint))
   726                 info->codepoints[0] += 'a' - 'A';
   727             info->n_codepoints = 1;
   728             return;
   729         }
   730 
   731         /* Try to locate the codepoint in any of the maps. */
   732         for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
   733             int index;
   734 
   735             index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
   736             if(index >= 0) {
   737                 /* Found the mapping. */
   738                 unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
   739                 const unsigned* map = FOLD_MAP_LIST[i].map;
   740                 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
   741 
   742                 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
   743                 info->n_codepoints = n_codepoints;
   744 
   745                 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
   746                     /* The found mapping maps whole range of codepoints,
   747                      * i.e. we have to offset info->codepoints[0] accordingly. */
   748                     if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
   749                         /* Alternating type of the range. */
   750                         info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
   751                     } else {
   752                         /* Range to range kind of mapping. */
   753                         info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
   754                     }
   755                 }
   756 
   757                 return;
   758             }
   759         }
   760 
   761         /* No mapping found. Map the codepoint to itself. */
   762         info->codepoints[0] = codepoint;
   763         info->n_codepoints = 1;
   764     }
   765 #endif
   766 
   767 
   768 #if defined MD4C_USE_UTF16
   769     #define IS_UTF16_SURROGATE_HI(word)     (((WORD)(word) & 0xfc00) == 0xd800)
   770     #define IS_UTF16_SURROGATE_LO(word)     (((WORD)(word) & 0xfc00) == 0xdc00)
   771     #define UTF16_DECODE_SURROGATE(hi, lo)  (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
   772 
   773     static unsigned
   774     md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
   775     {
   776         if(IS_UTF16_SURROGATE_HI(str[0])) {
   777             if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
   778                 if(p_size != NULL)
   779                     *p_size = 2;
   780                 return UTF16_DECODE_SURROGATE(str[0], str[1]);
   781             }
   782         }
   783 
   784         if(p_size != NULL)
   785             *p_size = 1;
   786         return str[0];
   787     }
   788 
   789     static unsigned
   790     md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
   791     {
   792         if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
   793             return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
   794 
   795         return CH(off);
   796     }
   797 
   798     /* No whitespace uses surrogates, so no decoding needed here. */
   799     #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
   800     #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(CH(off))
   801     #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(CH((off)-1))
   802 
   803     #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
   804     #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
   805 
   806     static inline int
   807     md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
   808     {
   809         return md_decode_utf16le__(str+off, str_size-off, p_char_size);
   810     }
   811 #elif defined MD4C_USE_UTF8
   812     #define IS_UTF8_LEAD1(byte)     ((unsigned char)(byte) <= 0x7f)
   813     #define IS_UTF8_LEAD2(byte)     (((unsigned char)(byte) & 0xe0) == 0xc0)
   814     #define IS_UTF8_LEAD3(byte)     (((unsigned char)(byte) & 0xf0) == 0xe0)
   815     #define IS_UTF8_LEAD4(byte)     (((unsigned char)(byte) & 0xf8) == 0xf0)
   816     #define IS_UTF8_TAIL(byte)      (((unsigned char)(byte) & 0xc0) == 0x80)
   817 
   818     static unsigned
   819     md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
   820     {
   821         if(!IS_UTF8_LEAD1(str[0])) {
   822             if(IS_UTF8_LEAD2(str[0])) {
   823                 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
   824                     if(p_size != NULL)
   825                         *p_size = 2;
   826 
   827                     return (((unsigned int)str[0] & 0x1f) << 6) |
   828                            (((unsigned int)str[1] & 0x3f) << 0);
   829                 }
   830             } else if(IS_UTF8_LEAD3(str[0])) {
   831                 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
   832                     if(p_size != NULL)
   833                         *p_size = 3;
   834 
   835                     return (((unsigned int)str[0] & 0x0f) << 12) |
   836                            (((unsigned int)str[1] & 0x3f) << 6) |
   837                            (((unsigned int)str[2] & 0x3f) << 0);
   838                 }
   839             } else if(IS_UTF8_LEAD4(str[0])) {
   840                 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
   841                     if(p_size != NULL)
   842                         *p_size = 4;
   843 
   844                     return (((unsigned int)str[0] & 0x07) << 18) |
   845                            (((unsigned int)str[1] & 0x3f) << 12) |
   846                            (((unsigned int)str[2] & 0x3f) << 6) |
   847                            (((unsigned int)str[3] & 0x3f) << 0);
   848                 }
   849             }
   850         }
   851 
   852         if(p_size != NULL)
   853             *p_size = 1;
   854         return (unsigned) str[0];
   855     }
   856 
   857     static unsigned
   858     md_decode_utf8_before__(MD_CTX* ctx, OFF off)
   859     {
   860         if(!IS_UTF8_LEAD1(CH(off-1))) {
   861             if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
   862                 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
   863                        (((unsigned int)CH(off-1) & 0x3f) << 0);
   864 
   865             if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
   866                 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
   867                        (((unsigned int)CH(off-2) & 0x3f) << 6) |
   868                        (((unsigned int)CH(off-1) & 0x3f) << 0);
   869 
   870             if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
   871                 return (((unsigned int)CH(off-4) & 0x07) << 18) |
   872                        (((unsigned int)CH(off-3) & 0x3f) << 12) |
   873                        (((unsigned int)CH(off-2) & 0x3f) << 6) |
   874                        (((unsigned int)CH(off-1) & 0x3f) << 0);
   875         }
   876 
   877         return (unsigned) CH(off-1);
   878     }
   879 
   880     #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
   881     #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
   882     #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
   883 
   884     #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
   885     #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
   886 
   887     static inline unsigned
   888     md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
   889     {
   890         return md_decode_utf8__(str+off, str_size-off, p_char_size);
   891     }
   892 #else
   893     #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
   894     #define ISUNICODEWHITESPACE(off)        ISWHITESPACE(off)
   895     #define ISUNICODEWHITESPACEBEFORE(off)  ISWHITESPACE((off)-1)
   896 
   897     #define ISUNICODEPUNCT(off)             ISPUNCT(off)
   898     #define ISUNICODEPUNCTBEFORE(off)       ISPUNCT((off)-1)
   899 
   900     static inline void
   901     md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
   902     {
   903         info->codepoints[0] = codepoint;
   904         if(ISUPPER_(codepoint))
   905             info->codepoints[0] += 'a' - 'A';
   906         info->n_codepoints = 1;
   907     }
   908 
   909     static inline unsigned
   910     md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
   911     {
   912         *p_size = 1;
   913         return (unsigned) str[off];
   914     }
   915 #endif
   916 
   917 
   918 /*************************************
   919  ***  Helper string manipulations  ***
   920  *************************************/
   921 
   922 /* Fill buffer with copy of the string between 'beg' and 'end' but replace any
   923  * line breaks with given replacement character.
   924  *
   925  * NOTE: Caller is responsible to make sure the buffer is large enough.
   926  * (Given the output is always shorter then input, (end - beg) is good idea
   927  * what the caller should allocate.)
   928  */
   929 static void
   930 md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
   931                CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
   932 {
   933     CHAR* ptr = buffer;
   934     int line_index = 0;
   935     OFF off = beg;
   936 
   937     MD_UNUSED(n_lines);
   938 
   939     while(1) {
   940         const MD_LINE* line = &lines[line_index];
   941         OFF line_end = line->end;
   942         if(end < line_end)
   943             line_end = end;
   944 
   945         while(off < line_end) {
   946             *ptr = CH(off);
   947             ptr++;
   948             off++;
   949         }
   950 
   951         if(off >= end) {
   952             *p_size = (MD_SIZE)(ptr - buffer);
   953             return;
   954         }
   955 
   956         *ptr = line_break_replacement_char;
   957         ptr++;
   958 
   959         line_index++;
   960         off = lines[line_index].beg;
   961     }
   962 }
   963 
   964 /* Wrapper of md_merge_lines() which allocates new buffer for the output string.
   965  */
   966 static int
   967 md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
   968                     CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
   969 {
   970     CHAR* buffer;
   971 
   972     buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
   973     if(buffer == NULL) {
   974         MD_LOG("malloc() failed.");
   975         return -1;
   976     }
   977 
   978     md_merge_lines(ctx, beg, end, lines, n_lines,
   979                 line_break_replacement_char, buffer, p_size);
   980 
   981     *p_str = buffer;
   982     return 0;
   983 }
   984 
   985 static OFF
   986 md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
   987 {
   988     SZ char_size;
   989     unsigned codepoint;
   990 
   991     while(off < size) {
   992         codepoint = md_decode_unicode(label, off, size, &char_size);
   993         if(!ISUNICODEWHITESPACE_(codepoint)  &&  !ISNEWLINE_(label[off]))
   994             break;
   995         off += char_size;
   996     }
   997 
   998     return off;
   999 }
  1000 
  1001 
  1002 /******************************
  1003  ***  Recognizing raw HTML  ***
  1004  ******************************/
  1005 
  1006 /* md_is_html_tag() may be called when processing inlines (inline raw HTML)
  1007  * or when breaking document to blocks (checking for start of HTML block type 7).
  1008  *
  1009  * When breaking document to blocks, we do not yet know line boundaries, but
  1010  * in that case the whole tag has to live on a single line. We distinguish this
  1011  * by n_lines == 0.
  1012  */
  1013 static int
  1014 md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
  1015 {
  1016     int attr_state;
  1017     OFF off = beg;
  1018     OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
  1019     int i = 0;
  1020 
  1021     MD_ASSERT(CH(beg) == _T('<'));
  1022 
  1023     if(off + 1 >= line_end)
  1024         return FALSE;
  1025     off++;
  1026 
  1027     /* For parsing attributes, we need a little state automaton below.
  1028      * State -1: no attributes are allowed.
  1029      * State 0: attribute could follow after some whitespace.
  1030      * State 1: after a whitespace (attribute name may follow).
  1031      * State 2: after attribute name ('=' MAY follow).
  1032      * State 3: after '=' (value specification MUST follow).
  1033      * State 41: in middle of unquoted attribute value.
  1034      * State 42: in middle of single-quoted attribute value.
  1035      * State 43: in middle of double-quoted attribute value.
  1036      */
  1037     attr_state = 0;
  1038 
  1039     if(CH(off) == _T('/')) {
  1040         /* Closer tag "</ ... >". No attributes may be present. */
  1041         attr_state = -1;
  1042         off++;
  1043     }
  1044 
  1045     /* Tag name */
  1046     if(off >= line_end  ||  !ISALPHA(off))
  1047         return FALSE;
  1048     off++;
  1049     while(off < line_end  &&  (ISALNUM(off)  ||  CH(off) == _T('-')))
  1050         off++;
  1051 
  1052     /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
  1053      * and final '>'. */
  1054     while(1) {
  1055         while(off < line_end  &&  !ISNEWLINE(off)) {
  1056             if(attr_state > 40) {
  1057                 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
  1058                     attr_state = 0;
  1059                     off--;  /* Put the char back for re-inspection in the new state. */
  1060                 } else if(attr_state == 42 && CH(off) == _T('\'')) {
  1061                     attr_state = 0;
  1062                 } else if(attr_state == 43 && CH(off) == _T('"')) {
  1063                     attr_state = 0;
  1064                 }
  1065                 off++;
  1066             } else if(ISWHITESPACE(off)) {
  1067                 if(attr_state == 0)
  1068                     attr_state = 1;
  1069                 off++;
  1070             } else if(attr_state <= 2 && CH(off) == _T('>')) {
  1071                 /* End. */
  1072                 goto done;
  1073             } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
  1074                 /* End with digraph '/>' */
  1075                 off++;
  1076                 goto done;
  1077             } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
  1078                 off++;
  1079                 /* Attribute name */
  1080                 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
  1081                     off++;
  1082                 attr_state = 2;
  1083             } else if(attr_state == 2 && CH(off) == _T('=')) {
  1084                 /* Attribute assignment sign */
  1085                 off++;
  1086                 attr_state = 3;
  1087             } else if(attr_state == 3) {
  1088                 /* Expecting start of attribute value. */
  1089                 if(CH(off) == _T('"'))
  1090                     attr_state = 43;
  1091                 else if(CH(off) == _T('\''))
  1092                     attr_state = 42;
  1093                 else if(!ISANYOF(off, _T("\"'=<>`"))  &&  !ISNEWLINE(off))
  1094                     attr_state = 41;
  1095                 else
  1096                     return FALSE;
  1097                 off++;
  1098             } else {
  1099                 /* Anything unexpected. */
  1100                 return FALSE;
  1101             }
  1102         }
  1103 
  1104         /* We have to be on a single line. See definition of start condition
  1105          * of HTML block, type 7. */
  1106         if(n_lines == 0)
  1107             return FALSE;
  1108 
  1109         i++;
  1110         if(i >= n_lines)
  1111             return FALSE;
  1112 
  1113         off = lines[i].beg;
  1114         line_end = lines[i].end;
  1115 
  1116         if(attr_state == 0  ||  attr_state == 41)
  1117             attr_state = 1;
  1118 
  1119         if(off >= max_end)
  1120             return FALSE;
  1121     }
  1122 
  1123 done:
  1124     if(off >= max_end)
  1125         return FALSE;
  1126 
  1127     *p_end = off+1;
  1128     return TRUE;
  1129 }
  1130 
  1131 static int
  1132 md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
  1133                         const MD_LINE* lines, int n_lines,
  1134                         OFF beg, OFF max_end, OFF* p_end,
  1135                         OFF* p_scan_horizon)
  1136 {
  1137     OFF off = beg;
  1138     int i = 0;
  1139 
  1140     if(off < *p_scan_horizon  &&  *p_scan_horizon >= max_end - len) {
  1141         /* We have already scanned the range up to the max_end so we know
  1142          * there is nothing to see. */
  1143         return FALSE;
  1144     }
  1145 
  1146     while(TRUE) {
  1147         while(off + len <= lines[i].end  &&  off + len <= max_end) {
  1148             if(md_ascii_eq(STR(off), str, len)) {
  1149                 /* Success. */
  1150                 *p_end = off + len;
  1151                 return TRUE;
  1152             }
  1153             off++;
  1154         }
  1155 
  1156         i++;
  1157         if(off >= max_end  ||  i >= n_lines) {
  1158             /* Failure. */
  1159             *p_scan_horizon = off;
  1160             return FALSE;
  1161         }
  1162 
  1163         off = lines[i].beg;
  1164     }
  1165 }
  1166 
  1167 static int
  1168 md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
  1169 {
  1170     OFF off = beg;
  1171 
  1172     MD_ASSERT(CH(beg) == _T('<'));
  1173 
  1174     if(off + 4 >= lines[0].end)
  1175         return FALSE;
  1176     if(CH(off+1) != _T('!')  ||  CH(off+2) != _T('-')  ||  CH(off+3) != _T('-'))
  1177         return FALSE;
  1178     off += 4;
  1179 
  1180     /* ">" and "->" must not follow the opening. */
  1181     if(off < lines[0].end  &&  CH(off) == _T('>'))
  1182         return FALSE;
  1183     if(off+1 < lines[0].end  &&  CH(off) == _T('-')  &&  CH(off+1) == _T('>'))
  1184         return FALSE;
  1185 
  1186     /* HTML comment must not contain "--", so we scan just for "--" instead
  1187      * of "-->" and verify manually that '>' follows. */
  1188     if(md_scan_for_html_closer(ctx, _T("--"), 2,
  1189                 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
  1190     {
  1191         if(*p_end < max_end  &&  CH(*p_end) == _T('>')) {
  1192             *p_end = *p_end + 1;
  1193             return TRUE;
  1194         }
  1195     }
  1196 
  1197     return FALSE;
  1198 }
  1199 
  1200 static int
  1201 md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
  1202 {
  1203     OFF off = beg;
  1204 
  1205     if(off + 2 >= lines[0].end)
  1206         return FALSE;
  1207     if(CH(off+1) != _T('?'))
  1208         return FALSE;
  1209     off += 2;
  1210 
  1211     return md_scan_for_html_closer(ctx, _T("?>"), 2,
  1212                 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
  1213 }
  1214 
  1215 static int
  1216 md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
  1217 {
  1218     OFF off = beg;
  1219 
  1220     if(off + 2 >= lines[0].end)
  1221         return FALSE;
  1222     if(CH(off+1) != _T('!'))
  1223         return FALSE;
  1224     off += 2;
  1225 
  1226     /* Declaration name. */
  1227     if(off >= lines[0].end  ||  !ISALPHA(off))
  1228         return FALSE;
  1229     off++;
  1230     while(off < lines[0].end  &&  ISALPHA(off))
  1231         off++;
  1232     if(off < lines[0].end  &&  !ISWHITESPACE(off))
  1233         return FALSE;
  1234 
  1235     return md_scan_for_html_closer(ctx, _T(">"), 1,
  1236                 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
  1237 }
  1238 
  1239 static int
  1240 md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
  1241 {
  1242     static const CHAR open_str[] = _T("<![CDATA[");
  1243     static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
  1244 
  1245     OFF off = beg;
  1246 
  1247     if(off + open_size >= lines[0].end)
  1248         return FALSE;
  1249     if(memcmp(STR(off), open_str, open_size) != 0)
  1250         return FALSE;
  1251     off += open_size;
  1252 
  1253     if(lines[n_lines-1].end < max_end)
  1254         max_end = lines[n_lines-1].end - 2;
  1255 
  1256     return md_scan_for_html_closer(ctx, _T("]]>"), 3,
  1257                 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
  1258 }
  1259 
  1260 static int
  1261 md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
  1262 {
  1263     MD_ASSERT(CH(beg) == _T('<'));
  1264     return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end)  ||
  1265             md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end)  ||
  1266             md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end)  ||
  1267             md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end)  ||
  1268             md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
  1269 }
  1270 
  1271 
  1272 /****************************
  1273  ***  Recognizing Entity  ***
  1274  ****************************/
  1275 
  1276 static int
  1277 md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
  1278 {
  1279     OFF off = beg;
  1280     MD_UNUSED(ctx);
  1281 
  1282     while(off < max_end  &&  ISXDIGIT_(text[off])  &&  off - beg <= 8)
  1283         off++;
  1284 
  1285     if(1 <= off - beg  &&  off - beg <= 6) {
  1286         *p_end = off;
  1287         return TRUE;
  1288     } else {
  1289         return FALSE;
  1290     }
  1291 }
  1292 
  1293 static int
  1294 md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
  1295 {
  1296     OFF off = beg;
  1297     MD_UNUSED(ctx);
  1298 
  1299     while(off < max_end  &&  ISDIGIT_(text[off])  &&  off - beg <= 8)
  1300         off++;
  1301 
  1302     if(1 <= off - beg  &&  off - beg <= 7) {
  1303         *p_end = off;
  1304         return TRUE;
  1305     } else {
  1306         return FALSE;
  1307     }
  1308 }
  1309 
  1310 static int
  1311 md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
  1312 {
  1313     OFF off = beg;
  1314     MD_UNUSED(ctx);
  1315 
  1316     if(off < max_end  &&  ISALPHA_(text[off]))
  1317         off++;
  1318     else
  1319         return FALSE;
  1320 
  1321     while(off < max_end  &&  ISALNUM_(text[off])  &&  off - beg <= 48)
  1322         off++;
  1323 
  1324     if(2 <= off - beg  &&  off - beg <= 48) {
  1325         *p_end = off;
  1326         return TRUE;
  1327     } else {
  1328         return FALSE;
  1329     }
  1330 }
  1331 
  1332 static int
  1333 md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
  1334 {
  1335     int is_contents;
  1336     OFF off = beg;
  1337 
  1338     MD_ASSERT(text[off] == _T('&'));
  1339     off++;
  1340 
  1341     if(off+2 < max_end  &&  text[off] == _T('#')  &&  (text[off+1] == _T('x') || text[off+1] == _T('X')))
  1342         is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
  1343     else if(off+1 < max_end  &&  text[off] == _T('#'))
  1344         is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
  1345     else
  1346         is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
  1347 
  1348     if(is_contents  &&  off < max_end  &&  text[off] == _T(';')) {
  1349         *p_end = off+1;
  1350         return TRUE;
  1351     } else {
  1352         return FALSE;
  1353     }
  1354 }
  1355 
  1356 static inline int
  1357 md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
  1358 {
  1359     return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
  1360 }
  1361 
  1362 
  1363 /******************************
  1364  ***  Attribute Management  ***
  1365  ******************************/
  1366 
  1367 typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
  1368 struct MD_ATTRIBUTE_BUILD_tag {
  1369     CHAR* text;
  1370     MD_TEXTTYPE* substr_types;
  1371     OFF* substr_offsets;
  1372     int substr_count;
  1373     int substr_alloc;
  1374     MD_TEXTTYPE trivial_types[1];
  1375     OFF trivial_offsets[2];
  1376 };
  1377 
  1378 
  1379 #define MD_BUILD_ATTR_NO_ESCAPES    0x0001
  1380 
  1381 static int
  1382 md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
  1383                             MD_TEXTTYPE type, OFF off)
  1384 {
  1385     if(build->substr_count >= build->substr_alloc) {
  1386         MD_TEXTTYPE* new_substr_types;
  1387         OFF* new_substr_offsets;
  1388 
  1389         build->substr_alloc = (build->substr_alloc > 0
  1390                 ? build->substr_alloc + build->substr_alloc / 2
  1391                 : 8);
  1392         new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
  1393                                     build->substr_alloc * sizeof(MD_TEXTTYPE));
  1394         if(new_substr_types == NULL) {
  1395             MD_LOG("realloc() failed.");
  1396             return -1;
  1397         }
  1398         /* Note +1 to reserve space for final offset (== raw_size). */
  1399         new_substr_offsets = (OFF*) realloc(build->substr_offsets,
  1400                                     (build->substr_alloc+1) * sizeof(OFF));
  1401         if(new_substr_offsets == NULL) {
  1402             MD_LOG("realloc() failed.");
  1403             free(new_substr_types);
  1404             return -1;
  1405         }
  1406 
  1407         build->substr_types = new_substr_types;
  1408         build->substr_offsets = new_substr_offsets;
  1409     }
  1410 
  1411     build->substr_types[build->substr_count] = type;
  1412     build->substr_offsets[build->substr_count] = off;
  1413     build->substr_count++;
  1414     return 0;
  1415 }
  1416 
  1417 static void
  1418 md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
  1419 {
  1420     MD_UNUSED(ctx);
  1421 
  1422     if(build->substr_alloc > 0) {
  1423         free(build->text);
  1424         free(build->substr_types);
  1425         free(build->substr_offsets);
  1426     }
  1427 }
  1428 
  1429 static int
  1430 md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
  1431                    unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
  1432 {
  1433     OFF raw_off, off;
  1434     int is_trivial;
  1435     int ret = 0;
  1436 
  1437     memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
  1438 
  1439     /* If there is no backslash and no ampersand, build trivial attribute
  1440      * without any malloc(). */
  1441     is_trivial = TRUE;
  1442     for(raw_off = 0; raw_off < raw_size; raw_off++) {
  1443         if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
  1444             is_trivial = FALSE;
  1445             break;
  1446         }
  1447     }
  1448 
  1449     if(is_trivial) {
  1450         build->text = (CHAR*) (raw_size ? raw_text : NULL);
  1451         build->substr_types = build->trivial_types;
  1452         build->substr_offsets = build->trivial_offsets;
  1453         build->substr_count = 1;
  1454         build->substr_alloc = 0;
  1455         build->trivial_types[0] = MD_TEXT_NORMAL;
  1456         build->trivial_offsets[0] = 0;
  1457         build->trivial_offsets[1] = raw_size;
  1458         off = raw_size;
  1459     } else {
  1460         build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
  1461         if(build->text == NULL) {
  1462             MD_LOG("malloc() failed.");
  1463             goto abort;
  1464         }
  1465 
  1466         raw_off = 0;
  1467         off = 0;
  1468 
  1469         while(raw_off < raw_size) {
  1470             if(raw_text[raw_off] == _T('\0')) {
  1471                 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
  1472                 memcpy(build->text + off, raw_text + raw_off, 1);
  1473                 off++;
  1474                 raw_off++;
  1475                 continue;
  1476             }
  1477 
  1478             if(raw_text[raw_off] == _T('&')) {
  1479                 OFF ent_end;
  1480 
  1481                 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
  1482                     MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
  1483                     memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
  1484                     off += ent_end - raw_off;
  1485                     raw_off = ent_end;
  1486                     continue;
  1487                 }
  1488             }
  1489 
  1490             if(build->substr_count == 0  ||  build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
  1491                 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
  1492 
  1493             if(!(flags & MD_BUILD_ATTR_NO_ESCAPES)  &&
  1494                raw_text[raw_off] == _T('\\')  &&  raw_off+1 < raw_size  &&
  1495                (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
  1496                 raw_off++;
  1497 
  1498             build->text[off++] = raw_text[raw_off++];
  1499         }
  1500         build->substr_offsets[build->substr_count] = off;
  1501     }
  1502 
  1503     attr->text = build->text;
  1504     attr->size = off;
  1505     attr->substr_offsets = build->substr_offsets;
  1506     attr->substr_types = build->substr_types;
  1507     return 0;
  1508 
  1509 abort:
  1510     md_free_attribute(ctx, build);
  1511     return -1;
  1512 }
  1513 
  1514 
  1515 /*********************************************
  1516  ***  Dictionary of Reference Definitions  ***
  1517  *********************************************/
  1518 
  1519 #define MD_FNV1A_BASE       2166136261U
  1520 #define MD_FNV1A_PRIME      16777619U
  1521 
  1522 static inline unsigned
  1523 md_fnv1a(unsigned base, const void* data, size_t n)
  1524 {
  1525     const unsigned char* buf = (const unsigned char*) data;
  1526     unsigned hash = base;
  1527     size_t i;
  1528 
  1529     for(i = 0; i < n; i++) {
  1530         hash ^= buf[i];
  1531         hash *= MD_FNV1A_PRIME;
  1532     }
  1533 
  1534     return hash;
  1535 }
  1536 
  1537 
  1538 struct MD_REF_DEF_tag {
  1539     CHAR* label;
  1540     CHAR* title;
  1541     unsigned hash;
  1542     SZ label_size;
  1543     SZ title_size;
  1544     OFF dest_beg;
  1545     OFF dest_end;
  1546     unsigned char label_needs_free : 1;
  1547     unsigned char title_needs_free : 1;
  1548 };
  1549 
  1550 /* Label equivalence is quite complicated with regards to whitespace and case
  1551  * folding. This complicates computing a hash of it as well as direct comparison
  1552  * of two labels. */
  1553 
  1554 static unsigned
  1555 md_link_label_hash(const CHAR* label, SZ size)
  1556 {
  1557     unsigned hash = MD_FNV1A_BASE;
  1558     OFF off;
  1559     unsigned codepoint;
  1560     int is_whitespace = FALSE;
  1561 
  1562     off = md_skip_unicode_whitespace(label, 0, size);
  1563     while(off < size) {
  1564         SZ char_size;
  1565 
  1566         codepoint = md_decode_unicode(label, off, size, &char_size);
  1567         is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
  1568 
  1569         if(is_whitespace) {
  1570             codepoint = ' ';
  1571             hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
  1572             off = md_skip_unicode_whitespace(label, off, size);
  1573         } else {
  1574             MD_UNICODE_FOLD_INFO fold_info;
  1575 
  1576             md_get_unicode_fold_info(codepoint, &fold_info);
  1577             hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
  1578             off += char_size;
  1579         }
  1580     }
  1581 
  1582     return hash;
  1583 }
  1584 
  1585 static OFF
  1586 md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
  1587                                  MD_UNICODE_FOLD_INFO* fold_info)
  1588 {
  1589     unsigned codepoint;
  1590     SZ char_size;
  1591 
  1592     if(off >= size) {
  1593         /* Treat end of a link label as a whitespace. */
  1594         goto whitespace;
  1595     }
  1596 
  1597     codepoint = md_decode_unicode(label, off, size, &char_size);
  1598     off += char_size;
  1599     if(ISUNICODEWHITESPACE_(codepoint)) {
  1600         /* Treat all whitespace as equivalent */
  1601         goto whitespace;
  1602     }
  1603 
  1604     /* Get real folding info. */
  1605     md_get_unicode_fold_info(codepoint, fold_info);
  1606     return off;
  1607 
  1608 whitespace:
  1609     fold_info->codepoints[0] = _T(' ');
  1610     fold_info->n_codepoints = 1;
  1611     return md_skip_unicode_whitespace(label, off, size);
  1612 }
  1613 
  1614 static int
  1615 md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
  1616 {
  1617     OFF a_off;
  1618     OFF b_off;
  1619     MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
  1620     MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
  1621     OFF a_fi_off = 0;
  1622     OFF b_fi_off = 0;
  1623     int cmp;
  1624 
  1625     a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
  1626     b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
  1627     while(a_off < a_size || a_fi_off < a_fi.n_codepoints ||
  1628           b_off < b_size || b_fi_off < b_fi.n_codepoints)
  1629     {
  1630         /* If needed, load fold info for next char. */
  1631         if(a_fi_off >= a_fi.n_codepoints) {
  1632             a_fi_off = 0;
  1633             a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
  1634         }
  1635         if(b_fi_off >= b_fi.n_codepoints) {
  1636             b_fi_off = 0;
  1637             b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
  1638         }
  1639 
  1640         cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
  1641         if(cmp != 0)
  1642             return cmp;
  1643 
  1644         a_fi_off++;
  1645         b_fi_off++;
  1646     }
  1647 
  1648     return 0;
  1649 }
  1650 
  1651 typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
  1652 struct MD_REF_DEF_LIST_tag {
  1653     int n_ref_defs;
  1654     int alloc_ref_defs;
  1655     MD_REF_DEF* ref_defs[];  /* Valid items always  point into ctx->ref_defs[] */
  1656 };
  1657 
  1658 static int
  1659 md_ref_def_cmp(const void* a, const void* b)
  1660 {
  1661     const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
  1662     const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
  1663 
  1664     if(a_ref->hash < b_ref->hash)
  1665         return -1;
  1666     else if(a_ref->hash > b_ref->hash)
  1667         return +1;
  1668     else
  1669         return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
  1670 }
  1671 
  1672 static int
  1673 md_ref_def_cmp_for_sort(const void* a, const void* b)
  1674 {
  1675     int cmp;
  1676 
  1677     cmp = md_ref_def_cmp(a, b);
  1678 
  1679     /* Ensure stability of the sorting. */
  1680     if(cmp == 0) {
  1681         const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
  1682         const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
  1683 
  1684         if(a_ref < b_ref)
  1685             cmp = -1;
  1686         else if(a_ref > b_ref)
  1687             cmp = +1;
  1688         else
  1689             cmp = 0;
  1690     }
  1691 
  1692     return cmp;
  1693 }
  1694 
  1695 static int
  1696 md_build_ref_def_hashtable(MD_CTX* ctx)
  1697 {
  1698     int i, j;
  1699 
  1700     if(ctx->n_ref_defs == 0)
  1701         return 0;
  1702 
  1703     ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
  1704     ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
  1705     if(ctx->ref_def_hashtable == NULL) {
  1706         MD_LOG("malloc() failed.");
  1707         goto abort;
  1708     }
  1709     memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
  1710 
  1711     /* Each member of ctx->ref_def_hashtable[] can be:
  1712      *  -- NULL,
  1713      *  -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
  1714      *  -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
  1715      *     such MD_REF_DEFs.
  1716      */
  1717     for(i = 0; i < ctx->n_ref_defs; i++) {
  1718         MD_REF_DEF* def = &ctx->ref_defs[i];
  1719         void* bucket;
  1720         MD_REF_DEF_LIST* list;
  1721 
  1722         def->hash = md_link_label_hash(def->label, def->label_size);
  1723         bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
  1724 
  1725         if(bucket == NULL) {
  1726             /* The bucket is empty. Make it just point to the def. */
  1727             ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
  1728             continue;
  1729         }
  1730 
  1731         if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
  1732             /* The bucket already contains one ref. def. Lets see whether it
  1733              * is the same label (ref. def. duplicate) or different one
  1734              * (hash conflict). */
  1735             MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
  1736 
  1737             if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
  1738                 /* Duplicate label: Ignore this ref. def. */
  1739                 continue;
  1740             }
  1741 
  1742             /* Make the bucket complex, i.e. able to hold more ref. defs. */
  1743             list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
  1744             if(list == NULL) {
  1745                 MD_LOG("malloc() failed.");
  1746                 goto abort;
  1747             }
  1748             list->ref_defs[0] = old_def;
  1749             list->ref_defs[1] = def;
  1750             list->n_ref_defs = 2;
  1751             list->alloc_ref_defs = 2;
  1752             ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
  1753             continue;
  1754         }
  1755 
  1756         /* Append the def to the complex bucket list.
  1757          *
  1758          * Note in this case we ignore potential duplicates to avoid expensive
  1759          * iterating over the complex bucket. Below, we revisit all the complex
  1760          * buckets and handle it more cheaply after the complex bucket contents
  1761          * is sorted. */
  1762         list = (MD_REF_DEF_LIST*) bucket;
  1763         if(list->n_ref_defs >= list->alloc_ref_defs) {
  1764             int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
  1765             MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
  1766                         sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
  1767             if(list_tmp == NULL) {
  1768                 MD_LOG("realloc() failed.");
  1769                 goto abort;
  1770             }
  1771             list = list_tmp;
  1772             list->alloc_ref_defs = alloc_ref_defs;
  1773             ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
  1774         }
  1775 
  1776         list->ref_defs[list->n_ref_defs] = def;
  1777         list->n_ref_defs++;
  1778     }
  1779 
  1780     /* Sort the complex buckets so we can use bsearch() with them. */
  1781     for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
  1782         void* bucket = ctx->ref_def_hashtable[i];
  1783         MD_REF_DEF_LIST* list;
  1784 
  1785         if(bucket == NULL)
  1786             continue;
  1787         if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
  1788             continue;
  1789 
  1790         list = (MD_REF_DEF_LIST*) bucket;
  1791         qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
  1792 
  1793         /* Disable all duplicates in the complex bucket by forcing all such
  1794          * records to point to the 1st such ref. def. I.e. no matter which
  1795          * record is found during the lookup, it will always point to the right
  1796          * ref. def. in ctx->ref_defs[]. */
  1797         for(j = 1; j < list->n_ref_defs; j++) {
  1798             if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
  1799                 list->ref_defs[j] = list->ref_defs[j-1];
  1800         }
  1801     }
  1802 
  1803     return 0;
  1804 
  1805 abort:
  1806     return -1;
  1807 }
  1808 
  1809 static void
  1810 md_free_ref_def_hashtable(MD_CTX* ctx)
  1811 {
  1812     if(ctx->ref_def_hashtable != NULL) {
  1813         int i;
  1814 
  1815         for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
  1816             void* bucket = ctx->ref_def_hashtable[i];
  1817             if(bucket == NULL)
  1818                 continue;
  1819             if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
  1820                 continue;
  1821             free(bucket);
  1822         }
  1823 
  1824         free(ctx->ref_def_hashtable);
  1825     }
  1826 }
  1827 
  1828 static const MD_REF_DEF*
  1829 md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
  1830 {
  1831     unsigned hash;
  1832     void* bucket;
  1833 
  1834     if(ctx->ref_def_hashtable_size == 0)
  1835         return NULL;
  1836 
  1837     hash = md_link_label_hash(label, label_size);
  1838     bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
  1839 
  1840     if(bucket == NULL) {
  1841         return NULL;
  1842     } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
  1843         const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
  1844 
  1845         if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
  1846             return def;
  1847         else
  1848             return NULL;
  1849     } else {
  1850         MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
  1851         MD_REF_DEF key_buf;
  1852         const MD_REF_DEF* key = &key_buf;
  1853         const MD_REF_DEF** ret;
  1854 
  1855         key_buf.label = (CHAR*) label;
  1856         key_buf.label_size = label_size;
  1857         key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
  1858 
  1859         ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
  1860                     list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
  1861         if(ret != NULL)
  1862             return *ret;
  1863         else
  1864             return NULL;
  1865     }
  1866 }
  1867 
  1868 
  1869 /***************************
  1870  ***  Recognizing Links  ***
  1871  ***************************/
  1872 
  1873 /* Note this code is partially shared between processing inlines and blocks
  1874  * as reference definitions and links share some helper parser functions.
  1875  */
  1876 
  1877 typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
  1878 struct MD_LINK_ATTR_tag {
  1879     OFF dest_beg;
  1880     OFF dest_end;
  1881 
  1882     CHAR* title;
  1883     SZ title_size;
  1884     int title_needs_free;
  1885 };
  1886 
  1887 
  1888 static int
  1889 md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
  1890                  OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
  1891                  OFF* p_contents_beg, OFF* p_contents_end)
  1892 {
  1893     OFF off = beg;
  1894     OFF contents_beg = 0;
  1895     OFF contents_end = 0;
  1896     int line_index = 0;
  1897     int len = 0;
  1898 
  1899     if(CH(off) != _T('['))
  1900         return FALSE;
  1901     off++;
  1902 
  1903     while(1) {
  1904         OFF line_end = lines[line_index].end;
  1905 
  1906         while(off < line_end) {
  1907             if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
  1908                 if(contents_end == 0) {
  1909                     contents_beg = off;
  1910                     *p_beg_line_index = line_index;
  1911                 }
  1912                 contents_end = off + 2;
  1913                 off += 2;
  1914             } else if(CH(off) == _T('[')) {
  1915                 return FALSE;
  1916             } else if(CH(off) == _T(']')) {
  1917                 if(contents_beg < contents_end) {
  1918                     /* Success. */
  1919                     *p_contents_beg = contents_beg;
  1920                     *p_contents_end = contents_end;
  1921                     *p_end = off+1;
  1922                     *p_end_line_index = line_index;
  1923                     return TRUE;
  1924                 } else {
  1925                     /* Link label must have some non-whitespace contents. */
  1926                     return FALSE;
  1927                 }
  1928             } else {
  1929                 unsigned codepoint;
  1930                 SZ char_size;
  1931 
  1932                 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
  1933                 if(!ISUNICODEWHITESPACE_(codepoint)) {
  1934                     if(contents_end == 0) {
  1935                         contents_beg = off;
  1936                         *p_beg_line_index = line_index;
  1937                     }
  1938                     contents_end = off + char_size;
  1939                 }
  1940 
  1941                 off += char_size;
  1942             }
  1943 
  1944             len++;
  1945             if(len > 999)
  1946                 return FALSE;
  1947         }
  1948 
  1949         line_index++;
  1950         len++;
  1951         if(line_index < n_lines)
  1952             off = lines[line_index].beg;
  1953         else
  1954             break;
  1955     }
  1956 
  1957     return FALSE;
  1958 }
  1959 
  1960 static int
  1961 md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
  1962                          OFF* p_contents_beg, OFF* p_contents_end)
  1963 {
  1964     OFF off = beg;
  1965 
  1966     if(off >= max_end  ||  CH(off) != _T('<'))
  1967         return FALSE;
  1968     off++;
  1969 
  1970     while(off < max_end) {
  1971         if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
  1972             off += 2;
  1973             continue;
  1974         }
  1975 
  1976         if(ISNEWLINE(off)  ||  CH(off) == _T('<'))
  1977             return FALSE;
  1978 
  1979         if(CH(off) == _T('>')) {
  1980             /* Success. */
  1981             *p_contents_beg = beg+1;
  1982             *p_contents_end = off;
  1983             *p_end = off+1;
  1984             return TRUE;
  1985         }
  1986 
  1987         off++;
  1988     }
  1989 
  1990     return FALSE;
  1991 }
  1992 
  1993 static int
  1994 md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
  1995                          OFF* p_contents_beg, OFF* p_contents_end)
  1996 {
  1997     OFF off = beg;
  1998     int parenthesis_level = 0;
  1999 
  2000     while(off < max_end) {
  2001         if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
  2002             off += 2;
  2003             continue;
  2004         }
  2005 
  2006         if(ISWHITESPACE(off) || ISCNTRL(off))
  2007             break;
  2008 
  2009         /* Link destination may include balanced pairs of unescaped '(' ')'.
  2010          * Note we limit the maximal nesting level by 32 to protect us from
  2011          * https://github.com/jgm/cmark/issues/214 */
  2012         if(CH(off) == _T('(')) {
  2013             parenthesis_level++;
  2014             if(parenthesis_level > 32)
  2015                 return FALSE;
  2016         } else if(CH(off) == _T(')')) {
  2017             if(parenthesis_level == 0)
  2018                 break;
  2019             parenthesis_level--;
  2020         }
  2021 
  2022         off++;
  2023     }
  2024 
  2025     if(parenthesis_level != 0  ||  off == beg)
  2026         return FALSE;
  2027 
  2028     /* Success. */
  2029     *p_contents_beg = beg;
  2030     *p_contents_end = off;
  2031     *p_end = off;
  2032     return TRUE;
  2033 }
  2034 
  2035 static inline int
  2036 md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
  2037                        OFF* p_contents_beg, OFF* p_contents_end)
  2038 {
  2039     if(CH(beg) == _T('<'))
  2040         return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
  2041     else
  2042         return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
  2043 }
  2044 
  2045 static int
  2046 md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
  2047                  OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
  2048                  OFF* p_contents_beg, OFF* p_contents_end)
  2049 {
  2050     OFF off = beg;
  2051     CHAR closer_char;
  2052     int line_index = 0;
  2053 
  2054     /* White space with up to one line break. */
  2055     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
  2056         off++;
  2057     if(off >= lines[line_index].end) {
  2058         line_index++;
  2059         if(line_index >= n_lines)
  2060             return FALSE;
  2061         off = lines[line_index].beg;
  2062     }
  2063     if(off == beg)
  2064         return FALSE;
  2065 
  2066     *p_beg_line_index = line_index;
  2067 
  2068     /* First char determines how to detect end of it. */
  2069     switch(CH(off)) {
  2070         case _T('"'):   closer_char = _T('"'); break;
  2071         case _T('\''):  closer_char = _T('\''); break;
  2072         case _T('('):   closer_char = _T(')'); break;
  2073         default:        return FALSE;
  2074     }
  2075     off++;
  2076 
  2077     *p_contents_beg = off;
  2078 
  2079     while(line_index < n_lines) {
  2080         OFF line_end = lines[line_index].end;
  2081 
  2082         while(off < line_end) {
  2083             if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
  2084                 off++;
  2085             } else if(CH(off) == closer_char) {
  2086                 /* Success. */
  2087                 *p_contents_end = off;
  2088                 *p_end = off+1;
  2089                 *p_end_line_index = line_index;
  2090                 return TRUE;
  2091             } else if(closer_char == _T(')')  &&  CH(off) == _T('(')) {
  2092                 /* ()-style title cannot contain (unescaped '(')) */
  2093                 return FALSE;
  2094             }
  2095 
  2096             off++;
  2097         }
  2098 
  2099         line_index++;
  2100     }
  2101 
  2102     return FALSE;
  2103 }
  2104 
  2105 /* Returns 0 if it is not a reference definition.
  2106  *
  2107  * Returns N > 0 if it is a reference definition. N then corresponds to the
  2108  * number of lines forming it). In this case the definition is stored for
  2109  * resolving any links referring to it.
  2110  *
  2111  * Returns -1 in case of an error (out of memory).
  2112  */
  2113 static int
  2114 md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
  2115 {
  2116     OFF label_contents_beg;
  2117     OFF label_contents_end;
  2118     int label_contents_line_index = -1;
  2119     int label_is_multiline = FALSE;
  2120     OFF dest_contents_beg;
  2121     OFF dest_contents_end;
  2122     OFF title_contents_beg;
  2123     OFF title_contents_end;
  2124     int title_contents_line_index;
  2125     int title_is_multiline = FALSE;
  2126     OFF off;
  2127     int line_index = 0;
  2128     int tmp_line_index;
  2129     MD_REF_DEF* def = NULL;
  2130     int ret = 0;
  2131 
  2132     /* Link label. */
  2133     if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
  2134                 &off, &label_contents_line_index, &line_index,
  2135                 &label_contents_beg, &label_contents_end))
  2136         return FALSE;
  2137     label_is_multiline = (label_contents_line_index != line_index);
  2138 
  2139     /* Colon. */
  2140     if(off >= lines[line_index].end  ||  CH(off) != _T(':'))
  2141         return FALSE;
  2142     off++;
  2143 
  2144     /* Optional white space with up to one line break. */
  2145     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
  2146         off++;
  2147     if(off >= lines[line_index].end) {
  2148         line_index++;
  2149         if(line_index >= n_lines)
  2150             return FALSE;
  2151         off = lines[line_index].beg;
  2152     }
  2153 
  2154     /* Link destination. */
  2155     if(!md_is_link_destination(ctx, off, lines[line_index].end,
  2156                 &off, &dest_contents_beg, &dest_contents_end))
  2157         return FALSE;
  2158 
  2159     /* (Optional) title. Note we interpret it as an title only if nothing
  2160      * more follows on its last line. */
  2161     if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
  2162                 &off, &title_contents_line_index, &tmp_line_index,
  2163                 &title_contents_beg, &title_contents_end)
  2164         &&  off >= lines[line_index + tmp_line_index].end)
  2165     {
  2166         title_is_multiline = (tmp_line_index != title_contents_line_index);
  2167         title_contents_line_index += line_index;
  2168         line_index += tmp_line_index;
  2169     } else {
  2170         /* Not a title. */
  2171         title_is_multiline = FALSE;
  2172         title_contents_beg = off;
  2173         title_contents_end = off;
  2174         title_contents_line_index = 0;
  2175     }
  2176 
  2177     /* Nothing more can follow on the last line. */
  2178     if(off < lines[line_index].end)
  2179         return FALSE;
  2180 
  2181     /* So, it _is_ a reference definition. Remember it. */
  2182     if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
  2183         MD_REF_DEF* new_defs;
  2184 
  2185         ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
  2186                 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
  2187                 : 16);
  2188         new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
  2189         if(new_defs == NULL) {
  2190             MD_LOG("realloc() failed.");
  2191             goto abort;
  2192         }
  2193 
  2194         ctx->ref_defs = new_defs;
  2195     }
  2196     def = &ctx->ref_defs[ctx->n_ref_defs];
  2197     memset(def, 0, sizeof(MD_REF_DEF));
  2198 
  2199     if(label_is_multiline) {
  2200         MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
  2201                     lines + label_contents_line_index, n_lines - label_contents_line_index,
  2202                     _T(' '), &def->label, &def->label_size));
  2203         def->label_needs_free = TRUE;
  2204     } else {
  2205         def->label = (CHAR*) STR(label_contents_beg);
  2206         def->label_size = label_contents_end - label_contents_beg;
  2207     }
  2208 
  2209     if(title_is_multiline) {
  2210         MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
  2211                     lines + title_contents_line_index, n_lines - title_contents_line_index,
  2212                     _T('\n'), &def->title, &def->title_size));
  2213         def->title_needs_free = TRUE;
  2214     } else {
  2215         def->title = (CHAR*) STR(title_contents_beg);
  2216         def->title_size = title_contents_end - title_contents_beg;
  2217     }
  2218 
  2219     def->dest_beg = dest_contents_beg;
  2220     def->dest_end = dest_contents_end;
  2221 
  2222     /* Success. */
  2223     ctx->n_ref_defs++;
  2224     return line_index + 1;
  2225 
  2226 abort:
  2227     /* Failure. */
  2228     if(def != NULL  &&  def->label_needs_free)
  2229         free(def->label);
  2230     if(def != NULL  &&  def->title_needs_free)
  2231         free(def->title);
  2232     return ret;
  2233 }
  2234 
  2235 static int
  2236 md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
  2237                      OFF beg, OFF end, MD_LINK_ATTR* attr)
  2238 {
  2239     const MD_REF_DEF* def;
  2240     const MD_LINE* beg_line;
  2241     int is_multiline;
  2242     CHAR* label;
  2243     SZ label_size;
  2244     int ret;
  2245 
  2246     MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
  2247     MD_ASSERT(CH(end-1) == _T(']'));
  2248 
  2249     beg += (CH(beg) == _T('!') ? 2 : 1);
  2250     end--;
  2251 
  2252     /* Find lines corresponding to the beg and end positions. */
  2253     beg_line = md_lookup_line(beg, lines, n_lines);
  2254     is_multiline = (end > beg_line->end);
  2255 
  2256     if(is_multiline) {
  2257         MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
  2258                  (int)(n_lines - (beg_line - lines)), _T(' '), &label, &label_size));
  2259     } else {
  2260         label = (CHAR*) STR(beg);
  2261         label_size = end - beg;
  2262     }
  2263 
  2264     def = md_lookup_ref_def(ctx, label, label_size);
  2265     if(def != NULL) {
  2266         attr->dest_beg = def->dest_beg;
  2267         attr->dest_end = def->dest_end;
  2268         attr->title = def->title;
  2269         attr->title_size = def->title_size;
  2270         attr->title_needs_free = FALSE;
  2271     }
  2272 
  2273     if(is_multiline)
  2274         free(label);
  2275 
  2276     ret = (def != NULL);
  2277 
  2278 abort:
  2279     return ret;
  2280 }
  2281 
  2282 static int
  2283 md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
  2284                        OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
  2285 {
  2286     int line_index = 0;
  2287     int tmp_line_index;
  2288     OFF title_contents_beg;
  2289     OFF title_contents_end;
  2290     int title_contents_line_index;
  2291     int title_is_multiline;
  2292     OFF off = beg;
  2293     int ret = FALSE;
  2294 
  2295     while(off >= lines[line_index].end)
  2296         line_index++;
  2297 
  2298     MD_ASSERT(CH(off) == _T('('));
  2299     off++;
  2300 
  2301     /* Optional white space with up to one line break. */
  2302     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
  2303         off++;
  2304     if(off >= lines[line_index].end  &&  (off >= ctx->size  ||  ISNEWLINE(off))) {
  2305         line_index++;
  2306         if(line_index >= n_lines)
  2307             return FALSE;
  2308         off = lines[line_index].beg;
  2309     }
  2310 
  2311     /* Link destination may be omitted, but only when not also having a title. */
  2312     if(off < ctx->size  &&  CH(off) == _T(')')) {
  2313         attr->dest_beg = off;
  2314         attr->dest_end = off;
  2315         attr->title = NULL;
  2316         attr->title_size = 0;
  2317         attr->title_needs_free = FALSE;
  2318         off++;
  2319         *p_end = off;
  2320         return TRUE;
  2321     }
  2322 
  2323     /* Link destination. */
  2324     if(!md_is_link_destination(ctx, off, lines[line_index].end,
  2325                         &off, &attr->dest_beg, &attr->dest_end))
  2326         return FALSE;
  2327 
  2328     /* (Optional) title. */
  2329     if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
  2330                 &off, &title_contents_line_index, &tmp_line_index,
  2331                 &title_contents_beg, &title_contents_end))
  2332     {
  2333         title_is_multiline = (tmp_line_index != title_contents_line_index);
  2334         title_contents_line_index += line_index;
  2335         line_index += tmp_line_index;
  2336     } else {
  2337         /* Not a title. */
  2338         title_is_multiline = FALSE;
  2339         title_contents_beg = off;
  2340         title_contents_end = off;
  2341         title_contents_line_index = 0;
  2342     }
  2343 
  2344     /* Optional whitespace followed with final ')'. */
  2345     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
  2346         off++;
  2347     if (off >= lines[line_index].end  &&  (off >= ctx->size || ISNEWLINE(off))) {
  2348         line_index++;
  2349         if(line_index >= n_lines)
  2350             return FALSE;
  2351         off = lines[line_index].beg;
  2352     }
  2353     if(CH(off) != _T(')'))
  2354         goto abort;
  2355     off++;
  2356 
  2357     if(title_contents_beg >= title_contents_end) {
  2358         attr->title = NULL;
  2359         attr->title_size = 0;
  2360         attr->title_needs_free = FALSE;
  2361     } else if(!title_is_multiline) {
  2362         attr->title = (CHAR*) STR(title_contents_beg);
  2363         attr->title_size = title_contents_end - title_contents_beg;
  2364         attr->title_needs_free = FALSE;
  2365     } else {
  2366         MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
  2367                     lines + title_contents_line_index, n_lines - title_contents_line_index,
  2368                     _T('\n'), &attr->title, &attr->title_size));
  2369         attr->title_needs_free = TRUE;
  2370     }
  2371 
  2372     *p_end = off;
  2373     ret = TRUE;
  2374 
  2375 abort:
  2376     return ret;
  2377 }
  2378 
  2379 static void
  2380 md_free_ref_defs(MD_CTX* ctx)
  2381 {
  2382     int i;
  2383 
  2384     for(i = 0; i < ctx->n_ref_defs; i++) {
  2385         MD_REF_DEF* def = &ctx->ref_defs[i];
  2386 
  2387         if(def->label_needs_free)
  2388             free(def->label);
  2389         if(def->title_needs_free)
  2390             free(def->title);
  2391     }
  2392 
  2393     free(ctx->ref_defs);
  2394 }
  2395 
  2396 
  2397 /******************************************
  2398  ***  Processing Inlines (a.k.a Spans)  ***
  2399  ******************************************/
  2400 
  2401 /* We process inlines in few phases:
  2402  *
  2403  * (1) We go through the block text and collect all significant characters
  2404  *     which may start/end a span or some other significant position into
  2405  *     ctx->marks[]. Core of this is what md_collect_marks() does.
  2406  *
  2407  *     We also do some very brief preliminary context-less analysis, whether
  2408  *     it might be opener or closer (e.g. of an emphasis span).
  2409  *
  2410  *     This speeds the other steps as we do not need to re-iterate over all
  2411  *     characters anymore.
  2412  *
  2413  * (2) We analyze each potential mark types, in order by their precedence.
  2414  *
  2415  *     In each md_analyze_XXX() function, we re-iterate list of the marks,
  2416  *     skipping already resolved regions (in preceding precedences) and try to
  2417  *     resolve them.
  2418  *
  2419  * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
  2420  *       them as resolved.
  2421  *
  2422  * (2.2) For range-type marks, we analyze whether the mark could be closer
  2423  *       and, if yes, whether there is some preceding opener it could satisfy.
  2424  *
  2425  *       If not we check whether it could be really an opener and if yes, we
  2426  *       remember it so subsequent closers may resolve it.
  2427  *
  2428  * (3) Finally, when all marks were analyzed, we render the block contents
  2429  *     by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
  2430  *     or ::close_span() whenever we reach a resolved mark.
  2431  */
  2432 
  2433 
  2434 /* The mark structure.
  2435  *
  2436  * '\\': Maybe escape sequence.
  2437  * '\0': NULL char.
  2438  *  '*': Maybe (strong) emphasis start/end.
  2439  *  '_': Maybe (strong) emphasis start/end.
  2440  *  '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
  2441  *  '`': Maybe code span start/end.
  2442  *  '&': Maybe start of entity.
  2443  *  ';': Maybe end of entity.
  2444  *  '<': Maybe start of raw HTML or autolink.
  2445  *  '>': Maybe end of raw HTML or autolink.
  2446  *  '[': Maybe start of link label or link text.
  2447  *  '!': Equivalent of '[' for image.
  2448  *  ']': Maybe end of link label or link text.
  2449  *  '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
  2450  *  ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
  2451  *  '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
  2452  *  'D': Dummy mark, it reserves a space for splitting a previous mark
  2453  *       (e.g. emphasis) or to make more space for storing some special data
  2454  *       related to the preceding mark (e.g. link).
  2455  *
  2456  * Note that not all instances of these chars in the text imply creation of the
  2457  * structure. Only those which have (or may have, after we see more context)
  2458  * the special meaning.
  2459  *
  2460  * (Keep this struct as small as possible to fit as much of them into CPU
  2461  * cache line.)
  2462  */
  2463 struct MD_MARK_tag {
  2464     OFF beg;
  2465     OFF end;
  2466 
  2467     /* For unresolved openers, 'prev' and 'next' form the chain of open openers
  2468      * of given type 'ch'.
  2469      *
  2470      * During resolving, we disconnect from the chain and point to the
  2471      * corresponding counterpart so opener points to its closer and vice versa.
  2472      */
  2473     int prev;
  2474     int next;
  2475     CHAR ch;
  2476     unsigned char flags;
  2477 };
  2478 
  2479 /* Mark flags (these apply to ALL mark types). */
  2480 #define MD_MARK_POTENTIAL_OPENER            0x01  /* Maybe opener. */
  2481 #define MD_MARK_POTENTIAL_CLOSER            0x02  /* Maybe closer. */
  2482 #define MD_MARK_OPENER                      0x04  /* Definitely opener. */
  2483 #define MD_MARK_CLOSER                      0x08  /* Definitely closer. */
  2484 #define MD_MARK_RESOLVED                    0x10  /* Resolved in any definite way. */
  2485 
  2486 /* Mark flags specific for various mark types (so they can share bits). */
  2487 #define MD_MARK_EMPH_INTRAWORD              0x20  /* Helper for the "rule of 3". */
  2488 #define MD_MARK_EMPH_MOD3_0                 0x40
  2489 #define MD_MARK_EMPH_MOD3_1                 0x80
  2490 #define MD_MARK_EMPH_MOD3_2                 (0x40 | 0x80)
  2491 #define MD_MARK_EMPH_MOD3_MASK              (0x40 | 0x80)
  2492 #define MD_MARK_AUTOLINK                    0x20  /* Distinguisher for '<', '>'. */
  2493 #define MD_MARK_VALIDPERMISSIVEAUTOLINK     0x20  /* For permissive autolinks. */
  2494 #define MD_MARK_HASNESTEDBRACKETS           0x20  /* For '[' to rule out invalid link labels early */
  2495 
  2496 static MD_MARKCHAIN*
  2497 md_asterisk_chain(MD_CTX* ctx, unsigned flags)
  2498 {
  2499     switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
  2500         case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0:  return &ASTERISK_OPENERS_intraword_mod3_0;
  2501         case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1:  return &ASTERISK_OPENERS_intraword_mod3_1;
  2502         case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2:  return &ASTERISK_OPENERS_intraword_mod3_2;
  2503         case MD_MARK_EMPH_MOD3_0:                           return &ASTERISK_OPENERS_extraword_mod3_0;
  2504         case MD_MARK_EMPH_MOD3_1:                           return &ASTERISK_OPENERS_extraword_mod3_1;
  2505         case MD_MARK_EMPH_MOD3_2:                           return &ASTERISK_OPENERS_extraword_mod3_2;
  2506         default:                                            MD_UNREACHABLE();
  2507     }
  2508     return NULL;
  2509 }
  2510 
  2511 static MD_MARKCHAIN*
  2512 md_mark_chain(MD_CTX* ctx, int mark_index)
  2513 {
  2514     MD_MARK* mark = &ctx->marks[mark_index];
  2515 
  2516     switch(mark->ch) {
  2517         case _T('*'):   return md_asterisk_chain(ctx, mark->flags);
  2518         case _T('_'):   return &UNDERSCORE_OPENERS;
  2519         case _T('~'):   return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
  2520         /* case _T('!'):   MD_FALLTHROUGH(); */
  2521         case _T('['):   return &BRACKET_OPENERS;
  2522         case _T('|'):   return &TABLECELLBOUNDARIES;
  2523         case _T('-'):   return &FAINT_OPENERS;
  2524         case _T('%'):   return &INVERSE_OPENERS;
  2525         case _T('!'):   return &CONCEAL_OPENERS;
  2526         case _T('^'):   return &BLINK_OPENERS;
  2527         default:        return NULL;
  2528     }
  2529 }
  2530 
  2531 static MD_MARK*
  2532 md_push_mark(MD_CTX* ctx)
  2533 {
  2534     if(ctx->n_marks >= ctx->alloc_marks) {
  2535         MD_MARK* new_marks;
  2536 
  2537         ctx->alloc_marks = (ctx->alloc_marks > 0
  2538                 ? ctx->alloc_marks + ctx->alloc_marks / 2
  2539                 : 64);
  2540         new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
  2541         if(new_marks == NULL) {
  2542             MD_LOG("realloc() failed.");
  2543             return NULL;
  2544         }
  2545 
  2546         ctx->marks = new_marks;
  2547     }
  2548 
  2549     return &ctx->marks[ctx->n_marks++];
  2550 }
  2551 
  2552 #define PUSH_MARK_()                                                    \
  2553         do {                                                            \
  2554             mark = md_push_mark(ctx);                                   \
  2555             if(mark == NULL) {                                          \
  2556                 ret = -1;                                               \
  2557                 goto abort;                                             \
  2558             }                                                           \
  2559         } while(0)
  2560 
  2561 #define PUSH_MARK(ch_, beg_, end_, flags_)                              \
  2562         do {                                                            \
  2563             PUSH_MARK_();                                               \
  2564             mark->beg = (beg_);                                         \
  2565             mark->end = (end_);                                         \
  2566             mark->prev = -1;                                            \
  2567             mark->next = -1;                                            \
  2568             mark->ch = (char)(ch_);                                     \
  2569             mark->flags = (flags_);                                     \
  2570         } while(0)
  2571 
  2572 
  2573 static void
  2574 md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
  2575 {
  2576     if(chain->tail >= 0)
  2577         ctx->marks[chain->tail].next = mark_index;
  2578     else
  2579         chain->head = mark_index;
  2580 
  2581     ctx->marks[mark_index].prev = chain->tail;
  2582     ctx->marks[mark_index].next = -1;
  2583     chain->tail = mark_index;
  2584 }
  2585 
  2586 /* Sometimes, we need to store a pointer into the mark. It is quite rare
  2587  * so we do not bother to make MD_MARK use union, and it can only happen
  2588  * for dummy marks. */
  2589 static inline void
  2590 md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
  2591 {
  2592     MD_MARK* mark = &ctx->marks[mark_index];
  2593     MD_ASSERT(mark->ch == 'D');
  2594 
  2595     /* Check only members beg and end are misused for this. */
  2596     MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
  2597     memcpy(mark, &ptr, sizeof(void*));
  2598 }
  2599 
  2600 static inline void*
  2601 md_mark_get_ptr(MD_CTX* ctx, int mark_index)
  2602 {
  2603     void* ptr;
  2604     MD_MARK* mark = &ctx->marks[mark_index];
  2605     MD_ASSERT(mark->ch == 'D');
  2606     memcpy(&ptr, mark, sizeof(void*));
  2607     return ptr;
  2608 }
  2609 
  2610 static void
  2611 md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
  2612 {
  2613     MD_MARK* opener = &ctx->marks[opener_index];
  2614     MD_MARK* closer = &ctx->marks[closer_index];
  2615 
  2616     /* Remove opener from the list of openers. */
  2617     if(chain != NULL) {
  2618         if(opener->prev >= 0)
  2619             ctx->marks[opener->prev].next = opener->next;
  2620         else
  2621             chain->head = opener->next;
  2622 
  2623         if(opener->next >= 0)
  2624             ctx->marks[opener->next].prev = opener->prev;
  2625         else
  2626             chain->tail = opener->prev;
  2627     }
  2628 
  2629     /* Interconnect opener and closer and mark both as resolved. */
  2630     opener->next = closer_index;
  2631     opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
  2632     closer->prev = opener_index;
  2633     closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
  2634 }
  2635 
  2636 
  2637 #define MD_ROLLBACK_ALL         0
  2638 #define MD_ROLLBACK_CROSSING    1
  2639 
  2640 /* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
  2641  * resolvings accordingly to these rules:
  2642  *
  2643  * (1) All openers BEFORE the range corresponding to any closer inside the
  2644  *     range are un-resolved and they are re-added to their respective chains
  2645  *     of unresolved openers. This ensures we can reuse the opener for closers
  2646  *     AFTER the range.
  2647  *
  2648  * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
  2649  *     are discarded.
  2650  *
  2651  * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
  2652  *     in (1) are discarded. I.e. pairs of openers and closers which are both
  2653  *     inside the range are retained as well as any unpaired marks.
  2654  */
  2655 static void
  2656 md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
  2657 {
  2658     int i;
  2659     int mark_index;
  2660 
  2661     /* Cut all unresolved openers at the mark index. */
  2662     for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
  2663         MD_MARKCHAIN* chain = &ctx->mark_chains[i];
  2664 
  2665         while(chain->tail >= opener_index) {
  2666             int same = chain->tail == opener_index;
  2667             chain->tail = ctx->marks[chain->tail].prev;
  2668             if (same) break;
  2669         }
  2670 
  2671         if(chain->tail >= 0)
  2672             ctx->marks[chain->tail].next = -1;
  2673         else
  2674             chain->head = -1;
  2675     }
  2676 
  2677     /* Go backwards so that unresolved openers are re-added into their
  2678      * respective chains, in the right order. */
  2679     mark_index = closer_index - 1;
  2680     while(mark_index > opener_index) {
  2681         MD_MARK* mark = &ctx->marks[mark_index];
  2682         int mark_flags = mark->flags;
  2683         int discard_flag = (how == MD_ROLLBACK_ALL);
  2684 
  2685         if(mark->flags & MD_MARK_CLOSER) {
  2686             int mark_opener_index = mark->prev;
  2687 
  2688             /* Undo opener BEFORE the range. */
  2689             if(mark_opener_index < opener_index) {
  2690                 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
  2691                 MD_MARKCHAIN* chain;
  2692 
  2693                 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
  2694                 chain = md_mark_chain(ctx, opener_index);
  2695                 if(chain != NULL) {
  2696                     md_mark_chain_append(ctx, chain, mark_opener_index);
  2697                     discard_flag = 1;
  2698                 }
  2699             }
  2700         }
  2701 
  2702         /* And reset our flags. */
  2703         if(discard_flag) {
  2704             /* Make zero-length closer a dummy mark as that's how it was born */
  2705             if((mark->flags & MD_MARK_CLOSER)  &&  mark->beg == mark->end)
  2706                 mark->ch = 'D';
  2707 
  2708             mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
  2709         }
  2710 
  2711         /* Jump as far as we can over unresolved or non-interesting marks. */
  2712         switch(how) {
  2713             case MD_ROLLBACK_CROSSING:
  2714                 if((mark_flags & MD_MARK_CLOSER)  &&  mark->prev > opener_index) {
  2715                     /* If we are closer with opener INSIDE the range, there may
  2716                      * not be any other crosser inside the subrange. */
  2717                     mark_index = mark->prev;
  2718                     break;
  2719                 }
  2720                 MD_FALLTHROUGH();
  2721             default:
  2722                 mark_index--;
  2723                 break;
  2724         }
  2725     }
  2726 }
  2727 
  2728 static void
  2729 md_build_mark_char_map(MD_CTX* ctx)
  2730 {
  2731     memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
  2732 
  2733     ctx->mark_char_map['\\'] = 1;
  2734     ctx->mark_char_map['^'] = 1;
  2735     ctx->mark_char_map['%'] = 1;
  2736     ctx->mark_char_map['-'] = 1;
  2737     ctx->mark_char_map['*'] = 1;
  2738     ctx->mark_char_map['_'] = 1;
  2739     ctx->mark_char_map['`'] = 1;
  2740     ctx->mark_char_map['&'] = 1;
  2741     ctx->mark_char_map[';'] = 1;
  2742     ctx->mark_char_map['<'] = 1;
  2743     ctx->mark_char_map['>'] = 1;
  2744     ctx->mark_char_map['['] = 1;
  2745     ctx->mark_char_map['!'] = 1;
  2746     ctx->mark_char_map[']'] = 1;
  2747     ctx->mark_char_map['\0'] = 1;
  2748 
  2749     if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
  2750         ctx->mark_char_map['~'] = 1;
  2751 
  2752     if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
  2753         ctx->mark_char_map['


] = 1;
  2754 
  2755     if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
  2756         ctx->mark_char_map['@'] = 1;
  2757 
  2758     if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
  2759         ctx->mark_char_map[':'] = 1;
  2760 
  2761     if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
  2762         ctx->mark_char_map['.'] = 1;
  2763 
  2764     if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
  2765         ctx->mark_char_map['|'] = 1;
  2766 
  2767     if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
  2768         int i;
  2769 
  2770         for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
  2771             if(ISWHITESPACE_(i))
  2772                 ctx->mark_char_map[i] = 1;
  2773         }
  2774     }
  2775 }
  2776 
  2777 /* We limit code span marks to lower than 32 backticks. This solves the
  2778  * pathologic case of too many openers, each of different length: Their
  2779  * resolving would be then O(n^2). */
  2780 #define CODESPAN_MARK_MAXLEN    32
  2781 
  2782 static int
  2783 md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
  2784                 OFF* p_opener_beg, OFF* p_opener_end,
  2785                 OFF* p_closer_beg, OFF* p_closer_end,
  2786                 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
  2787                 int* p_reached_paragraph_end)
  2788 {
  2789     OFF opener_beg = beg;
  2790     OFF opener_end;
  2791     OFF closer_beg;
  2792     OFF closer_end;
  2793     SZ mark_len;
  2794     OFF line_end;
  2795     int has_space_after_opener = FALSE;
  2796     int has_eol_after_opener = FALSE;
  2797     int has_space_before_closer = FALSE;
  2798     int has_eol_before_closer = FALSE;
  2799     int has_only_space = TRUE;
  2800     int line_index = 0;
  2801 
  2802     line_end = lines[0].end;
  2803     opener_end = opener_beg;
  2804     while(opener_end < line_end  &&  CH(opener_end) == _T('`'))
  2805         opener_end++;
  2806     has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
  2807     has_eol_after_opener = (opener_end == line_end);
  2808 
  2809     /* The caller needs to know end of the opening mark even if we fail. */
  2810     *p_opener_end = opener_end;
  2811 
  2812     mark_len = opener_end - opener_beg;
  2813     if(mark_len > CODESPAN_MARK_MAXLEN)
  2814         return FALSE;
  2815 
  2816     /* Check whether we already know there is no closer of this length.
  2817      * If so, re-scan does no sense. This fixes issue #59. */
  2818     if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end  ||
  2819        (*p_reached_paragraph_end  &&  last_potential_closers[mark_len-1] < opener_end))
  2820         return FALSE;
  2821 
  2822     closer_beg = opener_end;
  2823     closer_end = opener_end;
  2824 
  2825     /* Find closer mark. */
  2826     while(TRUE) {
  2827         while(closer_beg < line_end  &&  CH(closer_beg) != _T('`')) {
  2828             if(CH(closer_beg) != _T(' '))
  2829                 has_only_space = FALSE;
  2830             closer_beg++;
  2831         }
  2832         closer_end = closer_beg;
  2833         while(closer_end < line_end  &&  CH(closer_end) == _T('`'))
  2834             closer_end++;
  2835 
  2836         if(closer_end - closer_beg == mark_len) {
  2837             /* Success. */
  2838             has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
  2839             has_eol_before_closer = (closer_beg == lines[line_index].beg);
  2840             break;
  2841         }
  2842 
  2843         if(closer_end - closer_beg > 0) {
  2844             /* We have found a back-tick which is not part of the closer. */
  2845             has_only_space = FALSE;
  2846 
  2847             /* But if we eventually fail, remember it as a potential closer
  2848              * of its own length for future attempts. This mitigates needs for
  2849              * rescans. */
  2850             if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
  2851                 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
  2852                     last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
  2853             }
  2854         }
  2855 
  2856         if(closer_end >= line_end) {
  2857             line_index++;
  2858             if(line_index >= n_lines) {
  2859                 /* Reached end of the paragraph and still nothing. */
  2860                 *p_reached_paragraph_end = TRUE;
  2861                 return FALSE;
  2862             }
  2863             /* Try on the next line. */
  2864             line_end = lines[line_index].end;
  2865             closer_beg = lines[line_index].beg;
  2866         } else {
  2867             closer_beg = closer_end;
  2868         }
  2869     }
  2870 
  2871     /* If there is a space or a new line both after and before the opener
  2872      * (and if the code span is not made of spaces only), consume one initial
  2873      * and one trailing space as part of the marks. */
  2874     if(!has_only_space  &&
  2875        (has_space_after_opener || has_eol_after_opener)  &&
  2876        (has_space_before_closer || has_eol_before_closer))
  2877     {
  2878         if(has_space_after_opener)
  2879             opener_end++;
  2880         else
  2881             opener_end = lines[1].beg;
  2882 
  2883         if(has_space_before_closer)
  2884             closer_beg--;
  2885         else {
  2886             closer_beg = lines[line_index-1].end;
  2887             /* We need to eat the preceding "\r\n" but not any line trailing
  2888              * spaces. */
  2889             while(closer_beg < ctx->size  &&  ISBLANK(closer_beg))
  2890                 closer_beg++;
  2891         }
  2892     }
  2893 
  2894     *p_opener_beg = opener_beg;
  2895     *p_opener_end = opener_end;
  2896     *p_closer_beg = closer_beg;
  2897     *p_closer_end = closer_end;
  2898     return TRUE;
  2899 }
  2900 
  2901 /* detect anchors with syntax: [|anchorId] */
  2902 static int
  2903 md_is_anchor_span(MD_CTX* ctx, const MD_LINE* lines, OFF off, OFF* p_closer_beg)
  2904 {
  2905     OFF line_end = lines[0].end;
  2906     // Smallest anchor is [|x]
  2907     // An anchor must be on a single line
  2908     if (off+4 >= line_end)
  2909         return FALSE;
  2910     off += 2;
  2911 
  2912     // Find closer mark
  2913     int opener_end = off;
  2914     while (off < line_end) {
  2915         if (CH(off) == _T(']')) {
  2916             // Check if there an id for the anchor
  2917             if (off == opener_end)
  2918                 return FALSE;
  2919             *p_closer_beg = off;
  2920             return TRUE;
  2921         }
  2922         off++;
  2923     }
  2924     return FALSE;
  2925 }
  2926 
  2927 #ifdef MD4C_USE_UTF16
  2928     /* For UTF-16, mark_char_map[] covers only ASCII. */
  2929     #define IS_MARK_CHAR(off)   ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map))  &&  \
  2930                                 (ctx->mark_char_map[(unsigned char) CH(off)]))
  2931 #else
  2932     /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
  2933     #define IS_MARK_CHAR(off)   (ctx->mark_char_map[(unsigned char) CH(off)])
  2934 #endif
  2935 
  2936 /* detect faint effect: -text text- */
  2937 static int
  2938 md_is_faint_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
  2939 {
  2940     OFF tmp;
  2941     OFF line_end;
  2942 
  2943     line_end = lines[0].end;
  2944     if (beg+2 >= line_end)
  2945         return FALSE;
  2946     if (ISUNICODEWHITESPACE(beg+1))
  2947         return FALSE;
  2948     tmp = beg+2;
  2949     while (tmp < line_end) {
  2950         if (CH(tmp) == _T('-') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
  2951             && (!ISUNICODEWHITESPACE(tmp-1))) {
  2952             *p_closer_beg = tmp;
  2953             return TRUE;
  2954         }
  2955         tmp++;
  2956     }
  2957 
  2958     return FALSE;
  2959 }
  2960 
  2961 /* detect inverse effect: %text text% */
  2962 static int
  2963 md_is_inverse_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
  2964 {
  2965     OFF tmp;
  2966     OFF line_end;
  2967 
  2968     line_end = lines[0].end;
  2969     if (beg+2 >= line_end)
  2970         return FALSE;
  2971     if (ISUNICODEWHITESPACE(beg+1))
  2972         return FALSE;
  2973     tmp = beg+2;
  2974     while (tmp < line_end) {
  2975         if (CH(tmp) == _T('%') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
  2976             && (!ISUNICODEWHITESPACE(tmp-1))) {
  2977             *p_closer_beg = tmp;
  2978             return TRUE;
  2979         }
  2980         tmp++;
  2981     }
  2982 
  2983     return FALSE;
  2984 }
  2985 
  2986 /* detect conceal effect: !text text! */
  2987 static int
  2988 md_is_conceal_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
  2989 {
  2990     OFF tmp;
  2991     OFF line_end;
  2992 
  2993     line_end = lines[0].end;
  2994     if (beg+2 >= line_end)
  2995         return FALSE;
  2996     if (ISUNICODEWHITESPACE(beg+1))
  2997         return FALSE;
  2998     tmp = beg+2;
  2999     while (tmp < line_end) {
  3000         if (CH(tmp) == _T('!') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
  3001             && (!ISUNICODEWHITESPACE(tmp-1))) {
  3002             *p_closer_beg = tmp;
  3003             return TRUE;
  3004         }
  3005         tmp++;
  3006     }
  3007 
  3008     return FALSE;
  3009 }
  3010 
  3011 /* detect blink effect: ^text text^ */
  3012 static int
  3013 md_is_blink_span(MD_CTX* ctx, const MD_LINE* lines, OFF beg, OFF* p_closer_beg)
  3014 {
  3015     OFF tmp;
  3016     OFF line_end;
  3017 
  3018     line_end = lines[0].end;
  3019     if (beg+2 >= line_end)
  3020         return FALSE;
  3021     if (ISUNICODEWHITESPACE(beg+1))
  3022         return FALSE;
  3023     tmp = beg+2;
  3024     while (tmp < line_end) {
  3025         if (CH(tmp) == _T('^') && (tmp+1 == line_end || ISUNICODEWHITESPACE(tmp+1) || IS_MARK_CHAR(tmp+1))
  3026             && (!ISUNICODEWHITESPACE(tmp-1))) {
  3027             *p_closer_beg = tmp;
  3028             return TRUE;
  3029         }
  3030         tmp++;
  3031     }
  3032 
  3033     return FALSE;
  3034 }
  3035 
  3036 static int
  3037 md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
  3038 {
  3039     OFF off = beg+1;
  3040 
  3041     MD_ASSERT(CH(beg) == _T('<'));
  3042 
  3043     /* Check for scheme. */
  3044     if(off >= max_end  ||  !ISASCII(off))
  3045         return FALSE;
  3046     off++;
  3047     while(1) {
  3048         if(off >= max_end)
  3049             return FALSE;
  3050         if(off - beg > 32)
  3051             return FALSE;
  3052         if(CH(off) == _T(':')  &&  off - beg >= 3)
  3053             break;
  3054         if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
  3055             return FALSE;
  3056         off++;
  3057     }
  3058 
  3059     /* Check the path after the scheme. */
  3060     while(off < max_end  &&  CH(off) != _T('>')) {
  3061         if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
  3062             return FALSE;
  3063         off++;
  3064     }
  3065 
  3066     if(off >= max_end)
  3067         return FALSE;
  3068 
  3069     MD_ASSERT(CH(off) == _T('>'));
  3070     *p_end = off+1;
  3071     return TRUE;
  3072 }
  3073 
  3074 static int
  3075 md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
  3076 {
  3077     OFF off = beg + 1;
  3078     int label_len;
  3079 
  3080     MD_ASSERT(CH(beg) == _T('<'));
  3081 
  3082     /* The code should correspond to this regexp:
  3083             /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
  3084             @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
  3085             (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
  3086      */
  3087 
  3088     /* Username (before '@'). */
  3089     while(off < max_end  &&  (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
  3090         off++;
  3091     if(off <= beg+1)
  3092         return FALSE;
  3093 
  3094     /* '@' */
  3095     if(off >= max_end  ||  CH(off) != _T('@'))
  3096         return FALSE;
  3097     off++;
  3098 
  3099     /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
  3100      * characters or '-', but '-' is not allowed as first or last char. */
  3101     label_len = 0;
  3102     while(off < max_end) {
  3103         if(ISALNUM(off))
  3104             label_len++;
  3105         else if(CH(off) == _T('-')  &&  label_len > 0)
  3106             label_len++;
  3107         else if(CH(off) == _T('.')  &&  label_len > 0  &&  CH(off-1) != _T('-'))
  3108             label_len = 0;
  3109         else
  3110             break;
  3111 
  3112         if(label_len > 63)
  3113             return FALSE;
  3114 
  3115         off++;
  3116     }
  3117 
  3118     if(label_len <= 0  || off >= max_end  ||  CH(off) != _T('>') ||  CH(off-1) == _T('-'))
  3119         return FALSE;
  3120 
  3121     *p_end = off+1;
  3122     return TRUE;
  3123 }
  3124 
  3125 static int
  3126 md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
  3127 {
  3128     if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
  3129         *p_missing_mailto = FALSE;
  3130         return TRUE;
  3131     }
  3132 
  3133     if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
  3134         *p_missing_mailto = TRUE;
  3135         return TRUE;
  3136     }
  3137 
  3138     return FALSE;
  3139 }
  3140 
  3141 static int
  3142 md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
  3143 {
  3144     const MD_LINE* line_term = lines + n_lines;
  3145     const MD_LINE* line;
  3146     int ret = 0;
  3147     MD_MARK* mark;
  3148     OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
  3149     int codespan_scanned_till_paragraph_end = FALSE;
  3150 
  3151     for(line = lines; line < line_term; line++) {
  3152         OFF off = line->beg;
  3153         OFF line_end = line->end;
  3154 
  3155         while(TRUE) {
  3156             CHAR ch;
  3157 
  3158             /* Optimization: Use some loop unrolling. */
  3159             while(off + 3 < line_end  &&  !IS_MARK_CHAR(off+0)  &&  !IS_MARK_CHAR(off+1)
  3160                                       &&  !IS_MARK_CHAR(off+2)  &&  !IS_MARK_CHAR(off+3))
  3161                 off += 4;
  3162             while(off < line_end  &&  !IS_MARK_CHAR(off+0))
  3163                 off++;
  3164 
  3165             if(off >= line_end)
  3166                 break;
  3167 
  3168             ch = CH(off);
  3169 
  3170             /* A backslash escape.
  3171              * It can go beyond line->end as it may involve escaped new
  3172              * line to form a hard break. */
  3173             if(ch == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
  3174                 /* Hard-break cannot be on the last line of the block. */
  3175                 if(!ISNEWLINE(off+1)  ||  line+1 < line_term)
  3176                     PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
  3177                 off += 2;
  3178                 continue;
  3179             }
  3180 
  3181             /* A potential (string) emphasis start/end. */
  3182             if(ch == _T('*')  ||  ch == _T('_')) {
  3183                 OFF tmp = off+1;
  3184                 int left_level;     /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
  3185                 int right_level;    /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
  3186 
  3187                 while(tmp < line_end  &&  CH(tmp) == ch)
  3188                     tmp++;
  3189 
  3190                 if(off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off))
  3191                     left_level = 0;
  3192                 else if(ISUNICODEPUNCTBEFORE(off))
  3193                     left_level = 1;
  3194                 else
  3195                     left_level = 2;
  3196 
  3197                 if(tmp == line_end  ||  ISUNICODEWHITESPACE(tmp))
  3198                     right_level = 0;
  3199                 else if(ISUNICODEPUNCT(tmp))
  3200                     right_level = 1;
  3201                 else
  3202                     right_level = 2;
  3203 
  3204                 /* Intra-word underscore doesn't have special meaning. */
  3205                 if(ch == _T('_')  &&  left_level == 2  &&  right_level == 2) {
  3206                     left_level = 0;
  3207                     right_level = 0;
  3208                 }
  3209 
  3210                 if(left_level != 0  ||  right_level != 0) {
  3211                     unsigned flags = 0;
  3212 
  3213                     if(left_level > 0  &&  left_level >= right_level)
  3214                         flags |= MD_MARK_POTENTIAL_CLOSER;
  3215                     if(right_level > 0  &&  right_level >= left_level)
  3216                         flags |= MD_MARK_POTENTIAL_OPENER;
  3217                     if(left_level == 2  &&  right_level == 2)
  3218                         flags |= MD_MARK_EMPH_INTRAWORD;
  3219 
  3220                     /* For "the rule of three" we need to remember the original
  3221                      * size of the mark (modulo three), before we potentially
  3222                      * split the mark when being later resolved partially by some
  3223                      * shorter closer. */
  3224                     switch((tmp - off) % 3) {
  3225                         case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
  3226                         case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
  3227                         case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
  3228                     }
  3229 
  3230                     PUSH_MARK(ch, off, tmp, flags);
  3231 
  3232                     /* During resolving, multiple asterisks may have to be
  3233                      * split into independent span start/ends. Consider e.g.
  3234                      * "**foo* bar*". Therefore we push also some empty dummy
  3235                      * marks to have enough space for that. */
  3236                     off++;
  3237                     while(off < tmp) {
  3238                         PUSH_MARK('D', off, off, 0);
  3239                         off++;
  3240                     }
  3241                     continue;
  3242                 }
  3243 
  3244                 off = tmp;
  3245                 continue;
  3246             }
  3247 
  3248             /* A potential code span start/end. */
  3249             if(ch == _T('`')) {
  3250                 OFF opener_beg, opener_end;
  3251                 OFF closer_beg, closer_end;
  3252                 int is_code_span;
  3253 
  3254                 is_code_span = md_is_code_span(ctx, line, line_term - line, off,
  3255                                     &opener_beg, &opener_end, &closer_beg, &closer_end,
  3256                                     codespan_last_potential_closers,
  3257                                     &codespan_scanned_till_paragraph_end);
  3258                 if(is_code_span) {
  3259                     PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
  3260                     PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
  3261                     ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3262                     ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3263 
  3264                     off = closer_end;
  3265 
  3266                     /* Advance the current line accordingly. */
  3267                     if(off > line_end) {
  3268                         line = md_lookup_line(off, line, line_term - line);
  3269                         line_end = line->end;
  3270                     }
  3271                     continue;
  3272                 }
  3273 
  3274                 off = opener_end;
  3275                 continue;
  3276             }
  3277 
  3278             /* A potential faint span start/end. */
  3279             if(ch == _T('-')) {
  3280                 OFF closer_beg;
  3281                 int is_faint_span;
  3282 
  3283                 if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
  3284                     || IS_MARK_CHAR(off-1)) {
  3285 
  3286                     is_faint_span = md_is_faint_span(ctx, line, off, &closer_beg);
  3287                     if(is_faint_span) {
  3288                         PUSH_MARK(_T('-'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
  3289                         PUSH_MARK(_T('-'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
  3290                         ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3291                         ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3292                     }
  3293                 }
  3294                 off++;
  3295                 continue;
  3296             }
  3297 
  3298             /* A potential inverse span start/end. */
  3299             if(ch == _T('%')) {
  3300                 OFF closer_beg;
  3301                 int is_inverse_span;
  3302 
  3303                 if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
  3304                     || IS_MARK_CHAR(off-1)) {
  3305 
  3306                     is_inverse_span = md_is_inverse_span(ctx, line, off, &closer_beg);
  3307                     if(is_inverse_span) {
  3308                         PUSH_MARK(_T('%'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
  3309                         PUSH_MARK(_T('%'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
  3310                         ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3311                         ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3312 
  3313                     }
  3314                 }
  3315                 off++;
  3316                 continue;
  3317             }
  3318 
  3319             /* A potential conceal span start/end. */
  3320             if(ch == _T('!')) {
  3321                 OFF closer_beg;
  3322                 int is_conceal_span;
  3323 
  3324                 if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
  3325                     || IS_MARK_CHAR(off-1)) {
  3326 
  3327                     is_conceal_span = md_is_conceal_span(ctx, line, off, &closer_beg);
  3328                     if(is_conceal_span) {
  3329                         PUSH_MARK(_T('!'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
  3330                         PUSH_MARK(_T('!'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
  3331                         ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3332                         ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3333 
  3334                     }
  3335                 }
  3336                 off++;
  3337                 continue;
  3338             }
  3339 
  3340             /* A potential blink span start/end. */
  3341             if(ch == _T('^')) {
  3342                 OFF closer_beg;
  3343                 int is_blink_span;
  3344 
  3345                 if (off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off) || ISUNICODEPUNCTBEFORE(off)
  3346                     || IS_MARK_CHAR(off-1)) {
  3347 
  3348                     is_blink_span = md_is_blink_span(ctx, line, off, &closer_beg);
  3349                     if(is_blink_span) {
  3350                         PUSH_MARK(_T('^'), off, off+1, MD_MARK_OPENER | MD_MARK_RESOLVED);
  3351                         PUSH_MARK(_T('^'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
  3352                         ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3353                         ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3354 
  3355                     }
  3356                 }
  3357                 off++;
  3358                 continue;
  3359             }
  3360 
  3361             /* A potential entity start. */
  3362             if(ch == _T('&')) {
  3363                 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
  3364                 off++;
  3365                 continue;
  3366             }
  3367 
  3368             /* A potential entity end. */
  3369             if(ch == _T(';')) {
  3370                 /* We surely cannot be entity unless the previous mark is '&'. */
  3371                 if(ctx->n_marks > 0  &&  ctx->marks[ctx->n_marks-1].ch == _T('&'))
  3372                     PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
  3373 
  3374                 off++;
  3375                 continue;
  3376             }
  3377 
  3378             /* A potential autolink or raw HTML start/end. */
  3379             if(ch == _T('<')) {
  3380                 int is_autolink;
  3381                 OFF autolink_end;
  3382                 int missing_mailto;
  3383 
  3384                 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
  3385                     int is_html;
  3386                     OFF html_end;
  3387 
  3388                     /* Given the nature of the raw HTML, we have to recognize
  3389                      * it here. Doing so later in md_analyze_lt_gt() could
  3390                      * open can of worms of quadratic complexity. */
  3391                     is_html = md_is_html_any(ctx, line, line_term - line, off,
  3392                                     lines[n_lines-1].end, &html_end);
  3393                     if(is_html) {
  3394                         PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
  3395                         PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
  3396                         ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3397                         ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3398                         off = html_end;
  3399 
  3400                         /* Advance the current line accordingly. */
  3401                         if(off > line_end) {
  3402                             line = md_lookup_line(off, line, line_term - line);
  3403                             line_end = line->end;
  3404                         }
  3405                         continue;
  3406                     }
  3407                 }
  3408 
  3409                 is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
  3410                                     &autolink_end, &missing_mailto);
  3411                 if(is_autolink) {
  3412                     PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
  3413                                 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
  3414                     PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
  3415                                 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
  3416                     ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3417                     ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3418                     off = autolink_end;
  3419                     continue;
  3420                 }
  3421 
  3422                 off++;
  3423                 continue;
  3424             }
  3425 
  3426             /* A potential anchor */
  3427             if(ch == _T('[') && off+1 < line_end && CH(off+1) == _T('|')) {
  3428                 OFF closer_beg;
  3429                 int is_anchor_span = md_is_anchor_span(ctx, line, off, &closer_beg);
  3430                 if (is_anchor_span) {
  3431                     PUSH_MARK(_T('['), off, off+2, MD_MARK_OPENER | MD_MARK_RESOLVED);
  3432                     PUSH_MARK(_T(']'), closer_beg, closer_beg+1, MD_MARK_CLOSER | MD_MARK_RESOLVED);
  3433                     ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
  3434                     ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
  3435                     off = closer_beg+1;
  3436                     continue;
  3437                 }
  3438                 // continue analyzing [ mark
  3439             }
  3440 
  3441             /* A potential link or its part. */
  3442             if(ch == _T('[')  ||  (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
  3443                 OFF tmp = (ch == _T('[') ? off+1 : off+2);
  3444                 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
  3445                 off = tmp;
  3446                 /* Two dummies to make enough place for data we need if it is
  3447                  * a link. */
  3448                 PUSH_MARK('D', off, off, 0);
  3449                 PUSH_MARK('D', off, off, 0);
  3450                 continue;
  3451             }
  3452             if(ch == _T(']')) {
  3453                 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
  3454                 off++;
  3455                 continue;
  3456             }
  3457 
  3458             /* A potential permissive e-mail autolink. */
  3459             if(ch == _T('@')) {
  3460                 if(line->beg + 1 <= off  &&  ISALNUM(off-1)  &&
  3461                     off + 3 < line->end  &&  ISALNUM(off+1))
  3462                 {
  3463                     PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
  3464                     /* Push a dummy as a reserve for a closer. */
  3465                     PUSH_MARK('D', off, off, 0);
  3466                 }
  3467 
  3468                 off++;
  3469                 continue;
  3470             }
  3471 
  3472             /* A potential permissive URL autolink. */
  3473             if(ch == _T(':')) {
  3474                 static struct {
  3475                     const CHAR* scheme;
  3476                     SZ scheme_size;
  3477                     const CHAR* suffix;
  3478                     SZ suffix_size;
  3479                 } scheme_map[] = {
  3480                     /* In the order from the most frequently used, arguably. */
  3481                     { _T("https"), 5,   _T("//"), 2 },
  3482                     { _T("gemini"), 6,   _T("//"), 2 },
  3483                     { _T("http"), 4,    _T("//"), 2 },
  3484                     { _T("gopher"), 6,   _T("//"), 2 },
  3485                     { _T("spartan"), 7,    _T("//"), 2 },
  3486                     { _T("ftp"), 3,     _T("//"), 2 }
  3487                 };
  3488                 int scheme_index;
  3489 
  3490                 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
  3491                     const CHAR* scheme = scheme_map[scheme_index].scheme;
  3492                     const SZ scheme_size = scheme_map[scheme_index].scheme_size;
  3493                     const CHAR* suffix = scheme_map[scheme_index].suffix;
  3494                     const SZ suffix_size = scheme_map[scheme_index].suffix_size;
  3495 
  3496                     if(line->beg + scheme_size <= off  &&  md_ascii_eq(STR(off-scheme_size), scheme, scheme_size)  &&
  3497                         (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~([")))  &&
  3498                         off + 1 + suffix_size < line->end  &&  md_ascii_eq(STR(off+1), suffix, suffix_size))
  3499                     {
  3500                         PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
  3501                         /* Push a dummy as a reserve for a closer. */
  3502                         PUSH_MARK('D', off, off, 0);
  3503                         off += 1 + suffix_size;
  3504                         break;
  3505                     }
  3506                 }
  3507 
  3508                 off++;
  3509                 continue;
  3510             }
  3511 
  3512             /* A potential permissive WWW autolink. */
  3513             if(ch == _T('.')) {
  3514                 if(line->beg + 3 <= off  &&  md_ascii_eq(STR(off-3), _T("www"), 3)  &&
  3515                     (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~([")))  &&
  3516                     off + 1 < line_end)
  3517                 {
  3518                     PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
  3519                     /* Push a dummy as a reserve for a closer. */
  3520                     PUSH_MARK('D', off, off, 0);
  3521                     off++;
  3522                     continue;
  3523                 }
  3524 
  3525                 off++;
  3526                 continue;
  3527             }
  3528 
  3529             /* A potential table cell boundary or wiki link label delimiter. */
  3530             if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
  3531                 PUSH_MARK(ch, off, off+1, 0);
  3532                 off++;
  3533                 continue;
  3534             }
  3535 
  3536             /* A potential strikethrough start/end. */
  3537             if(ch == _T('~')) {
  3538                 OFF tmp = off+1;
  3539 
  3540                 while(tmp < line_end  &&  CH(tmp) == _T('~'))
  3541                     tmp++;
  3542 
  3543                 if(tmp - off < 3) {
  3544                     unsigned flags = 0;
  3545 
  3546                     if(tmp < line_end  &&  !ISUNICODEWHITESPACE(tmp))
  3547                         flags |= MD_MARK_POTENTIAL_OPENER;
  3548                     if(off > line->beg  &&  !ISUNICODEWHITESPACEBEFORE(off))
  3549                         flags |= MD_MARK_POTENTIAL_CLOSER;
  3550                     if(flags != 0)
  3551                         PUSH_MARK(ch, off, tmp, flags);
  3552                 }
  3553 
  3554                 off = tmp;
  3555                 continue;
  3556             }
  3557 
  3558             /* A potential equation start/end */
  3559             if(ch == _T('


)) {
  3560                 /* We can have at most two consecutive $ signs,
  3561                  * where two dollar signs signify a display equation. */
  3562                 OFF tmp = off+1;
  3563 
  3564                 while(tmp < line_end && CH(tmp) == _T('


))
  3565                     tmp++;
  3566 
  3567                 if (tmp - off <= 2)
  3568                     PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
  3569                 off = tmp;
  3570                 continue;
  3571             }
  3572 
  3573             /* Turn non-trivial whitespace into single space. */
  3574             if(ISWHITESPACE_(ch)) {
  3575                 OFF tmp = off+1;
  3576 
  3577                 while(tmp < line_end  &&  ISWHITESPACE(tmp))
  3578                     tmp++;
  3579 
  3580                 if(tmp - off > 1  ||  ch != _T(' '))
  3581                     PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
  3582 
  3583                 off = tmp;
  3584                 continue;
  3585             }
  3586 
  3587             /* NULL character. */
  3588             if(ch == _T('\0')) {
  3589                 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
  3590                 off++;
  3591                 continue;
  3592             }
  3593 
  3594             off++;
  3595         }
  3596     }
  3597 
  3598     /* Add a dummy mark at the end of the mark vector to simplify
  3599      * process_inlines(). */
  3600     PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
  3601 
  3602 abort:
  3603     return ret;
  3604 }
  3605 
  3606 static void
  3607 md_analyze_bracket(MD_CTX* ctx, int mark_index)
  3608 {
  3609     /* We cannot really resolve links here as for that we would need
  3610      * more context. E.g. a following pair of brackets (reference link),
  3611      * or enclosing pair of brackets (if the inner is the link, the outer
  3612      * one cannot be.)
  3613      *
  3614      * Therefore we here only construct a list of '[' ']' pairs ordered by
  3615      * position of the closer. This allows us to analyze what is or is not
  3616      * link in the right order, from inside to outside in case of nested
  3617      * brackets.
  3618      *
  3619      * The resolving itself is deferred to md_resolve_links().
  3620      */
  3621 
  3622     MD_MARK* mark = &ctx->marks[mark_index];
  3623 
  3624     if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
  3625         if(BRACKET_OPENERS.head != -1)
  3626             ctx->marks[BRACKET_OPENERS.tail].flags |= MD_MARK_HASNESTEDBRACKETS;
  3627 
  3628         md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
  3629         return;
  3630     }
  3631 
  3632     if(BRACKET_OPENERS.tail >= 0) {
  3633         /* Pop the opener from the chain. */
  3634         int opener_index = BRACKET_OPENERS.tail;
  3635         MD_MARK* opener = &ctx->marks[opener_index];
  3636         if(opener->prev >= 0)
  3637             ctx->marks[opener->prev].next = -1;
  3638         else
  3639             BRACKET_OPENERS.head = -1;
  3640         BRACKET_OPENERS.tail = opener->prev;
  3641 
  3642         /* Interconnect the opener and closer. */
  3643         opener->next = mark_index;
  3644         mark->prev = opener_index;
  3645 
  3646         /* Add the pair into chain of potential links for md_resolve_links().
  3647          * Note we misuse opener->prev for this as opener->next points to its
  3648          * closer. */
  3649         if(ctx->unresolved_link_tail >= 0)
  3650             ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
  3651         else
  3652             ctx->unresolved_link_head = opener_index;
  3653         ctx->unresolved_link_tail = opener_index;
  3654         opener->prev = -1;
  3655     }
  3656 }
  3657 
  3658 /* Forward declaration. */
  3659 static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
  3660                                      int mark_beg, int mark_end);
  3661 
  3662 static int
  3663 md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
  3664 {
  3665     int opener_index = ctx->unresolved_link_head;
  3666     OFF last_link_beg = 0;
  3667     OFF last_link_end = 0;
  3668     OFF last_img_beg = 0;
  3669     OFF last_img_end = 0;
  3670 
  3671     while(opener_index >= 0) {
  3672         MD_MARK* opener = &ctx->marks[opener_index];
  3673         int closer_index = opener->next;
  3674         MD_MARK* closer = &ctx->marks[closer_index];
  3675         int next_index = opener->prev;
  3676         MD_MARK* next_opener;
  3677         MD_MARK* next_closer;
  3678         MD_LINK_ATTR attr;
  3679         int is_link = FALSE;
  3680 
  3681         if(next_index >= 0) {
  3682             next_opener = &ctx->marks[next_index];
  3683             next_closer = &ctx->marks[next_opener->next];
  3684         } else {
  3685             next_opener = NULL;
  3686             next_closer = NULL;
  3687         }
  3688 
  3689         /* If nested ("[ [ ] ]"), we need to make sure that:
  3690          *   - The outer does not end inside of (...) belonging to the inner.
  3691          *   - The outer cannot be link if the inner is link (i.e. not image).
  3692          *
  3693          * (Note we here analyze from inner to outer as the marks are ordered
  3694          * by closer->beg.)
  3695          */
  3696         if((opener->beg < last_link_beg  &&  closer->end < last_link_end)  ||
  3697            (opener->beg < last_img_beg  &&  closer->end < last_img_end)  ||
  3698            (opener->beg < last_link_end  &&  opener->ch == '['))
  3699         {
  3700             opener_index = next_index;
  3701             continue;
  3702         }
  3703 
  3704         /* Recognize and resolve wiki links.
  3705          * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
  3706          */
  3707         if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
  3708             (opener->end - opener->beg == 1) &&         /* not image */
  3709             next_opener != NULL &&                      /* double '[' opener */
  3710             next_opener->ch == '[' &&
  3711             (next_opener->beg == opener->beg - 1) &&
  3712             (next_opener->end - next_opener->beg == 1) &&
  3713             next_closer != NULL &&                      /* double ']' closer */
  3714             next_closer->ch == ']' &&
  3715             (next_closer->beg == closer->beg + 1) &&
  3716             (next_closer->end - next_closer->beg == 1))
  3717         {
  3718             MD_MARK* delim = NULL;
  3719             int delim_index;
  3720             OFF dest_beg, dest_end;
  3721 
  3722             is_link = TRUE;
  3723 
  3724             /* We don't allow destination to be longer than 100 characters.
  3725              * Lets scan to see whether there is '|'. (If not then the whole
  3726              * wiki-link has to be below the 100 characters.) */
  3727             delim_index = opener_index + 1;
  3728             while(delim_index < closer_index) {
  3729                 MD_MARK* m = &ctx->marks[delim_index];
  3730                 if(m->ch == '|') {
  3731                     delim = m;
  3732                     break;
  3733                 }
  3734                 if(m->ch != 'D'  &&  m->beg - opener->end > 100)
  3735                     break;
  3736                 delim_index++;
  3737             }
  3738             dest_beg = opener->end;
  3739             dest_end = (delim != NULL) ? delim->beg : closer->beg;
  3740             if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
  3741                 is_link = FALSE;
  3742 
  3743             /* There may not be any new line in the destination. */
  3744             if(is_link) {
  3745                 OFF off;
  3746                 for(off = dest_beg; off < dest_end; off++) {
  3747                     if(ISNEWLINE(off)) {
  3748                         is_link = FALSE;
  3749                         break;
  3750                     }
  3751                 }
  3752             }
  3753 
  3754             if(is_link) {
  3755                 if(delim != NULL) {
  3756                     if(delim->end < closer->beg) {
  3757                         md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL);
  3758                         md_rollback(ctx, delim_index, closer_index, MD_ROLLBACK_CROSSING);
  3759                         delim->flags |= MD_MARK_RESOLVED;
  3760                         opener->end = delim->beg;
  3761                     } else {
  3762                         /* The pipe is just before the closer: [[foo|]] */
  3763                         md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
  3764                         closer->beg = delim->beg;
  3765                         delim = NULL;
  3766                     }
  3767                 }
  3768 
  3769                 opener->beg = next_opener->beg;
  3770                 opener->next = closer_index;
  3771                 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
  3772 
  3773                 closer->end = next_closer->end;
  3774                 closer->prev = opener_index;
  3775                 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
  3776 
  3777                 last_link_beg = opener->beg;
  3778                 last_link_end = closer->end;
  3779 
  3780                 if(delim != NULL)
  3781                     md_analyze_link_contents(ctx, lines, n_lines, delim_index+1, closer_index);
  3782 
  3783                 opener_index = next_opener->prev;
  3784                 continue;
  3785             }
  3786         }
  3787 
  3788         if(next_opener != NULL  &&  next_opener->beg == closer->end) {
  3789             if(next_closer->beg > closer->end + 1) {
  3790                 /* Might be full reference link. */
  3791                 if(!(next_opener->flags & MD_MARK_HASNESTEDBRACKETS))
  3792                     is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
  3793             } else {
  3794                 /* Might be shortcut reference link. */
  3795                 if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
  3796                     is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
  3797             }
  3798 
  3799             if(is_link < 0)
  3800                 return -1;
  3801 
  3802             if(is_link) {
  3803                 /* Eat the 2nd "[...]". */
  3804                 closer->end = next_closer->end;
  3805 
  3806                 /* Do not analyze the label as a standalone link in the next
  3807                  * iteration. */
  3808                 next_index = ctx->marks[next_index].prev;
  3809             }
  3810         } else {
  3811             if(closer->end < ctx->size  &&  CH(closer->end) == _T('(')) {
  3812                 /* Might be inline link. */
  3813                 OFF inline_link_end = UINT_MAX;
  3814 
  3815                 is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr);
  3816                 if(is_link < 0)
  3817                     return -1;
  3818 
  3819                 /* Check the closing ')' is not inside an already resolved range
  3820                  * (i.e. a range with a higher priority), e.g. a code span. */
  3821                 if(is_link) {
  3822                     int i = closer_index + 1;
  3823 
  3824                     while(i < ctx->n_marks) {
  3825                         MD_MARK* mark = &ctx->marks[i];
  3826 
  3827                         if(mark->beg >= inline_link_end)
  3828                             break;
  3829                         if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
  3830                             if(ctx->marks[mark->next].beg >= inline_link_end) {
  3831                                 /* Cancel the link status. */
  3832                                 if(attr.title_needs_free)
  3833                                     free(attr.title);
  3834                                 is_link = FALSE;
  3835                                 break;
  3836                             }
  3837 
  3838                             i = mark->next + 1;
  3839                         } else {
  3840                             i++;
  3841                         }
  3842                     }
  3843                 }
  3844 
  3845                 if(is_link) {
  3846                     /* Eat the "(...)" */
  3847                     closer->end = inline_link_end;
  3848                 }
  3849             }
  3850 
  3851             if(!is_link) {
  3852                 /* Might be collapsed reference link. */
  3853                 if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
  3854                     is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
  3855                 if(is_link < 0)
  3856                     return -1;
  3857             }
  3858         }
  3859 
  3860         if(is_link) {
  3861             /* Resolve the brackets as a link. */
  3862             opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
  3863             closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
  3864 
  3865             /* If it is a link, we store the destination and title in the two
  3866              * dummy marks after the opener. */
  3867             MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
  3868             ctx->marks[opener_index+1].beg = attr.dest_beg;
  3869             ctx->marks[opener_index+1].end = attr.dest_end;
  3870 
  3871             MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
  3872             md_mark_store_ptr(ctx, opener_index+2, attr.title);
  3873             /* The title might or might not have been allocated for us. */
  3874             if(attr.title_needs_free)
  3875                 md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2);
  3876             ctx->marks[opener_index+2].prev = attr.title_size;
  3877 
  3878             if(opener->ch == '[') {
  3879                 last_link_beg = opener->beg;
  3880                 last_link_end = closer->end;
  3881             } else {
  3882                 last_img_beg = opener->beg;
  3883                 last_img_end = closer->end;
  3884             }
  3885 
  3886             md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
  3887 
  3888             /* If the link text is formed by nothing but permissive autolink,
  3889              * suppress the autolink.
  3890              * See https://github.com/mity/md4c/issues/152 for more info. */
  3891             if(ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) {
  3892                 MD_MARK* first_nested;
  3893                 MD_MARK* last_nested;
  3894 
  3895                 first_nested = opener + 1;
  3896                 while(first_nested->ch == _T('D')  &&  first_nested < closer)
  3897                     first_nested++;
  3898 
  3899                 last_nested = closer - 1;
  3900                 while(first_nested->ch == _T('D')  &&  last_nested > opener)
  3901                     last_nested--;
  3902 
  3903                 if((first_nested->flags & MD_MARK_RESOLVED)  &&
  3904                    first_nested->beg == opener->end  &&
  3905                    ISANYOF_(first_nested->ch, _T("@:."))  &&
  3906                    first_nested->next == (last_nested - ctx->marks)  &&
  3907                    last_nested->end == closer->beg)
  3908                 {
  3909                     first_nested->ch = _T('D');
  3910                     first_nested->flags &= ~MD_MARK_RESOLVED;
  3911                     last_nested->ch = _T('D');
  3912                     last_nested->flags &= ~MD_MARK_RESOLVED;
  3913                 }
  3914             }
  3915         }
  3916 
  3917         opener_index = next_index;
  3918     }
  3919 
  3920     return 0;
  3921 }
  3922 
  3923 /* Analyze whether the mark '&' starts a HTML entity.
  3924  * If so, update its flags as well as flags of corresponding closer ';'. */
  3925 static void
  3926 md_analyze_entity(MD_CTX* ctx, int mark_index)
  3927 {
  3928     MD_MARK* opener = &ctx->marks[mark_index];
  3929     MD_MARK* closer;
  3930     OFF off;
  3931 
  3932     /* Cannot be entity if there is no closer as the next mark.
  3933      * (Any other mark between would mean strange character which cannot be
  3934      * part of the entity.
  3935      *
  3936      * So we can do all the work on '&' and do not call this later for the
  3937      * closing mark ';'.
  3938      */
  3939     if(mark_index + 1 >= ctx->n_marks)
  3940         return;
  3941     closer = &ctx->marks[mark_index+1];
  3942     if(closer->ch != ';')
  3943         return;
  3944 
  3945     if(md_is_entity(ctx, opener->beg, closer->end, &off)) {
  3946         MD_ASSERT(off == closer->end);
  3947 
  3948         md_resolve_range(ctx, NULL, mark_index, mark_index+1);
  3949         opener->end = closer->end;
  3950     }
  3951 }
  3952 
  3953 static void
  3954 md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
  3955 {
  3956     MD_MARK* mark = &ctx->marks[mark_index];
  3957     mark->flags |= MD_MARK_RESOLVED;
  3958 
  3959     md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index);
  3960     ctx->n_table_cell_boundaries++;
  3961 }
  3962 
  3963 /* Split a longer mark into two. The new mark takes the given count of
  3964  * characters. May only be called if an adequate number of dummy 'D' marks
  3965  * follows.
  3966  */
  3967 static int
  3968 md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
  3969 {
  3970     MD_MARK* mark = &ctx->marks[mark_index];
  3971     int new_mark_index = mark_index + (mark->end - mark->beg - n);
  3972     MD_MARK* dummy = &ctx->marks[new_mark_index];
  3973 
  3974     MD_ASSERT(mark->end - mark->beg > n);
  3975     MD_ASSERT(dummy->ch == 'D');
  3976 
  3977     memcpy(dummy, mark, sizeof(MD_MARK));
  3978     mark->end -= n;
  3979     dummy->beg = mark->end;
  3980 
  3981     return new_mark_index;
  3982 }
  3983 
  3984 static void
  3985 md_analyze_emph(MD_CTX* ctx, int mark_index)
  3986 {
  3987     MD_MARK* mark = &ctx->marks[mark_index];
  3988     MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
  3989 
  3990     /* If we can be a closer, try to resolve with the preceding opener. */
  3991     if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
  3992         MD_MARK* opener = NULL;
  3993         int opener_index = 0;
  3994 
  3995         if(mark->ch == _T('*')) {
  3996             MD_MARKCHAIN* opener_chains[6];
  3997             int i, n_opener_chains;
  3998             unsigned flags = mark->flags;
  3999 
  4000             /* Apply the "rule of three". */
  4001             n_opener_chains = 0;
  4002             opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
  4003             if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
  4004                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
  4005             if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
  4006                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
  4007             opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
  4008             if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
  4009                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
  4010             if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
  4011                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
  4012 
  4013             /* Opener is the most recent mark from the allowed chains. */
  4014             for(i = 0; i < n_opener_chains; i++) {
  4015                 if(opener_chains[i]->tail >= 0) {
  4016                     int tmp_index = opener_chains[i]->tail;
  4017                     MD_MARK* tmp_mark = &ctx->marks[tmp_index];
  4018                     if(opener == NULL  ||  tmp_mark->end > opener->end) {
  4019                         opener_index = tmp_index;
  4020                         opener = tmp_mark;
  4021                     }
  4022                 }
  4023             }
  4024         } else {
  4025             /* Simple emph. mark */
  4026             if(chain->tail >= 0) {
  4027                 opener_index = chain->tail;
  4028                 opener = &ctx->marks[opener_index];
  4029             }
  4030         }
  4031 
  4032         /* Resolve, if we have found matching opener. */
  4033         if(opener != NULL) {
  4034             SZ opener_size = opener->end - opener->beg;
  4035             SZ closer_size = mark->end - mark->beg;
  4036             MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index);
  4037 
  4038             if(opener_size > closer_size) {
  4039                 opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
  4040                 md_mark_chain_append(ctx, opener_chain, opener_index);
  4041             } else if(opener_size < closer_size) {
  4042                 md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
  4043             }
  4044 
  4045             md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
  4046             md_resolve_range(ctx, opener_chain, opener_index, mark_index);
  4047             return;
  4048         }
  4049     }
  4050 
  4051     /* If we could not resolve as closer, we may be yet be an opener. */
  4052     if(mark->flags & MD_MARK_POTENTIAL_OPENER)
  4053         md_mark_chain_append(ctx, chain, mark_index);
  4054 }
  4055 
  4056 static void
  4057 md_analyze_tilde(MD_CTX* ctx, int mark_index)
  4058 {
  4059     MD_MARK* mark = &ctx->marks[mark_index];
  4060     MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
  4061 
  4062     /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
  4063      * only tildes sequences of length 1 and 2, and the length of the opener
  4064      * and closer has to match. */
  4065 
  4066     if((mark->flags & MD_MARK_POTENTIAL_CLOSER)  &&  chain->head >= 0) {
  4067         int opener_index = chain->head;
  4068 
  4069         md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
  4070         md_resolve_range(ctx, chain, opener_index, mark_index);
  4071         return;
  4072     }
  4073 
  4074     if(mark->flags & MD_MARK_POTENTIAL_OPENER)
  4075         md_mark_chain_append(ctx, chain, mark_index);
  4076 }
  4077 
  4078 static void
  4079 md_analyze_dollar(MD_CTX* ctx, int mark_index)
  4080 {
  4081     /* This should mimic the way inline equations work in LaTeX, so there
  4082      * can only ever be one item in the chain (i.e. the dollars can't be
  4083      * nested). This is basically the same as the md_analyze_tilde function,
  4084      * except that we require matching openers and closers to be of the same
  4085      * length.
  4086      *
  4087      * E.g.: $abc$def$ => abc (display equation) def (end equation) */
  4088     if(DOLLAR_OPENERS.head >= 0) {
  4089         /* If the potential closer has a non-matching number of $, discard */
  4090         MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
  4091         MD_MARK* close = &ctx->marks[mark_index];
  4092 
  4093         int opener_index = DOLLAR_OPENERS.head;
  4094         md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
  4095         if (open->end - open->beg == close->end - close->beg) {
  4096             /* We are the matching closer */
  4097             md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index);
  4098             return;
  4099         }
  4100     }
  4101 
  4102     md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
  4103 }
  4104 
  4105 static void
  4106 md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
  4107 {
  4108     MD_MARK* opener = &ctx->marks[mark_index];
  4109     int closer_index = mark_index + 1;
  4110     MD_MARK* closer = &ctx->marks[closer_index];
  4111     MD_MARK* next_resolved_mark;
  4112     OFF off = opener->end;
  4113     int n_dots = FALSE;
  4114     int has_underscore_in_last_seg = FALSE;
  4115     int has_underscore_in_next_to_last_seg = FALSE;
  4116     int n_opened_parenthesis = 0;
  4117     int n_excess_parenthesis = 0;
  4118 
  4119     /* Check for domain. */
  4120     while(off < ctx->size) {
  4121         if(ISALNUM(off) || CH(off) == _T('-')) {
  4122             off++;
  4123         } else if(CH(off) == _T('.')) {
  4124             /* We must see at least one period. */
  4125             n_dots++;
  4126             has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
  4127             has_underscore_in_last_seg = FALSE;
  4128             off++;
  4129         } else if(CH(off) == _T('_')) {
  4130             /* No underscore may be present in the last two domain segments. */
  4131             has_underscore_in_last_seg = TRUE;
  4132             off++;
  4133         } else {
  4134             break;
  4135         }
  4136     }
  4137     if(off > opener->end  &&  CH(off-1) == _T('.')) {
  4138         off--;
  4139         n_dots--;
  4140     }
  4141     if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
  4142         return;
  4143 
  4144     /* Check for path. */
  4145     next_resolved_mark = closer + 1;
  4146     while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
  4147         next_resolved_mark++;
  4148     while(off < next_resolved_mark->beg  &&  CH(off) != _T('<')  &&  !ISWHITESPACE(off)  &&  !ISNEWLINE(off)) {
  4149         /* Parenthesis must be balanced. */
  4150         if(CH(off) == _T('(')) {
  4151             n_opened_parenthesis++;
  4152         } else if(CH(off) == _T(')')) {
  4153             if(n_opened_parenthesis > 0)
  4154                 n_opened_parenthesis--;
  4155             else
  4156                 n_excess_parenthesis++;
  4157         }
  4158 
  4159         off++;
  4160     }
  4161 
  4162     /* Trim a trailing punctuation from the end. */
  4163     while(TRUE) {
  4164         if(ISANYOF(off-1, _T("?!.,:*_~"))) {
  4165             off--;
  4166         } else if(CH(off-1) == ')'  &&  n_excess_parenthesis > 0) {
  4167             /* Unmatched ')' can be in an interior of the path but not at the
  4168              * of it, so the auto-link may be safely nested in a parenthesis
  4169              * pair. */
  4170             off--;
  4171             n_excess_parenthesis--;
  4172         } else {
  4173             break;
  4174         }
  4175     }
  4176 
  4177     /* Ok. Lets call it an auto-link. Adapt opener and create closer to zero
  4178      * length so all the contents becomes the link text. */
  4179     MD_ASSERT(closer->ch == 'D' ||
  4180               ((ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS) &&
  4181                (closer->ch == '.' || closer->ch == ':' || closer->ch == '@')));
  4182     opener->end = opener->beg;
  4183     closer->ch = opener->ch;
  4184     closer->beg = off;
  4185     closer->end = off;
  4186     md_resolve_range(ctx, NULL, mark_index, closer_index);
  4187 }
  4188 
  4189 /* The permissive autolinks do not have to be enclosed in '<' '>' but we
  4190  * instead impose stricter rules what is understood as an e-mail address
  4191  * here. Actually any non-alphanumeric characters with exception of '.'
  4192  * are prohibited both in username and after '@'. */
  4193 static void
  4194 md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
  4195 {
  4196     MD_MARK* opener = &ctx->marks[mark_index];
  4197     int closer_index;
  4198     MD_MARK* closer;
  4199     OFF beg = opener->beg;
  4200     OFF end = opener->end;
  4201     int dot_count = 0;
  4202 
  4203     MD_ASSERT(opener->ch == _T('@'));
  4204 
  4205     /* Scan for name before '@'. */
  4206     while(beg > 0  &&  (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
  4207         beg--;
  4208 
  4209     /* Scan for domain after '@'. */
  4210     while(end < ctx->size  &&  (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
  4211         if(CH(end) == _T('.'))
  4212             dot_count++;
  4213         end++;
  4214     }
  4215     if(CH(end-1) == _T('.')) {  /* Final '.' not part of it. */
  4216         dot_count--;
  4217         end--;
  4218     }
  4219     else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
  4220         return;
  4221     if(CH(end-1) == _T('@')  ||  dot_count == 0)
  4222         return;
  4223 
  4224     /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
  4225      * length so all the contents becomes the link text. */
  4226     closer_index = mark_index + 1;
  4227     closer = &ctx->marks[closer_index];
  4228     if (closer->ch != 'D') return;
  4229 
  4230     opener->beg = beg;
  4231     opener->end = beg;
  4232     closer->ch = opener->ch;
  4233     closer->beg = end;
  4234     closer->end = end;
  4235     md_resolve_range(ctx, NULL, mark_index, closer_index);
  4236 }
  4237 
  4238 static inline void
  4239 md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
  4240                  int mark_beg, int mark_end, const CHAR* mark_chars)
  4241 {
  4242     int i = mark_beg;
  4243     MD_UNUSED(lines);
  4244     MD_UNUSED(n_lines);
  4245 
  4246     while(i < mark_end) {
  4247         MD_MARK* mark = &ctx->marks[i];
  4248 
  4249         /* Skip resolved spans. */
  4250         if(mark->flags & MD_MARK_RESOLVED) {
  4251             if(mark->flags & MD_MARK_OPENER) {
  4252                 MD_ASSERT(i < mark->next);
  4253                 i = mark->next + 1;
  4254             } else {
  4255                 i++;
  4256             }
  4257             continue;
  4258         }
  4259 
  4260         /* Skip marks we do not want to deal with. */
  4261         if(!ISANYOF_(mark->ch, mark_chars)) {
  4262             i++;
  4263             continue;
  4264         }
  4265 
  4266         /* Analyze the mark. */
  4267         switch(mark->ch) {
  4268             case '[':   /* Pass through. */
  4269             case '!':   /* Pass through. */
  4270             case ']':   md_analyze_bracket(ctx, i); break;
  4271             case '&':   md_analyze_entity(ctx, i); break;
  4272             case '|':   md_analyze_table_cell_boundary(ctx, i); break;
  4273             case '_':   /* Pass through. */
  4274             case '*':   md_analyze_emph(ctx, i); break;
  4275             case '~':   md_analyze_tilde(ctx, i); break;
  4276             case '


:   md_analyze_dollar(ctx, i); break;
  4277             case '.':   /* Pass through. */
  4278             case ':':   md_analyze_permissive_url_autolink(ctx, i); break;
  4279             case '@':   md_analyze_permissive_email_autolink(ctx, i); break;
  4280         }
  4281 
  4282         i++;
  4283     }
  4284 }
  4285 
  4286 /* Analyze marks (build ctx->marks). */
  4287 static int
  4288 md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
  4289 {
  4290     int ret;
  4291 
  4292     /* Reset the previously collected stack of marks. */
  4293     ctx->n_marks = 0;
  4294 
  4295     /* Collect all marks. */
  4296     MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
  4297 
  4298     /* (1) Links. */
  4299     md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"));
  4300     MD_CHECK(md_resolve_links(ctx, lines, n_lines));
  4301     BRACKET_OPENERS.head = -1;
  4302     BRACKET_OPENERS.tail = -1;
  4303     ctx->unresolved_link_head = -1;
  4304     ctx->unresolved_link_tail = -1;
  4305 
  4306     if(table_mode) {
  4307         /* (2) Analyze table cell boundaries.
  4308          * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
  4309          * not after, because caller may need it. */
  4310         MD_ASSERT(n_lines == 1);
  4311         TABLECELLBOUNDARIES.head = -1;
  4312         TABLECELLBOUNDARIES.tail = -1;
  4313         ctx->n_table_cell_boundaries = 0;
  4314         md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"));
  4315         return ret;
  4316     }
  4317 
  4318     /* (3) Emphasis and strong emphasis; permissive autolinks. */
  4319     md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks);
  4320 
  4321 abort:
  4322     return ret;
  4323 }
  4324 
  4325 static void
  4326 md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
  4327                          int mark_beg, int mark_end)
  4328 {
  4329     int i;
  4330 
  4331     md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("&"));
  4332     md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
  4333 
  4334     for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
  4335         ctx->mark_chains[i].head = -1;
  4336         ctx->mark_chains[i].tail = -1;
  4337     }
  4338 }
  4339 
  4340 static int
  4341 md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
  4342                       const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
  4343                       const CHAR* title, SZ title_size)
  4344 {
  4345     MD_ATTRIBUTE_BUILD href_build = { 0 };
  4346     MD_ATTRIBUTE_BUILD title_build = { 0 };
  4347     MD_SPAN_A_DETAIL det;
  4348     int ret = 0;
  4349 
  4350     /* Note we here rely on fact that MD_SPAN_A_DETAIL and
  4351      * MD_SPAN_IMG_DETAIL are binary-compatible. */
  4352     memset(&det, 0, sizeof(MD_SPAN_A_DETAIL));
  4353     MD_CHECK(md_build_attribute(ctx, dest, dest_size,
  4354                     (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
  4355                     &det.href, &href_build));
  4356     MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
  4357 
  4358     if(enter)
  4359         MD_ENTER_SPAN(type, &det);
  4360     else
  4361         MD_LEAVE_SPAN(type, &det);
  4362 
  4363 abort:
  4364     md_free_attribute(ctx, &href_build);
  4365     md_free_attribute(ctx, &title_build);
  4366     return ret;
  4367 }
  4368 
  4369 static int
  4370 md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
  4371 {
  4372     MD_ATTRIBUTE_BUILD target_build = { 0 };
  4373     MD_SPAN_WIKILINK_DETAIL det;
  4374     int ret = 0;
  4375 
  4376     memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL));
  4377     MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
  4378 
  4379     if (enter)
  4380         MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
  4381     else
  4382         MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
  4383 
  4384 abort:
  4385     md_free_attribute(ctx, &target_build);
  4386     return ret;
  4387 }
  4388 
  4389 
  4390 /* Render the output, accordingly to the analyzed ctx->marks. */
  4391 static int
  4392 md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
  4393 {
  4394     MD_TEXTTYPE text_type;
  4395     const MD_LINE* line = lines;
  4396     MD_MARK* prev_mark = NULL;
  4397     MD_MARK* mark;
  4398     OFF off = lines[0].beg;
  4399     OFF end = lines[n_lines-1].end;
  4400     int enforce_hardbreak = 0;
  4401     int ret = 0;
  4402 
  4403     /* Find first resolved mark. Note there is always at least one resolved
  4404      * mark,  the dummy last one after the end of the latest line we actually
  4405      * never really reach. This saves us of a lot of special checks and cases
  4406      * in this function. */
  4407     mark = ctx->marks;
  4408     while(!(mark->flags & MD_MARK_RESOLVED))
  4409         mark++;
  4410 
  4411     text_type = MD_TEXT_NORMAL;
  4412 
  4413     while(1) {
  4414         /* Process the text up to the next mark or end-of-line. */
  4415         OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
  4416         if(tmp > off) {
  4417             MD_TEXT(text_type, STR(off), tmp - off);
  4418             off = tmp;
  4419         }
  4420 
  4421         /* If reached the mark, process it and move to next one. */
  4422         if(off >= mark->beg) {
  4423             switch(mark->ch) {
  4424                 case '\\':      /* Backslash escape. */
  4425                     if(ISNEWLINE(mark->beg+1))
  4426                         enforce_hardbreak = 1;
  4427                     else
  4428                         MD_TEXT(text_type, STR(mark->beg+1), 1);
  4429                     break;
  4430 
  4431                 case ' ':       /* Non-trivial space. */
  4432                     MD_TEXT(text_type, _T(" "), 1);
  4433                     break;
  4434 
  4435                 case '`':       /* Code span. */
  4436                     if(mark->flags & MD_MARK_OPENER) {
  4437                         MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
  4438                         text_type = MD_TEXT_CODE;
  4439                     } else {
  4440                         MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
  4441                         text_type = MD_TEXT_NORMAL;
  4442                     }
  4443                     break;
  4444 
  4445                 case '-': /* faint */
  4446                     if(mark->flags & MD_MARK_OPENER) {
  4447                         MD_ENTER_SPAN(MD_SPAN_FNT, NULL);
  4448                     } else {
  4449                         MD_LEAVE_SPAN(MD_SPAN_FNT, NULL);
  4450                     }
  4451                     break;
  4452 
  4453                 case '%': /* inverse */
  4454                     if(mark->flags & MD_MARK_OPENER) {
  4455                         MD_ENTER_SPAN(MD_SPAN_INV, NULL);
  4456                     } else {
  4457                         MD_LEAVE_SPAN(MD_SPAN_INV, NULL);
  4458                     }
  4459                     break;
  4460 
  4461                 case '^': /* blink */
  4462                     if(mark->flags & MD_MARK_OPENER) {
  4463                         MD_ENTER_SPAN(MD_SPAN_BLI, NULL);
  4464                     } else {
  4465                         MD_LEAVE_SPAN(MD_SPAN_BLI, NULL);
  4466                     }
  4467                     break;
  4468 
  4469                 case '_':       /* Underline (or emphasis if we fall through). */
  4470                     if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
  4471                         if(mark->flags & MD_MARK_OPENER) {
  4472                             /* while(off < mark->end) { */
  4473                             /*     MD_ENTER_SPAN(MD_SPAN_U, NULL); */
  4474                             /*     off++; */
  4475                             /* } */
  4476                             if((mark->end - off) % 2) {
  4477                                 MD_ENTER_SPAN(MD_SPAN_U, NULL);
  4478                                 off++;
  4479                             }
  4480                             while(off + 1 < mark->end) {
  4481                                 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
  4482                                 off += 2;
  4483                             }
  4484                         } else {
  4485                             /* while(off < mark->end) { */
  4486                             /*     MD_LEAVE_SPAN(MD_SPAN_U, NULL); */
  4487                             /*     off++; */
  4488                             /* } */
  4489                             while(off + 1 < mark->end) {
  4490                                 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
  4491                                 off += 2;
  4492                             }
  4493                             if((mark->end - off) % 2) {
  4494                                 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
  4495                                 off++;
  4496                             }
  4497                         }
  4498                         break;
  4499                     }
  4500                     MD_FALLTHROUGH();
  4501 
  4502                 case '*':       /* Emphasis, strong emphasis. */
  4503                     if(mark->flags & MD_MARK_OPENER) {
  4504                         if((mark->end - off) % 2) {
  4505                             MD_ENTER_SPAN(MD_SPAN_EM, NULL);
  4506                             off++;
  4507                         }
  4508                         while(off + 1 < mark->end) {
  4509                             MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
  4510                             off += 2;
  4511                         }
  4512                     } else {
  4513                         while(off + 1 < mark->end) {
  4514                             MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
  4515                             off += 2;
  4516                         }
  4517                         if((mark->end - off) % 2) {
  4518                             MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
  4519                             off++;
  4520                         }
  4521                     }
  4522                     break;
  4523 
  4524                 case '~': /* crossed */
  4525                     if(mark->flags & MD_MARK_OPENER)
  4526                         MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
  4527                     else
  4528                         MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
  4529                     break;
  4530 
  4531                 case '


:
  4532                     if(mark->flags & MD_MARK_OPENER) {
  4533                         MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
  4534                         text_type = MD_TEXT_LATEXMATH;
  4535                     } else {
  4536                         MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
  4537                         text_type = MD_TEXT_NORMAL;
  4538                     }
  4539                     break;
  4540 
  4541                 case '!': /* conceal/hidden */
  4542                     if (mark->prev == -1) {
  4543                         if (mark->flags & MD_MARK_OPENER) {
  4544                             MD_ENTER_SPAN(MD_SPAN_COC, NULL);
  4545                             break;
  4546                         }
  4547                     }
  4548                     else {
  4549                         if (ctx->marks[mark->prev].ch == '!' && !(mark->flags & MD_MARK_OPENER)) {
  4550                             MD_LEAVE_SPAN(MD_SPAN_COC, NULL);
  4551                             break;
  4552                         }
  4553                     }
  4554                 case '[':       /* Link, wiki link, image, anchor. */
  4555                 case ']':
  4556                 {
  4557                     const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
  4558                     const MD_MARK* closer = &ctx->marks[opener->next];
  4559                     const MD_MARK* dest_mark;
  4560                     const MD_MARK* title_mark;
  4561 
  4562                     if ((opener->ch == '[' && closer->ch == ']') &&
  4563                         opener->end - opener->beg >= 2 &&
  4564                         closer->end - closer->beg >= 2)
  4565                     {
  4566                         int has_label = (opener->end - opener->beg > 2);
  4567                         SZ target_sz;
  4568 
  4569                         if(has_label)
  4570                             target_sz = opener->end - (opener->beg+2);
  4571                         else
  4572                             target_sz = closer->beg - opener->end;
  4573 
  4574                         MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
  4575                                  has_label ? STR(opener->beg+2) : STR(opener->end),
  4576                                  target_sz));
  4577 
  4578                         break;
  4579                     }
  4580 
  4581                     if ((opener->ch == '[' && closer->ch == ']') &&
  4582                         opener->end - opener->beg == 2 &&
  4583                         closer->end - closer->beg == 1 &&
  4584                         CH(opener->beg+1) == _T('|'))
  4585                     {
  4586                         if(mark->flags & MD_MARK_OPENER) {
  4587                             MD_ENTER_SPAN(MD_SPAN_ANCHOR, NULL);
  4588                         } else {
  4589                             MD_LEAVE_SPAN(MD_SPAN_ANCHOR, NULL);
  4590                         }
  4591                     }
  4592 
  4593                     dest_mark = opener+1;
  4594                     MD_ASSERT(dest_mark->ch == 'D');
  4595                     title_mark = opener+2;
  4596                     if (title_mark->ch != 'D') break;
  4597 
  4598                     MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
  4599                                 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
  4600                                 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
  4601                                 md_mark_get_ptr(ctx, (int)(title_mark - ctx->marks)),
  4602                                                                 title_mark->prev));
  4603 
  4604                     /* link/image closer may span multiple lines. */
  4605                     if(mark->ch == ']') {
  4606                         while(mark->end > line->end)
  4607                             line++;
  4608                     }
  4609 
  4610                     break;
  4611                 }
  4612 
  4613                 case '<':
  4614                 case '>':       /* Autolink or raw HTML. */
  4615                     if(!(mark->flags & MD_MARK_AUTOLINK)) {
  4616                         /* Raw HTML. */
  4617                         if(mark->flags & MD_MARK_OPENER)
  4618                             text_type = MD_TEXT_HTML;
  4619                         else
  4620                             text_type = MD_TEXT_NORMAL;
  4621                         break;
  4622                     }
  4623                     /* Pass through, if auto-link. */
  4624                     MD_FALLTHROUGH();
  4625 
  4626                 case '@':       /* Permissive e-mail autolink. */
  4627                 case ':':       /* Permissive URL autolink. */
  4628                 case '.':       /* Permissive WWW autolink. */
  4629                 {
  4630                     MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
  4631                     MD_MARK* closer = &ctx->marks[opener->next];
  4632                     const CHAR* dest = STR(opener->end);
  4633                     SZ dest_size = closer->beg - opener->end;
  4634 
  4635                     /* For permissive auto-links we do not know closer mark
  4636                      * position at the time of md_collect_marks(), therefore
  4637                      * it can be out-of-order in ctx->marks[].
  4638                      *
  4639                      * With this flag, we make sure that we output the closer
  4640                      * only if we processed the opener. */
  4641                     if(mark->flags & MD_MARK_OPENER)
  4642                         closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
  4643 
  4644                     if(opener->ch == '@' || opener->ch == '.') {
  4645                         dest_size += 7;
  4646                         MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
  4647                         memcpy(ctx->buffer,
  4648                                 (opener->ch == '@' ? _T("mailto:") : _T("http://")),
  4649                                 7 * sizeof(CHAR));
  4650                         memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
  4651                         dest = ctx->buffer;
  4652                     }
  4653 
  4654                     if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
  4655                         MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
  4656                                     MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
  4657                     break;
  4658                 }
  4659 
  4660                 case '&':       /* Entity. */
  4661                     MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
  4662                     break;
  4663 
  4664                 case '\0':
  4665                     MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
  4666                     break;
  4667 
  4668                 case 127:
  4669                     goto abort;
  4670             }
  4671 
  4672             off = mark->end;
  4673 
  4674             /* Move to next resolved mark. */
  4675             prev_mark = mark;
  4676             mark++;
  4677             while(!(mark->flags & MD_MARK_RESOLVED)  ||  mark->beg < off)
  4678                 mark++;
  4679         }
  4680 
  4681         /* If reached end of line, move to next one. */
  4682         if(off >= line->end) {
  4683             /* If it is the last line, we are done. */
  4684             if(off >= end)
  4685                 break;
  4686 
  4687             if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
  4688                 OFF tmp;
  4689 
  4690                 MD_ASSERT(prev_mark != NULL);
  4691                 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '


)  &&  (prev_mark->flags & MD_MARK_OPENER));
  4692                 MD_ASSERT(ISANYOF2_(mark->ch, '`', '


)  &&  (mark->flags & MD_MARK_CLOSER));
  4693 
  4694                 /* Inside a code span, trailing line whitespace has to be
  4695                  * outputted. */
  4696                 tmp = off;
  4697                 while(off < ctx->size  &&  ISBLANK(off))
  4698                     off++;
  4699                 if(off > tmp)
  4700                     MD_TEXT(text_type, STR(tmp), off-tmp);
  4701 
  4702                 /* and new lines are transformed into single spaces. */
  4703                 if(prev_mark->end < off  &&  off < mark->beg)
  4704                     MD_TEXT(text_type, _T(" "), 1);
  4705             } else if(text_type == MD_TEXT_HTML) {
  4706                 /* Inside raw HTML, we output the new line verbatim, including
  4707                  * any trailing spaces. */
  4708                 OFF tmp = off;
  4709 
  4710                 while(tmp < end  &&  ISBLANK(tmp))
  4711                     tmp++;
  4712                 if(tmp > off)
  4713                     MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
  4714                 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
  4715             } else {
  4716                 /* Output soft or hard line break. */
  4717                 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
  4718 
  4719                 if(text_type == MD_TEXT_NORMAL) {
  4720                     if(enforce_hardbreak)
  4721                         break_type = MD_TEXT_BR;
  4722                     else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
  4723                         break_type = MD_TEXT_BR;
  4724                 }
  4725 
  4726                 MD_TEXT(break_type, _T("\n"), 1);
  4727             }
  4728 
  4729             /* Move to the next line. */
  4730             line++;
  4731             off = line->beg;
  4732 
  4733             enforce_hardbreak = 0;
  4734         }
  4735     }
  4736 
  4737 abort:
  4738     return ret;
  4739 }
  4740 
  4741 
  4742 /***************************
  4743  ***  Processing Tables  ***
  4744  ***************************/
  4745 
  4746 static void
  4747 md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
  4748 {
  4749     static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
  4750     OFF off = beg;
  4751 
  4752     while(n_align > 0) {
  4753         int index = 0;  /* index into align_map[] */
  4754 
  4755         while(CH(off) != _T('-'))
  4756             off++;
  4757         if(off > beg  &&  CH(off-1) == _T(':'))
  4758             index |= 1;
  4759         while(off < end  &&  CH(off) == _T('-'))
  4760             off++;
  4761         if(off < end  &&  CH(off) == _T(':'))
  4762             index |= 2;
  4763 
  4764         *align = align_map[index];
  4765         align++;
  4766         n_align--;
  4767     }
  4768 
  4769 }
  4770 
  4771 /* Forward declaration. */
  4772 static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
  4773 
  4774 static int
  4775 md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
  4776 {
  4777     MD_LINE line;
  4778     MD_BLOCK_TD_DETAIL det;
  4779     int ret = 0;
  4780 
  4781     while(beg < end  &&  ISWHITESPACE(beg))
  4782         beg++;
  4783     while(end > beg  &&  ISWHITESPACE(end-1))
  4784         end--;
  4785 
  4786     det.align = align;
  4787     line.beg = beg;
  4788     line.end = end;
  4789 
  4790     MD_ENTER_BLOCK(cell_type, &det);
  4791     MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
  4792     MD_LEAVE_BLOCK(cell_type, &det);
  4793 
  4794 abort:
  4795     return ret;
  4796 }
  4797 
  4798 static int
  4799 md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
  4800                      const MD_ALIGN* align, int col_count)
  4801 {
  4802     MD_LINE line;
  4803     OFF* pipe_offs = NULL;
  4804     int i, j, k, n;
  4805     int ret = 0;
  4806 
  4807     line.beg = beg;
  4808     line.end = end;
  4809 
  4810     /* Break the line into table cells by identifying pipe characters who
  4811      * form the cell boundary. */
  4812     MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
  4813 
  4814     /* We have to remember the cell boundaries in local buffer because
  4815      * ctx->marks[] shall be reused during cell contents processing. */
  4816     n = ctx->n_table_cell_boundaries + 2;
  4817     pipe_offs = (OFF*) malloc(n * sizeof(OFF));
  4818     if(pipe_offs == NULL) {
  4819         MD_LOG("malloc() failed.");
  4820         ret = -1;
  4821         goto abort;
  4822     }
  4823     j = 0;
  4824     pipe_offs[j++] = beg;
  4825     for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
  4826         MD_MARK* mark = &ctx->marks[i];
  4827         pipe_offs[j++] = mark->end;
  4828     }
  4829     pipe_offs[j++] = end+1;
  4830 
  4831     /* Process cells. */
  4832     MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
  4833     k = 0;
  4834     for(i = 0; i < j-1  &&  k < col_count; i++) {
  4835         if(pipe_offs[i] < pipe_offs[i+1]-1)
  4836             MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
  4837     }
  4838     /* Make sure we call enough table cells even if the current table contains
  4839      * too few of them. */
  4840     while(k < col_count)
  4841         MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
  4842     MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
  4843 
  4844 abort:
  4845     free(pipe_offs);
  4846 
  4847     /* Free any temporary memory blocks stored within some dummy marks. */
  4848     for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
  4849         free(md_mark_get_ptr(ctx, i));
  4850     PTR_CHAIN.head = -1;
  4851     PTR_CHAIN.tail = -1;
  4852 
  4853     return ret;
  4854 }
  4855 
  4856 static int
  4857 md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
  4858 {
  4859     MD_ALIGN* align;
  4860     int i;
  4861     int ret = 0;
  4862 
  4863     /* At least two lines have to be present: The column headers and the line
  4864      * with the underlines. */
  4865     MD_ASSERT(n_lines >= 2);
  4866 
  4867     align = malloc(col_count * sizeof(MD_ALIGN));
  4868     if(align == NULL) {
  4869         MD_LOG("malloc() failed.");
  4870         ret = -1;
  4871         goto abort;
  4872     }
  4873 
  4874     md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count);
  4875 
  4876     MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
  4877     MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
  4878                         lines[0].beg, lines[0].end, align, col_count));
  4879     MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
  4880 
  4881     if(n_lines > 2) {
  4882         MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
  4883         for(i = 2; i < n_lines; i++) {
  4884             MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
  4885                      lines[i].beg, lines[i].end, align, col_count));
  4886         }
  4887         MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
  4888     }
  4889 
  4890 abort:
  4891     free(align);
  4892     return ret;
  4893 }
  4894 
  4895 
  4896 /**************************
  4897  ***  Processing Block  ***
  4898  **************************/
  4899 
  4900 #define MD_BLOCK_CONTAINER_OPENER   0x01
  4901 #define MD_BLOCK_CONTAINER_CLOSER   0x02
  4902 #define MD_BLOCK_CONTAINER          (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
  4903 #define MD_BLOCK_LOOSE_LIST         0x04
  4904 #define MD_BLOCK_SETEXT_HEADER      0x08
  4905 
  4906 struct MD_BLOCK_tag {
  4907     MD_BLOCKTYPE type  :  8;
  4908     unsigned flags     :  8;
  4909 
  4910     /* MD_BLOCK_H:      Header level (1 - 6)
  4911      * MD_BLOCK_CODE:   Non-zero if fenced, zero if indented.
  4912      * MD_BLOCK_LI:     Task mark character (0 if not task list item, 'x', 'X' or ' ').
  4913      * MD_BLOCK_TABLE:  Column count (as determined by the table underline).
  4914      */
  4915     unsigned data      : 16;
  4916 
  4917     /* Leaf blocks:     Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
  4918      * MD_BLOCK_LI:     Task mark offset in the input doc.
  4919      * MD_BLOCK_OL:     Start item number.
  4920      */
  4921     unsigned n_lines;
  4922 };
  4923 
  4924 struct MD_CONTAINER_tag {
  4925     CHAR ch;
  4926     unsigned is_loose    : 8;
  4927     unsigned is_task     : 8;
  4928     unsigned start;
  4929     unsigned mark_indent;
  4930     unsigned contents_indent;
  4931     OFF block_byte_off;
  4932     OFF task_mark_off;
  4933 };
  4934 
  4935 
  4936 static int
  4937 md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
  4938 {
  4939     int i;
  4940     int ret;
  4941 
  4942     MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
  4943     MD_CHECK(md_process_inlines(ctx, lines, n_lines));
  4944 
  4945 abort:
  4946     /* Free any temporary memory blocks stored within some dummy marks. */
  4947     for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
  4948         free(md_mark_get_ptr(ctx, i));
  4949     PTR_CHAIN.head = -1;
  4950     PTR_CHAIN.tail = -1;
  4951 
  4952     return ret;
  4953 }
  4954 
  4955 static int
  4956 md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
  4957 {
  4958     static const CHAR indent_chunk_str[] = _T("                ");
  4959     static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
  4960 
  4961     int i;
  4962     int ret = 0;
  4963 
  4964     for(i = 0; i < n_lines; i++) {
  4965         const MD_VERBATIMLINE* line = &lines[i];
  4966         int indent = line->indent;
  4967 
  4968         MD_ASSERT(indent >= 0);
  4969 
  4970         /* Output code indentation. */
  4971         while(indent > (int) indent_chunk_size) {
  4972             MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
  4973             indent -= indent_chunk_size;
  4974         }
  4975         if(indent > 0)
  4976             MD_TEXT(text_type, indent_chunk_str, indent);
  4977 
  4978         /* Output the code line itself. */
  4979         MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
  4980 
  4981         /* Enforce end-of-line. */
  4982         MD_TEXT(text_type, _T("\n"), 1);
  4983     }
  4984 
  4985 abort:
  4986     return ret;
  4987 }
  4988 
  4989 static int
  4990 md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
  4991 {
  4992     if(is_fenced) {
  4993         /* Skip the first line in case of fenced code: It is the fence.
  4994          * (Only the starting fence is present due to logic in md_analyze_line().) */
  4995         lines++;
  4996         n_lines--;
  4997     } else {
  4998         /* Ignore blank lines at start/end of indented code block. */
  4999         while(n_lines > 0  &&  lines[0].beg == lines[0].end) {
  5000             lines++;
  5001             n_lines--;
  5002         }
  5003         while(n_lines > 0  &&  lines[n_lines-1].beg == lines[n_lines-1].end) {
  5004             n_lines--;
  5005         }
  5006     }
  5007 
  5008     if(n_lines == 0)
  5009         return 0;
  5010 
  5011     return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines);
  5012 }
  5013 
  5014 static int
  5015 md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
  5016                             MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
  5017 {
  5018     const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
  5019     OFF beg = fence_line->beg;
  5020     OFF end = fence_line->end;
  5021     OFF lang_end;
  5022     CHAR fence_ch = CH(fence_line->beg);
  5023     int ret = 0;
  5024 
  5025     /* Skip the fence itself. */
  5026     while(beg < ctx->size  &&  CH(beg) == fence_ch)
  5027         beg++;
  5028     /* Trim initial spaces. */
  5029     while(beg < ctx->size  &&  CH(beg) == _T(' '))
  5030         beg++;
  5031 
  5032     /* Trim trailing spaces. */
  5033     while(end > beg  &&  CH(end-1) == _T(' '))
  5034         end--;
  5035 
  5036     /* Build info string attribute. */
  5037     MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
  5038 
  5039     /* Build info string attribute. */
  5040     lang_end = beg;
  5041     while(lang_end < end  &&  !ISWHITESPACE(lang_end))
  5042         lang_end++;
  5043     MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
  5044 
  5045     det->fence_char = fence_ch;
  5046 
  5047 abort:
  5048     return ret;
  5049 }
  5050 
  5051 static int
  5052 md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
  5053 {
  5054     union {
  5055         MD_BLOCK_H_DETAIL header;
  5056         MD_BLOCK_CODE_DETAIL code;
  5057         MD_BLOCK_TABLE_DETAIL table;
  5058     } det;
  5059     MD_ATTRIBUTE_BUILD info_build;
  5060     MD_ATTRIBUTE_BUILD lang_build;
  5061     int is_in_tight_list;
  5062     int clean_fence_code_detail = FALSE;
  5063     int ret = 0;
  5064 
  5065     memset(&det, 0, sizeof(det));
  5066 
  5067     if(ctx->n_containers == 0)
  5068         is_in_tight_list = FALSE;
  5069     else
  5070         is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
  5071 
  5072     switch(block->type) {
  5073         case MD_BLOCK_H:
  5074             det.header.level = block->data;
  5075             break;
  5076 
  5077         case MD_BLOCK_CODE:
  5078             /* For fenced code block, we may need to set the info string. */
  5079             if(block->data != 0) {
  5080                 memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL));
  5081                 clean_fence_code_detail = TRUE;
  5082                 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
  5083             }
  5084             break;
  5085 
  5086         case MD_BLOCK_TABLE:
  5087             det.table.col_count = block->data;
  5088             det.table.head_row_count = 1;
  5089             det.table.body_row_count = block->n_lines - 2;
  5090             break;
  5091 
  5092         default:
  5093             /* Noop. */
  5094             break;
  5095     }
  5096 
  5097     if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
  5098         MD_ENTER_BLOCK(block->type, (void*) &det);
  5099 
  5100     /* Process the block contents accordingly to is type. */
  5101     switch(block->type) {
  5102         case MD_BLOCK_HR:
  5103             /* noop */
  5104             break;
  5105 
  5106         case MD_BLOCK_CODE:
  5107             MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
  5108                             (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
  5109             break;
  5110 
  5111         case MD_BLOCK_HTML:
  5112             MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
  5113                             (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
  5114             break;
  5115 
  5116         case MD_BLOCK_TABLE:
  5117             MD_CHECK(md_process_table_block_contents(ctx, block->data,
  5118                             (const MD_LINE*)(block + 1), block->n_lines));
  5119             break;
  5120 
  5121         default:
  5122             MD_CHECK(md_process_normal_block_contents(ctx,
  5123                             (const MD_LINE*)(block + 1), block->n_lines));
  5124             break;
  5125     }
  5126 
  5127     if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
  5128         MD_LEAVE_BLOCK(block->type, (void*) &det);
  5129 
  5130 abort:
  5131     if(clean_fence_code_detail) {
  5132         md_free_attribute(ctx, &info_build);
  5133         md_free_attribute(ctx, &lang_build);
  5134     }
  5135     return ret;
  5136 }
  5137 
  5138 static int
  5139 md_process_all_blocks(MD_CTX* ctx)
  5140 {
  5141     int byte_off = 0;
  5142     int ret = 0;
  5143 
  5144     /* ctx->containers now is not needed for detection of lists and list items
  5145      * so we reuse it for tracking what lists are loose or tight. We rely
  5146      * on the fact the vector is large enough to hold the deepest nesting
  5147      * level of lists. */
  5148     ctx->n_containers = 0;
  5149 
  5150     while(byte_off < ctx->n_block_bytes) {
  5151         MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
  5152         union {
  5153             MD_BLOCK_UL_DETAIL ul;
  5154             MD_BLOCK_OL_DETAIL ol;
  5155             MD_BLOCK_LI_DETAIL li;
  5156         } det;
  5157 
  5158         switch(block->type) {
  5159             case MD_BLOCK_UL:
  5160                 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
  5161                 det.ul.mark = (CHAR) block->data;
  5162                 break;
  5163 
  5164             case MD_BLOCK_OL:
  5165                 det.ol.start = block->n_lines;
  5166                 det.ol.is_tight =  (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
  5167                 det.ol.mark_delimiter = (CHAR) block->data;
  5168                 break;
  5169 
  5170             case MD_BLOCK_LI:
  5171                 det.li.is_task = (block->data != 0);
  5172                 det.li.task_mark = (CHAR) block->data;
  5173                 det.li.task_mark_offset = (OFF) block->n_lines;
  5174                 break;
  5175 
  5176             default:
  5177                 /* noop */
  5178                 break;
  5179         }
  5180 
  5181         if(block->flags & MD_BLOCK_CONTAINER) {
  5182             if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
  5183                 MD_LEAVE_BLOCK(block->type, &det);
  5184 
  5185                 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
  5186                     ctx->n_containers--;
  5187             }
  5188 
  5189             if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
  5190                 MD_ENTER_BLOCK(block->type, &det);
  5191 
  5192                 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
  5193                     ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
  5194                     ctx->n_containers++;
  5195                 } else if(block->type == MD_BLOCK_QUOTE) {
  5196                     /* This causes that any text in a block quote, even if
  5197                      * nested inside a tight list item, is wrapped with
  5198                      * <p>...</p>. */
  5199                     ctx->containers[ctx->n_containers].is_loose = TRUE;
  5200                     ctx->n_containers++;
  5201                 }
  5202             }
  5203         } else {
  5204             MD_CHECK(md_process_leaf_block(ctx, block));
  5205 
  5206             if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
  5207                 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
  5208             else
  5209                 byte_off += block->n_lines * sizeof(MD_LINE);
  5210         }
  5211 
  5212         byte_off += sizeof(MD_BLOCK);
  5213     }
  5214 
  5215     ctx->n_block_bytes = 0;
  5216 
  5217 abort:
  5218     return ret;
  5219 }
  5220 
  5221 
  5222 /************************************
  5223  ***  Grouping Lines into Blocks  ***
  5224  ************************************/
  5225 
  5226 static void*
  5227 md_push_block_bytes(MD_CTX* ctx, int n_bytes)
  5228 {
  5229     void* ptr;
  5230 
  5231     if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
  5232         void* new_block_bytes;
  5233 
  5234         ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
  5235                 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
  5236                 : 512);
  5237         new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes);
  5238         if(new_block_bytes == NULL) {
  5239             MD_LOG("realloc() failed.");
  5240             return NULL;
  5241         }
  5242 
  5243         /* Fix the ->current_block after the reallocation. */
  5244         if(ctx->current_block != NULL) {
  5245             OFF off_current_block = (OFF) ((char*) ctx->current_block - (char*) ctx->block_bytes);
  5246             ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
  5247         }
  5248 
  5249         ctx->block_bytes = new_block_bytes;
  5250     }
  5251 
  5252     ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
  5253     ctx->n_block_bytes += n_bytes;
  5254     return ptr;
  5255 }
  5256 
  5257 static int
  5258 md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
  5259 {
  5260     MD_BLOCK* block;
  5261 
  5262     MD_ASSERT(ctx->current_block == NULL);
  5263 
  5264     block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
  5265     if(block == NULL)
  5266         return -1;
  5267 
  5268     switch(line->type) {
  5269         case MD_LINE_HR:
  5270             block->type = MD_BLOCK_HR;
  5271             break;
  5272 
  5273         case MD_LINE_ATXHEADER:
  5274         case MD_LINE_SETEXTHEADER:
  5275             block->type = MD_BLOCK_H;
  5276             break;
  5277 
  5278         case MD_LINE_FENCEDCODE:
  5279         case MD_LINE_INDENTEDCODE:
  5280             block->type = MD_BLOCK_CODE;
  5281             break;
  5282 
  5283         case MD_LINE_TEXT:
  5284             block->type = MD_BLOCK_P;
  5285             break;
  5286 
  5287         case MD_LINE_HTML:
  5288             block->type = MD_BLOCK_HTML;
  5289             break;
  5290 
  5291         case MD_LINE_BLANK:
  5292         case MD_LINE_SETEXTUNDERLINE:
  5293         case MD_LINE_TABLEUNDERLINE:
  5294         default:
  5295             MD_UNREACHABLE();
  5296             break;
  5297     }
  5298 
  5299     block->flags = 0;
  5300     block->data = line->data;
  5301     block->n_lines = 0;
  5302 
  5303     ctx->current_block = block;
  5304     return 0;
  5305 }
  5306 
  5307 /* Eat from start of current (textual) block any reference definitions and
  5308  * remember them so we can resolve any links referring to them.
  5309  *
  5310  * (Reference definitions can only be at start of it as they cannot break
  5311  * a paragraph.)
  5312  */
  5313 static int
  5314 md_consume_link_reference_definitions(MD_CTX* ctx)
  5315 {
  5316     MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
  5317     int n_lines = ctx->current_block->n_lines;
  5318     int n = 0;
  5319 
  5320     /* Compute how many lines at the start of the block form one or more
  5321      * reference definitions. */
  5322     while(n < n_lines) {
  5323         int n_link_ref_lines;
  5324 
  5325         n_link_ref_lines = md_is_link_reference_definition(ctx,
  5326                                     lines + n, n_lines - n);
  5327         /* Not a reference definition? */
  5328         if(n_link_ref_lines == 0)
  5329             break;
  5330 
  5331         /* We fail if it is the ref. def. but it could not be stored due
  5332          * a memory allocation error. */
  5333         if(n_link_ref_lines < 0)
  5334             return -1;
  5335 
  5336         n += n_link_ref_lines;
  5337     }
  5338 
  5339     /* If there was at least one reference definition, we need to remove
  5340      * its lines from the block, or perhaps even the whole block. */
  5341     if(n > 0) {
  5342         if(n == n_lines) {
  5343             /* Remove complete block. */
  5344             ctx->n_block_bytes -= n * sizeof(MD_LINE);
  5345             ctx->n_block_bytes -= sizeof(MD_BLOCK);
  5346             ctx->current_block = NULL;
  5347         } else {
  5348             /* Remove just some initial lines from the block. */
  5349             memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE));
  5350             ctx->current_block->n_lines -= n;
  5351             ctx->n_block_bytes -= n * sizeof(MD_LINE);
  5352         }
  5353     }
  5354 
  5355     return 0;
  5356 }
  5357 
  5358 static int
  5359 md_end_current_block(MD_CTX* ctx)
  5360 {
  5361     int ret = 0;
  5362 
  5363     if(ctx->current_block == NULL)
  5364         return ret;
  5365 
  5366     /* Check whether there is a reference definition. (We do this here instead
  5367      * of in md_analyze_line() because reference definition can take multiple
  5368      * lines.) */
  5369     if(ctx->current_block->type == MD_BLOCK_P  ||
  5370        (ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
  5371     {
  5372         MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
  5373         if(CH(lines[0].beg) == _T('[')) {
  5374             MD_CHECK(md_consume_link_reference_definitions(ctx));
  5375             if(ctx->current_block == NULL)
  5376                 return ret;
  5377         }
  5378     }
  5379 
  5380     if(ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
  5381         int n_lines = ctx->current_block->n_lines;
  5382 
  5383         if(n_lines > 1) {
  5384             /* Get rid of the underline. */
  5385             ctx->current_block->n_lines--;
  5386             ctx->n_block_bytes -= sizeof(MD_LINE);
  5387         } else {
  5388             /* Only the underline has left after eating the ref. defs.
  5389              * Keep the line as beginning of a new ordinary paragraph. */
  5390             ctx->current_block->type = MD_BLOCK_P;
  5391             return 0;
  5392         }
  5393     }
  5394 
  5395     /* Mark we are not building any block anymore. */
  5396     ctx->current_block = NULL;
  5397 
  5398 abort:
  5399     return ret;
  5400 }
  5401 
  5402 static int
  5403 md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
  5404 {
  5405     MD_ASSERT(ctx->current_block != NULL);
  5406 
  5407     if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
  5408         MD_VERBATIMLINE* line;
  5409 
  5410         line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE));
  5411         if(line == NULL)
  5412             return -1;
  5413 
  5414         line->indent = analysis->indent;
  5415         line->beg = analysis->beg;
  5416         line->end = analysis->end;
  5417     } else {
  5418         MD_LINE* line;
  5419 
  5420         line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE));
  5421         if(line == NULL)
  5422             return -1;
  5423 
  5424         line->beg = analysis->beg;
  5425         line->end = analysis->end;
  5426     }
  5427     ctx->current_block->n_lines++;
  5428 
  5429     return 0;
  5430 }
  5431 
  5432 static int
  5433 md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
  5434                         unsigned data, unsigned flags)
  5435 {
  5436     MD_BLOCK* block;
  5437     int ret = 0;
  5438 
  5439     MD_CHECK(md_end_current_block(ctx));
  5440 
  5441     block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
  5442     if(block == NULL)
  5443         return -1;
  5444 
  5445     block->type = type;
  5446     block->flags = flags;
  5447     block->data = data;
  5448     block->n_lines = start;
  5449 
  5450 abort:
  5451     return ret;
  5452 }
  5453 
  5454 
  5455 
  5456 /***********************
  5457  ***  Line Analysis  ***
  5458  ***********************/
  5459 
  5460 static int
  5461 md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
  5462 {
  5463     OFF off = beg + 1;
  5464     int n = 1;
  5465 
  5466     while(off < ctx->size  &&  (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
  5467         if(CH(off) == CH(beg))
  5468             n++;
  5469         off++;
  5470     }
  5471 
  5472     if(n < 3) {
  5473         *p_killer = off;
  5474         return FALSE;
  5475     }
  5476 
  5477     /* Nothing else can be present on the line. */
  5478     if(off < ctx->size  &&  !ISNEWLINE(off)) {
  5479         *p_killer = off;
  5480         return FALSE;
  5481     }
  5482 
  5483     *p_end = off;
  5484     return TRUE;
  5485 }
  5486 
  5487 static int
  5488 md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
  5489 {
  5490     int n;
  5491     OFF off = beg + 1;
  5492 
  5493     while(off < ctx->size  &&  CH(off) == _T('#')  &&  off - beg < 7)
  5494         off++;
  5495     n = off - beg;
  5496 
  5497     if(n > 6)
  5498         return FALSE;
  5499     *p_level = n;
  5500 
  5501     if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS)  &&  off < ctx->size  &&
  5502        CH(off) != _T(' ')  &&  CH(off) != _T('\t')  &&  !ISNEWLINE(off))
  5503         return FALSE;
  5504 
  5505     while(off < ctx->size  &&  CH(off) == _T(' '))
  5506         off++;
  5507     *p_beg = off;
  5508     *p_end = off;
  5509     return TRUE;
  5510 }
  5511 
  5512 static int
  5513 md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
  5514 {
  5515     OFF off = beg + 1;
  5516 
  5517     while(off < ctx->size  &&  CH(off) == CH(beg))
  5518         off++;
  5519 
  5520     /* Optionally, space(s) can follow. */
  5521     while(off < ctx->size  &&  CH(off) == _T(' '))
  5522         off++;
  5523 
  5524     /* But nothing more is allowed on the line. */
  5525     if(off < ctx->size  &&  !ISNEWLINE(off))
  5526         return FALSE;
  5527 
  5528     *p_level = (CH(beg) == _T('=') ? 1 : 2);
  5529     *p_end = off;
  5530     return TRUE;
  5531 }
  5532 
  5533 static int
  5534 md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
  5535 {
  5536     OFF off = beg;
  5537     int found_pipe = FALSE;
  5538     unsigned col_count = 0;
  5539 
  5540     if(off < ctx->size  &&  CH(off) == _T('|')) {
  5541         found_pipe = TRUE;
  5542         off++;
  5543         while(off < ctx->size  &&  ISWHITESPACE(off))
  5544             off++;
  5545     }
  5546 
  5547     while(1) {
  5548         int delimited = FALSE;
  5549 
  5550         /* Cell underline ("-----", ":----", "----:" or ":----:") */
  5551         if(off < ctx->size  &&  CH(off) == _T(':'))
  5552             off++;
  5553         if(off >= ctx->size  ||  CH(off) != _T('-'))
  5554             return FALSE;
  5555         while(off < ctx->size  &&  CH(off) == _T('-'))
  5556             off++;
  5557         if(off < ctx->size  &&  CH(off) == _T(':'))
  5558             off++;
  5559 
  5560         col_count++;
  5561 
  5562         /* Pipe delimiter (optional at the end of line). */
  5563         while(off < ctx->size  &&  ISWHITESPACE(off))
  5564             off++;
  5565         if(off < ctx->size  &&  CH(off) == _T('|')) {
  5566             delimited = TRUE;
  5567             found_pipe =  TRUE;
  5568             off++;
  5569             while(off < ctx->size  &&  ISWHITESPACE(off))
  5570                 off++;
  5571         }
  5572 
  5573         /* Success, if we reach end of line. */
  5574         if(off >= ctx->size  ||  ISNEWLINE(off))
  5575             break;
  5576 
  5577         if(!delimited)
  5578             return FALSE;
  5579     }
  5580 
  5581     if(!found_pipe)
  5582         return FALSE;
  5583 
  5584     *p_end = off;
  5585     *p_col_count = col_count;
  5586     return TRUE;
  5587 }
  5588 
  5589 static int
  5590 md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
  5591 {
  5592     OFF off = beg;
  5593 
  5594     while(off < ctx->size && CH(off) == CH(beg))
  5595         off++;
  5596 
  5597     /* Fence must have at least three characters. */
  5598     if(off - beg < 3)
  5599         return FALSE;
  5600 
  5601     ctx->code_fence_length = off - beg;
  5602 
  5603     /* Optionally, space(s) can follow. */
  5604     while(off < ctx->size  &&  CH(off) == _T(' '))
  5605         off++;
  5606 
  5607     /* Optionally, an info string can follow. */
  5608     while(off < ctx->size  &&  !ISNEWLINE(off)) {
  5609         /* Backtick-based fence must not contain '`' in the info string. */
  5610         if(CH(beg) == _T('`')  &&  CH(off) == _T('`'))
  5611             return FALSE;
  5612         off++;
  5613     }
  5614 
  5615     *p_end = off;
  5616     return TRUE;
  5617 }
  5618 
  5619 static int
  5620 md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
  5621 {
  5622     OFF off = beg;
  5623     int ret = FALSE;
  5624 
  5625     /* Closing fence must have at least the same length and use same char as
  5626      * opening one. */
  5627     while(off < ctx->size  &&  CH(off) == ch)
  5628         off++;
  5629     if(off - beg < ctx->code_fence_length)
  5630         goto out;
  5631 
  5632     /* Optionally, space(s) can follow */
  5633     while(off < ctx->size  &&  CH(off) == _T(' '))
  5634         off++;
  5635 
  5636     /* But nothing more is allowed on the line. */
  5637     if(off < ctx->size  &&  !ISNEWLINE(off))
  5638         goto out;
  5639 
  5640     ret = TRUE;
  5641 
  5642 out:
  5643     /* Note we set *p_end even on failure: If we are not closing fence, caller
  5644      * would eat the line anyway without any parsing. */
  5645     *p_end = off;
  5646     return ret;
  5647 }
  5648 
  5649 /* Returns type of the raw HTML block, or FALSE if it is not HTML block.
  5650  * (Refer to CommonMark specification for details about the types.)
  5651  */
  5652 static int
  5653 md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
  5654 {
  5655     typedef struct TAG_tag TAG;
  5656     struct TAG_tag {
  5657         const CHAR* name;
  5658         unsigned len    : 8;
  5659     };
  5660 
  5661     /* Type 6 is started by a long list of allowed tags. We use two-level
  5662      * tree to speed-up the search. */
  5663 #ifdef X
  5664     #undef X
  5665 #endif
  5666 #define X(name)     { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
  5667 #define Xend        { NULL, 0 }
  5668     static const TAG t1[] = { X("pre"), X("script"), X("style"), X("textarea"), Xend };
  5669 
  5670     static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
  5671     static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
  5672     static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
  5673     static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
  5674                               X("div"), X("dl"), X("dt"), Xend };
  5675     static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
  5676                               X("form"), X("frame"), X("frameset"), Xend };
  5677     static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
  5678     static const TAG i6[] = { X("iframe"), Xend };
  5679     static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
  5680     static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
  5681     static const TAG n6[] = { X("nav"), X("noframes"), Xend };
  5682     static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
  5683     static const TAG p6[] = { X("p"), X("param"), Xend };
  5684     static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
  5685     static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
  5686                               X("thead"), X("title"), X("tr"), X("track"), Xend };
  5687     static const TAG u6[] = { X("ul"), Xend };
  5688     static const TAG xx[] = { Xend };
  5689 #undef X
  5690 
  5691     static const TAG* map6[26] = {
  5692         a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
  5693         n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
  5694     };
  5695     OFF off = beg + 1;
  5696     int i;
  5697 
  5698     /* Check for type 1: <script, <pre, or <style */
  5699     for(i = 0; t1[i].name != NULL; i++) {
  5700         if(off + t1[i].len <= ctx->size) {
  5701             if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len))
  5702                 return 1;
  5703         }
  5704     }
  5705 
  5706     /* Check for type 2: <!-- */
  5707     if(off + 3 < ctx->size  &&  CH(off) == _T('!')  &&  CH(off+1) == _T('-')  &&  CH(off+2) == _T('-'))
  5708         return 2;
  5709 
  5710     /* Check for type 3: <? */
  5711     if(off < ctx->size  &&  CH(off) == _T('?'))
  5712         return 3;
  5713 
  5714     /* Check for type 4 or 5: <! */
  5715     if(off < ctx->size  &&  CH(off) == _T('!')) {
  5716         /* Check for type 4: <! followed by uppercase letter. */
  5717         if(off + 1 < ctx->size  &&  ISASCII(off+1))
  5718             return 4;
  5719 
  5720         /* Check for type 5: <![CDATA[ */
  5721         if(off + 8 < ctx->size) {
  5722             if(md_ascii_eq(STR(off), _T("![CDATA["), 8))
  5723                 return 5;
  5724         }
  5725     }
  5726 
  5727     /* Check for type 6: Many possible starting tags listed above. */
  5728     if(off + 1 < ctx->size  &&  (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
  5729         int slot;
  5730         const TAG* tags;
  5731 
  5732         if(CH(off) == _T('/'))
  5733             off++;
  5734 
  5735         slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
  5736         tags = map6[slot];
  5737 
  5738         for(i = 0; tags[i].name != NULL; i++) {
  5739             if(off + tags[i].len <= ctx->size) {
  5740                 if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) {
  5741                     OFF tmp = off + tags[i].len;
  5742                     if(tmp >= ctx->size)
  5743                         return 6;
  5744                     if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
  5745                         return 6;
  5746                     if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
  5747                         return 6;
  5748                     break;
  5749                 }
  5750             }
  5751         }
  5752     }
  5753 
  5754     /* Check for type 7: any COMPLETE other opening or closing tag. */
  5755     if(off + 1 < ctx->size) {
  5756         OFF end;
  5757 
  5758         if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) {
  5759             /* Only optional whitespace and new line may follow. */
  5760             while(end < ctx->size  &&  ISWHITESPACE(end))
  5761                 end++;
  5762             if(end >= ctx->size  ||  ISNEWLINE(end))
  5763                 return 7;
  5764         }
  5765     }
  5766 
  5767     return FALSE;
  5768 }
  5769 
  5770 /* Case sensitive check whether there is a substring 'what' between 'beg'
  5771  * and end of line. */
  5772 static int
  5773 md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
  5774 {
  5775     OFF i;
  5776     for(i = beg; i + what_len < ctx->size; i++) {
  5777         if(ISNEWLINE(i))
  5778             break;
  5779         if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) {
  5780             *p_end = i + what_len;
  5781             return TRUE;
  5782         }
  5783     }
  5784 
  5785     *p_end = i;
  5786     return FALSE;
  5787 }
  5788 
  5789 /* Returns type of HTML block end condition or FALSE if not an end condition.
  5790  *
  5791  * Note it fills p_end even when it is not end condition as the caller
  5792  * does not need to analyze contents of a raw HTML block.
  5793  */
  5794 static int
  5795 md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
  5796 {
  5797     switch(ctx->html_block_type) {
  5798         case 1:
  5799         {
  5800             OFF off = beg;
  5801 
  5802             while(off < ctx->size  &&  !ISNEWLINE(off)) {
  5803                 if(CH(off) == _T('<')) {
  5804                   #define FIND_TAG_END(string, length) \
  5805                     if(off + length <= ctx->size && \
  5806                        md_ascii_case_eq(STR(off), _T(string), length)) { \
  5807                         *p_end = off + length; \
  5808                         return TRUE; \
  5809                     }
  5810                   FIND_TAG_END("</script>", 9)
  5811                   FIND_TAG_END("</style>", 8)
  5812                   FIND_TAG_END("</pre>", 6)
  5813                   #undef FIND_TAG_END
  5814                 }
  5815 
  5816                 off++;
  5817             }
  5818             *p_end = off;
  5819             return FALSE;
  5820         }
  5821 
  5822         case 2:
  5823             return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE);
  5824 
  5825         case 3:
  5826             return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE);
  5827 
  5828         case 4:
  5829             return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE);
  5830 
  5831         case 5:
  5832             return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE);
  5833 
  5834         case 6:     /* Pass through */
  5835         case 7:
  5836             *p_end = beg;
  5837             return (beg >= ctx->size || ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
  5838 
  5839         default:
  5840             MD_UNREACHABLE();
  5841     }
  5842     return FALSE;
  5843 }
  5844 
  5845 
  5846 static int
  5847 md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
  5848 {
  5849     /* Block quote has no "items" like lists. */
  5850     if(container->ch == _T('>'))
  5851         return FALSE;
  5852 
  5853     if(container->ch != pivot->ch)
  5854         return FALSE;
  5855     if(container->mark_indent > pivot->contents_indent)
  5856         return FALSE;
  5857 
  5858     return TRUE;
  5859 }
  5860 
  5861 static int
  5862 md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
  5863 {
  5864     if(ctx->n_containers >= ctx->alloc_containers) {
  5865         MD_CONTAINER* new_containers;
  5866 
  5867         ctx->alloc_containers = (ctx->alloc_containers > 0
  5868                 ? ctx->alloc_containers + ctx->alloc_containers / 2
  5869                 : 16);
  5870         new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER));
  5871         if(new_containers == NULL) {
  5872             MD_LOG("realloc() failed.");
  5873             return -1;
  5874         }
  5875 
  5876         ctx->containers = new_containers;
  5877     }
  5878 
  5879     memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER));
  5880     return 0;
  5881 }
  5882 
  5883 static int
  5884 md_enter_child_containers(MD_CTX* ctx, int n_children)
  5885 {
  5886     int i;
  5887     int ret = 0;
  5888 
  5889     for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
  5890         MD_CONTAINER* c = &ctx->containers[i];
  5891         int is_ordered_list = FALSE;
  5892 
  5893         switch(c->ch) {
  5894             case _T(')'):
  5895             case _T('.'):
  5896                 is_ordered_list = TRUE;
  5897                 MD_FALLTHROUGH();
  5898 
  5899             case _T('-'):
  5900             case _T('+'):
  5901             case _T('*'):
  5902                 /* Remember offset in ctx->block_bytes so we can revisit the
  5903                  * block if we detect it is a loose list. */
  5904                 md_end_current_block(ctx);
  5905                 c->block_byte_off = ctx->n_block_bytes;
  5906 
  5907                 MD_CHECK(md_push_container_bytes(ctx,
  5908                                 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
  5909                                 c->start, c->ch, MD_BLOCK_CONTAINER_OPENER));
  5910                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
  5911                                 c->task_mark_off,
  5912                                 (c->is_task ? CH(c->task_mark_off) : 0),
  5913                                 MD_BLOCK_CONTAINER_OPENER));
  5914                 break;
  5915 
  5916             case _T('>'):
  5917                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
  5918                 break;
  5919 
  5920             default:
  5921                 MD_UNREACHABLE();
  5922                 break;
  5923         }
  5924     }
  5925 
  5926 abort:
  5927     return ret;
  5928 }
  5929 
  5930 static int
  5931 md_leave_child_containers(MD_CTX* ctx, int n_keep)
  5932 {
  5933     int ret = 0;
  5934 
  5935     while(ctx->n_containers > n_keep) {
  5936         MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
  5937         int is_ordered_list = FALSE;
  5938 
  5939         switch(c->ch) {
  5940             case _T(')'):
  5941             case _T('.'):
  5942                 is_ordered_list = TRUE;
  5943                 MD_FALLTHROUGH();
  5944 
  5945             case _T('-'):
  5946             case _T('+'):
  5947             case _T('*'):
  5948                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
  5949                                 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
  5950                                 MD_BLOCK_CONTAINER_CLOSER));
  5951                 MD_CHECK(md_push_container_bytes(ctx,
  5952                                 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
  5953                                 c->ch, MD_BLOCK_CONTAINER_CLOSER));
  5954                 break;
  5955 
  5956             case _T('>'):
  5957                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
  5958                                 0, MD_BLOCK_CONTAINER_CLOSER));
  5959                 break;
  5960 
  5961             default:
  5962                 MD_UNREACHABLE();
  5963                 break;
  5964         }
  5965 
  5966         ctx->n_containers--;
  5967     }
  5968 
  5969 abort:
  5970     return ret;
  5971 }
  5972 
  5973 static int
  5974 md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
  5975 {
  5976     OFF off = beg;
  5977     OFF max_end;
  5978 
  5979     if(off >= ctx->size  ||  indent >= ctx->code_indent_offset)
  5980         return FALSE;
  5981 
  5982     /* Check for block quote mark. */
  5983     if(CH(off) == _T('>')) {
  5984         off++;
  5985         p_container->ch = _T('>');
  5986         p_container->is_loose = FALSE;
  5987         p_container->is_task = FALSE;
  5988         p_container->mark_indent = indent;
  5989         p_container->contents_indent = indent + 1;
  5990         *p_end = off;
  5991         return TRUE;
  5992     }
  5993 
  5994     /* Check for list item bullet mark. */
  5995     if(ISANYOF(off, _T("-+*"))  &&  (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) {
  5996         p_container->ch = CH(off);
  5997         p_container->is_loose = FALSE;
  5998         p_container->is_task = FALSE;
  5999         p_container->mark_indent = indent;
  6000         p_container->contents_indent = indent + 1;
  6001         *p_end = off+1;
  6002         return TRUE;
  6003     }
  6004 
  6005     /* Check for ordered list item marks. */
  6006     max_end = off + 9;
  6007     if(max_end > ctx->size)
  6008         max_end = ctx->size;
  6009     p_container->start = 0;
  6010     while(off < max_end  &&  ISDIGIT(off)) {
  6011         p_container->start = p_container->start * 10 + CH(off) - _T('0');
  6012         off++;
  6013     }
  6014     if(off > beg  &&
  6015        off < ctx->size  &&
  6016        (CH(off) == _T('.') || CH(off) == _T(')'))  &&
  6017        (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1)))
  6018     {
  6019         p_container->ch = CH(off);
  6020         p_container->is_loose = FALSE;
  6021         p_container->is_task = FALSE;
  6022         p_container->mark_indent = indent;
  6023         p_container->contents_indent = indent + off - beg + 1;
  6024         *p_end = off+1;
  6025         return TRUE;
  6026     }
  6027 
  6028     return FALSE;
  6029 }
  6030 
  6031 static unsigned
  6032 md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
  6033 {
  6034     OFF off = beg;
  6035     unsigned indent = total_indent;
  6036 
  6037     while(off < ctx->size  &&  ISBLANK(off)) {
  6038         if(CH(off) == _T('\t'))
  6039             indent = (indent + 4) & ~3;
  6040         else
  6041             indent++;
  6042         off++;
  6043     }
  6044 
  6045     *p_end = off;
  6046     return indent - total_indent;
  6047 }
  6048 
  6049 static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0, 0, 0, 0 };
  6050 
  6051 /* Analyze type of the line and find some its properties. This serves as a
  6052  * main input for determining type and boundaries of a block. */
  6053 static int
  6054 md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
  6055                 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
  6056 {
  6057     unsigned total_indent = 0;
  6058     int n_parents = 0;
  6059     int n_brothers = 0;
  6060     int n_children = 0;
  6061     MD_CONTAINER container = { 0 };
  6062     int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
  6063     OFF off = beg;
  6064     OFF hr_killer = 0;
  6065     int ret = 0;
  6066 
  6067     line->indent = md_line_indentation(ctx, total_indent, off, &off);
  6068     total_indent += line->indent;
  6069     line->beg = off;
  6070 
  6071     /* Given the indentation and block quote marks '>', determine how many of
  6072      * the current containers are our parents. */
  6073     while(n_parents < ctx->n_containers) {
  6074         MD_CONTAINER* c = &ctx->containers[n_parents];
  6075 
  6076         if(c->ch == _T('>')  &&  line->indent < ctx->code_indent_offset  &&
  6077             off < ctx->size  &&  CH(off) == _T('>'))
  6078         {
  6079             /* Block quote mark. */
  6080             off++;
  6081             total_indent++;
  6082             line->indent = md_line_indentation(ctx, total_indent, off, &off);
  6083             total_indent += line->indent;
  6084 
  6085             /* The optional 1st space after '>' is part of the block quote mark. */
  6086             if(line->indent > 0)
  6087                 line->indent--;
  6088 
  6089             line->beg = off;
  6090 
  6091         } else if(c->ch != _T('>')  &&  line->indent >= c->contents_indent) {
  6092             /* List. */
  6093             line->indent -= c->contents_indent;
  6094         } else {
  6095             break;
  6096         }
  6097 
  6098         n_parents++;
  6099     }
  6100 
  6101     if(off >= ctx->size  ||  ISNEWLINE(off)) {
  6102         /* Blank line does not need any real indentation to be nested inside
  6103          * a list. */
  6104         if(n_brothers + n_children == 0) {
  6105             while(n_parents < ctx->n_containers  &&  ctx->containers[n_parents].ch != _T('>'))
  6106                 n_parents++;
  6107         }
  6108     }
  6109 
  6110     while(TRUE) {
  6111         /* Check whether we are fenced code continuation. */
  6112         if(pivot_line->type == MD_LINE_FENCEDCODE) {
  6113             line->beg = off;
  6114 
  6115             /* We are another MD_LINE_FENCEDCODE unless we are closing fence
  6116              * which we transform into MD_LINE_BLANK. */
  6117             if(line->indent < ctx->code_indent_offset) {
  6118                 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) {
  6119                     line->type = MD_LINE_BLANK;
  6120                     ctx->last_line_has_list_loosening_effect = FALSE;
  6121                     break;
  6122                 }
  6123             }
  6124 
  6125             /* Change indentation accordingly to the initial code fence. */
  6126             if(n_parents == ctx->n_containers) {
  6127                 if(line->indent > pivot_line->indent)
  6128                     line->indent -= pivot_line->indent;
  6129                 else
  6130                     line->indent = 0;
  6131 
  6132                 line->type = MD_LINE_FENCEDCODE;
  6133                 break;
  6134             }
  6135         }
  6136 
  6137         /* Check whether we are HTML block continuation. */
  6138         if(pivot_line->type == MD_LINE_HTML  &&  ctx->html_block_type > 0) {
  6139             if(n_parents < ctx->n_containers) {
  6140                 /* HTML block is implicitly ended if the enclosing container
  6141                  * block ends. */
  6142                 ctx->html_block_type = 0;
  6143             } else {
  6144                 int html_block_type;
  6145 
  6146                 html_block_type = md_is_html_block_end_condition(ctx, off, &off);
  6147                 if(html_block_type > 0) {
  6148                     MD_ASSERT(html_block_type == ctx->html_block_type);
  6149 
  6150                     /* Make sure this is the last line of the block. */
  6151                     ctx->html_block_type = 0;
  6152 
  6153                     /* Some end conditions serve as blank lines at the same time. */
  6154                     if(html_block_type == 6 || html_block_type == 7) {
  6155                         line->type = MD_LINE_BLANK;
  6156                         line->indent = 0;
  6157                         break;
  6158                     }
  6159                 }
  6160 
  6161                 line->type = MD_LINE_HTML;
  6162                 n_parents = ctx->n_containers;
  6163                 break;
  6164             }
  6165         }
  6166 
  6167         /* Check for blank line. */
  6168         if(off >= ctx->size  ||  ISNEWLINE(off)) {
  6169             if(pivot_line->type == MD_LINE_INDENTEDCODE  &&  n_parents == ctx->n_containers) {
  6170                 line->type = MD_LINE_INDENTEDCODE;
  6171                 if(line->indent > ctx->code_indent_offset)
  6172                     line->indent -= ctx->code_indent_offset;
  6173                 else
  6174                     line->indent = 0;
  6175                 ctx->last_line_has_list_loosening_effect = FALSE;
  6176             } else {
  6177                 line->type = MD_LINE_BLANK;
  6178                 ctx->last_line_has_list_loosening_effect = (n_parents > 0  &&
  6179                         n_brothers + n_children == 0  &&
  6180                         ctx->containers[n_parents-1].ch != _T('>'));
  6181 
  6182     #if 1
  6183                 /* See https://github.com/mity/md4c/issues/6
  6184                  *
  6185                  * This ugly checking tests we are in (yet empty) list item but
  6186                  * not its very first line (i.e. not the line with the list
  6187                  * item mark).
  6188                  *
  6189                  * If we are such a blank line, then any following non-blank
  6190                  * line which would be part of the list item actually has to
  6191                  * end the list because according to the specification, "a list
  6192                  * item can begin with at most one blank line."
  6193                  */
  6194                 if(n_parents > 0  &&  ctx->containers[n_parents-1].ch != _T('>')  &&
  6195                    n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
  6196                    ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
  6197                 {
  6198                     MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
  6199                     if(top_block->type == MD_BLOCK_LI)
  6200                         ctx->last_list_item_starts_with_two_blank_lines = TRUE;
  6201                 }
  6202     #endif
  6203             }
  6204             break;
  6205         } else {
  6206     #if 1
  6207             /* This is the 2nd half of the hack. If the flag is set (i.e. there
  6208              * was a 2nd blank line at the beginning of the list item) and if
  6209              * we would otherwise still belong to the list item, we enforce
  6210              * the end of the list. */
  6211             ctx->last_line_has_list_loosening_effect = FALSE;
  6212             if(ctx->last_list_item_starts_with_two_blank_lines) {
  6213                 if(n_parents > 0  &&  ctx->containers[n_parents-1].ch != _T('>')  &&
  6214                    n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
  6215                    ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
  6216                 {
  6217                     MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
  6218                     if(top_block->type == MD_BLOCK_LI)
  6219                         n_parents--;
  6220                 }
  6221 
  6222                 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
  6223             }
  6224     #endif
  6225         }
  6226 
  6227         /* Check whether we are Setext underline. */
  6228         if(line->indent < ctx->code_indent_offset  &&  pivot_line->type == MD_LINE_TEXT
  6229             &&  off < ctx->size  &&  ISANYOF2(off, _T('='), _T('-'))
  6230             &&  (n_parents == ctx->n_containers))
  6231         {
  6232             unsigned level;
  6233 
  6234             if(md_is_setext_underline(ctx, off, &off, &level)) {
  6235                 line->type = MD_LINE_SETEXTUNDERLINE;
  6236                 line->data = level;
  6237                 break;
  6238             }
  6239         }
  6240 
  6241         /* Check for thematic break line. */
  6242         if(line->indent < ctx->code_indent_offset
  6243             &&  off < ctx->size  &&  off >= hr_killer
  6244             &&  ISANYOF(off, _T("-_*")))
  6245         {
  6246             if(md_is_hr_line(ctx, off, &off, &hr_killer)) {
  6247                 line->type = MD_LINE_HR;
  6248                 break;
  6249             }
  6250         }
  6251 
  6252         /* Check for "brother" container. I.e. whether we are another list item
  6253          * in already started list. */
  6254         if(n_parents < ctx->n_containers  &&  n_brothers + n_children == 0) {
  6255             OFF tmp;
  6256 
  6257             if(md_is_container_mark(ctx, line->indent, off, &tmp, &container)  &&
  6258                md_is_container_compatible(&ctx->containers[n_parents], &container))
  6259             {
  6260                 pivot_line = &md_dummy_blank_line;
  6261 
  6262                 off = tmp;
  6263 
  6264                 total_indent += container.contents_indent - container.mark_indent;
  6265                 line->indent = md_line_indentation(ctx, total_indent, off, &off);
  6266                 total_indent += line->indent;
  6267                 line->beg = off;
  6268 
  6269                 /* Some of the following whitespace actually still belongs to the mark. */
  6270                 if(off >= ctx->size || ISNEWLINE(off)) {
  6271                     container.contents_indent++;
  6272                 } else if(line->indent <= ctx->code_indent_offset) {
  6273                     container.contents_indent += line->indent;
  6274                     line->indent = 0;
  6275                 } else {
  6276                     container.contents_indent += 1;
  6277                     line->indent--;
  6278                 }
  6279 
  6280                 ctx->containers[n_parents].mark_indent = container.mark_indent;
  6281                 ctx->containers[n_parents].contents_indent = container.contents_indent;
  6282 
  6283                 n_brothers++;
  6284                 continue;
  6285             }
  6286         }
  6287 
  6288         /* Check for indented code.
  6289          * Note indented code block cannot interrupt a paragraph. */
  6290         if(line->indent >= ctx->code_indent_offset  &&
  6291             (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
  6292         {
  6293             line->type = MD_LINE_INDENTEDCODE;
  6294             MD_ASSERT(line->indent >= ctx->code_indent_offset);
  6295             line->indent -= ctx->code_indent_offset;
  6296             line->data = 0;
  6297             break;
  6298         }
  6299 
  6300         /* Check for start of a new container block. */
  6301         if(line->indent < ctx->code_indent_offset  &&
  6302            md_is_container_mark(ctx, line->indent, off, &off, &container))
  6303         {
  6304             if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
  6305                         (off >= ctx->size || ISNEWLINE(off))  &&  container.ch != _T('>'))
  6306             {
  6307                 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
  6308             } else if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
  6309                         ISANYOF2_(container.ch, _T('.'), _T(')'))  &&  container.start != 1)
  6310             {
  6311                 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
  6312             } else {
  6313                 total_indent += container.contents_indent - container.mark_indent;
  6314                 line->indent = md_line_indentation(ctx, total_indent, off, &off);
  6315                 total_indent += line->indent;
  6316 
  6317                 line->beg = off;
  6318                 line->data = container.ch;
  6319 
  6320                 /* Some of the following whitespace actually still belongs to the mark. */
  6321                 if(off >= ctx->size || ISNEWLINE(off)) {
  6322                     container.contents_indent++;
  6323                 } else if(line->indent <= ctx->code_indent_offset) {
  6324                     container.contents_indent += line->indent;
  6325                     line->indent = 0;
  6326                 } else {
  6327                     container.contents_indent += 1;
  6328                     line->indent--;
  6329                 }
  6330 
  6331                 if(n_brothers + n_children == 0)
  6332                     pivot_line = &md_dummy_blank_line;
  6333 
  6334                 if(n_children == 0)
  6335                     MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
  6336 
  6337                 n_children++;
  6338                 MD_CHECK(md_push_container(ctx, &container));
  6339                 continue;
  6340             }
  6341         }
  6342 
  6343         /* Check whether we are table continuation. */
  6344         if(pivot_line->type == MD_LINE_TABLE  &&  n_parents == ctx->n_containers) {
  6345             line->type = MD_LINE_TABLE;
  6346             break;
  6347         }
  6348 
  6349         /* Check for ATX header. */
  6350         if(line->indent < ctx->code_indent_offset  &&
  6351                 off < ctx->size  &&  CH(off) == _T('#'))
  6352         {
  6353             unsigned level;
  6354 
  6355             if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) {
  6356                 line->type = MD_LINE_ATXHEADER;
  6357                 line->data = level;
  6358                 break;
  6359             }
  6360         }
  6361 
  6362         /* Check whether we are starting code fence. */
  6363         if(off < ctx->size  &&  ISANYOF2(off, _T('`'), _T('~'))) {
  6364             if(md_is_opening_code_fence(ctx, off, &off)) {
  6365                 line->type = MD_LINE_FENCEDCODE;
  6366                 line->data = 1;
  6367                 break;
  6368             }
  6369         }
  6370 
  6371         /* Check for start of raw HTML block. */
  6372         if(off < ctx->size  &&  CH(off) == _T('<')
  6373             &&  !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
  6374         {
  6375             ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
  6376 
  6377             /* HTML block type 7 cannot interrupt paragraph. */
  6378             if(ctx->html_block_type == 7  &&  pivot_line->type == MD_LINE_TEXT)
  6379                 ctx->html_block_type = 0;
  6380 
  6381             if(ctx->html_block_type > 0) {
  6382                 /* The line itself also may immediately close the block. */
  6383                 if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) {
  6384                     /* Make sure this is the last line of the block. */
  6385                     ctx->html_block_type = 0;
  6386                 }
  6387 
  6388                 line->type = MD_LINE_HTML;
  6389                 break;
  6390             }
  6391         }
  6392 
  6393         /* Check for table underline. */
  6394         if((ctx->parser.flags & MD_FLAG_TABLES)  &&  pivot_line->type == MD_LINE_TEXT
  6395             &&  off < ctx->size  &&  ISANYOF3(off, _T('|'), _T('-'), _T(':'))
  6396             &&  n_parents == ctx->n_containers)
  6397         {
  6398             unsigned col_count;
  6399 
  6400             if(ctx->current_block != NULL  &&  ctx->current_block->n_lines == 1  &&
  6401                 md_is_table_underline(ctx, off, &off, &col_count))
  6402             {
  6403                 line->data = col_count;
  6404                 line->type = MD_LINE_TABLEUNDERLINE;
  6405                 break;
  6406             }
  6407         }
  6408 
  6409         /* By default, we are normal text line. */
  6410         line->type = MD_LINE_TEXT;
  6411         if(pivot_line->type == MD_LINE_TEXT  &&  n_brothers + n_children == 0) {
  6412             /* Lazy continuation. */
  6413             n_parents = ctx->n_containers;
  6414         }
  6415 
  6416         /* Check for task mark. */
  6417         if((ctx->parser.flags & MD_FLAG_TASKLISTS)  &&  n_brothers + n_children > 0  &&
  6418            ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
  6419         {
  6420             OFF tmp = off;
  6421 
  6422             while(tmp < ctx->size  &&  tmp < off + 3  &&  ISBLANK(tmp))
  6423                 tmp++;
  6424             if(tmp + 2 < ctx->size  &&  CH(tmp) == _T('[')  &&
  6425                ISANYOF(tmp+1, _T("xX "))  &&  CH(tmp+2) == _T(']')  &&
  6426                (tmp + 3 == ctx->size  ||  ISBLANK(tmp+3)  ||  ISNEWLINE(tmp+3)))
  6427             {
  6428                 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
  6429                 task_container->is_task = TRUE;
  6430                 task_container->task_mark_off = tmp + 1;
  6431                 off = tmp + 3;
  6432                 while(off < ctx->size && ISWHITESPACE(off))
  6433                     off++;
  6434                 if (off == ctx->size) break;
  6435                 line->beg = off;
  6436             }
  6437         }
  6438 
  6439         break;
  6440     }
  6441 
  6442     /* Scan for end of the line.
  6443      *
  6444      * Note this is quite a bottleneck of the parsing as we here iterate almost
  6445      * over compete document.
  6446      */
  6447 #if defined __linux__ && !defined MD4C_USE_UTF16
  6448     /* Recent glibc versions have superbly optimized strcspn(), even using
  6449      * vectorization if available. */
  6450     if(ctx->doc_ends_with_newline  &&  off < ctx->size) {
  6451         while(TRUE) {
  6452             off += (OFF) strcspn(STR(off), "\r\n");
  6453 
  6454             /* strcspn() can stop on zero terminator; but that can appear
  6455              * anywhere in the Markfown input... */
  6456             if(CH(off) == _T('\0'))
  6457                 off++;
  6458             else
  6459                 break;
  6460         }
  6461     } else
  6462 #endif
  6463     {
  6464         /* Optimization: Use some loop unrolling. */
  6465         while(off + 3 < ctx->size  &&  !ISNEWLINE(off+0)  &&  !ISNEWLINE(off+1)
  6466                                    &&  !ISNEWLINE(off+2)  &&  !ISNEWLINE(off+3))
  6467             off += 4;
  6468         while(off < ctx->size  &&  !ISNEWLINE(off))
  6469             off++;
  6470     }
  6471 
  6472     /* Set end of the line. */
  6473     line->end = off;
  6474 
  6475     /* But for ATX header, we should exclude the optional trailing mark. */
  6476     if(line->type == MD_LINE_ATXHEADER) {
  6477         OFF tmp = line->end;
  6478         while(tmp > line->beg && CH(tmp-1) == _T(' '))
  6479             tmp--;
  6480         while(tmp > line->beg && CH(tmp-1) == _T('#'))
  6481             tmp--;
  6482         if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
  6483             line->end = tmp;
  6484     }
  6485 
  6486     /* Trim trailing spaces. */
  6487     if(line->type != MD_LINE_INDENTEDCODE  &&  line->type != MD_LINE_FENCEDCODE) {
  6488         while(line->end > line->beg && CH(line->end-1) == _T(' '))
  6489             line->end--;
  6490     }
  6491 
  6492     /* Eat also the new line. */
  6493     if(off < ctx->size && CH(off) == _T('\r'))
  6494         off++;
  6495     if(off < ctx->size && CH(off) == _T('\n'))
  6496         off++;
  6497 
  6498     *p_end = off;
  6499 
  6500     /* If we belong to a list after seeing a blank line, the list is loose. */
  6501     if(prev_line_has_list_loosening_effect  &&  line->type != MD_LINE_BLANK  &&  n_parents + n_brothers > 0) {
  6502         MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
  6503         if(c->ch != _T('>')) {
  6504             MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
  6505             block->flags |= MD_BLOCK_LOOSE_LIST;
  6506         }
  6507     }
  6508 
  6509     /* Leave any containers we are not part of anymore. */
  6510     if(n_children == 0  &&  n_parents + n_brothers < ctx->n_containers)
  6511         MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
  6512 
  6513     /* Enter any container we found a mark for. */
  6514     if(n_brothers > 0) {
  6515         MD_ASSERT(n_brothers == 1);
  6516         MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
  6517                     ctx->containers[n_parents].task_mark_off,
  6518                     (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
  6519                     MD_BLOCK_CONTAINER_CLOSER));
  6520         MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
  6521                     container.task_mark_off,
  6522                     (container.is_task ? CH(container.task_mark_off) : 0),
  6523                     MD_BLOCK_CONTAINER_OPENER));
  6524         ctx->containers[n_parents].is_task = container.is_task;
  6525         ctx->containers[n_parents].task_mark_off = container.task_mark_off;
  6526     }
  6527 
  6528     if(n_children > 0)
  6529         MD_CHECK(md_enter_child_containers(ctx, n_children));
  6530 
  6531 abort:
  6532     return ret;
  6533 }
  6534 
  6535 static int
  6536 md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
  6537 {
  6538     const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
  6539     int ret = 0;
  6540 
  6541     /* Blank line ends current leaf block. */
  6542     if(line->type == MD_LINE_BLANK) {
  6543         MD_CHECK(md_end_current_block(ctx));
  6544         *p_pivot_line = &md_dummy_blank_line;
  6545         return 0;
  6546     }
  6547 
  6548     /* Some line types form block on their own. */
  6549     if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
  6550         MD_CHECK(md_end_current_block(ctx));
  6551 
  6552         /* Add our single-line block. */
  6553         MD_CHECK(md_start_new_block(ctx, line));
  6554         MD_CHECK(md_add_line_into_current_block(ctx, line));
  6555         MD_CHECK(md_end_current_block(ctx));
  6556         *p_pivot_line = &md_dummy_blank_line;
  6557         return 0;
  6558     }
  6559 
  6560     /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
  6561     if(line->type == MD_LINE_SETEXTUNDERLINE) {
  6562         MD_ASSERT(ctx->current_block != NULL);
  6563         ctx->current_block->type = MD_BLOCK_H;
  6564         ctx->current_block->data = line->data;
  6565         ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
  6566         MD_CHECK(md_add_line_into_current_block(ctx, line));
  6567         MD_CHECK(md_end_current_block(ctx));
  6568         if(ctx->current_block == NULL) {
  6569             *p_pivot_line = &md_dummy_blank_line;
  6570         } else {
  6571             /* This happens if we have consumed all the body as link ref. defs.
  6572              * and downgraded the underline into start of a new paragraph block. */
  6573             line->type = MD_LINE_TEXT;
  6574             *p_pivot_line = line;
  6575         }
  6576         return 0;
  6577     }
  6578 
  6579     /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
  6580     if(line->type == MD_LINE_TABLEUNDERLINE) {
  6581         MD_ASSERT(ctx->current_block != NULL);
  6582         MD_ASSERT(ctx->current_block->n_lines == 1);
  6583         ctx->current_block->type = MD_BLOCK_TABLE;
  6584         ctx->current_block->data = line->data;
  6585         MD_ASSERT(pivot_line != &md_dummy_blank_line);
  6586         ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
  6587         MD_CHECK(md_add_line_into_current_block(ctx, line));
  6588         return 0;
  6589     }
  6590 
  6591     /* The current block also ends if the line has different type. */
  6592     if(line->type != pivot_line->type)
  6593         MD_CHECK(md_end_current_block(ctx));
  6594 
  6595     /* The current line may start a new block. */
  6596     if(ctx->current_block == NULL) {
  6597         MD_CHECK(md_start_new_block(ctx, line));
  6598         *p_pivot_line = line;
  6599     }
  6600 
  6601     /* In all other cases the line is just a continuation of the current block. */
  6602     MD_CHECK(md_add_line_into_current_block(ctx, line));
  6603 
  6604 abort:
  6605     return ret;
  6606 }
  6607 
  6608 static int
  6609 md_process_doc(MD_CTX *ctx)
  6610 {
  6611     const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
  6612     MD_LINE_ANALYSIS line_buf[2];
  6613     MD_LINE_ANALYSIS* line = &line_buf[0];
  6614     OFF off = 0;
  6615     int ret = 0;
  6616 
  6617     MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
  6618 
  6619     while(off < ctx->size) {
  6620         if(line == pivot_line)
  6621             line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
  6622 
  6623         MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
  6624         MD_CHECK(md_process_line(ctx, &pivot_line, line));
  6625     }
  6626 
  6627     md_end_current_block(ctx);
  6628 
  6629     MD_CHECK(md_build_ref_def_hashtable(ctx));
  6630 
  6631     /* Process all blocks. */
  6632     MD_CHECK(md_leave_child_containers(ctx, 0));
  6633     MD_CHECK(md_process_all_blocks(ctx));
  6634 
  6635     MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
  6636 
  6637 abort:
  6638 
  6639 #if 0
  6640     /* Output some memory consumption statistics. */
  6641     {
  6642         char buffer[256];
  6643         sprintf(buffer, "Alloced %u bytes for block buffer.",
  6644                     (unsigned)(ctx->alloc_block_bytes));
  6645         MD_LOG(buffer);
  6646 
  6647         sprintf(buffer, "Alloced %u bytes for containers buffer.",
  6648                     (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
  6649         MD_LOG(buffer);
  6650 
  6651         sprintf(buffer, "Alloced %u bytes for marks buffer.",
  6652                     (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
  6653         MD_LOG(buffer);
  6654 
  6655         sprintf(buffer, "Alloced %u bytes for aux. buffer.",
  6656                     (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
  6657         MD_LOG(buffer);
  6658     }
  6659 #endif
  6660 
  6661     return ret;
  6662 }
  6663 
  6664 
  6665 /********************
  6666  ***  Public API  ***
  6667  ********************/
  6668 
  6669 int
  6670 md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
  6671 {
  6672     MD_CTX ctx = {.text = text,
  6673                   .size = size,
  6674                   .userdata = userdata,
  6675                   .code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4,
  6676                   .doc_ends_with_newline = (size > 0  &&  ISNEWLINE_(text[size-1]))};
  6677     int i;
  6678     int ret;
  6679 
  6680     if(parser->abi_version != 0) {
  6681         if(parser->debug_log != NULL)
  6682             parser->debug_log("Unsupported abi_version.", userdata);
  6683         return -1;
  6684     }
  6685 
  6686     /* Setup context structure. */
  6687     memcpy(&ctx.parser, parser, sizeof(MD_PARSER));
  6688     md_build_mark_char_map(&ctx);
  6689 
  6690     /* Reset all unresolved opener mark chains. */
  6691     for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
  6692         ctx.mark_chains[i].head = -1;
  6693         ctx.mark_chains[i].tail = -1;
  6694     }
  6695     ctx.unresolved_link_head = -1;
  6696     ctx.unresolved_link_tail = -1;
  6697 
  6698     /* All the work. */
  6699     ret = md_process_doc(&ctx);
  6700 
  6701     /* Clean-up. */
  6702     md_free_ref_defs(&ctx);
  6703     md_free_ref_def_hashtable(&ctx);
  6704     free(ctx.buffer);
  6705     free(ctx.marks);
  6706     free(ctx.block_bytes);
  6707     free(ctx.containers);
  6708 
  6709     return ret;
  6710 }