md4c

Log

Files

Refs

README

LICENSE

md4c.h (15511B)

     1 /*
     2  * MD4C: Markdown parser for C
     3  * (http://github.com/mity/md4c)
     4  *
     5  * Copyright (c) 2016-2020 Martin Mitas
     6  *
     7  * Permission is hereby granted, free of charge, to any person obtaining a
     8  * copy of this software and associated documentation files (the "Software"),
     9  * to deal in the Software without restriction, including without limitation
    10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    11  * and/or sell copies of the Software, and to permit persons to whom the
    12  * Software is furnished to do so, subject to the following conditions:
    13  *
    14  * The above copyright notice and this permission notice shall be included in
    15  * all copies or substantial portions of the Software.
    16  *
    17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
    18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
    23  * IN THE SOFTWARE.
    24  */
    25 
    26 #ifndef MD4C_H
    27 #define MD4C_H
    28 
    29 #ifdef __cplusplus
    30     extern "C" {
    31 #endif
    32 
    33 #if defined MD4C_USE_UTF16
    34     /* Magic to support UTF-16. Note that in order to use it, you have to define
    35      * the macro MD4C_USE_UTF16 both when building MD4C as well as when
    36      * including this header in your code. */
    37     #ifdef _WIN32
    38         #include <windows.h>
    39         typedef WCHAR       MD_CHAR;
    40     #else
    41         #error MD4C_USE_UTF16 is only supported on Windows.
    42     #endif
    43 #else
    44     typedef char            MD_CHAR;
    45 #endif
    46 
    47 typedef unsigned MD_SIZE;
    48 typedef unsigned MD_OFFSET;
    49 
    50 
    51 /* Block represents a part of document hierarchy structure like a paragraph
    52  * or list item.
    53  */
    54 typedef enum MD_BLOCKTYPE {
    55     /* <body>...</body> */
    56     MD_BLOCK_DOC = 0,
    57 
    58     /* <blockquote>...</blockquote> */
    59     MD_BLOCK_QUOTE,
    60 
    61     /* <ul>...</ul>
    62      * Detail: Structure MD_BLOCK_UL_DETAIL. */
    63     MD_BLOCK_UL,
    64 
    65     /* <ol>...</ol>
    66      * Detail: Structure MD_BLOCK_OL_DETAIL. */
    67     MD_BLOCK_OL,
    68 
    69     /* <li>...</li>
    70      * Detail: Structure MD_BLOCK_LI_DETAIL. */
    71     MD_BLOCK_LI,
    72 
    73     /* <hr> */
    74     MD_BLOCK_HR,
    75 
    76     /* <h1>...</h1> (for levels up to 6)
    77      * Detail: Structure MD_BLOCK_H_DETAIL. */
    78     MD_BLOCK_H,
    79 
    80     /* <pre><code>...</code></pre>
    81      * Note the text lines within code blocks are terminated with '\n'
    82      * instead of explicit MD_TEXT_BR. */
    83     MD_BLOCK_CODE,
    84 
    85     /* Raw HTML block. This itself does not correspond to any particular HTML
    86      * tag. The contents of it _is_ raw HTML source intended to be put
    87      * in verbatim form to the HTML output. */
    88     MD_BLOCK_HTML,
    89 
    90     /* <p>...</p> */
    91     MD_BLOCK_P,
    92 
    93     /* <table>...</table> and its contents.
    94      * Detail: Structure MD_BLOCK_TABLE_DETAIL (for MD_BLOCK_TABLE),
    95      *         structure MD_BLOCK_TD_DETAIL (for MD_BLOCK_TH and MD_BLOCK_TD)
    96      * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */
    97     MD_BLOCK_TABLE,
    98     MD_BLOCK_THEAD,
    99     MD_BLOCK_TBODY,
   100     MD_BLOCK_TR,
   101     MD_BLOCK_TH,
   102     MD_BLOCK_TD
   103 } MD_BLOCKTYPE;
   104 
   105 /* Span represents an in-line piece of a document which should be rendered with
   106  * the same font, color and other attributes. A sequence of spans forms a block
   107  * like paragraph or list item. */
   108 typedef enum MD_SPANTYPE {
   109     /* <em>...</em> */
   110     MD_SPAN_EM,
   111 
   112     /* <strong>...</strong> */
   113     MD_SPAN_STRONG,
   114 
   115     /* <a href="xxx">...</a>
   116      * Detail: Structure MD_SPAN_A_DETAIL. */
   117     MD_SPAN_A,
   118 
   119     /* <img src="xxx">...</a>
   120      * Detail: Structure MD_SPAN_IMG_DETAIL.
   121      * Note: Image text can contain nested spans and even nested images.
   122      * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility
   123      * of the parser to deal with it.
   124      */
   125     MD_SPAN_IMG,
   126 
   127     /* <code>...</code> */
   128     MD_SPAN_CODE,
   129 
   130     /* <del>...</del>
   131      * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled.
   132      */
   133     MD_SPAN_DEL,
   134 
   135     /* For recognizing inline ($) and display ($) equations
   136      * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled.
   137      */
   138     MD_SPAN_LATEXMATH,
   139     MD_SPAN_LATEXMATH_DISPLAY,
   140 
   141     /* Wiki links
   142      * Note: Recognized only when MD_FLAG_WIKILINKS is enabled.
   143      */
   144     MD_SPAN_WIKILINK,
   145 
   146     /* <u>...</u>
   147      * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */
   148     MD_SPAN_U,
   149     MD_SPAN_FNT,
   150     MD_SPAN_INV,
   151     MD_SPAN_COC,
   152     MD_SPAN_BLI,
   153     MD_SPAN_ANCHOR,
   154     /* This span type is issued by md4c
   155      * MD_SPAN_COLOR allows supporting RGB colors:
   156      * [text with colors](#1#13)
   157      * md4c treats colors as MD_SPAN_A and the parsing of the color
   158      * is done by the user.
   159      */
   160     MD_SPAN_COLOR,
   161 } MD_SPANTYPE;
   162 
   163 /* Text is the actual textual contents of span. */
   164 typedef enum MD_TEXTTYPE {
   165     /* Normal text. */
   166     MD_TEXT_NORMAL = 0,
   167 
   168     /* NULL character. CommonMark requires replacing NULL character with
   169      * the replacement char U+FFFD, so this allows caller to do that easily. */
   170     MD_TEXT_NULLCHAR,
   171 
   172     /* Line breaks.
   173      * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE
   174      * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */
   175     MD_TEXT_BR,         /* <br> (hard break) */
   176     MD_TEXT_SOFTBR,     /* '\n' in source text where it is not semantically meaningful (soft break) */
   177 
   178     /* Entity.
   179      * (a) Named entity, e.g. &nbsp;
   180      *     (Note MD4C does not have a list of known entities.
   181      *     Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is
   182      *     treated as a named entity.)
   183      * (b) Numerical entity, e.g. &#1234;
   184      * (c) Hexadecimal entity, e.g. &#x12AB;
   185      *
   186      * As MD4C is mostly encoding agnostic, application gets the verbatim
   187      * entity text into the MD_PARSER::text_callback(). */
   188     MD_TEXT_ENTITY,
   189 
   190     /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
   191      * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and
   192      * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this
   193      * kind of text. */
   194     MD_TEXT_CODE,
   195 
   196     /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not
   197      * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used.
   198      * The text contains verbatim '\n' for the new lines. */
   199     MD_TEXT_HTML,
   200 
   201     /* Text is inside an equation. This is processed the same way as inlined code
   202      * spans (`code`). */
   203     MD_TEXT_LATEXMATH
   204 } MD_TEXTTYPE;
   205 
   206 
   207 /* Alignment enumeration. */
   208 typedef enum MD_ALIGN {
   209     MD_ALIGN_DEFAULT = 0,   /* When unspecified. */
   210     MD_ALIGN_LEFT,
   211     MD_ALIGN_CENTER,
   212     MD_ALIGN_RIGHT
   213 } MD_ALIGN;
   214 
   215 
   216 /* String attribute.
   217  *
   218  * This wraps strings which are outside of a normal text flow and which are
   219  * propagated within various detailed structures, but which still may contain
   220  * string portions of different types like e.g. entities.
   221  *
   222  * So, for example, lets consider this image:
   223  *
   224  *     ![image alt text](http://example.org/image.png 'foo &quot; bar')
   225  *
   226  * The image alt text is propagated as a normal text via the MD_PARSER::text()
   227  * callback. However, the image title ('foo &quot; bar') is propagated as
   228  * MD_ATTRIBUTE in MD_SPAN_IMG_DETAIL::title.
   229  *
   230  * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following:
   231  *  -- [0]: "foo "   (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0)
   232  *  -- [1]: "&quot;" (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4)
   233  *  -- [2]: " bar"   (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10)
   234  *  -- [3]: (n/a)    (n/a                              ; substr_offsets[3] == 14)
   235  *
   236  * Note that these invariants are always guaranteed:
   237  *  -- substr_offsets[0] == 0
   238  *  -- substr_offsets[LAST+1] == size
   239  *  -- Currently, only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR
   240  *     substrings can appear. This could change only of the specification
   241  *     changes.
   242  */
   243 typedef struct MD_ATTRIBUTE {
   244     const MD_CHAR* text;
   245     MD_SIZE size;
   246     const MD_TEXTTYPE* substr_types;
   247     const MD_OFFSET* substr_offsets;
   248 } MD_ATTRIBUTE;
   249 
   250 
   251 /* Detailed info for MD_BLOCK_UL. */
   252 typedef struct MD_BLOCK_UL_DETAIL {
   253     int is_tight;           /* Non-zero if tight list, zero if loose. */
   254     MD_CHAR mark;           /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */
   255 } MD_BLOCK_UL_DETAIL;
   256 
   257 /* Detailed info for MD_BLOCK_OL. */
   258 typedef struct MD_BLOCK_OL_DETAIL {
   259     unsigned start;         /* Start index of the ordered list. */
   260     int is_tight;           /* Non-zero if tight list, zero if loose. */
   261     MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */
   262 } MD_BLOCK_OL_DETAIL;
   263 
   264 /* Detailed info for MD_BLOCK_LI. */
   265 typedef struct MD_BLOCK_LI_DETAIL {
   266     int is_task;            /* Can be non-zero only with MD_FLAG_TASKLISTS */
   267     MD_CHAR task_mark;      /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */
   268     MD_OFFSET task_mark_offset;  /* If is_task, then offset in the input of the char between '[' and ']'. */
   269 } MD_BLOCK_LI_DETAIL;
   270 
   271 /* Detailed info for MD_BLOCK_H. */
   272 typedef struct MD_BLOCK_H_DETAIL {
   273     unsigned level;         /* Header level (1 - 6) */
   274 } MD_BLOCK_H_DETAIL;
   275 
   276 /* Detailed info for MD_BLOCK_CODE. */
   277 typedef struct MD_BLOCK_CODE_DETAIL {
   278     MD_ATTRIBUTE info;
   279     MD_ATTRIBUTE lang;
   280     MD_CHAR fence_char;     /* The character used for fenced code block; or zero for indented code block. */
   281 } MD_BLOCK_CODE_DETAIL;
   282 
   283 /* Detailed info for MD_BLOCK_TABLE. */
   284 typedef struct MD_BLOCK_TABLE_DETAIL {
   285     unsigned col_count;         /* Count of columns in the table. */
   286     unsigned head_row_count;    /* Count of rows in the table header (currently always 1) */
   287     unsigned body_row_count;    /* Count of rows in the table body */
   288 } MD_BLOCK_TABLE_DETAIL;
   289 
   290 /* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */
   291 typedef struct MD_BLOCK_TD_DETAIL {
   292     MD_ALIGN align;
   293 } MD_BLOCK_TD_DETAIL;
   294 
   295 /* Detailed info for MD_SPAN_A. */
   296 typedef struct MD_SPAN_A_DETAIL {
   297     MD_ATTRIBUTE href;
   298     MD_ATTRIBUTE title;
   299 } MD_SPAN_A_DETAIL;
   300 
   301 /* Detailed info for MD_SPAN_IMG. */
   302 typedef struct MD_SPAN_IMG_DETAIL {
   303     MD_ATTRIBUTE src;
   304     MD_ATTRIBUTE title;
   305 } MD_SPAN_IMG_DETAIL;
   306 
   307 /* Detailed info for MD_SPAN_WIKILINK. */
   308 typedef struct MD_SPAN_WIKILINK {
   309     MD_ATTRIBUTE target;
   310 } MD_SPAN_WIKILINK_DETAIL;
   311 
   312 /* Flags specifying extensions/deviations from CommonMark specification.
   313  *
   314  * By default (when MD_PARSER::flags == 0), we follow CommonMark specification.
   315  * The following flags may allow some extensions or deviations from it.
   316  */
   317 #define MD_FLAG_COLLAPSEWHITESPACE          0x0001  /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */
   318 #define MD_FLAG_PERMISSIVEATXHEADERS        0x0002  /* Do not require space in ATX headers ( ###header ) */
   319 #define MD_FLAG_PERMISSIVEURLAUTOLINKS      0x0004  /* Recognize URLs as autolinks even without '<', '>' */
   320 #define MD_FLAG_PERMISSIVEEMAILAUTOLINKS    0x0008  /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */
   321 #define MD_FLAG_NOINDENTEDCODEBLOCKS        0x0010  /* Disable indented code blocks. (Only fenced code works.) */
   322 #define MD_FLAG_NOHTMLBLOCKS                0x0020  /* Disable raw HTML blocks. */
   323 #define MD_FLAG_NOHTMLSPANS                 0x0040  /* Disable raw HTML (inline). */
   324 #define MD_FLAG_TABLES                      0x0100  /* Enable tables extension. */
   325 #define MD_FLAG_STRIKETHROUGH               0x0200  /* Enable strikethrough extension. */
   326 #define MD_FLAG_PERMISSIVEWWWAUTOLINKS      0x0400  /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */
   327 #define MD_FLAG_TASKLISTS                   0x0800  /* Enable task list extension. */
   328 #define MD_FLAG_LATEXMATHSPANS              0x1000  /* Enable $ and $ containing LaTeX equations. */
   329 #define MD_FLAG_WIKILINKS                   0x2000  /* Enable wiki links extension. */
   330 #define MD_FLAG_UNDERLINE                   0x4000  /* Enable underline extension (and disables '_' for normal emphasis). */
   331 
   332 #define MD_FLAG_PERMISSIVEAUTOLINKS         (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
   333 #define MD_FLAG_NOHTML                      (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
   334 
   335 /* Convenient sets of flags corresponding to well-known Markdown dialects.
   336  *
   337  * Note we may only support subset of features of the referred dialect.
   338  * The constant just enables those extensions which bring us as close as
   339  * possible given what features we implement.
   340  *
   341  * ABI compatibility note: Meaning of these can change in time as new
   342  * extensions, bringing the dialect closer to the original, are implemented.
   343  */
   344 #define MD_DIALECT_COMMONMARK               0
   345 #define MD_DIALECT_GITHUB                   (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS)
   346 
   347 /* Parser structure.
   348  */
   349 typedef struct MD_PARSER {
   350     /* Reserved. Set to zero.
   351      */
   352     unsigned abi_version;
   353 
   354     /* Dialect options. Bitmask of MD_FLAG_xxxx values.
   355      */
   356     unsigned flags;
   357 
   358     /* Caller-provided rendering callbacks.
   359      *
   360      * For some block/span types, more detailed information is provided in a
   361      * type-specific structure pointed by the argument 'detail'.
   362      *
   363      * The last argument of all callbacks, 'userdata', is just propagated from
   364      * md_parse() and is available for any use by the application.
   365      *
   366      * Note any strings provided to the callbacks as their arguments or as
   367      * members of any detail structure are generally not zero-terminated.
   368      * Application has to take the respective size information into account.
   369      *
   370      * Any rendering callback may abort further parsing of the document by
   371      * returning non-zero.
   372      */
   373     int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
   374     int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
   375 
   376     int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
   377     int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
   378 
   379     int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/);
   380 
   381     /* Debug callback. Optional (may be NULL).
   382      *
   383      * If provided and something goes wrong, this function gets called.
   384      * This is intended for debugging and problem diagnosis for developers;
   385      * it is not intended to provide any errors suitable for displaying to an
   386      * end user.
   387      */
   388     void (*debug_log)(const char* /*msg*/, void* /*userdata*/);
   389 
   390     /* Reserved. Set to NULL.
   391      */
   392     void (*syntax)(void);
   393 } MD_PARSER;
   394 
   395 
   396 /* For backward compatibility. Do not use in new code.
   397  */
   398 typedef MD_PARSER MD_RENDERER;
   399 
   400 
   401 /* Parse the Markdown document stored in the string 'text' of size 'size'.
   402  * The parser provides callbacks to be called during the parsing so the
   403  * caller can render the document on the screen or convert the Markdown
   404  * to another format.
   405  *
   406  * Zero is returned on success. If a runtime error occurs (e.g. a memory
   407  * fails), -1 is returned. If the processing is aborted due any callback
   408  * returning non-zero, the return value of the callback is returned.
   409  */
   410 int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata);
   411 
   412 
   413 #ifdef __cplusplus
   414     }  /* extern "C" { */
   415 #endif
   416 
   417 #endif  /* MD4C_H */