💾 Archived View for gmi.noulin.net › gitRepositories › md4c › file › md4c › md4c.h.gmi captured on 2023-01-29 at 13:23:31. Gemini links have been rewritten to link to archived content
-=-=-=-=-=-=-
md4c.h (15511B)
1 /* 2 * MD4C: Markdown parser for C 3 * (http://github.com/mity/md4c) 4 * 5 * Copyright (c) 2016-2020 Martin Mitas 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23 * IN THE SOFTWARE. 24 */ 25 26 #ifndef MD4C_H 27 #define MD4C_H 28 29 #ifdef __cplusplus 30 extern "C" { 31 #endif 32 33 #if defined MD4C_USE_UTF16 34 /* Magic to support UTF-16. Note that in order to use it, you have to define 35 * the macro MD4C_USE_UTF16 both when building MD4C as well as when 36 * including this header in your code. */ 37 #ifdef _WIN32 38 #include <windows.h> 39 typedef WCHAR MD_CHAR; 40 #else 41 #error MD4C_USE_UTF16 is only supported on Windows. 42 #endif 43 #else 44 typedef char MD_CHAR; 45 #endif 46 47 typedef unsigned MD_SIZE; 48 typedef unsigned MD_OFFSET; 49 50 51 /* Block represents a part of document hierarchy structure like a paragraph 52 * or list item. 53 */ 54 typedef enum MD_BLOCKTYPE { 55 /* <body>...</body> */ 56 MD_BLOCK_DOC = 0, 57 58 /* <blockquote>...</blockquote> */ 59 MD_BLOCK_QUOTE, 60 61 /* <ul>...</ul> 62 * Detail: Structure MD_BLOCK_UL_DETAIL. */ 63 MD_BLOCK_UL, 64 65 /* <ol>...</ol> 66 * Detail: Structure MD_BLOCK_OL_DETAIL. */ 67 MD_BLOCK_OL, 68 69 /* <li>...</li> 70 * Detail: Structure MD_BLOCK_LI_DETAIL. */ 71 MD_BLOCK_LI, 72 73 /* <hr> */ 74 MD_BLOCK_HR, 75 76 /* <h1>...</h1> (for levels up to 6) 77 * Detail: Structure MD_BLOCK_H_DETAIL. */ 78 MD_BLOCK_H, 79 80 /* <pre><code>...</code></pre> 81 * Note the text lines within code blocks are terminated with '\n' 82 * instead of explicit MD_TEXT_BR. */ 83 MD_BLOCK_CODE, 84 85 /* Raw HTML block. This itself does not correspond to any particular HTML 86 * tag. The contents of it _is_ raw HTML source intended to be put 87 * in verbatim form to the HTML output. */ 88 MD_BLOCK_HTML, 89 90 /* <p>...</p> */ 91 MD_BLOCK_P, 92 93 /* <table>...</table> and its contents. 94 * Detail: Structure MD_BLOCK_TABLE_DETAIL (for MD_BLOCK_TABLE), 95 * structure MD_BLOCK_TD_DETAIL (for MD_BLOCK_TH and MD_BLOCK_TD) 96 * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */ 97 MD_BLOCK_TABLE, 98 MD_BLOCK_THEAD, 99 MD_BLOCK_TBODY, 100 MD_BLOCK_TR, 101 MD_BLOCK_TH, 102 MD_BLOCK_TD 103 } MD_BLOCKTYPE; 104 105 /* Span represents an in-line piece of a document which should be rendered with 106 * the same font, color and other attributes. A sequence of spans forms a block 107 * like paragraph or list item. */ 108 typedef enum MD_SPANTYPE { 109 /* <em>...</em> */ 110 MD_SPAN_EM, 111 112 /* <strong>...</strong> */ 113 MD_SPAN_STRONG, 114 115 /* <a href="xxx">...</a> 116 * Detail: Structure MD_SPAN_A_DETAIL. */ 117 MD_SPAN_A, 118 119 /* <img src="xxx">...</a> 120 * Detail: Structure MD_SPAN_IMG_DETAIL. 121 * Note: Image text can contain nested spans and even nested images. 122 * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility 123 * of the parser to deal with it. 124 */ 125 MD_SPAN_IMG, 126 127 /* <code>...</code> */ 128 MD_SPAN_CODE, 129 130 /* <del>...</del> 131 * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled. 132 */ 133 MD_SPAN_DEL, 134 135 /* For recognizing inline ($) and display ($) equations 136 * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled. 137 */ 138 MD_SPAN_LATEXMATH, 139 MD_SPAN_LATEXMATH_DISPLAY, 140 141 /* Wiki links 142 * Note: Recognized only when MD_FLAG_WIKILINKS is enabled. 143 */ 144 MD_SPAN_WIKILINK, 145 146 /* <u>...</u> 147 * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */ 148 MD_SPAN_U, 149 MD_SPAN_FNT, 150 MD_SPAN_INV, 151 MD_SPAN_COC, 152 MD_SPAN_BLI, 153 MD_SPAN_ANCHOR, 154 /* This span type is issued by md4c 155 * MD_SPAN_COLOR allows supporting RGB colors: 156 * [text with colors](#1#13) 157 * md4c treats colors as MD_SPAN_A and the parsing of the color 158 * is done by the user. 159 */ 160 MD_SPAN_COLOR, 161 } MD_SPANTYPE; 162 163 /* Text is the actual textual contents of span. */ 164 typedef enum MD_TEXTTYPE { 165 /* Normal text. */ 166 MD_TEXT_NORMAL = 0, 167 168 /* NULL character. CommonMark requires replacing NULL character with 169 * the replacement char U+FFFD, so this allows caller to do that easily. */ 170 MD_TEXT_NULLCHAR, 171 172 /* Line breaks. 173 * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE 174 * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */ 175 MD_TEXT_BR, /* <br> (hard break) */ 176 MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */ 177 178 /* Entity. 179 * (a) Named entity, e.g. 180 * (Note MD4C does not have a list of known entities. 181 * Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is 182 * treated as a named entity.) 183 * (b) Numerical entity, e.g. Ӓ 184 * (c) Hexadecimal entity, e.g. ካ 185 * 186 * As MD4C is mostly encoding agnostic, application gets the verbatim 187 * entity text into the MD_PARSER::text_callback(). */ 188 MD_TEXT_ENTITY, 189 190 /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`). 191 * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and 192 * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this 193 * kind of text. */ 194 MD_TEXT_CODE, 195 196 /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not 197 * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used. 198 * The text contains verbatim '\n' for the new lines. */ 199 MD_TEXT_HTML, 200 201 /* Text is inside an equation. This is processed the same way as inlined code 202 * spans (`code`). */ 203 MD_TEXT_LATEXMATH 204 } MD_TEXTTYPE; 205 206 207 /* Alignment enumeration. */ 208 typedef enum MD_ALIGN { 209 MD_ALIGN_DEFAULT = 0, /* When unspecified. */ 210 MD_ALIGN_LEFT, 211 MD_ALIGN_CENTER, 212 MD_ALIGN_RIGHT 213 } MD_ALIGN; 214 215 216 /* String attribute. 217 * 218 * This wraps strings which are outside of a normal text flow and which are 219 * propagated within various detailed structures, but which still may contain 220 * string portions of different types like e.g. entities. 221 * 222 * So, for example, lets consider this image: 223 * 224 * ![image alt text](http://example.org/image.png 'foo " bar') 225 * 226 * The image alt text is propagated as a normal text via the MD_PARSER::text() 227 * callback. However, the image title ('foo " bar') is propagated as 228 * MD_ATTRIBUTE in MD_SPAN_IMG_DETAIL::title. 229 * 230 * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following: 231 * -- [0]: "foo " (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0) 232 * -- [1]: """ (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4) 233 * -- [2]: " bar" (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10) 234 * -- [3]: (n/a) (n/a ; substr_offsets[3] == 14) 235 * 236 * Note that these invariants are always guaranteed: 237 * -- substr_offsets[0] == 0 238 * -- substr_offsets[LAST+1] == size 239 * -- Currently, only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR 240 * substrings can appear. This could change only of the specification 241 * changes. 242 */ 243 typedef struct MD_ATTRIBUTE { 244 const MD_CHAR* text; 245 MD_SIZE size; 246 const MD_TEXTTYPE* substr_types; 247 const MD_OFFSET* substr_offsets; 248 } MD_ATTRIBUTE; 249 250 251 /* Detailed info for MD_BLOCK_UL. */ 252 typedef struct MD_BLOCK_UL_DETAIL { 253 int is_tight; /* Non-zero if tight list, zero if loose. */ 254 MD_CHAR mark; /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */ 255 } MD_BLOCK_UL_DETAIL; 256 257 /* Detailed info for MD_BLOCK_OL. */ 258 typedef struct MD_BLOCK_OL_DETAIL { 259 unsigned start; /* Start index of the ordered list. */ 260 int is_tight; /* Non-zero if tight list, zero if loose. */ 261 MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */ 262 } MD_BLOCK_OL_DETAIL; 263 264 /* Detailed info for MD_BLOCK_LI. */ 265 typedef struct MD_BLOCK_LI_DETAIL { 266 int is_task; /* Can be non-zero only with MD_FLAG_TASKLISTS */ 267 MD_CHAR task_mark; /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */ 268 MD_OFFSET task_mark_offset; /* If is_task, then offset in the input of the char between '[' and ']'. */ 269 } MD_BLOCK_LI_DETAIL; 270 271 /* Detailed info for MD_BLOCK_H. */ 272 typedef struct MD_BLOCK_H_DETAIL { 273 unsigned level; /* Header level (1 - 6) */ 274 } MD_BLOCK_H_DETAIL; 275 276 /* Detailed info for MD_BLOCK_CODE. */ 277 typedef struct MD_BLOCK_CODE_DETAIL { 278 MD_ATTRIBUTE info; 279 MD_ATTRIBUTE lang; 280 MD_CHAR fence_char; /* The character used for fenced code block; or zero for indented code block. */ 281 } MD_BLOCK_CODE_DETAIL; 282 283 /* Detailed info for MD_BLOCK_TABLE. */ 284 typedef struct MD_BLOCK_TABLE_DETAIL { 285 unsigned col_count; /* Count of columns in the table. */ 286 unsigned head_row_count; /* Count of rows in the table header (currently always 1) */ 287 unsigned body_row_count; /* Count of rows in the table body */ 288 } MD_BLOCK_TABLE_DETAIL; 289 290 /* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */ 291 typedef struct MD_BLOCK_TD_DETAIL { 292 MD_ALIGN align; 293 } MD_BLOCK_TD_DETAIL; 294 295 /* Detailed info for MD_SPAN_A. */ 296 typedef struct MD_SPAN_A_DETAIL { 297 MD_ATTRIBUTE href; 298 MD_ATTRIBUTE title; 299 } MD_SPAN_A_DETAIL; 300 301 /* Detailed info for MD_SPAN_IMG. */ 302 typedef struct MD_SPAN_IMG_DETAIL { 303 MD_ATTRIBUTE src; 304 MD_ATTRIBUTE title; 305 } MD_SPAN_IMG_DETAIL; 306 307 /* Detailed info for MD_SPAN_WIKILINK. */ 308 typedef struct MD_SPAN_WIKILINK { 309 MD_ATTRIBUTE target; 310 } MD_SPAN_WIKILINK_DETAIL; 311 312 /* Flags specifying extensions/deviations from CommonMark specification. 313 * 314 * By default (when MD_PARSER::flags == 0), we follow CommonMark specification. 315 * The following flags may allow some extensions or deviations from it. 316 */ 317 #define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ 318 #define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */ 319 #define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */ 320 #define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ 321 #define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */ 322 #define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */ 323 #define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */ 324 #define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */ 325 #define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */ 326 #define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */ 327 #define MD_FLAG_TASKLISTS 0x0800 /* Enable task list extension. */ 328 #define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $ containing LaTeX equations. */ 329 #define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */ 330 #define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */ 331 332 #define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) 333 #define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) 334 335 /* Convenient sets of flags corresponding to well-known Markdown dialects. 336 * 337 * Note we may only support subset of features of the referred dialect. 338 * The constant just enables those extensions which bring us as close as 339 * possible given what features we implement. 340 * 341 * ABI compatibility note: Meaning of these can change in time as new 342 * extensions, bringing the dialect closer to the original, are implemented. 343 */ 344 #define MD_DIALECT_COMMONMARK 0 345 #define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS) 346 347 /* Parser structure. 348 */ 349 typedef struct MD_PARSER { 350 /* Reserved. Set to zero. 351 */ 352 unsigned abi_version; 353 354 /* Dialect options. Bitmask of MD_FLAG_xxxx values. 355 */ 356 unsigned flags; 357 358 /* Caller-provided rendering callbacks. 359 * 360 * For some block/span types, more detailed information is provided in a 361 * type-specific structure pointed by the argument 'detail'. 362 * 363 * The last argument of all callbacks, 'userdata', is just propagated from 364 * md_parse() and is available for any use by the application. 365 * 366 * Note any strings provided to the callbacks as their arguments or as 367 * members of any detail structure are generally not zero-terminated. 368 * Application has to take the respective size information into account. 369 * 370 * Any rendering callback may abort further parsing of the document by 371 * returning non-zero. 372 */ 373 int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); 374 int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/); 375 376 int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); 377 int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/); 378 379 int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/); 380 381 /* Debug callback. Optional (may be NULL). 382 * 383 * If provided and something goes wrong, this function gets called. 384 * This is intended for debugging and problem diagnosis for developers; 385 * it is not intended to provide any errors suitable for displaying to an 386 * end user. 387 */ 388 void (*debug_log)(const char* /*msg*/, void* /*userdata*/); 389 390 /* Reserved. Set to NULL. 391 */ 392 void (*syntax)(void); 393 } MD_PARSER; 394 395 396 /* For backward compatibility. Do not use in new code. 397 */ 398 typedef MD_PARSER MD_RENDERER; 399 400 401 /* Parse the Markdown document stored in the string 'text' of size 'size'. 402 * The parser provides callbacks to be called during the parsing so the 403 * caller can render the document on the screen or convert the Markdown 404 * to another format. 405 * 406 * Zero is returned on success. If a runtime error occurs (e.g. a memory 407 * fails), -1 is returned. If the processing is aborted due any callback 408 * returning non-zero, the return value of the callback is returned. 409 */ 410 int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata); 411 412 413 #ifdef __cplusplus 414 } /* extern "C" { */ 415 #endif 416 417 #endif /* MD4C_H */