💾 Archived View for gmi.noulin.net › gitRepositories › md2html › file › render_html.c.gmi captured on 2023-01-29 at 13:24:49. Gemini links have been rewritten to link to archived content

View Raw

More Information

-=-=-=-=-=-=-

md2html

Log

Files

Refs

README

render_html.c (16141B)

     1 /*
     2  * MD4C: Markdown parser for C
     3  * (http://github.com/mity/md4c)
     4  *
     5  * Copyright (c) 2016-2017 Martin Mitas
     6  *
     7  * Permission is hereby granted, free of charge, to any person obtaining a
     8  * copy of this software and associated documentation files (the "Software"),
     9  * to deal in the Software without restriction, including without limitation
    10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    11  * and/or sell copies of the Software, and to permit persons to whom the
    12  * Software is furnished to do so, subject to the following conditions:
    13  *
    14  * The above copyright notice and this permission notice shall be included in
    15  * all copies or substantial portions of the Software.
    16  *
    17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
    18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
    23  * IN THE SOFTWARE.
    24  */
    25 
    26 #include <stdio.h>
    27 #include <string.h>
    28 
    29 #include "render_html.h"
    30 #include "entity.h"
    31 
    32 
    33 #ifdef _MSC_VER
    34     /* MSVC does not understand "inline" when building as pure C (not C++).
    35      * However it understands "__inline" */
    36     #ifndef __cplusplus
    37         #define inline __inline
    38     #endif
    39 #endif
    40 
    41 #ifdef _WIN32
    42     #define snprintf _snprintf
    43 #endif
    44 
    45 
    46 
    47 typedef struct MD_RENDER_HTML_tag MD_RENDER_HTML;
    48 struct MD_RENDER_HTML_tag {
    49     void (*process_output)(const MD_CHAR*, MD_SIZE, void*);
    50     void* userdata;
    51     unsigned flags;
    52     int image_nesting_level;
    53 };
    54 
    55 
    56 /*****************************************
    57  ***  HTML rendering helper functions  ***
    58  *****************************************/
    59 
    60 #define ISDIGIT(ch)     ('0' <= (ch) && (ch) <= '9')
    61 #define ISLOWER(ch)     ('a' <= (ch) && (ch) <= 'z')
    62 #define ISUPPER(ch)     ('A' <= (ch) && (ch) <= 'Z')
    63 #define ISALNUM(ch)     (ISLOWER(ch) || ISUPPER(ch) || ISDIGIT(ch))
    64 
    65 
    66 static inline void
    67 render_text(MD_RENDER_HTML* r, const MD_CHAR* text, MD_SIZE size)
    68 {
    69     r->process_output(text, size, r->userdata);
    70 }
    71 
    72 #define RENDER_LITERAL(r, literal)    render_text((r), (literal), strlen(literal))
    73 
    74 
    75 static void
    76 render_html_escaped(MD_RENDER_HTML* r, const MD_CHAR* data, MD_SIZE size)
    77 {
    78     MD_OFFSET beg = 0;
    79     MD_OFFSET off = 0;
    80 
    81     /* Some characters need to be escaped in normal HTML text. */
    82     #define HTML_NEED_ESCAPE(ch)                                            \
    83             ((ch) == '&' || (ch) == '<' || (ch) == '>' || (ch) == '"')
    84 
    85     while(1) {
    86         while(off < size  &&  !HTML_NEED_ESCAPE(data[off]))
    87             off++;
    88         if(off > beg)
    89             render_text(r, data + beg, off - beg);
    90 
    91         if(off < size) {
    92             switch(data[off]) {
    93                 case '&':   RENDER_LITERAL(r, "&amp;"); break;
    94                 case '<':   RENDER_LITERAL(r, "&lt;"); break;
    95                 case '>':   RENDER_LITERAL(r, "&gt;"); break;
    96                 case '"':   RENDER_LITERAL(r, "&quot;"); break;
    97             }
    98             off++;
    99         } else {
   100             break;
   101         }
   102         beg = off;
   103     }
   104 }
   105 
   106 static void
   107 render_url_escaped(MD_RENDER_HTML* r, const MD_CHAR* data, MD_SIZE size)
   108 {
   109     static const MD_CHAR hex_chars[] = "0123456789ABCDEF";
   110     MD_OFFSET beg = 0;
   111     MD_OFFSET off = 0;
   112 
   113     #define URL_NEED_ESCAPE(ch)                                             \
   114             (!ISALNUM(ch)  &&  strchr("-_.+!*'(),%#@?=;:/,+$", ch) == NULL)
   115 
   116     while(1) {
   117         while(off < size  &&  !URL_NEED_ESCAPE(data[off]))
   118             off++;
   119         if(off > beg)
   120             render_text(r, data + beg, off - beg);
   121 
   122         if(off < size) {
   123             char hex[3];
   124 
   125             switch(data[off]) {
   126                 case '&':   RENDER_LITERAL(r, "&amp;"); break;
   127                 case '\'':  RENDER_LITERAL(r, "&#x27;"); break;
   128                 default:
   129                     hex[0] = '%';
   130                     hex[1] = hex_chars[((unsigned)data[off] >> 4) & 0xf];
   131                     hex[2] = hex_chars[((unsigned)data[off] >> 0) & 0xf];
   132                     render_text(r, hex, 3);
   133                     break;
   134             }
   135             off++;
   136         } else {
   137             break;
   138         }
   139 
   140         beg = off;
   141     }
   142 }
   143 
   144 static unsigned
   145 hex_val(char ch)
   146 {
   147     if('0' <= ch && ch <= '9')
   148         return ch - '0';
   149     if('A' <= ch && ch <= 'Z')
   150         return ch - 'A' + 10;
   151     else
   152         return ch - 'a' + 10;
   153 }
   154 
   155 static void
   156 render_utf8_codepoint(MD_RENDER_HTML* r, unsigned codepoint,
   157                       void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE))
   158 {
   159     static const MD_CHAR utf8_replacement_char[] = { 0xef, 0xbf, 0xbd };
   160 
   161     unsigned char utf8[4];
   162     size_t n;
   163 
   164     if(codepoint <= 0x7f) {
   165         n = 1;
   166         utf8[0] = codepoint;
   167     } else if(codepoint <= 0x7ff) {
   168         n = 2;
   169         utf8[0] = 0xc0 | ((codepoint >>  6) & 0x1f);
   170         utf8[1] = 0x80 + ((codepoint >>  0) & 0x3f);
   171     } else if(codepoint <= 0xffff) {
   172         n = 3;
   173         utf8[0] = 0xe0 | ((codepoint >> 12) & 0xf);
   174         utf8[1] = 0x80 + ((codepoint >>  6) & 0x3f);
   175         utf8[2] = 0x80 + ((codepoint >>  0) & 0x3f);
   176     } else {
   177         n = 4;
   178         utf8[0] = 0xf0 | ((codepoint >> 18) & 0x7);
   179         utf8[1] = 0x80 + ((codepoint >> 12) & 0x3f);
   180         utf8[2] = 0x80 + ((codepoint >>  6) & 0x3f);
   181         utf8[3] = 0x80 + ((codepoint >>  0) & 0x3f);
   182     }
   183 
   184     if(0 < codepoint  &&  codepoint <= 0x10ffff)
   185         fn_append(r, (char*)utf8, n);
   186     else
   187         fn_append(r, utf8_replacement_char, 3);
   188 }
   189 
   190 /* Translate entity to its UTF-8 equivalent, or output the verbatim one
   191  * if such entity is unknown (or if the translation is disabled). */
   192 static void
   193 render_entity(MD_RENDER_HTML* r, const MD_CHAR* text, MD_SIZE size,
   194               void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE))
   195 {
   196     if(r->flags & MD_RENDER_FLAG_VERBATIM_ENTITIES) {
   197         fn_append(r, text, size);
   198         return;
   199     }
   200 
   201     /* We assume UTF-8 output is what is desired. */
   202     if(size > 3 && text[1] == '#') {
   203         unsigned codepoint = 0;
   204 
   205         if(text[2] == 'x' || text[2] == 'X') {
   206             /* Hexadecimal entity (e.g. "&#x1234abcd;")). */
   207             MD_SIZE i;
   208             for(i = 3; i < size-1; i++)
   209                 codepoint = 16 * codepoint + hex_val(text[i]);
   210         } else {
   211             /* Decimal entity (e.g. "&1234;") */
   212             MD_SIZE i;
   213             for(i = 2; i < size-1; i++)
   214                 codepoint = 10 * codepoint + (text[i] - '0');
   215         }
   216 
   217         render_utf8_codepoint(r, codepoint, fn_append);
   218         return;
   219     } else {
   220         /* Named entity (e.g. "&nbsp;"). */
   221         const struct entity* ent;
   222 
   223         ent = entity_lookup(text, size);
   224         if(ent != NULL) {
   225             render_utf8_codepoint(r, ent->codepoints[0], fn_append);
   226             if(ent->codepoints[1])
   227                 render_utf8_codepoint(r, ent->codepoints[1], fn_append);
   228             return;
   229         }
   230     }
   231 
   232     fn_append(r, text, size);
   233 }
   234 
   235 static void
   236 render_attribute(MD_RENDER_HTML* r, const MD_ATTRIBUTE* attr,
   237                  void (*fn_append)(MD_RENDER_HTML*, const MD_CHAR*, MD_SIZE))
   238 {
   239     int i;
   240 
   241     for(i = 0; attr->substr_offsets[i] < attr->size; i++) {
   242         MD_TEXTTYPE type = attr->substr_types[i];
   243         MD_OFFSET off = attr->substr_offsets[i];
   244         MD_SIZE size = attr->substr_offsets[i+1] - off;
   245         const MD_CHAR* text = attr->text + off;
   246 
   247         switch(type) {
   248             case MD_TEXT_NULLCHAR:  render_utf8_codepoint(r, 0x0000, render_text); break;
   249             case MD_TEXT_ENTITY:    render_entity(r, text, size, fn_append); break;
   250             default:                fn_append(r, text, size); break;
   251         }
   252     }
   253 }
   254 
   255 
   256 static void
   257 render_open_ol_block(MD_RENDER_HTML* r, const MD_BLOCK_OL_DETAIL* det)
   258 {
   259     char buf[64];
   260 
   261     if(det->start == 1) {
   262         RENDER_LITERAL(r, "<ol>\n");
   263         return;
   264     }
   265 
   266     snprintf(buf, sizeof(buf), "<ol start=\"%u\">\n", det->start);
   267     RENDER_LITERAL(r, buf);
   268 }
   269 
   270 static void
   271 render_open_code_block(MD_RENDER_HTML* r, const MD_BLOCK_CODE_DETAIL* det)
   272 {
   273     RENDER_LITERAL(r, "<pre><code");
   274 
   275     /* If known, output the HTML 5 attribute class="language-LANGNAME". */
   276     if(det->lang.text != NULL) {
   277         RENDER_LITERAL(r, " class=\"language-");
   278         render_attribute(r, &det->lang, render_html_escaped);
   279         RENDER_LITERAL(r, "\"");
   280     }
   281 
   282     RENDER_LITERAL(r, ">");
   283 }
   284 
   285 static void
   286 render_open_td_block(MD_RENDER_HTML* r, const MD_CHAR* cell_type, const MD_BLOCK_TD_DETAIL* det)
   287 {
   288     RENDER_LITERAL(r, "<");
   289     RENDER_LITERAL(r, cell_type);
   290 
   291     switch(det->align) {
   292         case MD_ALIGN_LEFT:     RENDER_LITERAL(r, " align=\"left\">"); break;
   293         case MD_ALIGN_CENTER:   RENDER_LITERAL(r, " align=\"center\">"); break;
   294         case MD_ALIGN_RIGHT:    RENDER_LITERAL(r, " align=\"right\">"); break;
   295         default:                RENDER_LITERAL(r, ">"); break;
   296     }
   297 }
   298 
   299 static void
   300 render_open_a_span(MD_RENDER_HTML* r, const MD_SPAN_A_DETAIL* det)
   301 {
   302     RENDER_LITERAL(r, "<a href=\"");
   303     render_attribute(r, &det->href, render_url_escaped);
   304 
   305     if(det->title.text != NULL) {
   306         RENDER_LITERAL(r, "\" title=\"");
   307         render_attribute(r, &det->title, render_html_escaped);
   308     }
   309 
   310     RENDER_LITERAL(r, "\">");
   311 }
   312 
   313 static void
   314 render_open_img_span(MD_RENDER_HTML* r, const MD_SPAN_IMG_DETAIL* det)
   315 {
   316     RENDER_LITERAL(r, "<img src=\"");
   317     render_attribute(r, &det->src, render_url_escaped);
   318 
   319     RENDER_LITERAL(r, "\" alt=\"");
   320 
   321     r->image_nesting_level++;
   322 }
   323 
   324 static void
   325 render_close_img_span(MD_RENDER_HTML* r, const MD_SPAN_IMG_DETAIL* det)
   326 {
   327     if(det->title.text != NULL) {
   328         RENDER_LITERAL(r, "\" title=\"");
   329         render_attribute(r, &det->title, render_html_escaped);
   330     }
   331 
   332     RENDER_LITERAL(r, "\">");
   333 
   334     r->image_nesting_level--;
   335 }
   336 
   337 
   338 /**************************************
   339  ***  HTML renderer implementation  ***
   340  **************************************/
   341 
   342 static int
   343 enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
   344 {
   345     static const MD_CHAR* head[6] = { "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>" };
   346     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
   347 
   348     switch(type) {
   349         case MD_BLOCK_DOC:      /* noop */ break;
   350         case MD_BLOCK_QUOTE:    RENDER_LITERAL(r, "<blockquote>\n"); break;
   351         case MD_BLOCK_UL:       RENDER_LITERAL(r, "<ul>\n"); break;
   352         case MD_BLOCK_OL:       render_open_ol_block(r, (const MD_BLOCK_OL_DETAIL*)detail); break;
   353         case MD_BLOCK_LI:       RENDER_LITERAL(r, "<li>"); break;
   354         case MD_BLOCK_HR:       RENDER_LITERAL(r, "<hr>\n"); break;
   355         case MD_BLOCK_H:        RENDER_LITERAL(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
   356         case MD_BLOCK_CODE:     render_open_code_block(r, (const MD_BLOCK_CODE_DETAIL*) detail); break;
   357         case MD_BLOCK_HTML:     /* noop */ break;
   358         case MD_BLOCK_P:        RENDER_LITERAL(r, "<p>"); break;
   359         case MD_BLOCK_TABLE:    RENDER_LITERAL(r, "<table>\n"); break;
   360         case MD_BLOCK_THEAD:    RENDER_LITERAL(r, "<thead>\n"); break;
   361         case MD_BLOCK_TBODY:    RENDER_LITERAL(r, "<tbody>\n"); break;
   362         case MD_BLOCK_TR:       RENDER_LITERAL(r, "<tr>\n"); break;
   363         case MD_BLOCK_TH:       render_open_td_block(r, "th", (MD_BLOCK_TD_DETAIL*)detail); break;
   364         case MD_BLOCK_TD:       render_open_td_block(r, "td", (MD_BLOCK_TD_DETAIL*)detail); break;
   365     }
   366 
   367     return 0;
   368 }
   369 
   370 static int
   371 leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
   372 {
   373     static const MD_CHAR* head[6] = { "</h1>\n", "</h2>\n", "</h3>\n", "</h4>\n", "</h5>\n", "</h6>\n" };
   374     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
   375 
   376     switch(type) {
   377         case MD_BLOCK_DOC:      /*noop*/ break;
   378         case MD_BLOCK_QUOTE:    RENDER_LITERAL(r, "</blockquote>\n"); break;
   379         case MD_BLOCK_UL:       RENDER_LITERAL(r, "</ul>\n"); break;
   380         case MD_BLOCK_OL:       RENDER_LITERAL(r, "</ol>\n"); break;
   381         case MD_BLOCK_LI:       RENDER_LITERAL(r, "</li>\n"); break;
   382         case MD_BLOCK_HR:       /*noop*/ break;
   383         case MD_BLOCK_H:        RENDER_LITERAL(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
   384         case MD_BLOCK_CODE:     RENDER_LITERAL(r, "</code></pre>\n"); break;
   385         case MD_BLOCK_HTML:     /* noop */ break;
   386         case MD_BLOCK_P:        RENDER_LITERAL(r, "</p>\n"); break;
   387         case MD_BLOCK_TABLE:    RENDER_LITERAL(r, "</table>\n"); break;
   388         case MD_BLOCK_THEAD:    RENDER_LITERAL(r, "</thead>\n"); break;
   389         case MD_BLOCK_TBODY:    RENDER_LITERAL(r, "</tbody>\n"); break;
   390         case MD_BLOCK_TR:       RENDER_LITERAL(r, "</tr>\n"); break;
   391         case MD_BLOCK_TH:       RENDER_LITERAL(r, "</th>\n"); break;
   392         case MD_BLOCK_TD:       RENDER_LITERAL(r, "</td>\n"); break;
   393     }
   394 
   395     return 0;
   396 }
   397 
   398 static int
   399 enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
   400 {
   401     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
   402 
   403     if(r->image_nesting_level > 0) {
   404         /* We are inside an image, i.e. rendering the ALT attribute of
   405          * <IMG> tag. */
   406         return 0;
   407     }
   408 
   409     switch(type) {
   410         case MD_SPAN_EM:        RENDER_LITERAL(r, "<em>"); break;
   411         case MD_SPAN_STRONG:    RENDER_LITERAL(r, "<strong>"); break;
   412         case MD_SPAN_A:         render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break;
   413         case MD_SPAN_IMG:       render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
   414         case MD_SPAN_CODE:      RENDER_LITERAL(r, "<code>"); break;
   415         case MD_SPAN_DEL:       RENDER_LITERAL(r, "<del>"); break;
   416     }
   417 
   418     return 0;
   419 }
   420 
   421 static int
   422 leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
   423 {
   424     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
   425 
   426     if(r->image_nesting_level > 0) {
   427         /* We are inside an image, i.e. rendering the ALT attribute of
   428          * <IMG> tag. */
   429         if(r->image_nesting_level == 1  &&  type == MD_SPAN_IMG)
   430             render_close_img_span(r, (MD_SPAN_IMG_DETAIL*) detail);
   431         return 0;
   432     }
   433 
   434     switch(type) {
   435         case MD_SPAN_EM:        RENDER_LITERAL(r, "</em>"); break;
   436         case MD_SPAN_STRONG:    RENDER_LITERAL(r, "</strong>"); break;
   437         case MD_SPAN_A:         RENDER_LITERAL(r, "</a>"); break;
   438         case MD_SPAN_IMG:       /*noop, handled above*/ break;
   439         case MD_SPAN_CODE:      RENDER_LITERAL(r, "</code>"); break;
   440         case MD_SPAN_DEL:       RENDER_LITERAL(r, "</del>"); break;
   441     }
   442 
   443     return 0;
   444 }
   445 
   446 static int
   447 text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdata)
   448 {
   449     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
   450 
   451     switch(type) {
   452         case MD_TEXT_NULLCHAR:  render_utf8_codepoint(r, 0x0000, render_text); break;
   453         case MD_TEXT_BR:        RENDER_LITERAL(r, (r->image_nesting_level == 0 ? "<br>\n" : " ")); break;
   454         case MD_TEXT_SOFTBR:    RENDER_LITERAL(r, (r->image_nesting_level == 0 ? "\n" : " ")); break;
   455         case MD_TEXT_HTML:      render_text(r, text, size); break;
   456         case MD_TEXT_ENTITY:    render_entity(r, text, size, render_html_escaped); break;
   457         default:                render_html_escaped(r, text, size); break;
   458     }
   459 
   460     return 0;
   461 }
   462 
   463 static void
   464 debug_log_callback(const char* msg, void* userdata)
   465 {
   466     MD_RENDER_HTML* r = (MD_RENDER_HTML*) userdata;
   467     if(r->flags & MD_RENDER_FLAG_DEBUG)
   468         fprintf(stderr, "MD4C: %s\n", msg);
   469 }
   470 
   471 int
   472 md_render_html(const MD_CHAR* input, MD_SIZE input_size,
   473                void (*process_output)(const MD_CHAR*, MD_SIZE, void*),
   474                void* userdata, unsigned parser_flags, unsigned renderer_flags)
   475 {
   476     MD_RENDER_HTML render = { process_output, userdata, renderer_flags, 0 };
   477 
   478     MD_RENDERER renderer = {
   479         enter_block_callback,
   480         leave_block_callback,
   481         enter_span_callback,
   482         leave_span_callback,
   483         text_callback,
   484         debug_log_callback,
   485         parser_flags
   486     };
   487 
   488     return md_parse(input, input_size, &renderer, (void*) &render);
   489 }
   490