From 1da5a8086c998471d47d85992ae0eac253a43ed8 Mon Sep 17 00:00:00 2001 From: chowette Date: Wed, 30 Mar 2022 18:37:25 +0200 Subject: [PATCH 01/35] First step for header identifier like gfm --- md2html/md2html.c | 4 ++++ scripts/run-tests.sh | 4 ++++ src/md4c-html.c | 17 +++++++++++++-- src/md4c.c | 36 +++++++++++++++++++++++++++++++- src/md4c.h | 4 +++- test/heading-auto-identifier.txt | 11 ++++++++++ 6 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 test/heading-auto-identifier.txt diff --git a/md2html/md2html.c b/md2html/md2html.c index 06b2b74b..f0a4da60 100644 --- a/md2html/md2html.c +++ b/md2html/md2html.c @@ -220,6 +220,7 @@ static const CMDLINE_OPTION cmdline_options[] = { { 0, "funderline", '_', 0 }, { 0, "fverbatim-entities", 'E', 0 }, { 0, "fwiki-links", 'K', 0 }, + { 0, "fheading-auto-id", '#', 0 }, { 0, "fno-html-blocks", 'F', 0 }, { 0, "fno-html-spans", 'G', 0 }, @@ -269,6 +270,8 @@ usage(void) " --ftasklists Enable task lists\n" " --funderline Enable underline spans\n" " --fwiki-links Enable wiki links\n" + " --fheading-auto-id\n" + " Enable heading auto identifier\n" "\n" "Markdown suppression options:\n" " --fno-html-blocks\n" @@ -335,6 +338,7 @@ cmdline_callback(int opt, char const* value, void* data) case 'K': parser_flags |= MD_FLAG_WIKILINKS; break; case 'X': parser_flags |= MD_FLAG_TASKLISTS; break; case '_': parser_flags |= MD_FLAG_UNDERLINE; break; + case '#': parser_flags |= MD_FLAG_HEADINGAUTOID; break; default: fprintf(stderr, "Illegal option: %s\n", value); diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh index c00b36a9..9d2359ca 100755 --- a/scripts/run-tests.sh +++ b/scripts/run-tests.sh @@ -70,6 +70,10 @@ echo echo "Underline extension:" $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/underline.txt" -p "$PROGRAM --funderline" +echo +echo "Heading auto identifiers extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/heading-auto-identifier.txt" -p "$PROGRAM --fheading-auto-id" + echo echo "Pathological input:" $PYTHON "$TEST_DIR/pathological_tests.py" -p "$PROGRAM" diff --git a/src/md4c-html.c b/src/md4c-html.c index d604aecb..6892000b 100644 --- a/src/md4c-html.c +++ b/src/md4c-html.c @@ -309,6 +309,20 @@ render_open_code_block(MD_HTML* r, const MD_BLOCK_CODE_DETAIL* det) RENDER_VERBATIM(r, ">"); } +static void +render_header_block(MD_HTML* r, const MD_BLOCK_H_DETAIL* det) +{ + static const MD_CHAR* head[6] = { "level- 1]); + if(det->identifier.text != NULL) { + RENDER_VERBATIM(r, " id=\""); + render_attribute(r, &det->identifier, render_html_escaped); + RENDER_VERBATIM(r, "\""); + } + RENDER_VERBATIM(r, ">"); +} + static void render_open_td_block(MD_HTML* r, const MD_CHAR* cell_type, const MD_BLOCK_TD_DETAIL* det) { @@ -378,7 +392,6 @@ render_open_wikilink_span(MD_HTML* r, const MD_SPAN_WIKILINK_DETAIL* det) static int enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) { - static const MD_CHAR* head[6] = { "

", "

", "

", "

", "

", "
" }; MD_HTML* r = (MD_HTML*) userdata; switch(type) { @@ -388,7 +401,7 @@ enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) case MD_BLOCK_OL: render_open_ol_block(r, (const MD_BLOCK_OL_DETAIL*)detail); break; case MD_BLOCK_LI: render_open_li_block(r, (const MD_BLOCK_LI_DETAIL*)detail); break; case MD_BLOCK_HR: RENDER_VERBATIM(r, (r->flags & MD_HTML_FLAG_XHTML) ? "
\n" : "
\n"); break; - case MD_BLOCK_H: RENDER_VERBATIM(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break; + case MD_BLOCK_H: render_header_block(r, (const MD_BLOCK_H_DETAIL*)detail); break; case MD_BLOCK_CODE: render_open_code_block(r, (const MD_BLOCK_CODE_DETAIL*) detail); break; case MD_BLOCK_HTML: /* noop */ break; case MD_BLOCK_P: RENDER_VERBATIM(r, "

"); break; diff --git a/src/md4c.c b/src/md4c.c index 3677c0e0..f20fd45b 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -4746,6 +4746,31 @@ md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DE return ret; } +static int +md_setup_H_identifier(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_H_DETAIL* det, + MD_ATTRIBUTE_BUILD* id_build) +{ + const MD_LINE* header_line = (const MD_LINE*)(block + 1); + OFF beg = header_line->beg; + OFF end = header_line->end; + + int ret = 0; + + /* Trim initial spaces. */ + while(beg < ctx->size && CH(beg) == _T(' ')) + beg++; + + /* Trim trailing spaces. */ + while(end > beg && CH(end-1) == _T(' ')) + end--; + + /* Build info string attribute. */ + MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->identifier, id_build)); + +abort: + return ret; +} + static int md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) { @@ -4754,6 +4779,8 @@ md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) MD_BLOCK_CODE_DETAIL code; MD_BLOCK_TABLE_DETAIL table; } det; + MD_ATTRIBUTE_BUILD identifier_build; + int clean_header_detail = FALSE; MD_ATTRIBUTE_BUILD info_build; MD_ATTRIBUTE_BUILD lang_build; int is_in_tight_list; @@ -4770,7 +4797,11 @@ md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) switch(block->type) { case MD_BLOCK_H: det.header.level = block->data; - break; + if (ctx->parser.flags & MD_FLAG_HEADINGAUTOID){ + clean_header_detail = TRUE; + MD_CHECK(md_setup_H_identifier(ctx, block, &det.header, &identifier_build )); + } + break; case MD_BLOCK_CODE: /* For fenced code block, we may need to set the info string. */ @@ -4826,6 +4857,9 @@ md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) MD_LEAVE_BLOCK(block->type, (void*) &det); abort: + if(clean_header_detail) { + md_free_attribute(ctx, &identifier_build); + } if(clean_fence_code_detail) { md_free_attribute(ctx, &info_build); md_free_attribute(ctx, &lang_build); diff --git a/src/md4c.h b/src/md4c.h index 95f78f9b..cfff1ea5 100644 --- a/src/md4c.h +++ b/src/md4c.h @@ -259,6 +259,7 @@ typedef struct MD_BLOCK_LI_DETAIL { /* Detailed info for MD_BLOCK_H. */ typedef struct MD_BLOCK_H_DETAIL { unsigned level; /* Header level (1 - 6) */ + MD_ATTRIBUTE identifier; /* identifier, eg {#some-id} or autogenerated from the heading text*/ } MD_BLOCK_H_DETAIL; /* Detailed info for MD_BLOCK_CODE. */ @@ -316,6 +317,7 @@ typedef struct MD_SPAN_WIKILINK { #define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */ #define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */ #define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */ +#define MD_FLAG_HEADINGAUTOID 0x8000 /* Enable header auto identifiers. */ #define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) #define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) @@ -330,7 +332,7 @@ typedef struct MD_SPAN_WIKILINK { * extensions, bringing the dialect closer to the original, are implemented. */ #define MD_DIALECT_COMMONMARK 0 -#define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS) +#define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS | MD_FLAG_HEADINGAUTOID) /* Parser structure. */ diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt new file mode 100644 index 00000000..3ac6609f --- /dev/null +++ b/test/heading-auto-identifier.txt @@ -0,0 +1,11 @@ + +# Heading auto identifiers + +With the flag `MD_FLAG_HEADINGAUTOID`, MD4C generate an identifier for a heading. + +```````````````````````````````` example +# heading +. +

heading

+```````````````````````````````` + From 340f15c978b3b3ec23aa3a2f1022a34f196055d1 Mon Sep 17 00:00:00 2001 From: chowette Date: Sun, 3 Apr 2022 16:14:20 +0200 Subject: [PATCH 02/35] Implement identifier transformation like github This imply scanning and transforming the heading text. As the heading text is transformed, we need to stor it somewhere. Instead of doing almost one malloc for each heading, allocate a large buffer and store all indentifier inside, and using index. We also remember the heading in the header Block to later retrieve it. --- src/md4c.c | 191 ++++++++++++++++++++++++++++++- src/md4c.h | 2 +- test/heading-auto-identifier.txt | 35 ++++++ 3 files changed, 225 insertions(+), 3 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index f20fd45b..60141aad 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -128,7 +128,7 @@ typedef struct MD_MARK_tag MD_MARK; typedef struct MD_BLOCK_tag MD_BLOCK; typedef struct MD_CONTAINER_tag MD_CONTAINER; typedef struct MD_REF_DEF_tag MD_REF_DEF; - +typedef struct MD_HEADING_DEF_tag MD_HEADING_DEF; /* During analyzes of inline marks, we need to manage some "mark chains", * of (yet unresolved) openers. This structure holds start/end of the chain. @@ -163,6 +163,15 @@ struct MD_CTX_tag { void** ref_def_hashtable; int ref_def_hashtable_size; + /* Heading definitions. */ + MD_HEADING_DEF* heading_defs; + int n_heading_defs; + int alloc_heading_defs; + /* autogenerated identifiers for heading */ + CHAR* identifiers; + SZ identifiers_size; + SZ alloc_identifiers; + /* Stack of inline/span markers. * This is only used for parsing a single block contents but by storing it * here we may reuse the stack for subsequent blocks; i.e. we have fewer @@ -1507,6 +1516,133 @@ md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, return -1; } +/********************************************* + *** Dictionary of Heading Definitions *** + *********************************************/ + +struct MD_HEADING_DEF_tag { + CHAR* identifier; + SZ ident_size; +}; + +static int +md_push_heading_def(MD_CTX* ctx) +{ + if(ctx->n_heading_defs >= ctx->alloc_heading_defs) { + MD_HEADING_DEF* new_defs; + + ctx->alloc_heading_defs = (ctx->alloc_heading_defs > 0 + ? ctx->alloc_heading_defs + ctx->alloc_heading_defs / 2 + : 16); + new_defs = (MD_HEADING_DEF*) realloc(ctx->heading_defs, ctx->alloc_heading_defs * sizeof(MD_HEADING_DEF)); + if(new_defs == NULL) { + MD_LOG("realloc() failed."); + return -1; + } + + ctx->heading_defs = new_defs; + } + return 0; +} + +static int +md_alloc_identifiers(MD_CTX *ctx, MD_HEADING_DEF* def) +{ + if (ctx->identifiers_size + def->ident_size >= ctx->alloc_identifiers) + { + CHAR *new_identifiers; + + ctx->alloc_identifiers = (ctx->alloc_identifiers > 0 + ? ctx->alloc_identifiers + ctx->alloc_identifiers / 2 + : 512); + + new_identifiers = (CHAR *)realloc(ctx->identifiers, sizeof(CHAR) * ctx->alloc_identifiers); + if (new_identifiers == NULL) + { + MD_LOG("realloc() failed."); + return -1; + } + + ctx->identifiers = new_identifiers; + } + + def->identifier = &ctx->identifiers[ctx->identifiers_size]; + return 0; +} + +static int +md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines) +{ + int ret = 0; + + int line_index = 0; + OFF beg = lines[0].beg; + OFF end = lines[n_lines-1].end; + + /* Trim initial spaces. */ + while(beg < ctx->size && CH(beg) == _T(' ')) + beg++; + + /* Trim trailing spaces. */ + while(end > beg && CH(end-1) == _T(' ')) + end--; + + + def->ident_size = end - beg; + MD_CHECK(md_alloc_identifiers(ctx, def)); + + /* copy the ident and transform as needed */ + OFF off = beg; + CHAR* ptr = def->identifier; + + while(1) { + const MD_LINE* line = &lines[line_index]; + OFF line_end = line->end; + if(end < line_end) + line_end = end; + + while(off < line_end) { + if(ISUNICODEWHITESPACE(off) || CH(off) == _T('-') ){ // space and '-' are replaced by a '-' + *ptr = _T('-'); + } else if (ISUNICODEPUNCT(off)) { + off++; + continue; + } else if(ISUPPER(off)){ // make uppercase lower + *ptr = CH(off)+('a'-'A'); + } else { + *ptr = CH(off); + } + ptr++; + off++; + } + + if(off >= end) { + // update real identifier size + def->ident_size = (MD_SIZE)(ptr - def->identifier); + break; + } + + *ptr = _T('-'); // end of line + ptr++; + + line_index++; + off = lines[line_index].beg; + } + + // update used identifier buffer size + ctx->identifiers_size += def->ident_size; + + return 0; +abort: + + return -1; +} + +static void +md_free_heading_defs(MD_CTX* ctx) +{ + free(ctx->heading_defs); +} /********************************************* *** Dictionary of Reference Definitions *** @@ -4617,6 +4753,10 @@ struct MD_BLOCK_tag { * MD_BLOCK_OL: Start item number. */ unsigned n_lines; + /* MD_BLOCK_H: reference definition index + */ + unsigned heading_def; // todo rename me to heading_idx ? + }; struct MD_CONTAINER_tag { @@ -4765,7 +4905,9 @@ md_setup_H_identifier(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_H_DETAIL* det end--; /* Build info string attribute. */ - MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->identifier, id_build)); + + MD_HEADING_DEF * heading = &ctx->heading_defs[block->heading_def]; + MD_CHECK(md_build_attribute(ctx, heading->identifier, heading->ident_size, 0, &det->identifier, id_build)); abort: return ret; @@ -5087,6 +5229,32 @@ md_consume_link_reference_definitions(MD_CTX* ctx) return 0; } +/* Build the identifier for this heading and remember them so we can + * resolve any link referring to them. + * + */ +static int +md_make_heading(MD_CTX* ctx) +{ + int ret = 0; + + MD_BLOCK* block = ctx->current_block; + MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); + + MD_HEADING_DEF * def = NULL; + MD_CHECK(md_push_heading_def(ctx)); + def = &ctx->heading_defs[ctx->n_heading_defs]; + memset(def, 0, sizeof(MD_HEADING_DEF)); + + // remplissage de la ref def + MD_CHECK(md_heading_build_ident(ctx, def, lines, block->n_lines)); + block->heading_def = ctx->n_heading_defs; + ctx->n_heading_defs++; + +abort: + return ret; +} + static int md_end_current_block(MD_CTX* ctx) { @@ -5124,6 +5292,10 @@ md_end_current_block(MD_CTX* ctx) } } + if(ctx->current_block->type == MD_BLOCK_H){ + MD_CHECK(md_make_heading(ctx)); + } + /* Mark we are not building any block anymore. */ ctx->current_block = NULL; @@ -6387,6 +6559,19 @@ md_process_doc(MD_CTX *ctx) sprintf(buffer, "Alloced %u bytes for aux. buffer.", (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR))); MD_LOG(buffer); + + sprintf(buffer, "Alloced %u bytes for reference definition buffer.", + (unsigned)(ctx->alloc_ref_defs * sizeof(MD_REF_DEF))); + MD_LOG(buffer); + + sprintf(buffer, "Alloced %u bytes for identifiers buffer.", + (unsigned)(ctx->alloc_identifiers * sizeof(MD_CHAR))); + MD_LOG(buffer); + + sprintf(buffer, "Alloced %u bytes for heading definition buffer.", + (unsigned)(ctx->alloc_heading_defs * sizeof(MD_REF_DEF))); + MD_LOG(buffer); + } #endif @@ -6433,6 +6618,8 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userd ret = md_process_doc(&ctx); /* Clean-up. */ + md_free_heading_defs(&ctx); + free(ctx.identifiers); md_free_ref_defs(&ctx); md_free_ref_def_hashtable(&ctx); free(ctx.buffer); diff --git a/src/md4c.h b/src/md4c.h index cfff1ea5..0c4984c6 100644 --- a/src/md4c.h +++ b/src/md4c.h @@ -317,7 +317,7 @@ typedef struct MD_SPAN_WIKILINK { #define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */ #define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */ #define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */ -#define MD_FLAG_HEADINGAUTOID 0x8000 /* Enable header auto identifiers. */ +#define MD_FLAG_HEADINGAUTOID 0x8000 /* Enable header auto identifiers like github. */ #define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) #define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index 3ac6609f..c36fd5c4 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -9,3 +9,38 @@ With the flag `MD_FLAG_HEADINGAUTOID`, MD4C generate an identifier for a heading

heading

```````````````````````````````` +Spaces are replaced by `-` and upercase are replaced by lower case + +```````````````````````````````` example +# The Heading +. +

The Heading

+```````````````````````````````` + +The non-alphanumeric characters are discarded except for `-. + +```````````````````````````````` example +# The %@!= stupid _ heading ! +. +

The %@!= stupid _ heading !

+```````````````````````````````` + +As a result, you can get some empty heading with no identifier. + +```````````````````````````````` example +# ! +. +

!

+```````````````````````````````` + +Heading srting with number are not treated differently + +```````````````````````````````` example +# 1.1 The start +. +

1.1 The start

+```````````````````````````````` + + + + From 5513c81929509f1fd16fc427ff9ac8d8ea67ee72 Mon Sep 17 00:00:00 2001 From: chowette Date: Mon, 4 Apr 2022 22:15:07 +0200 Subject: [PATCH 03/35] Fix : garbage can occur in large file. Do not hold pointer to reallocated memory. Realloc can copy to a new location and our pointers are now invalid. memorize the offset in the buffer instead, this remain valid even after reallochas relocated the buffer --- src/md4c.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 60141aad..48a8c126 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -1521,7 +1521,7 @@ md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, *********************************************/ struct MD_HEADING_DEF_tag { - CHAR* identifier; + OFF ident_beg; SZ ident_size; }; @@ -1566,7 +1566,7 @@ md_alloc_identifiers(MD_CTX *ctx, MD_HEADING_DEF* def) ctx->identifiers = new_identifiers; } - def->identifier = &ctx->identifiers[ctx->identifiers_size]; + def->ident_beg = ctx->identifiers_size; return 0; } @@ -1593,7 +1593,7 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l /* copy the ident and transform as needed */ OFF off = beg; - CHAR* ptr = def->identifier; + CHAR* ptr = &ctx->identifiers[def->ident_beg]; while(1) { const MD_LINE* line = &lines[line_index]; @@ -1618,7 +1618,7 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l if(off >= end) { // update real identifier size - def->ident_size = (MD_SIZE)(ptr - def->identifier); + def->ident_size = (MD_SIZE)(ptr - &ctx->identifiers[def->ident_beg]); break; } @@ -4907,7 +4907,7 @@ md_setup_H_identifier(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_H_DETAIL* det /* Build info string attribute. */ MD_HEADING_DEF * heading = &ctx->heading_defs[block->heading_def]; - MD_CHECK(md_build_attribute(ctx, heading->identifier, heading->ident_size, 0, &det->identifier, id_build)); + MD_CHECK(md_build_attribute(ctx, &ctx->identifiers[heading->ident_beg], heading->ident_size, 0, &det->identifier, id_build)); abort: return ret; From 9215d8721f3e1c30bae4c64130485cc2704fabe7 Mon Sep 17 00:00:00 2001 From: chowette Date: Mon, 4 Apr 2022 22:35:01 +0200 Subject: [PATCH 04/35] no need to trim space when building identifiers. the block line(s) is always trimed when build --- src/md4c.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 48a8c126..11de197f 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -1579,15 +1579,6 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l OFF beg = lines[0].beg; OFF end = lines[n_lines-1].end; - /* Trim initial spaces. */ - while(beg < ctx->size && CH(beg) == _T(' ')) - beg++; - - /* Trim trailing spaces. */ - while(end > beg && CH(end-1) == _T(' ')) - end--; - - def->ident_size = end - beg; MD_CHECK(md_alloc_identifiers(ctx, def)); @@ -4896,14 +4887,6 @@ md_setup_H_identifier(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_H_DETAIL* det int ret = 0; - /* Trim initial spaces. */ - while(beg < ctx->size && CH(beg) == _T(' ')) - beg++; - - /* Trim trailing spaces. */ - while(end > beg && CH(end-1) == _T(' ')) - end--; - /* Build info string attribute. */ MD_HEADING_DEF * heading = &ctx->heading_defs[block->heading_def]; @@ -5246,7 +5229,7 @@ md_make_heading(MD_CTX* ctx) def = &ctx->heading_defs[ctx->n_heading_defs]; memset(def, 0, sizeof(MD_HEADING_DEF)); - // remplissage de la ref def + // filling of the heading def MD_CHECK(md_heading_build_ident(ctx, def, lines, block->n_lines)); block->heading_def = ctx->n_heading_defs; ctx->n_heading_defs++; From 53514005f719c94c0d7eb3f2d8b17072cc3bec83 Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 5 Apr 2022 23:34:30 +0200 Subject: [PATCH 05/35] =?UTF-8?q?convert=20uppercase=20unicode=20identifie?= =?UTF-8?q?r=20into=20lower=20case=20like=20`=CE=91=CE=93=CE=A9`=20is=20ch?= =?UTF-8?q?anged=20to=20`=CE=B1=CE=B3=CF=89`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/md4c.c | 75 ++++++++++++++++++++++++++++---- test/heading-auto-identifier.txt | 11 ++++- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 11de197f..439751cc 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -859,6 +859,36 @@ struct MD_UNICODE_FOLD_INFO_tag { return (unsigned) str[0]; } +/* + * encode a codepoint into the corresponding utf8 byte sequence + * the string buffer passed must be large enough + * return the number of bytes written to the buffer + */ + static unsigned + md_encode_utf8__(unsigned codepoint, CHAR* str ) + { + if(codepoint <= 0x7f){ + *str++ = (char)codepoint; + return 1; + } else if (codepoint <= 0x7FF){ + *str++ = 0xc0 | (codepoint >> 6); + *str++ = 0x80 | ((codepoint >> 0) & 0x3f); + return 2; + } else if ( codepoint <= 0xFFFF) { + *str++ = 0xe0 | (codepoint >> 12); + *str++ = 0x80 | ((codepoint >> 6 ) & 0x3f); + *str++ = 0x80 | ((codepoint >> 0 ) & 0x3f); + return 3; + } else if ( codepoint <= 0x10FFFF) { + *str++ = 0xf0 | (codepoint >> 18); + *str++ = 0x80 | ((codepoint >> 12) & 0x3f); + *str++ = 0x80 | ((codepoint >> 6 ) & 0x3f); + *str++ = 0x80 | ((codepoint >> 0 ) & 0x3f); + return 4; + } + return 0; + } + static unsigned md_decode_utf8_before__(MD_CTX* ctx, OFF off) { @@ -886,6 +916,7 @@ struct MD_UNICODE_FOLD_INFO_tag { #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off)) + #define ISUNICODEPUNCT_(codepoint) md_is_unicode_punct__(codepoint) #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off)) @@ -894,11 +925,18 @@ struct MD_UNICODE_FOLD_INFO_tag { { return md_decode_utf8__(str+off, str_size-off, p_char_size); } + + static inline unsigned + md_encode_unicode(unsigned codepoint, CHAR* str ) + { + return md_encode_utf8__(codepoint, str); + } #else #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint) #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off) #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1) + #define ISUNICODEPUNCT_(codepoint) ISPUNCT(codepoint) #define ISUNICODEPUNCT(off) ISPUNCT(off) #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) @@ -911,6 +949,13 @@ struct MD_UNICODE_FOLD_INFO_tag { info->n_codepoints = 1; } + static unsigned + md_encode_unicode(unsigned codepoint, CHAR* str ) + { + *str = codepoint; + return 1; + } + static inline unsigned md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size) { @@ -1593,18 +1638,30 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l line_end = end; while(off < line_end) { - if(ISUNICODEWHITESPACE(off) || CH(off) == _T('-') ){ // space and '-' are replaced by a '-' - *ptr = _T('-'); - } else if (ISUNICODEPUNCT(off)) { + if( CH(off) == _T('-') ){ // '-' are not replaced + *ptr++ = _T('-'); off++; continue; - } else if(ISUPPER(off)){ // make uppercase lower - *ptr = CH(off)+('a'-'A'); - } else { - *ptr = CH(off); } - ptr++; - off++; + unsigned codepoint; + SZ char_size; + + codepoint = md_decode_unicode(ctx->text, off, line_end, &char_size); + if(ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE(off)) {// replace white spaces by '-' + *ptr++ = _T('-'); + off = md_skip_unicode_whitespace(ctx->text, off, line_end); + } else if (ISUNICODEPUNCT_(codepoint)) { // skip ponctuation + off += char_size; + continue; + } else { // make lower case + MD_UNICODE_FOLD_INFO fold_info; + md_get_unicode_fold_info(codepoint, &fold_info); + for (unsigned i = 0; i < fold_info.n_codepoints; i++) { + SZ n = md_encode_unicode(fold_info.codepoints[i], ptr); + ptr += n; + } + off += char_size; + } } if(off >= end) { diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index c36fd5c4..e743542f 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -17,6 +17,15 @@ Spaces are replaced by `-` and upercase are replaced by lower case

The Heading

```````````````````````````````` +Unicode characters can also be put lower case + +```````````````````````````````` example +# ĀĄŁŇŢŰŽבあИЯ +. +

ĀĄŁŇŢŰŽבあИЯ

+```````````````````````````````` + + The non-alphanumeric characters are discarded except for `-. ```````````````````````````````` example @@ -33,7 +42,7 @@ As a result, you can get some empty heading with no identifier.

!

```````````````````````````````` -Heading srting with number are not treated differently +Heading starting with numbers are not treated differently ```````````````````````````````` example # 1.1 The start From d253fc3b5bcef247c8d0eda2d643411ef67a24a2 Mon Sep 17 00:00:00 2001 From: chowette Date: Thu, 7 Apr 2022 21:06:58 +0200 Subject: [PATCH 06/35] =?UTF-8?q?First=20proof=20of=20concept=20for=20dupl?= =?UTF-8?q?icate=20identifier=20case=20It=20works=20but=20need=20more=20wo?= =?UTF-8?q?rk=20because=20-=20bad=20O(n=C2=B2)=20algorithm=20to=20be=20rep?= =?UTF-8?q?laced=20by=20a=20hash[Map|Set]=20-=20use=20snprintf=20for=20sim?= =?UTF-8?q?ple=20int=5Fto=5Fstr=20implementation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/md4c.c | 388 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 252 insertions(+), 136 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 439751cc..f7dce737 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -171,6 +171,8 @@ struct MD_CTX_tag { CHAR* identifiers; SZ identifiers_size; SZ alloc_identifiers; + /* postfix identifier book keeping */ + SZ max_postfix; /* Stack of inline/span markers. * This is only used for parsing a single block contents but by storing it @@ -1476,6 +1478,77 @@ md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build) } } +static int +md_build_trivial_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, + MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build) +{ + MD_UNUSED(ctx); + memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); + build->substr_types = build->trivial_types; + build->substr_offsets = build->trivial_offsets; + build->substr_count = 1; + build->substr_alloc = 0; + build->trivial_types[0] = MD_TEXT_NORMAL; + build->trivial_offsets[0] = 0; + build->trivial_offsets[1] = raw_size; + + attr->text = (CHAR*) (raw_size ? raw_text : NULL); + attr->size = raw_size; + attr->substr_offsets = build->substr_offsets; + attr->substr_types = build->substr_types; + return 0; +} + +static int +int_to_str( unsigned postfix, CHAR* dest){ + return snprintf(dest, 6,"%u", postfix); +} + +static int +md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, + unsigned postfix, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build) +{ + OFF off; + + memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); + + + build->substr_types = build->trivial_types; + build->substr_offsets = build->trivial_offsets; + build->substr_count = 1; + build->substr_alloc = 0; + build->trivial_types[0] = MD_TEXT_NORMAL; + build->trivial_offsets[0] = 0; + off = raw_size; + if (postfix > 0xffff ){ + // postfix is not allowed to be bigger than 65535 (2^16) , so maximum 5+1 char + postfix = 0xffff; + } + const SZ MAX_POSTFIX_SIZE= 6; + build->text = (CHAR*) malloc((raw_size + MAX_POSTFIX_SIZE) * sizeof(CHAR)); + if(build->text == NULL) { + MD_LOG("malloc() failed."); + goto abort; + } + + // copy original text + memcpy(build->text, raw_text, raw_size); + // append postfix + build->text[off++] = _T('-'); + off+= int_to_str(postfix, &build->text[off]); + + attr->text = build->text; + build->trivial_offsets[1] = off; + attr->size = off; + attr->substr_offsets = build->substr_offsets; + attr->substr_types = build->substr_types; + return 0; + +abort: + md_free_attribute(ctx, build); + return -1; +} + static int md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build) @@ -1561,137 +1634,6 @@ md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, return -1; } -/********************************************* - *** Dictionary of Heading Definitions *** - *********************************************/ - -struct MD_HEADING_DEF_tag { - OFF ident_beg; - SZ ident_size; -}; - -static int -md_push_heading_def(MD_CTX* ctx) -{ - if(ctx->n_heading_defs >= ctx->alloc_heading_defs) { - MD_HEADING_DEF* new_defs; - - ctx->alloc_heading_defs = (ctx->alloc_heading_defs > 0 - ? ctx->alloc_heading_defs + ctx->alloc_heading_defs / 2 - : 16); - new_defs = (MD_HEADING_DEF*) realloc(ctx->heading_defs, ctx->alloc_heading_defs * sizeof(MD_HEADING_DEF)); - if(new_defs == NULL) { - MD_LOG("realloc() failed."); - return -1; - } - - ctx->heading_defs = new_defs; - } - return 0; -} - -static int -md_alloc_identifiers(MD_CTX *ctx, MD_HEADING_DEF* def) -{ - if (ctx->identifiers_size + def->ident_size >= ctx->alloc_identifiers) - { - CHAR *new_identifiers; - - ctx->alloc_identifiers = (ctx->alloc_identifiers > 0 - ? ctx->alloc_identifiers + ctx->alloc_identifiers / 2 - : 512); - - new_identifiers = (CHAR *)realloc(ctx->identifiers, sizeof(CHAR) * ctx->alloc_identifiers); - if (new_identifiers == NULL) - { - MD_LOG("realloc() failed."); - return -1; - } - - ctx->identifiers = new_identifiers; - } - - def->ident_beg = ctx->identifiers_size; - return 0; -} - -static int -md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines) -{ - int ret = 0; - - int line_index = 0; - OFF beg = lines[0].beg; - OFF end = lines[n_lines-1].end; - - def->ident_size = end - beg; - MD_CHECK(md_alloc_identifiers(ctx, def)); - - /* copy the ident and transform as needed */ - OFF off = beg; - CHAR* ptr = &ctx->identifiers[def->ident_beg]; - - while(1) { - const MD_LINE* line = &lines[line_index]; - OFF line_end = line->end; - if(end < line_end) - line_end = end; - - while(off < line_end) { - if( CH(off) == _T('-') ){ // '-' are not replaced - *ptr++ = _T('-'); - off++; - continue; - } - unsigned codepoint; - SZ char_size; - - codepoint = md_decode_unicode(ctx->text, off, line_end, &char_size); - if(ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE(off)) {// replace white spaces by '-' - *ptr++ = _T('-'); - off = md_skip_unicode_whitespace(ctx->text, off, line_end); - } else if (ISUNICODEPUNCT_(codepoint)) { // skip ponctuation - off += char_size; - continue; - } else { // make lower case - MD_UNICODE_FOLD_INFO fold_info; - md_get_unicode_fold_info(codepoint, &fold_info); - for (unsigned i = 0; i < fold_info.n_codepoints; i++) { - SZ n = md_encode_unicode(fold_info.codepoints[i], ptr); - ptr += n; - } - off += char_size; - } - } - - if(off >= end) { - // update real identifier size - def->ident_size = (MD_SIZE)(ptr - &ctx->identifiers[def->ident_beg]); - break; - } - - *ptr = _T('-'); // end of line - ptr++; - - line_index++; - off = lines[line_index].beg; - } - - // update used identifier buffer size - ctx->identifiers_size += def->ident_size; - - return 0; -abort: - - return -1; -} - -static void -md_free_heading_defs(MD_CTX* ctx) -{ - free(ctx->heading_defs); -} - /********************************************* *** Dictionary of Reference Definitions *** *********************************************/ @@ -2573,6 +2515,178 @@ md_free_ref_defs(MD_CTX* ctx) free(ctx->ref_defs); } +/********************************************* + *** Dictionary of Heading Definitions *** + *********************************************/ + +struct MD_HEADING_DEF_tag { + unsigned hash; + OFF ident_beg; + SZ ident_size; + unsigned postfix; +}; + +struct MD_POSTFIX_DEF_tag { + OFF ident_beg; + SZ ident_size; +}; + +static int +md_push_heading_def(MD_CTX* ctx) +{ + if(ctx->n_heading_defs >= ctx->alloc_heading_defs) { + MD_HEADING_DEF* new_defs; + + ctx->alloc_heading_defs = (ctx->alloc_heading_defs > 0 + ? ctx->alloc_heading_defs + ctx->alloc_heading_defs / 2 + : 16); + new_defs = (MD_HEADING_DEF*) realloc(ctx->heading_defs, ctx->alloc_heading_defs * sizeof(MD_HEADING_DEF)); + if(new_defs == NULL) { + MD_LOG("realloc() failed."); + return -1; + } + + ctx->heading_defs = new_defs; + } + return 0; +} + +static int +md_alloc_identifiers(MD_CTX *ctx, MD_HEADING_DEF* def) +{ + if (ctx->identifiers_size + def->ident_size >= ctx->alloc_identifiers) + { + CHAR *new_identifiers; + + ctx->alloc_identifiers = (ctx->alloc_identifiers > 0 + ? ctx->alloc_identifiers + ctx->alloc_identifiers / 2 + : 512); + + new_identifiers = (CHAR *)realloc(ctx->identifiers, sizeof(CHAR) * ctx->alloc_identifiers); + if (new_identifiers == NULL) + { + MD_LOG("realloc() failed."); + return -1; + } + + ctx->identifiers = new_identifiers; + } + + def->ident_beg = ctx->identifiers_size; + return 0; +} + +static int +md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines) +{ + int ret = 0; + + int line_index = 0; + OFF beg = lines[0].beg; + OFF end = lines[n_lines-1].end; + + def->ident_size = end - beg; + MD_CHECK(md_alloc_identifiers(ctx, def)); + + /* copy the ident and transform as needed */ + OFF off = beg; + CHAR* ptr = &ctx->identifiers[def->ident_beg]; + + while(1) { + const MD_LINE* line = &lines[line_index]; + OFF line_end = line->end; + if(end < line_end) + line_end = end; + + while(off < line_end) { + if( CH(off) == _T('-') ){ // '-' are not replaced + *ptr++ = _T('-'); + off++; + continue; + } + unsigned codepoint; + SZ char_size; + + codepoint = md_decode_unicode(ctx->text, off, line_end, &char_size); + if(ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE(off)) {// replace white spaces by '-' + *ptr++ = _T('-'); + off = md_skip_unicode_whitespace(ctx->text, off, line_end); + } else if (ISUNICODEPUNCT_(codepoint)) { // skip ponctuation + off += char_size; + continue; + } else { // make lower case + MD_UNICODE_FOLD_INFO fold_info; + md_get_unicode_fold_info(codepoint, &fold_info); + for (unsigned i = 0; i < fold_info.n_codepoints; i++) { + SZ n = md_encode_unicode(fold_info.codepoints[i], ptr); + ptr += n; + } + off += char_size; + } + } + + if(off >= end) { + // update real identifier size + def->ident_size = (MD_SIZE)(ptr - &ctx->identifiers[def->ident_beg]); + break; + } + + *ptr = _T('-'); // end of line + ptr++; + + line_index++; + off = lines[line_index].beg; + } + // compute identifier hash reusing the link label function + def->hash = md_link_label_hash(&ctx->identifiers[def->ident_beg], def->ident_size); + + // update used identifier buffer size + ctx->identifiers_size += def->ident_size; + + return 0; +abort: + + return -1; +} + +static int +md_check_duplicate_identifier(MD_CTX* ctx) +{ + int i, j; + + if(ctx->n_heading_defs == 0) + return 0; + + // TODO: change this on purpose quadratic algo for now... + + for(i = 0; i < ctx->n_heading_defs; i++) { + MD_HEADING_DEF* defi= &ctx->heading_defs[i]; + for(j = i+1 ; j < ctx->n_heading_defs; j++) { + MD_HEADING_DEF* defj = &ctx->heading_defs[j]; + if ( defi->hash == defj->hash){ + // same hash, check for identifiers + MD_CHAR* defi_ident = &ctx->identifiers[defi->ident_beg]; + MD_CHAR* defj_ident = &ctx->identifiers[defj->ident_beg]; + + if(md_link_label_cmp(defi_ident, defi->ident_size, defj_ident, defj->ident_size) == 0) { + /* Duplicate identifier: increment counter */ + defj->postfix++; + // break; + if(defj->postfix > ctx->max_postfix) { + ctx->max_postfix = defj->postfix; + } + } + } + } + } + return 0; +} + +static void +md_free_heading_defs(MD_CTX* ctx) +{ + free(ctx->heading_defs); +} /****************************************** *** Processing Inlines (a.k.a Spans) *** @@ -4938,17 +5052,19 @@ static int md_setup_H_identifier(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_H_DETAIL* det, MD_ATTRIBUTE_BUILD* id_build) { - const MD_LINE* header_line = (const MD_LINE*)(block + 1); - OFF beg = header_line->beg; - OFF end = header_line->end; int ret = 0; /* Build info string attribute. */ MD_HEADING_DEF * heading = &ctx->heading_defs[block->heading_def]; - MD_CHECK(md_build_attribute(ctx, &ctx->identifiers[heading->ident_beg], heading->ident_size, 0, &det->identifier, id_build)); - + if(heading->postfix == 0) { + MD_CHECK(md_build_trivial_attribute(ctx, &ctx->identifiers[heading->ident_beg], + heading->ident_size, &det->identifier, id_build)); + } else { + MD_CHECK(md_build_attribute_postfix(ctx, &ctx->identifiers[heading->ident_beg], + heading->ident_size, heading->postfix, &det->identifier, id_build)); + } abort: return ret; } From 620fd9d3b0ee38f418b9ff2743644f501dc31113 Mon Sep 17 00:00:00 2001 From: chowette Date: Sat, 9 Apr 2022 12:40:41 +0200 Subject: [PATCH 07/35] better md_int16_to_str convertion function --- src/md4c.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index f7dce737..9777c67a 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -1499,9 +1499,33 @@ md_build_trivial_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, return 0; } +/* Convert a 16 bits unsigned word to a string +* the dest buffer must be at least 5 char long +* It does not nul terminat the string +* Return the number of characters used by the string +*/ static int -int_to_str( unsigned postfix, CHAR* dest){ - return snprintf(dest, 6,"%u", postfix); +md_int16_to_str(unsigned short n, CHAR* dest){ + char count = 5; + + if(n <10 ){ + static const CHAR numbers[] = _T("0123456789"); + *dest = numbers[n]; + return 1; + } + while(1){ + if(n< 100){ count = 2; break;} + if(n< 1000){ count = 3; break;} + if(n< 10000){ count = 4; break;} + break; + } + // start from end + dest += count; + while (n) { + *--dest = '0' + ( n % 10); + n /= 10; + } + return count; } static int @@ -1520,11 +1544,11 @@ md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, build->trivial_types[0] = MD_TEXT_NORMAL; build->trivial_offsets[0] = 0; off = raw_size; - if (postfix > 0xffff ){ - // postfix is not allowed to be bigger than 65535 (2^16) , so maximum 5+1 char + if (postfix > 0xffff) { + // postfix is not allowed to be bigger than 65535 (2^16) , so maximum 5 char postfix = 0xffff; } - const SZ MAX_POSTFIX_SIZE= 6; + const SZ MAX_POSTFIX_SIZE= 5; build->text = (CHAR*) malloc((raw_size + MAX_POSTFIX_SIZE) * sizeof(CHAR)); if(build->text == NULL) { MD_LOG("malloc() failed."); @@ -1535,7 +1559,7 @@ md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, memcpy(build->text, raw_text, raw_size); // append postfix build->text[off++] = _T('-'); - off+= int_to_str(postfix, &build->text[off]); + off+= md_int16_to_str(postfix, &build->text[off]); attr->text = build->text; build->trivial_offsets[1] = off; @@ -5061,7 +5085,7 @@ md_setup_H_identifier(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_H_DETAIL* det if(heading->postfix == 0) { MD_CHECK(md_build_trivial_attribute(ctx, &ctx->identifiers[heading->ident_beg], heading->ident_size, &det->identifier, id_build)); - } else { + } else { MD_CHECK(md_build_attribute_postfix(ctx, &ctx->identifiers[heading->ident_beg], heading->ident_size, heading->postfix, &det->identifier, id_build)); } @@ -6686,6 +6710,7 @@ md_process_doc(MD_CTX *ctx) md_end_current_block(ctx); + MD_CHECK(md_check_duplicate_identifier(ctx)); MD_CHECK(md_build_ref_def_hashtable(ctx)); /* Process all blocks. */ From c8c4feafb6737d74c39dab8fa83340338966a550 Mon Sep 17 00:00:00 2001 From: chowette Date: Mon, 11 Apr 2022 19:44:48 +0200 Subject: [PATCH 08/35] =?UTF-8?q?replace=20bad=20O(n=C2=B2)=20algorithm=20?= =?UTF-8?q?by=20a=20hashMap=20of=20identifier=20for=20numbering?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/md4c.c | 188 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 160 insertions(+), 28 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 9777c67a..5e7d1268 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -167,12 +167,14 @@ struct MD_CTX_tag { MD_HEADING_DEF* heading_defs; int n_heading_defs; int alloc_heading_defs; + void** heading_def_hashtable; + int heading_def_hashtable_size; /* autogenerated identifiers for heading */ CHAR* identifiers; SZ identifiers_size; SZ alloc_identifiers; - /* postfix identifier book keeping */ - SZ max_postfix; + + /* Stack of inline/span markers. * This is only used for parsing a single block contents but by storing it @@ -1473,7 +1475,9 @@ md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build) if(build->substr_alloc > 0) { free(build->text); + if( build->substr_types != build->trivial_types) free(build->substr_types); + if( build->substr_offsets != build->trivial_offsets) free(build->substr_offsets); } } @@ -1535,21 +1539,19 @@ md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, OFF off; memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); - - build->substr_types = build->trivial_types; build->substr_offsets = build->trivial_offsets; build->substr_count = 1; - build->substr_alloc = 0; + build->substr_alloc = 1; build->trivial_types[0] = MD_TEXT_NORMAL; build->trivial_offsets[0] = 0; off = raw_size; if (postfix > 0xffff) { - // postfix is not allowed to be bigger than 65535 (2^16) , so maximum 5 char + // postfix is not allowed to be bigger than 65535 (2^16) , so maximum 5 char postfix = 0xffff; } - const SZ MAX_POSTFIX_SIZE= 5; - build->text = (CHAR*) malloc((raw_size + MAX_POSTFIX_SIZE) * sizeof(CHAR)); + const SZ MAX_POSTFIX_SIZE= 5; // but also add 1 for the '-' + build->text = (CHAR*) malloc((raw_size + MAX_POSTFIX_SIZE+1) * sizeof(CHAR)); if(build->text == NULL) { MD_LOG("malloc() failed."); goto abort; @@ -2544,6 +2546,7 @@ md_free_ref_defs(MD_CTX* ctx) *********************************************/ struct MD_HEADING_DEF_tag { + CHAR* identifier; // only valid after all heading are known unsigned hash; OFF ident_beg; SZ ident_size; @@ -2661,9 +2664,6 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l line_index++; off = lines[line_index].beg; } - // compute identifier hash reusing the link label function - def->hash = md_link_label_hash(&ctx->identifiers[def->ident_beg], def->ident_size); - // update used identifier buffer size ctx->identifiers_size += def->ident_size; @@ -2673,37 +2673,168 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l return -1; } +typedef struct MD_HEADING_DEF_LIST_tag MD_HEADING_DEF_LIST; +struct MD_HEADING_DEF_LIST_tag { + int n_heading_defs; + int alloc_heading_defs; + MD_HEADING_DEF* heading_defs[]; /* Valid items always point into ctx->heading_defs[] */ +}; + static int -md_check_duplicate_identifier(MD_CTX* ctx) +md_heading_def_cmp(const void* a, const void* b) +{ + const MD_HEADING_DEF* a_ref = *(const MD_HEADING_DEF**)a; + const MD_HEADING_DEF* b_ref = *(const MD_HEADING_DEF**)b; + + if(a_ref->hash < b_ref->hash) + return -1; + else if(a_ref->hash > b_ref->hash) + return +1; + else + return md_link_label_cmp(a_ref->identifier, a_ref->ident_size, + b_ref->identifier, b_ref->ident_size); +} + +static int +md_heading_def_cmp_for_sort(const void* a, const void* b) +{ + int cmp; + + cmp = md_heading_def_cmp(a, b); + + /* Ensure stability of the sorting. */ + if(cmp == 0) { + const MD_HEADING_DEF* a_ref = *(const MD_HEADING_DEF**)a; + const MD_HEADING_DEF* b_ref = *(const MD_HEADING_DEF**)b; + + if(a_ref < b_ref) + cmp = -1; + else if(a_ref > b_ref) + cmp = +1; + else + cmp = 0; + } + + return cmp; +} + +static int +md_build_heading_def_hashtable(MD_CTX* ctx) { int i, j; if(ctx->n_heading_defs == 0) return 0; - // TODO: change this on purpose quadratic algo for now... + ctx->heading_def_hashtable_size = (ctx->n_heading_defs * 5) / 4; + ctx->heading_def_hashtable = malloc(ctx->heading_def_hashtable_size * sizeof(void*)); + if(ctx->heading_def_hashtable == NULL) { + MD_LOG("malloc() failed."); + goto abort; + } + memset(ctx->heading_def_hashtable, 0, ctx->heading_def_hashtable_size * sizeof(void*)); + /* Each member of ctx->heading_def_hashtable[] can be: + * -- NULL, + * -- pointer to the MD_HEADING_DEF in ctx->heading_defs[], or + * -- pointer to a MD_HEADING_DEF_LIST, which holds multiple pointers to + * such MD_HEADING_DEFs. + */ for(i = 0; i < ctx->n_heading_defs; i++) { - MD_HEADING_DEF* defi= &ctx->heading_defs[i]; - for(j = i+1 ; j < ctx->n_heading_defs; j++) { - MD_HEADING_DEF* defj = &ctx->heading_defs[j]; - if ( defi->hash == defj->hash){ - // same hash, check for identifiers - MD_CHAR* defi_ident = &ctx->identifiers[defi->ident_beg]; - MD_CHAR* defj_ident = &ctx->identifiers[defj->ident_beg]; + MD_HEADING_DEF* def = &ctx->heading_defs[i]; + void* bucket; + MD_HEADING_DEF_LIST* list; + + // compute identifier hash reusing the link label hash function + def->identifier = &ctx->identifiers[def->ident_beg]; + def->hash = md_link_label_hash(def->identifier, def->ident_size); + bucket = ctx->heading_def_hashtable[def->hash % ctx->heading_def_hashtable_size]; + + if(bucket == NULL) { + /* The bucket is empty. Make it just point to the def. */ + ctx->heading_def_hashtable[def->hash % ctx->heading_def_hashtable_size] = def; + continue; + } + + if(ctx->heading_defs <= (MD_HEADING_DEF*) bucket && (MD_HEADING_DEF*) bucket < ctx->heading_defs + ctx->n_heading_defs) { + /* The bucket already contains one heading def.*/ + MD_HEADING_DEF* old_def = (MD_HEADING_DEF*) bucket; + + /* Make the bucket complex, i.e. able to hold more heading defs. */ + list = (MD_HEADING_DEF_LIST*) malloc(sizeof(MD_HEADING_DEF_LIST) + 2 * sizeof(MD_HEADING_DEF*)); + if(list == NULL) { + MD_LOG("malloc() failed."); + goto abort; + } + list->heading_defs[0] = old_def; + list->heading_defs[1] = def; + list->n_heading_defs = 2; + list->alloc_heading_defs = 2; + ctx->heading_def_hashtable[def->hash % ctx->heading_def_hashtable_size] = list; + continue; + } - if(md_link_label_cmp(defi_ident, defi->ident_size, defj_ident, defj->ident_size) == 0) { - /* Duplicate identifier: increment counter */ - defj->postfix++; - // break; - if(defj->postfix > ctx->max_postfix) { - ctx->max_postfix = defj->postfix; + /* Append the def to the complex bucket list. */ + list = (MD_HEADING_DEF_LIST*) bucket; + if(list->n_heading_defs >= list->alloc_heading_defs) { + int alloc_heading_defs = list->alloc_heading_defs + list->alloc_heading_defs / 2; + MD_HEADING_DEF_LIST* list_tmp = (MD_HEADING_DEF_LIST*) realloc(list, + sizeof(MD_HEADING_DEF_LIST) + alloc_heading_defs * sizeof(MD_HEADING_DEF*)); + if(list_tmp == NULL) { + MD_LOG("realloc() failed."); + goto abort; } + list = list_tmp; + list->alloc_heading_defs = alloc_heading_defs; + ctx->heading_def_hashtable[def->hash % ctx->heading_def_hashtable_size] = list; } + + list->heading_defs[list->n_heading_defs] = def; + list->n_heading_defs++; } + + /* Sort the complex buckets so we can use bsearch() with them. */ + for(i = 0; i < ctx->heading_def_hashtable_size; i++) { + void* bucket = ctx->heading_def_hashtable[i]; + MD_HEADING_DEF_LIST* list; + + if(bucket == NULL) + continue; + if(ctx->heading_defs <= (MD_HEADING_DEF*) bucket && (MD_HEADING_DEF*) bucket < ctx->heading_defs + ctx->n_heading_defs) + continue; + + list = (MD_HEADING_DEF_LIST*) bucket; + qsort(list->heading_defs, list->n_heading_defs, sizeof(MD_HEADING_DEF*), md_heading_def_cmp_for_sort); + + for(j = 1; j < list->n_heading_defs; j++) { + if(md_heading_def_cmp(&list->heading_defs[j-1], &list->heading_defs[j]) == 0) + list->heading_defs[j]->postfix = list->heading_defs[j-1]->postfix + 1; } } + return 0; + +abort: + return -1; +} + +static void +md_free_heading_def_hashtable(MD_CTX* ctx) +{ + if(ctx->heading_def_hashtable != NULL) { + int i; + + for(i = 0; i < ctx->heading_def_hashtable_size; i++) { + void* bucket = ctx->heading_def_hashtable[i]; + if(bucket == NULL) + continue; + if(ctx->heading_defs <= (MD_HEADING_DEF*) bucket && (MD_HEADING_DEF*) bucket < ctx->heading_defs + ctx->n_heading_defs) + continue; + free(bucket); + } + + free(ctx->heading_def_hashtable); + } } static void @@ -6710,7 +6841,7 @@ md_process_doc(MD_CTX *ctx) md_end_current_block(ctx); - MD_CHECK(md_check_duplicate_identifier(ctx)); + MD_CHECK(md_build_heading_def_hashtable(ctx)); MD_CHECK(md_build_ref_def_hashtable(ctx)); /* Process all blocks. */ @@ -6750,7 +6881,7 @@ md_process_doc(MD_CTX *ctx) MD_LOG(buffer); sprintf(buffer, "Alloced %u bytes for heading definition buffer.", - (unsigned)(ctx->alloc_heading_defs * sizeof(MD_REF_DEF))); + (unsigned)(ctx->alloc_heading_defs * sizeof(MD_HEADING_DEF))); MD_LOG(buffer); } @@ -6800,6 +6931,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userd /* Clean-up. */ md_free_heading_defs(&ctx); + md_free_heading_def_hashtable(&ctx); free(ctx.identifiers); md_free_ref_defs(&ctx); md_free_ref_def_hashtable(&ctx); From edf9bd93d0b189919aabf55f085919cea65abb53 Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 12 Apr 2022 21:45:28 +0200 Subject: [PATCH 09/35] heading with link correctly ignore url when generating heading identifier --- src/md4c.c | 203 ++++++++++++++++++++----------- test/heading-auto-identifier.txt | 24 ++++ 2 files changed, 159 insertions(+), 68 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 5e7d1268..aa43aecc 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -2603,75 +2603,9 @@ md_alloc_identifiers(MD_CTX *ctx, MD_HEADING_DEF* def) return 0; } +/** forward declaration */ static int -md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines) -{ - int ret = 0; - - int line_index = 0; - OFF beg = lines[0].beg; - OFF end = lines[n_lines-1].end; - - def->ident_size = end - beg; - MD_CHECK(md_alloc_identifiers(ctx, def)); - - /* copy the ident and transform as needed */ - OFF off = beg; - CHAR* ptr = &ctx->identifiers[def->ident_beg]; - - while(1) { - const MD_LINE* line = &lines[line_index]; - OFF line_end = line->end; - if(end < line_end) - line_end = end; - - while(off < line_end) { - if( CH(off) == _T('-') ){ // '-' are not replaced - *ptr++ = _T('-'); - off++; - continue; - } - unsigned codepoint; - SZ char_size; - - codepoint = md_decode_unicode(ctx->text, off, line_end, &char_size); - if(ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE(off)) {// replace white spaces by '-' - *ptr++ = _T('-'); - off = md_skip_unicode_whitespace(ctx->text, off, line_end); - } else if (ISUNICODEPUNCT_(codepoint)) { // skip ponctuation - off += char_size; - continue; - } else { // make lower case - MD_UNICODE_FOLD_INFO fold_info; - md_get_unicode_fold_info(codepoint, &fold_info); - for (unsigned i = 0; i < fold_info.n_codepoints; i++) { - SZ n = md_encode_unicode(fold_info.codepoints[i], ptr); - ptr += n; - } - off += char_size; - } - } - - if(off >= end) { - // update real identifier size - def->ident_size = (MD_SIZE)(ptr - &ctx->identifiers[def->ident_beg]); - break; - } - - *ptr = _T('-'); // end of line - ptr++; - - line_index++; - off = lines[line_index].beg; - } - // update used identifier buffer size - ctx->identifiers_size += def->ident_size; - - return 0; -abort: - - return -1; -} +md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines); typedef struct MD_HEADING_DEF_LIST_tag MD_HEADING_DEF_LIST; struct MD_HEADING_DEF_LIST_tag { @@ -6243,6 +6177,139 @@ md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTA return FALSE; } +static int +md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines) +{ + int ret = 0; + + const MD_LINE* line = lines; + MD_MARK* mark; + OFF beg = lines[0].beg; + OFF end = lines[n_lines-1].end; + + /* Reset the previously collected stack of marks. */ + ctx->n_marks = 0; + + MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE)); + + /* Find first resolved mark. Note there is always at least one resolved + * mark, the dummy last one after the end of the latest line we actually + * never really reach. This saves us of a lot of special checks and cases + * in this function. */ + mark = ctx->marks; + while(!(mark->flags & MD_MARK_RESOLVED)) + mark++; + + def->ident_size = end - beg; + MD_CHECK(md_alloc_identifiers(ctx, def)); + + /* copy the ident and transform as needed */ + OFF off = beg; + CHAR* ptr = &ctx->identifiers[def->ident_beg]; + + while(1) { + + OFF line_end = line->end; + if(end < line_end) + line_end = end; + /* Process the text up to the next mark or end-of-line. */ + OFF tmp = (line->end < mark->beg ? line->end : mark->beg); + + while(off < tmp) { + if( CH(off) == _T('-') ){ // '-' are not replaced + *ptr++ = _T('-'); + off++; + continue; + } + unsigned codepoint; + SZ char_size; + + codepoint = md_decode_unicode(ctx->text, off, line_end, &char_size); + if(ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE(off)) {// replace white spaces by '-' + *ptr++ = _T('-'); + off = md_skip_unicode_whitespace(ctx->text, off, line_end); + } else if (ISUNICODEPUNCT_(codepoint)) { // skip ponctuation + off += char_size; + continue; + } else { // make lower case + MD_UNICODE_FOLD_INFO fold_info; + md_get_unicode_fold_info(codepoint, &fold_info); + for (unsigned i = 0; i < fold_info.n_codepoints; i++) { + SZ n = md_encode_unicode(fold_info.codepoints[i], ptr); + ptr += n; + } + off += char_size; + } + } + /* If reached the mark, process it and move to next one. */ + if(off >= mark->beg) { + switch(mark->ch) { + + case '[': /* Link, wiki link, image. */ + case '!': + case ']': + { + const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]); + const MD_MARK* closer = &ctx->marks[opener->next]; + const MD_MARK* dest_mark; + const MD_MARK* title_mark; + + if ((opener->ch == '[' && closer->ch == ']') && + opener->end - opener->beg >= 2 && + closer->end - closer->beg >= 2) + { + break; + } + + dest_mark = opener+1; + MD_ASSERT(dest_mark->ch == 'D'); + title_mark = opener+2; + if (title_mark->ch != 'D') break; + + /* link/image closer may span multiple lines. */ + if(mark->ch == ']') { + while(mark->end > line->end) + line++; + } + + break; + } + } + + off = mark->end; + + /* Move to next resolved mark. */ + mark++; + while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off) + mark++; + } + + /* If reached end of line, move to next one. */ + if(off >= line->end) { + /* If it is the last line, we are done. */ + if(off >= end) { + // update real identifier size + def->ident_size = (MD_SIZE)(ptr - &ctx->identifiers[def->ident_beg]); + break; + } + + *ptr = _T('-'); // end of line + ptr++; + + /* Move to the next line. */ + line++; + off = line->beg; + } + } + // update used identifier buffer size + ctx->identifiers_size += def->ident_size; + + return 0; +abort: + + return -1; +} + static unsigned md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end) { diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index e743542f..aa9622cb 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -50,6 +50,30 @@ Heading starting with numbers are not treated differently

1.1 The start

```````````````````````````````` +Heading can contain link inside + +```````````````````````````````` example +# Title with a [link](hidden) inside +. +

Title with a link inside

+```````````````````````````````` + +Heading can contain wiki link inside but requiere the MD_FLAG_WIKILINKS + +```````````````````````````````` example +# Title with a [[hidden-wiki|link]] inside +. +

Title with a [[hidden-wiki|link]] inside

+```````````````````````````````` + +Heading can contain formatting + +```````````````````````````````` example +# Title with *emphasis* inside +. +

Title with emphasis inside

+```````````````````````````````` + From ef791385c5f1ba76a714ea4d10444215d426d5a0 Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 12 Apr 2022 21:56:50 +0200 Subject: [PATCH 10/35] make heading only when MD_FLAG_HEADINGAUTOID is set --- src/md4c.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/md4c.c b/src/md4c.c index aa43aecc..ad5184f5 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -5537,7 +5537,7 @@ md_end_current_block(MD_CTX* ctx) } } - if(ctx->current_block->type == MD_BLOCK_H){ + if(ctx->current_block->type == MD_BLOCK_H && (ctx->parser.flags & MD_FLAG_HEADINGAUTOID)){ MD_CHECK(md_make_heading(ctx)); } @@ -6908,7 +6908,9 @@ md_process_doc(MD_CTX *ctx) md_end_current_block(ctx); + if(ctx->parser.flags & MD_FLAG_HEADINGAUTOID) { MD_CHECK(md_build_heading_def_hashtable(ctx)); + } MD_CHECK(md_build_ref_def_hashtable(ctx)); /* Process all blocks. */ From 7812fdba64beee72b2379e7b535350c715fa364b Mon Sep 17 00:00:00 2001 From: chowette Date: Sat, 23 Apr 2022 20:31:43 +0200 Subject: [PATCH 11/35] Emoji are treated as ponctuation, unicode emoji are stripped --- scripts/build_symbol_map.py | 66 ++++++++++++++++++++++++++++++++ src/md4c.c | 66 +++++++++++++++++++++++++++++++- test/heading-auto-identifier.txt | 21 ++++++++++ 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 scripts/build_symbol_map.py diff --git a/scripts/build_symbol_map.py b/scripts/build_symbol_map.py new file mode 100644 index 00000000..bd19f5a5 --- /dev/null +++ b/scripts/build_symbol_map.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import os +import sys +import textwrap + + +self_path = os.path.dirname(os.path.realpath(__file__)); +f = open(self_path + "/unicode/DerivedGeneralCategory.txt", "r") + +codepoint_list = [] +category_list = [ "Sm", "Sc", "Sk", "So" ] + +# Filter codepoints falling in the right category: +for line in f: + comment_off = line.find("#") + if comment_off >= 0: + line = line[:comment_off] + line = line.strip() + if not line: + continue + + char_range, category = line.split(";") + char_range = char_range.strip() + category = category.strip() + + if not category in category_list: + continue + + delim_off = char_range.find("..") + if delim_off >= 0: + codepoint0 = int(char_range[:delim_off], 16) + codepoint1 = int(char_range[delim_off+2:], 16) + for codepoint in range(codepoint0, codepoint1 + 1): + codepoint_list.append(codepoint) + else: + codepoint = int(char_range, 16) + codepoint_list.append(codepoint) +f.close() + + +codepoint_list.sort() + + +index0 = 0 +count = len(codepoint_list) + +records = list() +while index0 < count: + index1 = index0 + 1 + while index1 < count and codepoint_list[index1] == codepoint_list[index1-1] + 1: + index1 += 1 + + if index1 - index0 > 1: + # Range of codepoints + records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1])) + else: + # Single codepoint + records.append("S(0x{:04x})".format(codepoint_list[index0])) + + index0 = index1 + +sys.stdout.write("static const unsigned SYMBOL_MAP[] = {\n") +sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110, + initial_indent = " ", subsequent_indent=" "))) +sys.stdout.write("\n};\n\n") diff --git a/src/md4c.c b/src/md4c.c index ad5184f5..b05a642e 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -305,6 +305,7 @@ struct MD_VERBATIMLINE_tag { #define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f'))) #define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127) #define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126)) +#define ISSYMBOL_(ch) (ISANYOF3_(ch, _T('+'), _T('|'), _T('~')) || ISIN_(ch, 60, 62)) #define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z'))) #define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z'))) #define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch)) @@ -321,6 +322,7 @@ struct MD_VERBATIMLINE_tag { #define ISWHITESPACE(off) ISWHITESPACE_(CH(off)) #define ISCNTRL(off) ISCNTRL_(CH(off)) #define ISPUNCT(off) ISPUNCT_(CH(off)) +#define ISSYMBOL(off) ISSYMBOL_(CH(off)) #define ISUPPER(off) ISUPPER_(CH(off)) #define ISLOWER(off) ISLOWER_(CH(off)) #define ISALPHA(off) ISALPHA_(CH(off)) @@ -624,6 +626,64 @@ struct MD_UNICODE_FOLD_INFO_tag { return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0); } + static int + md_is_unicode_symbol__(unsigned codepoint) + { +#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000) +#define S(cp) (cp) + /* Unicode "Sm", "Sc", "Sk", "So" categories. + * (generated by scripts/build_symbol_map.py) */ + static const unsigned SYMBOL_MAP[] = { + S(0x0024), S(0x002b), R(0x003c,0x003e), S(0x005e), S(0x0060), S(0x007c), S(0x007e), R(0x00a2,0x00a6), + R(0x00a8,0x00a9), S(0x00ac), R(0x00ae,0x00b1), S(0x00b4), S(0x00b8), S(0x00d7), S(0x00f7), + R(0x02c2,0x02c5), R(0x02d2,0x02df), R(0x02e5,0x02eb), S(0x02ed), R(0x02ef,0x02ff), S(0x0375), + R(0x0384,0x0385), S(0x03f6), S(0x0482), R(0x058d,0x058f), R(0x0606,0x0608), S(0x060b), R(0x060e,0x060f), + S(0x06de), S(0x06e9), R(0x06fd,0x06fe), S(0x07f6), R(0x07fe,0x07ff), R(0x09f2,0x09f3), R(0x09fa,0x09fb), + S(0x0af1), S(0x0b70), R(0x0bf3,0x0bfa), S(0x0c7f), S(0x0d4f), S(0x0d79), S(0x0e3f), R(0x0f01,0x0f03), + S(0x0f13), R(0x0f15,0x0f17), R(0x0f1a,0x0f1f), S(0x0f34), S(0x0f36), S(0x0f38), R(0x0fbe,0x0fc5), + R(0x0fc7,0x0fcc), R(0x0fce,0x0fcf), R(0x0fd5,0x0fd8), R(0x109e,0x109f), R(0x1390,0x1399), S(0x166d), + S(0x17db), S(0x1940), R(0x19de,0x19ff), R(0x1b61,0x1b6a), R(0x1b74,0x1b7c), S(0x1fbd), R(0x1fbf,0x1fc1), + R(0x1fcd,0x1fcf), R(0x1fdd,0x1fdf), R(0x1fed,0x1fef), R(0x1ffd,0x1ffe), S(0x2044), S(0x2052), + R(0x207a,0x207c), R(0x208a,0x208c), R(0x20a0,0x20bf), R(0x2100,0x2101), R(0x2103,0x2106), + R(0x2108,0x2109), S(0x2114), R(0x2116,0x2118), R(0x211e,0x2123), S(0x2125), S(0x2127), S(0x2129), + S(0x212e), R(0x213a,0x213b), R(0x2140,0x2144), R(0x214a,0x214d), S(0x214f), R(0x218a,0x218b), + R(0x2190,0x2307), R(0x230c,0x2328), R(0x232b,0x2426), R(0x2440,0x244a), R(0x249c,0x24e9), + R(0x2500,0x2767), R(0x2794,0x27c4), R(0x27c7,0x27e5), R(0x27f0,0x2982), R(0x2999,0x29d7), + R(0x29dc,0x29fb), R(0x29fe,0x2b73), R(0x2b76,0x2b95), R(0x2b97,0x2bff), R(0x2ce5,0x2cea), + R(0x2e50,0x2e51), R(0x2e80,0x2e99), R(0x2e9b,0x2ef3), R(0x2f00,0x2fd5), R(0x2ff0,0x2ffb), S(0x3004), + R(0x3012,0x3013), S(0x3020), R(0x3036,0x3037), R(0x303e,0x303f), R(0x309b,0x309c), R(0x3190,0x3191), + R(0x3196,0x319f), R(0x31c0,0x31e3), R(0x3200,0x321e), R(0x322a,0x3247), S(0x3250), R(0x3260,0x327f), + R(0x328a,0x32b0), R(0x32c0,0x33ff), R(0x4dc0,0x4dff), R(0xa490,0xa4c6), R(0xa700,0xa716), + R(0xa720,0xa721), R(0xa789,0xa78a), R(0xa828,0xa82b), R(0xa836,0xa839), R(0xaa77,0xaa79), S(0xab5b), + R(0xab6a,0xab6b), S(0xfb29), R(0xfbb2,0xfbc1), R(0xfdfc,0xfdfd), S(0xfe62), R(0xfe64,0xfe66), S(0xfe69), + S(0xff04), S(0xff0b), R(0xff1c,0xff1e), S(0xff3e), S(0xff40), S(0xff5c), S(0xff5e), R(0xffe0,0xffe6), + R(0xffe8,0xffee), R(0xfffc,0xfffd), R(0x10137,0x1013f), R(0x10179,0x10189), R(0x1018c,0x1018e), + R(0x10190,0x1019c), S(0x101a0), R(0x101d0,0x101fc), R(0x10877,0x10878), S(0x10ac8), S(0x1173f), + R(0x11fd5,0x11ff1), R(0x16b3c,0x16b3f), S(0x16b45), S(0x1bc9c), R(0x1d000,0x1d0f5), R(0x1d100,0x1d126), + R(0x1d129,0x1d164), R(0x1d16a,0x1d16c), R(0x1d183,0x1d184), R(0x1d18c,0x1d1a9), R(0x1d1ae,0x1d1e8), + R(0x1d200,0x1d241), S(0x1d245), R(0x1d300,0x1d356), S(0x1d6c1), S(0x1d6db), S(0x1d6fb), S(0x1d715), + S(0x1d735), S(0x1d74f), S(0x1d76f), S(0x1d789), S(0x1d7a9), S(0x1d7c3), R(0x1d800,0x1d9ff), + R(0x1da37,0x1da3a), R(0x1da6d,0x1da74), R(0x1da76,0x1da83), R(0x1da85,0x1da86), S(0x1e14f), S(0x1e2ff), + S(0x1ecac), S(0x1ecb0), S(0x1ed2e), R(0x1eef0,0x1eef1), R(0x1f000,0x1f02b), R(0x1f030,0x1f093), + R(0x1f0a0,0x1f0ae), R(0x1f0b1,0x1f0bf), R(0x1f0c1,0x1f0cf), R(0x1f0d1,0x1f0f5), R(0x1f10d,0x1f1ad), + R(0x1f1e6,0x1f202), R(0x1f210,0x1f23b), R(0x1f240,0x1f248), R(0x1f250,0x1f251), R(0x1f260,0x1f265), + R(0x1f300,0x1f6d7), R(0x1f6e0,0x1f6ec), R(0x1f6f0,0x1f6fc), R(0x1f700,0x1f773), R(0x1f780,0x1f7d8), + R(0x1f7e0,0x1f7eb), R(0x1f800,0x1f80b), R(0x1f810,0x1f847), R(0x1f850,0x1f859), R(0x1f860,0x1f887), + R(0x1f890,0x1f8ad), R(0x1f8b0,0x1f8b1), R(0x1f900,0x1f978), R(0x1f97a,0x1f9cb), R(0x1f9cd,0x1fa53), + R(0x1fa60,0x1fa6d), R(0x1fa70,0x1fa74), R(0x1fa78,0x1fa7a), R(0x1fa80,0x1fa86), R(0x1fa90,0x1faa8), + R(0x1fab0,0x1fab6), R(0x1fac0,0x1fac2), R(0x1fad0,0x1fad6), R(0x1fb00,0x1fb92), R(0x1fb94,0x1fbca) + }; + +#undef R +#undef S + + /* The ASCII ones are the most frequently used ones. */ + if(codepoint <= 0x7f) + return ISSYMBOL_(codepoint); + + return (md_unicode_bsearch__(codepoint, SYMBOL_MAP, SIZEOF_ARRAY(SYMBOL_MAP)) >= 0); + } + static void md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) { @@ -924,6 +984,8 @@ struct MD_UNICODE_FOLD_INFO_tag { #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL)) #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off)) + #define ISUNICODESYMBOL_(codepoint) md_is_unicode_symbol__(codepoint) + static inline unsigned md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size) { @@ -944,6 +1006,8 @@ struct MD_UNICODE_FOLD_INFO_tag { #define ISUNICODEPUNCT(off) ISPUNCT(off) #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) + #define ISUNICODESYMBOL_(codepoint) ISSYMBOL_(codepoint) + static inline void md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info) { @@ -6228,7 +6292,7 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l if(ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE(off)) {// replace white spaces by '-' *ptr++ = _T('-'); off = md_skip_unicode_whitespace(ctx->text, off, line_end); - } else if (ISUNICODEPUNCT_(codepoint)) { // skip ponctuation + } else if (ISUNICODEPUNCT_(codepoint) || ISUNICODESYMBOL_(codepoint)) { // skip ponctuation and symbols off += char_size; continue; } else { // make lower case diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index aa9622cb..5dcf8396 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -74,6 +74,27 @@ Heading can contain formatting

Title with emphasis inside

```````````````````````````````` +Heading can contain some emoji code like :emoji:, they are treated as normal text +```````````````````````````````` example +# emoji1 :+1: +# emoji2 :-1: +# emoji3 :100: +. +

emoji1 :+1:

+

emoji2 :-1:

+

emoji3 :100:

+```````````````````````````````` + +But unicode emoji characters are stripped +```````````````````````````````` example +# emoji4 👍 +# emoji5 💯 +# the + sign +. +

emoji4 👍

+

emoji5 💯

+

the + sign

+```````````````````````````````` From 3738b8a38d729692e87433516f9200a42ea7c6e5 Mon Sep 17 00:00:00 2001 From: chowette Date: Sat, 23 Apr 2022 20:54:37 +0200 Subject: [PATCH 12/35] fix -Wdeclaration-after-statement travis errors --- src/md4c.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index b05a642e..8353d635 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -1601,6 +1601,7 @@ md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, unsigned postfix, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build) { OFF off; + const SZ MAX_POSTFIX_SIZE = 5; // but also add 1 for the '-' memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); build->substr_types = build->trivial_types; @@ -1614,7 +1615,7 @@ md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, // postfix is not allowed to be bigger than 65535 (2^16) , so maximum 5 char postfix = 0xffff; } - const SZ MAX_POSTFIX_SIZE= 5; // but also add 1 for the '-' + build->text = (CHAR*) malloc((raw_size + MAX_POSTFIX_SIZE+1) * sizeof(CHAR)); if(build->text == NULL) { MD_LOG("malloc() failed."); @@ -6186,8 +6187,9 @@ md_leave_child_containers(MD_CTX* ctx, int n_keep) static int md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container) { - OFF off = beg; OFF max_end; + OFF off = beg; + if(off >= ctx->size || indent >= ctx->code_indent_offset) return FALSE; @@ -6244,11 +6246,13 @@ md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTA static int md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines) { + MD_MARK* mark; + CHAR* ptr; int ret = 0; const MD_LINE* line = lines; - MD_MARK* mark; OFF beg = lines[0].beg; + OFF off = beg; OFF end = lines[n_lines-1].end; /* Reset the previously collected stack of marks. */ @@ -6268,25 +6272,25 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l MD_CHECK(md_alloc_identifiers(ctx, def)); /* copy the ident and transform as needed */ - OFF off = beg; - CHAR* ptr = &ctx->identifiers[def->ident_beg]; + ptr = &ctx->identifiers[def->ident_beg]; while(1) { OFF line_end = line->end; - if(end < line_end) - line_end = end; /* Process the text up to the next mark or end-of-line. */ OFF tmp = (line->end < mark->beg ? line->end : mark->beg); + if(end < line_end) + line_end = end; while(off < tmp) { + unsigned codepoint; + SZ char_size; + if( CH(off) == _T('-') ){ // '-' are not replaced *ptr++ = _T('-'); off++; continue; } - unsigned codepoint; - SZ char_size; codepoint = md_decode_unicode(ctx->text, off, line_end, &char_size); if(ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE(off)) {// replace white spaces by '-' From 2f3ff6fc36df88530b0450dda1e3050d8444f56a Mon Sep 17 00:00:00 2001 From: chowette Date: Sat, 23 Apr 2022 22:24:45 +0200 Subject: [PATCH 13/35] add more tests to improve coverage --- scripts/run-tests.sh | 4 ++ test/heading-auto-identifier.txt | 30 ++++++++++++++ test/pathological_auto_ident_tests.py | 57 +++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100755 test/pathological_auto_ident_tests.py diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh index 9d2359ca..ae0003b1 100755 --- a/scripts/run-tests.sh +++ b/scripts/run-tests.sh @@ -77,3 +77,7 @@ $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/heading-auto-identifier.txt" -p echo echo "Pathological input:" $PYTHON "$TEST_DIR/pathological_tests.py" -p "$PROGRAM" + +echo +echo "Heading auto identifiers pathological input:" +$PYTHON "$TEST_DIR/pathological_auto_ident_tests.py" -p "$PROGRAM --fheading-auto-id" diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index 5dcf8396..823ee75a 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -98,3 +98,33 @@ But unicode emoji characters are stripped

the + sign

```````````````````````````````` +Same heading get a suffix number. + +```````````````````````````````` example +# title +# title +## title +### title +# Title +# title +# ti!tle +# title +# title +# title +# title +# title +. +

title

+

title

+

title

+

title

+

Title

+

title

+

ti!tle

+

title

+

title

+

title

+

title

+

title

+```````````````````````````````` + diff --git a/test/pathological_auto_ident_tests.py b/test/pathological_auto_ident_tests.py new file mode 100755 index 00000000..fe23770e --- /dev/null +++ b/test/pathological_auto_ident_tests.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import re +import argparse +import sys +import platform +from cmark import CMark +from timeit import default_timer as timer + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run cmark tests.') + parser.add_argument('-p', '--program', dest='program', nargs='?', default=None, + help='program to test') + parser.add_argument('--library-dir', dest='library_dir', nargs='?', + default=None, help='directory containing dynamic library') + args = parser.parse_args(sys.argv[1:]) + +cmark = CMark(prog=args.program, library_dir=args.library_dir) + +# list of pairs consisting of input and a regex that must match the output. +pathological = { + # note - some pythons have limit of 65535 for {num-matches} in re. + + "many identical heading": + (("# a\n" * (50000+1)), + re.compile("^

a

\n(

a

\n){50000}$")) +} + +whitespace_re = re.compile('/s+/') +passed = 0 +errored = 0 +failed = 0 + +#print("Testing pathological cases:") +for description in pathological: + (inp, regex) = pathological[description] + start = timer() + [rc, actual, err] = cmark.to_html(inp) + end = timer() + if rc != 0: + errored += 1 + print('{:35} [ERRORED (return code %d)]'.format(description, rc)) + print(err) + elif regex.search(actual): + print('{:35} [PASSED] {:.3f} secs'.format(description, end-start)) + passed += 1 + else: + print('{:35} [FAILED]'.format(description)) + print(repr(actual)) + failed += 1 + +print("%d passed, %d failed, %d errored" % (passed, failed, errored)) +if (failed == 0 and errored == 0): + exit(0) +else: + exit(1) From f572773d940e272f4148cc498b16973dcae3b7c7 Mon Sep 17 00:00:00 2001 From: chowette Date: Sun, 24 Apr 2022 00:01:58 +0200 Subject: [PATCH 14/35] add more tests to improve coverage --- test/heading-auto-identifier.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index 823ee75a..2955d1ed 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -128,3 +128,28 @@ Same heading get a suffix number.

title

```````````````````````````````` +# Coverage + +additional test to improve test coverage. + +No heading in a document + +```````````````````````````````` example +no heading +. +

no heading

+```````````````````````````````` + +Multi line heading require a link so it can contain a new line. + +```````````````````````````````` example +Title with a [multi +line +link](link) inside +====================== +. +

Title with a multi +line +link inside

+ +```````````````````````````````` From 12ccd2f4be130d43f795d5e9955dcff4b3ec3e95 Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 26 Apr 2022 20:32:46 +0200 Subject: [PATCH 15/35] fix use of wrong macro ISUNICODEPUNCT_() use a codepoint not an offset. use ISPUNCT_() --- src/md4c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/md4c.c b/src/md4c.c index 8353d635..53cdc1fc 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -1002,7 +1002,7 @@ struct MD_UNICODE_FOLD_INFO_tag { #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off) #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1) - #define ISUNICODEPUNCT_(codepoint) ISPUNCT(codepoint) + #define ISUNICODEPUNCT_(codepoint) ISPUNCT_(codepoint) #define ISUNICODEPUNCT(off) ISPUNCT(off) #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1) From 138a104096a3fa69e4b6a28a2824c129b298139c Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 26 Apr 2022 20:39:21 +0200 Subject: [PATCH 16/35] remove unused struct MD_POSTFIX_DEF_tag some indentation cleaning --- src/md4c.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 53cdc1fc..dca3a844 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -1540,9 +1540,9 @@ md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build) if(build->substr_alloc > 0) { free(build->text); if( build->substr_types != build->trivial_types) - free(build->substr_types); + free(build->substr_types); if( build->substr_offsets != build->trivial_offsets) - free(build->substr_offsets); + free(build->substr_offsets); } } @@ -1602,8 +1602,8 @@ md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, { OFF off; const SZ MAX_POSTFIX_SIZE = 5; // but also add 1 for the '-' - - memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); + + memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD)); build->substr_types = build->trivial_types; build->substr_offsets = build->trivial_offsets; build->substr_count = 1; @@ -1616,7 +1616,7 @@ md_build_attribute_postfix(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size, postfix = 0xffff; } - build->text = (CHAR*) malloc((raw_size + MAX_POSTFIX_SIZE+1) * sizeof(CHAR)); + build->text = (CHAR*) malloc((raw_size + MAX_POSTFIX_SIZE+1) * sizeof(CHAR)); if(build->text == NULL) { MD_LOG("malloc() failed."); goto abort; @@ -2618,11 +2618,6 @@ struct MD_HEADING_DEF_tag { unsigned postfix; }; -struct MD_POSTFIX_DEF_tag { - OFF ident_beg; - SZ ident_size; -}; - static int md_push_heading_def(MD_CTX* ctx) { @@ -2732,7 +2727,7 @@ md_build_heading_def_hashtable(MD_CTX* ctx) goto abort; } memset(ctx->heading_def_hashtable, 0, ctx->heading_def_hashtable_size * sizeof(void*)); - + /* Each member of ctx->heading_def_hashtable[] can be: * -- NULL, * -- pointer to the MD_HEADING_DEF in ctx->heading_defs[], or @@ -2772,7 +2767,7 @@ md_build_heading_def_hashtable(MD_CTX* ctx) ctx->heading_def_hashtable[def->hash % ctx->heading_def_hashtable_size] = list; continue; } - + /* Append the def to the complex bucket list. */ list = (MD_HEADING_DEF_LIST*) bucket; if(list->n_heading_defs >= list->alloc_heading_defs) { @@ -2782,15 +2777,15 @@ md_build_heading_def_hashtable(MD_CTX* ctx) if(list_tmp == NULL) { MD_LOG("realloc() failed."); goto abort; - } + } list = list_tmp; list->alloc_heading_defs = alloc_heading_defs; ctx->heading_def_hashtable[def->hash % ctx->heading_def_hashtable_size] = list; - } + } list->heading_defs[list->n_heading_defs] = def; list->n_heading_defs++; - } + } /* Sort the complex buckets so we can use bsearch() with them. */ for(i = 0; i < ctx->heading_def_hashtable_size; i++) { @@ -2811,7 +2806,7 @@ md_build_heading_def_hashtable(MD_CTX* ctx) } } - return 0; + return 0; abort: return -1; @@ -6370,8 +6365,8 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l } } // update used identifier buffer size - ctx->identifiers_size += def->ident_size; - + ctx->identifiers_size += def->ident_size; + return 0; abort: @@ -6977,7 +6972,7 @@ md_process_doc(MD_CTX *ctx) md_end_current_block(ctx); if(ctx->parser.flags & MD_FLAG_HEADINGAUTOID) { - MD_CHECK(md_build_heading_def_hashtable(ctx)); + MD_CHECK(md_build_heading_def_hashtable(ctx)); } MD_CHECK(md_build_ref_def_hashtable(ctx)); From 5d8a7e990100b32384f79185d243966a3643223e Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 26 Apr 2022 20:53:00 +0200 Subject: [PATCH 17/35] Change how struct `MD_REF_DEF` store dest: Use a pointer and a size instead of begining and ending index. We need a pointer because we want to store the heading identifier as destination, but the identifier is not part of the initial ctx->txt buffer. This is done like the `title` We also cascade change the `MD_LINK_ATTR` struct and supporting functions --- src/md4c.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index dca3a844..4455dbc7 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -1751,11 +1751,11 @@ md_fnv1a(unsigned base, const void* data, size_t n) struct MD_REF_DEF_tag { CHAR* label; CHAR* title; + CHAR* dest; unsigned hash; SZ label_size; SZ title_size; - OFF dest_beg; - OFF dest_end; + SZ dest_size; unsigned char label_needs_free : 1; unsigned char title_needs_free : 1; }; @@ -2089,8 +2089,8 @@ md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size) typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR; struct MD_LINK_ATTR_tag { - OFF dest_beg; - OFF dest_end; + CHAR* dest; + SZ dest_size; CHAR* title; SZ title_size; @@ -2172,7 +2172,7 @@ md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, static int md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, - OFF* p_contents_beg, OFF* p_contents_end) + CHAR** p_contents, SZ* p_contents_size) { OFF off = beg; @@ -2191,8 +2191,8 @@ md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, if(CH(off) == _T('>')) { /* Success. */ - *p_contents_beg = beg+1; - *p_contents_end = off; + *p_contents = (CHAR*)STR(beg+1); + *p_contents_size = off - (beg+1); *p_end = off+1; return TRUE; } @@ -2205,7 +2205,7 @@ md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, static int md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, - OFF* p_contents_beg, OFF* p_contents_end) + CHAR** p_contents, SZ* p_contents_size) { OFF off = beg; int parenthesis_level = 0; @@ -2239,20 +2239,20 @@ md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, return FALSE; /* Success. */ - *p_contents_beg = beg; - *p_contents_end = off; + *p_contents = (CHAR*)STR(beg); + *p_contents_size = off - beg; *p_end = off; return TRUE; } static inline int md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, - OFF* p_contents_beg, OFF* p_contents_end) + CHAR** p_contents, SZ* p_contents_size) { if(CH(beg) == _T('<')) - return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); + return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents, p_contents_size); else - return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end); + return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents, p_contents_size); } static int @@ -2330,8 +2330,8 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) OFF label_contents_end; int label_contents_line_index = -1; int label_is_multiline = FALSE; - OFF dest_contents_beg; - OFF dest_contents_end; + CHAR* dest_contents; + SZ dest_contents_size; OFF title_contents_beg; OFF title_contents_end; int title_contents_line_index; @@ -2366,7 +2366,7 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) /* Link destination. */ if(!md_is_link_destination(ctx, off, lines[line_index].end, - &off, &dest_contents_beg, &dest_contents_end)) + &off, &dest_contents, &dest_contents_size)) return FALSE; /* (Optional) title. Note we interpret it as an title only if nothing @@ -2429,8 +2429,8 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) def->title_size = title_contents_end - title_contents_beg; } - def->dest_beg = dest_contents_beg; - def->dest_end = dest_contents_end; + def->dest = dest_contents; + def->dest_size = dest_contents_size; /* Success. */ ctx->n_ref_defs++; @@ -2476,8 +2476,8 @@ md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines, def = md_lookup_ref_def(ctx, label, label_size); if(def != NULL) { - attr->dest_beg = def->dest_beg; - attr->dest_end = def->dest_end; + attr->dest = def->dest; + attr->dest_size = def->dest_size; attr->title = def->title; attr->title_size = def->title_size; attr->title_needs_free = FALSE; @@ -2523,8 +2523,8 @@ md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, /* Link destination may be omitted, but only when not also having a title. */ if(off < ctx->size && CH(off) == _T(')')) { - attr->dest_beg = off; - attr->dest_end = off; + attr->dest = (CHAR*)STR(off); + attr->dest_size = 0; attr->title = NULL; attr->title_size = 0; attr->title_needs_free = FALSE; @@ -2535,7 +2535,7 @@ md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines, /* Link destination. */ if(!md_is_link_destination(ctx, off, lines[line_index].end, - &off, &attr->dest_beg, &attr->dest_end)) + &off, &attr->dest, &attr->dest_size)) return FALSE; /* (Optional) title. */ @@ -4074,8 +4074,8 @@ md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines) /* If it is a link, we store the destination and title in the two * dummy marks after the opener. */ MD_ASSERT(ctx->marks[opener_index+1].ch == 'D'); - ctx->marks[opener_index+1].beg = attr.dest_beg; - ctx->marks[opener_index+1].end = attr.dest_end; + md_mark_store_ptr(ctx, opener_index+1, attr.dest); + ctx->marks[opener_index+1].prev = attr.dest_size; MD_ASSERT(ctx->marks[opener_index+2].ch == 'D'); md_mark_store_ptr(ctx, opener_index+2, attr.title); @@ -4742,7 +4742,8 @@ md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines) MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'), (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A), - STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE, + md_mark_get_ptr(ctx, (int)(dest_mark - ctx->marks)), + dest_mark->prev, FALSE, md_mark_get_ptr(ctx, (int)(title_mark - ctx->marks)), title_mark->prev)); From 1ecb4b8e10bc806031fe2eff6fc8624e5af6e8d2 Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 26 Apr 2022 20:59:36 +0200 Subject: [PATCH 18/35] extract reference definition so we can reuse it for heading --- src/md4c.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 4455dbc7..8045e16f 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -2315,6 +2315,26 @@ md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, return FALSE; } +static int +md_push_ref_def(MD_CTX* ctx) +{ + if(ctx->n_ref_defs >= ctx->alloc_ref_defs) { + MD_REF_DEF* new_defs; + + ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0 + ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2 + : 16); + new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF)); + if(new_defs == NULL) { + MD_LOG("realloc() failed."); + return -1; + } + + ctx->ref_defs = new_defs; + } + return 0; +} + /* Returns 0 if it is not a reference definition. * * Returns N > 0 if it is a reference definition. N then corresponds to the @@ -2392,20 +2412,7 @@ md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines) return FALSE; /* So, it _is_ a reference definition. Remember it. */ - if(ctx->n_ref_defs >= ctx->alloc_ref_defs) { - MD_REF_DEF* new_defs; - - ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0 - ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2 - : 16); - new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF)); - if(new_defs == NULL) { - MD_LOG("realloc() failed."); - goto abort; - } - - ctx->ref_defs = new_defs; - } + MD_CHECK(md_push_ref_def(ctx)); def = &ctx->ref_defs[ctx->n_ref_defs]; memset(def, 0, sizeof(MD_REF_DEF)); From 2d15e712d4319429f62dd286b8227d395aadef29 Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 26 Apr 2022 21:32:38 +0200 Subject: [PATCH 19/35] store identifier with the leading # --- src/md4c.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 8045e16f..c94d1be7 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -2841,7 +2841,7 @@ md_free_heading_def_hashtable(MD_CTX* ctx) static void md_free_heading_defs(MD_CTX* ctx) { - free(ctx->heading_defs); + free(ctx->heading_defs); } /****************************************** @@ -5216,11 +5216,11 @@ md_setup_H_identifier(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_H_DETAIL* det MD_HEADING_DEF * heading = &ctx->heading_defs[block->heading_def]; if(heading->postfix == 0) { - MD_CHECK(md_build_trivial_attribute(ctx, &ctx->identifiers[heading->ident_beg], - heading->ident_size, &det->identifier, id_build)); + MD_CHECK(md_build_trivial_attribute(ctx, &ctx->identifiers[heading->ident_beg]+1, + heading->ident_size-1, &det->identifier, id_build)); } else { - MD_CHECK(md_build_attribute_postfix(ctx, &ctx->identifiers[heading->ident_beg], - heading->ident_size, heading->postfix, &det->identifier, id_build)); + MD_CHECK(md_build_attribute_postfix(ctx, &ctx->identifiers[heading->ident_beg]+1, + heading->ident_size-1, heading->postfix, &det->identifier, id_build)); } abort: return ret; @@ -6271,12 +6271,13 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l while(!(mark->flags & MD_MARK_RESOLVED)) mark++; - def->ident_size = end - beg; + /* The identifier will not be bigger than the heading + '#' */ + def->ident_size = end - beg + 1; MD_CHECK(md_alloc_identifiers(ctx, def)); /* copy the ident and transform as needed */ ptr = &ctx->identifiers[def->ident_beg]; - + *ptr++ = _T('#'); // start with a '#' while(1) { OFF line_end = line->end; From ada2e6587d58e2b346594d000c95d540731b73fb Mon Sep 17 00:00:00 2001 From: chowette Date: Tue, 26 Apr 2022 21:34:00 +0200 Subject: [PATCH 20/35] store the heading --- src/md4c.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/md4c.c b/src/md4c.c index c94d1be7..ba19f627 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -2618,6 +2618,8 @@ md_free_ref_defs(MD_CTX* ctx) *********************************************/ struct MD_HEADING_DEF_tag { + CHAR* heading; + SZ heading_size; CHAR* identifier; // only valid after all heading are known unsigned hash; OFF ident_beg; @@ -6258,6 +6260,10 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l OFF off = beg; OFF end = lines[n_lines-1].end; + /* store the heading */ + def->heading = (CHAR*)STR(beg); + def->heading_size = end-beg; + /* Reset the previously collected stack of marks. */ ctx->n_marks = 0; From 74f3e4b368c783b4648640ddbafb583f5d14ac3b Mon Sep 17 00:00:00 2001 From: chowette Date: Fri, 14 Oct 2022 21:21:58 +0200 Subject: [PATCH 21/35] add flag MD_FLAG_HEADINGAUTOID doc +typo --- README.md | 4 ++++ test/heading-auto-identifier.txt | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9abe987e..35d26cd4 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,10 @@ extensions: * With the flag `MD_FLAG_UNDERLINE`, underscore (`_`) denotes an underline instead of an ordinary emphasis or strong emphasis. +* With the flag `MD_FLAG_HEADINGAUTOID`, unique identifiers are generated for + headings. The HTML render output them as `id` in the heading tag. For example + `

Title

`. + Few features of CommonMark (those some people see as mis-features) may be disabled with the following flags: diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index 2955d1ed..0d9b56cc 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -9,7 +9,7 @@ With the flag `MD_FLAG_HEADINGAUTOID`, MD4C generate an identifier for a heading

heading

```````````````````````````````` -Spaces are replaced by `-` and upercase are replaced by lower case +Spaces are replaced by `-` and uppercase are replaced by lower case ```````````````````````````````` example # The Heading From 6bfc91dcdb707a2aaa77e9091d940c9dc72c117d Mon Sep 17 00:00:00 2001 From: chowette Date: Fri, 14 Oct 2022 21:26:18 +0200 Subject: [PATCH 22/35] remember the heading as a reference definition --- src/md4c.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/md4c.c b/src/md4c.c index ba19f627..dca69141 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -5557,6 +5557,7 @@ md_make_heading(MD_CTX* ctx) MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1); MD_HEADING_DEF * def = NULL; + MD_REF_DEF * rdef = NULL; MD_CHECK(md_push_heading_def(ctx)); def = &ctx->heading_defs[ctx->n_heading_defs]; memset(def, 0, sizeof(MD_HEADING_DEF)); @@ -5566,6 +5567,20 @@ md_make_heading(MD_CTX* ctx) block->heading_def = ctx->n_heading_defs; ctx->n_heading_defs++; + // remember the heading as a reference definition + MD_CHECK(md_push_ref_def(ctx)); + rdef = &ctx->ref_defs[ctx->n_ref_defs]; + memset(rdef, 0, sizeof(MD_REF_DEF)); + rdef->label = def->heading; + rdef->label_size = def->heading_size; + + rdef->dest = &ctx->identifiers[def->ident_beg]; + rdef->dest_size = def->ident_size; + + + /* Success. */ + ctx->n_ref_defs++; + abort: return ret; } From 1ec0845bba1df2f2c8a8d6e0fd368718866760e2 Mon Sep 17 00:00:00 2001 From: chowette Date: Fri, 14 Oct 2022 21:27:19 +0200 Subject: [PATCH 23/35] rebuild identifier reference after a reallocation --- src/md4c.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/md4c.c b/src/md4c.c index dca69141..5e789acc 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -2664,7 +2664,17 @@ md_alloc_identifiers(MD_CTX *ctx, MD_HEADING_DEF* def) MD_LOG("realloc() failed."); return -1; } - + if (ctx->identifiers != new_identifiers){ + // rebuild all ref_def pointing to identifiers + int i; + for(i = 0; i < ctx->n_ref_defs; i++) { + MD_REF_DEF* def = &ctx->ref_defs[i]; + if (def->dest > ctx->identifiers + && def->dest <= ctx->identifiers+ctx->identifiers_size ){ + def->dest = new_identifiers + (def->dest - ctx->identifiers); + } + } + } ctx->identifiers = new_identifiers; } From 1eb9b0628392f0d09ef778bfee1146c03efe4f6b Mon Sep 17 00:00:00 2001 From: chowette Date: Fri, 14 Oct 2022 21:28:29 +0200 Subject: [PATCH 24/35] store the heading level --- src/md4c.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 5e789acc..7124b567 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -2625,6 +2625,7 @@ struct MD_HEADING_DEF_tag { OFF ident_beg; SZ ident_size; unsigned postfix; + unsigned level:8; }; static int @@ -2684,7 +2685,7 @@ md_alloc_identifiers(MD_CTX *ctx, MD_HEADING_DEF* def) /** forward declaration */ static int -md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines); +md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines, int level); typedef struct MD_HEADING_DEF_LIST_tag MD_HEADING_DEF_LIST; struct MD_HEADING_DEF_LIST_tag { @@ -5573,7 +5574,7 @@ md_make_heading(MD_CTX* ctx) memset(def, 0, sizeof(MD_HEADING_DEF)); // filling of the heading def - MD_CHECK(md_heading_build_ident(ctx, def, lines, block->n_lines)); + MD_CHECK(md_heading_build_ident(ctx, def, lines, block->n_lines, block->data)); block->heading_def = ctx->n_heading_defs; ctx->n_heading_defs++; @@ -6274,7 +6275,7 @@ md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTA } static int -md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines) +md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_lines, int level) { MD_MARK* mark; CHAR* ptr; @@ -6288,6 +6289,8 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l /* store the heading */ def->heading = (CHAR*)STR(beg); def->heading_size = end-beg; + /* store the heading level */ + def->level = level; /* Reset the previously collected stack of marks. */ ctx->n_marks = 0; From 36bb1e98a884e50930b26bb5d220c736d1041d19 Mon Sep 17 00:00:00 2001 From: chowette Date: Fri, 14 Oct 2022 22:18:46 +0200 Subject: [PATCH 25/35] Output TOC at start of document --- src/md4c-html.c | 2 ++ src/md4c.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ src/md4c.h | 3 ++- 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/src/md4c-html.c b/src/md4c-html.c index 6892000b..bcee1e65 100644 --- a/src/md4c-html.c +++ b/src/md4c-html.c @@ -411,6 +411,7 @@ enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) case MD_BLOCK_TR: RENDER_VERBATIM(r, "\n"); break; case MD_BLOCK_TH: render_open_td_block(r, "th", (MD_BLOCK_TD_DETAIL*)detail); break; case MD_BLOCK_TD: render_open_td_block(r, "td", (MD_BLOCK_TD_DETAIL*)detail); break; + case MD_BLOCK_NAV: RENDER_VERBATIM(r, "\n"); break; } return 0; diff --git a/src/md4c.c b/src/md4c.c index 7124b567..972745db 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -6993,6 +6993,67 @@ md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANAL return ret; } +static int +md_output_toc(MD_CTX *ctx) +{ + MD_HEADING_DEF *hd; + MD_BLOCK_LI_DETAIL li_det; + + MD_ATTRIBUTE_BUILD href_build = {0}; + MD_ATTRIBUTE_BUILD title_build = {0}; + MD_SPAN_A_DETAIL a_det; + li_det.is_task = FALSE; + int ret = 0; + int level = 0; + int i; + + MD_ENTER_BLOCK(MD_BLOCK_NAV, NULL); + + for (i = 0; i < ctx->n_heading_defs; ++i){ + hd = &ctx->heading_defs[i]; + while (hd->level > level){ + MD_ENTER_BLOCK(MD_BLOCK_UL, NULL); + ++level; + } + while (hd->level < level){ + MD_LEAVE_BLOCK(MD_BLOCK_UL, NULL); + --level; + } + + MD_ENTER_BLOCK(MD_BLOCK_LI, &li_det); + memset(&a_det, 0, sizeof(MD_SPAN_A_DETAIL)); + if (hd->postfix == 0){ + MD_CHECK(md_build_attribute(ctx, hd->identifier, hd->ident_size, + MD_BUILD_ATTR_NO_ESCAPES, + &a_det.href, &href_build)); + } else { + MD_CHECK(md_build_attribute_postfix(ctx, + hd->identifier, hd->ident_size, + hd->postfix, &a_det.href, &href_build)); + } + + MD_CHECK(md_build_attribute(ctx, NULL, 0, 0, &a_det.title, &title_build)); + + MD_ENTER_SPAN(MD_SPAN_A, &a_det); + + MD_TEXT(MD_TEXT_NORMAL, hd->heading, hd->heading_size); + MD_LEAVE_SPAN(MD_SPAN_A, NULL); + MD_LEAVE_BLOCK(MD_BLOCK_LI, NULL); + } + + // close remaining opened level + while (level > 0){ + MD_LEAVE_BLOCK(MD_BLOCK_UL, NULL); + --level; + } + MD_LEAVE_BLOCK(MD_BLOCK_NAV, NULL); + +abort: + md_free_attribute(ctx, &href_build); + md_free_attribute(ctx, &title_build); + return ret; +} + static int md_process_doc(MD_CTX *ctx) { @@ -7019,6 +7080,9 @@ md_process_doc(MD_CTX *ctx) } MD_CHECK(md_build_ref_def_hashtable(ctx)); + /* Output the TOC */ + MD_CHECK(md_output_toc(ctx)); + /* Process all blocks. */ MD_CHECK(md_leave_child_containers(ctx, 0)); MD_CHECK(md_process_all_blocks(ctx)); diff --git a/src/md4c.h b/src/md4c.h index 0c4984c6..e0e98c70 100644 --- a/src/md4c.h +++ b/src/md4c.h @@ -99,7 +99,8 @@ typedef enum MD_BLOCKTYPE { MD_BLOCK_TBODY, MD_BLOCK_TR, MD_BLOCK_TH, - MD_BLOCK_TD + MD_BLOCK_TD, + MD_BLOCK_NAV } MD_BLOCKTYPE; /* Span represents an in-line piece of a document which should be rendered with From 8738b1e765798f5e5488d0d5abd3be8c94ad2915 Mon Sep 17 00:00:00 2001 From: chowette Date: Sun, 16 Oct 2022 18:48:09 +0200 Subject: [PATCH 26/35] Add TOC option to the parser parameter struct - depth for toc output - Increase the abi_version to 1 - add --table-of-content option to md2html - add --toc-depth=x to limit TOC levels --- md2html/md2html.c | 21 ++++++++++++++++++++- src/md4c-html.c | 6 ++++-- src/md4c-html.h | 5 ++++- src/md4c.c | 46 ++++++++++++++++++++++++++-------------------- src/md4c.h | 24 +++++++++++++++++++++++- 5 files changed, 77 insertions(+), 25 deletions(-) diff --git a/md2html/md2html.c b/md2html/md2html.c index f0a4da60..59d65020 100644 --- a/md2html/md2html.c +++ b/md2html/md2html.c @@ -42,8 +42,10 @@ static unsigned parser_flags = 0; #endif static int want_fullhtml = 0; static int want_xhtml = 0; +static int want_toc = 0; static int want_stat = 0; +MD_TOC_OPTIONS toc_options = { 3, NULL}; /********************************* *** Simple grow-able buffer *** @@ -142,7 +144,7 @@ process_file(FILE* in, FILE* out) t0 = clock(); ret = md_html(buf_in.data, (MD_SIZE)buf_in.size, process_output, (void*) &buf_out, - parser_flags, renderer_flags); + parser_flags, renderer_flags, &toc_options); t1 = clock(); if(ret != 0) { @@ -200,6 +202,8 @@ static const CMDLINE_OPTION cmdline_options[] = { { 'o', "output", 'o', CMDLINE_OPTFLAG_REQUIREDARG }, { 'f', "full-html", 'f', 0 }, { 'x', "xhtml", 'x', 0 }, + { 't', "table-of-content", 't', 0 }, + { 0, "toc-depth", 'd', CMDLINE_OPTFLAG_REQUIREDARG }, { 's', "stat", 's', 0 }, { 'h', "help", 'h', 0 }, { 'v', "version", 'v', 0 }, @@ -241,6 +245,10 @@ usage(void) " -o --output=FILE Output file (default is standard output)\n" " -f, --full-html Generate full HTML document, including header\n" " -x, --xhtml Generate XHTML instead of HTML\n" + " -t, --table-of-content\n" + " Generate a table of content at start\n" + " --toc-depth=3 set the maximum level of heading in the table\n" + " of content. 1 to 6. Default is 3\n" " -s, --stat Measure time of input parsing\n" " -h, --help Display this help and exit\n" " -v, --version Display version and exit\n" @@ -298,6 +306,15 @@ version(void) static const char* input_path = NULL; static const char* output_path = NULL; +static int parse_toc_depth(char const* value){ + int depth = -1; + depth = *value - '0'; + if(depth<0 || depth > 6){ + depth = -1; + } + return depth; +} + static int cmdline_callback(int opt, char const* value, void* data) { @@ -314,6 +331,8 @@ cmdline_callback(int opt, char const* value, void* data) case 'o': output_path = value; break; case 'f': want_fullhtml = 1; break; case 'x': want_xhtml = 1; renderer_flags |= MD_HTML_FLAG_XHTML; break; + case 't': want_toc = 1; parser_flags |= MD_FLAG_HEADINGAUTOID; break; + case 'd': toc_options.depth = parse_toc_depth(value); break; case 's': want_stat = 1; break; case 'h': usage(); exit(0); break; case 'v': version(); exit(0); break; diff --git a/src/md4c-html.c b/src/md4c-html.c index bcee1e65..b6cdf7af 100644 --- a/src/md4c-html.c +++ b/src/md4c-html.c @@ -546,13 +546,14 @@ debug_log_callback(const char* msg, void* userdata) int md_html(const MD_CHAR* input, MD_SIZE input_size, void (*process_output)(const MD_CHAR*, MD_SIZE, void*), - void* userdata, unsigned parser_flags, unsigned renderer_flags) + void* userdata, unsigned parser_flags, unsigned renderer_flags, + MD_TOC_OPTIONS* toc_options) { MD_HTML render = { process_output, userdata, renderer_flags, 0, { 0 } }; int i; MD_PARSER parser = { - 0, + 1, parser_flags, enter_block_callback, leave_block_callback, @@ -560,6 +561,7 @@ md_html(const MD_CHAR* input, MD_SIZE input_size, leave_span_callback, text_callback, debug_log_callback, + *toc_options, NULL }; diff --git a/src/md4c-html.h b/src/md4c-html.h index 23d3f739..aeac7f52 100644 --- a/src/md4c-html.h +++ b/src/md4c-html.h @@ -52,13 +52,16 @@ * Param userdata is just propagated back to process_output() callback. * Param parser_flags are flags from md4c.h propagated to md_parse(). * Param render_flags is bitmask of MD_HTML_FLAG_xxxx. + * Param toc_options is a pointer to toc options from md4c.h propagated to md_parse(). * * Returns -1 on error (if md_parse() fails.) * Returns 0 on success. */ int md_html(const MD_CHAR* input, MD_SIZE input_size, void (*process_output)(const MD_CHAR*, MD_SIZE, void*), - void* userdata, unsigned parser_flags, unsigned renderer_flags); + void* userdata, unsigned parser_flags, unsigned renderer_flags, + MD_TOC_OPTIONS* toc_options + ); #ifdef __cplusplus diff --git a/src/md4c.c b/src/md4c.c index 972745db..bde80add 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -7012,33 +7012,38 @@ md_output_toc(MD_CTX *ctx) for (i = 0; i < ctx->n_heading_defs; ++i){ hd = &ctx->heading_defs[i]; while (hd->level > level){ - MD_ENTER_BLOCK(MD_BLOCK_UL, NULL); + if (level <= ctx->parser.toc_options.depth) + MD_ENTER_BLOCK(MD_BLOCK_UL, NULL); ++level; } while (hd->level < level){ - MD_LEAVE_BLOCK(MD_BLOCK_UL, NULL); + if (level <= ctx->parser.toc_options.depth) + MD_LEAVE_BLOCK(MD_BLOCK_UL, NULL); --level; } - MD_ENTER_BLOCK(MD_BLOCK_LI, &li_det); - memset(&a_det, 0, sizeof(MD_SPAN_A_DETAIL)); - if (hd->postfix == 0){ - MD_CHECK(md_build_attribute(ctx, hd->identifier, hd->ident_size, - MD_BUILD_ATTR_NO_ESCAPES, - &a_det.href, &href_build)); - } else { - MD_CHECK(md_build_attribute_postfix(ctx, - hd->identifier, hd->ident_size, - hd->postfix, &a_det.href, &href_build)); - } + if (level <= ctx->parser.toc_options.depth){ + MD_ENTER_BLOCK(MD_BLOCK_LI, &li_det); + memset(&a_det, 0, sizeof(MD_SPAN_A_DETAIL)); + if (hd->postfix == 0){ + MD_CHECK(md_build_attribute(ctx, hd->identifier, hd->ident_size, + MD_BUILD_ATTR_NO_ESCAPES, + &a_det.href, &href_build)); + } else { + MD_CHECK(md_build_attribute_postfix(ctx, + hd->identifier, hd->ident_size, + hd->postfix, &a_det.href, &href_build)); + } - MD_CHECK(md_build_attribute(ctx, NULL, 0, 0, &a_det.title, &title_build)); + MD_CHECK(md_build_attribute(ctx, NULL, 0, 0, &a_det.title, &title_build)); - MD_ENTER_SPAN(MD_SPAN_A, &a_det); + MD_ENTER_SPAN(MD_SPAN_A, &a_det); - MD_TEXT(MD_TEXT_NORMAL, hd->heading, hd->heading_size); - MD_LEAVE_SPAN(MD_SPAN_A, NULL); - MD_LEAVE_BLOCK(MD_BLOCK_LI, NULL); + MD_TEXT(MD_TEXT_NORMAL, hd->heading, hd->heading_size); + MD_LEAVE_SPAN(MD_SPAN_A, NULL); + MD_LEAVE_BLOCK(MD_BLOCK_LI, NULL); + } + } // close remaining opened level @@ -7081,7 +7086,8 @@ md_process_doc(MD_CTX *ctx) MD_CHECK(md_build_ref_def_hashtable(ctx)); /* Output the TOC */ - MD_CHECK(md_output_toc(ctx)); + if(ctx->parser.toc_options.depth > 0) + MD_CHECK(md_output_toc(ctx)); /* Process all blocks. */ MD_CHECK(md_leave_child_containers(ctx, 0)); @@ -7141,7 +7147,7 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userd int i; int ret; - if(parser->abi_version != 0) { + if(parser->abi_version != 1) { if(parser->debug_log != NULL) parser->debug_log("Unsupported abi_version.", userdata); return -1; diff --git a/src/md4c.h b/src/md4c.h index e0e98c70..8dbd417f 100644 --- a/src/md4c.h +++ b/src/md4c.h @@ -335,10 +335,26 @@ typedef struct MD_SPAN_WIKILINK { #define MD_DIALECT_COMMONMARK 0 #define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS | MD_FLAG_HEADINGAUTOID) +/* Table of content option structure + */ +typedef struct MD_TOC_OPTIONS { + /* Specify the maximum level of heading to include in the table of contents. + * a value of 0 disable Table of content generation + */ + int depth; + + /* Specify a table of content placeholder. + * + * Providing a empty or NULL placeholder will output the TOC at document start. + */ + const MD_CHAR* toc_placeholder; + +} MD_TOC_OPTIONS; + /* Parser structure. */ typedef struct MD_PARSER { - /* Reserved. Set to zero. + /* Reserved. Set to 1. */ unsigned abi_version; @@ -378,6 +394,12 @@ typedef struct MD_PARSER { */ void (*debug_log)(const char* /*msg*/, void* /*userdata*/); + /* Table of content parameters + * + * + */ + MD_TOC_OPTIONS toc_options; + /* Reserved. Set to NULL. */ void (*syntax)(void); From bc98da465e4760126d486a0658eca0a305bcb1a9 Mon Sep 17 00:00:00 2001 From: chowette Date: Wed, 19 Oct 2022 19:47:39 +0200 Subject: [PATCH 27/35] add optional table of content place holder MARK --table-of -content option has a parameter to set the mark --toc is a shorthand for the --table-of-content option wrong TOC depth is now an error --- md2html/md2html.c | 35 +++++++++++++++++---------- src/md4c.c | 60 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 76 insertions(+), 19 deletions(-) diff --git a/md2html/md2html.c b/md2html/md2html.c index 59d65020..9a5b3b9c 100644 --- a/md2html/md2html.c +++ b/md2html/md2html.c @@ -202,8 +202,9 @@ static const CMDLINE_OPTION cmdline_options[] = { { 'o', "output", 'o', CMDLINE_OPTFLAG_REQUIREDARG }, { 'f', "full-html", 'f', 0 }, { 'x', "xhtml", 'x', 0 }, - { 't', "table-of-content", 't', 0 }, - { 0, "toc-depth", 'd', CMDLINE_OPTFLAG_REQUIREDARG }, + { 't', "table-of-content", 't', CMDLINE_OPTFLAG_OPTIONALARG }, + { 0, "toc", 't', CMDLINE_OPTFLAG_OPTIONALARG }, + { 0, "toc-depth", 'd', CMDLINE_OPTFLAG_REQUIREDARG }, { 's', "stat", 's', 0 }, { 'h', "help", 'h', 0 }, { 'v', "version", 'v', 0 }, @@ -245,9 +246,10 @@ usage(void) " -o --output=FILE Output file (default is standard output)\n" " -f, --full-html Generate full HTML document, including header\n" " -x, --xhtml Generate XHTML instead of HTML\n" - " -t, --table-of-content\n" - " Generate a table of content at start\n" - " --toc-depth=3 set the maximum level of heading in the table\n" + " -t, --table-of-content=MARK, --toc=MARK\n" + " Generate a table of content in place of MARK line\n" + " If no MARK is given, the toc is generated at start\n" + " --toc-depth=D Set the maximum level of heading in the table\n" " of content. 1 to 6. Default is 3\n" " -s, --stat Measure time of input parsing\n" " -h, --help Display this help and exit\n" @@ -307,12 +309,9 @@ static const char* input_path = NULL; static const char* output_path = NULL; static int parse_toc_depth(char const* value){ - int depth = -1; - depth = *value - '0'; - if(depth<0 || depth > 6){ - depth = -1; - } - return depth; + toc_options.depth = -1; + toc_options.depth = *value - '0'; + return (toc_options.depth>0 && toc_options.depth <= 6); } static int @@ -331,8 +330,18 @@ cmdline_callback(int opt, char const* value, void* data) case 'o': output_path = value; break; case 'f': want_fullhtml = 1; break; case 'x': want_xhtml = 1; renderer_flags |= MD_HTML_FLAG_XHTML; break; - case 't': want_toc = 1; parser_flags |= MD_FLAG_HEADINGAUTOID; break; - case 'd': toc_options.depth = parse_toc_depth(value); break; + case 't': + want_toc = 1; + parser_flags |= MD_FLAG_HEADINGAUTOID; + toc_options.toc_placeholder = value; + break; + case 'd': + if(!parse_toc_depth(value)){ + fprintf(stderr, "Invalid toc-depth: %s\n", value); + fprintf(stderr, "Must be a number in the range 1-6\n"); + exit(1); + } + break; case 's': want_stat = 1; break; case 'h': usage(); exit(0); break; case 'v': version(); exit(0); break; diff --git a/src/md4c.c b/src/md4c.c index bde80add..2b95642a 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -174,7 +174,8 @@ struct MD_CTX_tag { SZ identifiers_size; SZ alloc_identifiers; - + /* Toc informations */ + int toc_found; /* Stack of inline/span markers. * This is only used for parsing a single block contents but by storing it @@ -258,7 +259,8 @@ enum MD_LINETYPE_tag { MD_LINE_HTML, MD_LINE_TEXT, MD_LINE_TABLE, - MD_LINE_TABLEUNDERLINE + MD_LINE_TABLEUNDERLINE, + MD_LINE_TOC }; typedef enum MD_LINETYPE_tag MD_LINETYPE; @@ -4615,6 +4617,8 @@ md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ targ return ret; } +/** forward declaration */ +static int md_output_toc(MD_CTX *ctx); /* Render the output, accordingly to the analyzed ctx->marks. */ static int @@ -5315,6 +5319,10 @@ md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block) (const MD_LINE*)(block + 1), block->n_lines)); break; + case MD_BLOCK_NAV: + MD_CHECK(md_output_toc(ctx)); + break; + default: MD_CHECK(md_process_normal_block_contents(ctx, (const MD_LINE*)(block + 1), block->n_lines)); @@ -5488,6 +5496,10 @@ md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line) block->type = MD_BLOCK_HTML; break; + case MD_LINE_TOC: + block->type = MD_BLOCK_NAV; + break; + case MD_LINE_BLANK: case MD_LINE_SETEXTUNDERLINE: case MD_LINE_TABLEUNDERLINE: @@ -5831,6 +5843,33 @@ md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count) return TRUE; } +static int +md_is_toc_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end) +{ + OFF off = beg; + + // allow for blank chars before the TOC mark + while(off < ctx->size && ISBLANK(off)) + off++; + + if(off < ctx->size && ISNEWLINE(off)) + return FALSE; + + const CHAR * toc = ctx->parser.toc_options.toc_placeholder; + + while(off < ctx->size && '\0' != *toc){ + if(CH(off) != *toc) + return FALSE; + toc++; + off++; + } + if('\0' == *toc){ + *p_beg = off; + *p_end = off; + } + return '\0' == *toc; +} + static int md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end) { @@ -6794,6 +6833,15 @@ md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end, } } + /* check for TOC mark */ + if(ctx->parser.toc_options.toc_placeholder != NULL && !ctx->toc_found && + md_is_toc_line(ctx, off, &line->beg, &off)) + { + line->type = MD_LINE_TOC; + ctx->toc_found = TRUE; + break; + } + /* By default, we are normal text line. */ line->type = MD_LINE_TEXT; if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) { @@ -7007,8 +7055,6 @@ md_output_toc(MD_CTX *ctx) int level = 0; int i; - MD_ENTER_BLOCK(MD_BLOCK_NAV, NULL); - for (i = 0; i < ctx->n_heading_defs; ++i){ hd = &ctx->heading_defs[i]; while (hd->level > level){ @@ -7051,7 +7097,6 @@ md_output_toc(MD_CTX *ctx) MD_LEAVE_BLOCK(MD_BLOCK_UL, NULL); --level; } - MD_LEAVE_BLOCK(MD_BLOCK_NAV, NULL); abort: md_free_attribute(ctx, &href_build); @@ -7086,8 +7131,11 @@ md_process_doc(MD_CTX *ctx) MD_CHECK(md_build_ref_def_hashtable(ctx)); /* Output the TOC */ - if(ctx->parser.toc_options.depth > 0) + if(ctx->parser.toc_options.depth > 0 && !ctx->toc_found) { + MD_ENTER_BLOCK(MD_BLOCK_NAV, NULL); MD_CHECK(md_output_toc(ctx)); + MD_LEAVE_BLOCK(MD_BLOCK_NAV, NULL); + } /* Process all blocks. */ MD_CHECK(md_leave_child_containers(ctx, 0)); From 3d4fc52f9a0e85d4afdccc621c56a0382d8dd2dd Mon Sep 17 00:00:00 2001 From: chowette Date: Wed, 19 Oct 2022 19:48:33 +0200 Subject: [PATCH 28/35] fix probleme with table of content
    and
generation --- src/md4c.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 2b95642a..079cf026 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -7058,9 +7058,9 @@ md_output_toc(MD_CTX *ctx) for (i = 0; i < ctx->n_heading_defs; ++i){ hd = &ctx->heading_defs[i]; while (hd->level > level){ + ++level; if (level <= ctx->parser.toc_options.depth) MD_ENTER_BLOCK(MD_BLOCK_UL, NULL); - ++level; } while (hd->level < level){ if (level <= ctx->parser.toc_options.depth) @@ -7094,7 +7094,8 @@ md_output_toc(MD_CTX *ctx) // close remaining opened level while (level > 0){ - MD_LEAVE_BLOCK(MD_BLOCK_UL, NULL); + if (level <= ctx->parser.toc_options.depth) + MD_LEAVE_BLOCK(MD_BLOCK_UL, NULL); --level; } From 643423bd418db5f025671d5aa8ff74d4d8659a58 Mon Sep 17 00:00:00 2001 From: chowette Date: Wed, 19 Oct 2022 19:51:50 +0200 Subject: [PATCH 29/35] add some test to the TOC option --- scripts/run-tests.sh | 4 +++ test/toc.txt | 76 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 test/toc.txt diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh index ae0003b1..0b794acc 100755 --- a/scripts/run-tests.sh +++ b/scripts/run-tests.sh @@ -81,3 +81,7 @@ $PYTHON "$TEST_DIR/pathological_tests.py" -p "$PROGRAM" echo echo "Heading auto identifiers pathological input:" $PYTHON "$TEST_DIR/pathological_auto_ident_tests.py" -p "$PROGRAM --fheading-auto-id" + +echo +echo "Table of content extension:" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/toc.txt" -p "$PROGRAM --table-of-content" diff --git a/test/toc.txt b/test/toc.txt new file mode 100644 index 00000000..ee010b4f --- /dev/null +++ b/test/toc.txt @@ -0,0 +1,76 @@ +# Table of content + +With the option `--table-of-content`, MD4C enables extension for output of +toc. + +Basic toc may look as follows: + +```````````````````````````````` example +# title +. + +

title

+```````````````````````````````` + +By default, the toc-depth is limited to heading of level 3 + +```````````````````````````````` example +# title level 1 +## title level 2 +### title level 3 +#### title level 4 +##### title level 5 +. + +

title level 1

+

title level 2

+

title level 3

+

title level 4

+
title level 5
+```````````````````````````````` + +The toc can skip some level + +```````````````````````````````` example +### title level 3 +# title level 1 +## title level 2 +##### title level 5 +### title level 3 again +. + +

title level 3

+

title level 1

+

title level 2

+
title level 5
+

title level 3 again

+```````````````````````````````` From 001494bc4a2093218b4935fcae91babea6e96e9c Mon Sep 17 00:00:00 2001 From: chowette Date: Wed, 19 Oct 2022 21:40:58 +0200 Subject: [PATCH 30/35] Table of content placement tests --- scripts/run-tests.sh | 4 +++ test/toc-mark.txt | 85 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 test/toc-mark.txt diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh index 0b794acc..874345ce 100755 --- a/scripts/run-tests.sh +++ b/scripts/run-tests.sh @@ -85,3 +85,7 @@ $PYTHON "$TEST_DIR/pathological_auto_ident_tests.py" -p "$PROGRAM --fheading-aut echo echo "Table of content extension:" $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/toc.txt" -p "$PROGRAM --table-of-content" + +echo +echo "Table of content placement extension :" +$PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/toc-mark.txt" -p "$PROGRAM --table-of-content=[[__TOC__]]" diff --git a/test/toc-mark.txt b/test/toc-mark.txt new file mode 100644 index 00000000..68283728 --- /dev/null +++ b/test/toc-mark.txt @@ -0,0 +1,85 @@ +# Table of content mark + +The TOC mark allow to place the toc where you need it. +Run the example with --toc=[[__TOC__]] + +```````````````````````````````` example +# title +# table of content +[[__TOC__]] +# some chapter +. +

title

+

table of content

+ +

some chapter

+```````````````````````````````` + +Only the first mark is replaced by the TOC + + + +```````````````````````````````` example +# title +[[__TOC__]] +[[__TOC__]] +. +

title

+ +

[[TOC]]

+```````````````````````````````` + +The TOC mark must be alone at start of a line or it is invalid: + +```````````````````````````````` example +# title +invalid [[__TOC__]] mark +. + +

title

+

invalid [[TOC]] mark

+ +```````````````````````````````` + +But you can have space at start of a line: + +```````````````````````````````` example +# title + [[__TOC__]] mark +. +

title

+ +```````````````````````````````` + +The text after the TOC mark is discarded: + +```````````````````````````````` example +# title +[[__TOC__]] discarded text +. +

title

+ +```````````````````````````````` + From 1ee979f9e16a44d3accc669cd4a7bc853952455f Mon Sep 17 00:00:00 2001 From: chowette Date: Wed, 19 Oct 2022 22:12:12 +0200 Subject: [PATCH 31/35] fix default TOC depth to properly handel case when no TOC is needed --- md2html/md2html.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/md2html/md2html.c b/md2html/md2html.c index 9a5b3b9c..139a6eda 100644 --- a/md2html/md2html.c +++ b/md2html/md2html.c @@ -45,7 +45,7 @@ static int want_xhtml = 0; static int want_toc = 0; static int want_stat = 0; -MD_TOC_OPTIONS toc_options = { 3, NULL}; +MD_TOC_OPTIONS toc_options = { 0, NULL}; /********************************* *** Simple grow-able buffer *** @@ -334,6 +334,8 @@ cmdline_callback(int opt, char const* value, void* data) want_toc = 1; parser_flags |= MD_FLAG_HEADINGAUTOID; toc_options.toc_placeholder = value; + if(toc_options.depth == 0) + toc_options.depth = 3; break; case 'd': if(!parse_toc_depth(value)){ From 3fca9196c27f61ad7011a6e1a8728950ce22b6e9 Mon Sep 17 00:00:00 2001 From: chowette Date: Wed, 19 Oct 2022 22:12:36 +0200 Subject: [PATCH 32/35] add some more pathological tests cases --- test/pathological_auto_ident_tests.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/pathological_auto_ident_tests.py b/test/pathological_auto_ident_tests.py index fe23770e..269fb917 100755 --- a/test/pathological_auto_ident_tests.py +++ b/test/pathological_auto_ident_tests.py @@ -24,7 +24,13 @@ "many identical heading": (("# a\n" * (50000+1)), - re.compile("^

a

\n(

a

\n){50000}$")) + re.compile("^

a

\n(

a

\n){50000}$")), + "too many identical heading": + (("# a\n" * (70000+2)), + re.compile("^

a

\n(

a

\n){70000}(

a

\n)$")), + "heading realocation": + (("# A long title to trigger a reallocation\n"*(300+1)), + re.compile("^

A long title to trigger a reallocation

\n(

A long title to trigger a reallocation

\n){300}$")) } whitespace_re = re.compile('/s+/') From d4f99b21928e87888fe765309e8bcf71a95bb3ea Mon Sep 17 00:00:00 2001 From: chowette Date: Thu, 20 Oct 2022 00:54:40 +0200 Subject: [PATCH 33/35] Fix declaration-after-statement build error in travis --- src/md4c.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 079cf026..b9398749 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -5847,6 +5847,7 @@ static int md_is_toc_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end) { OFF off = beg; + const CHAR * toc = ctx->parser.toc_options.toc_placeholder; // allow for blank chars before the TOC mark while(off < ctx->size && ISBLANK(off)) @@ -5855,8 +5856,6 @@ md_is_toc_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end) if(off < ctx->size && ISNEWLINE(off)) return FALSE; - const CHAR * toc = ctx->parser.toc_options.toc_placeholder; - while(off < ctx->size && '\0' != *toc){ if(CH(off) != *toc) return FALSE; @@ -7045,12 +7044,11 @@ static int md_output_toc(MD_CTX *ctx) { MD_HEADING_DEF *hd; - MD_BLOCK_LI_DETAIL li_det; + MD_BLOCK_LI_DETAIL li_det = {0}; MD_ATTRIBUTE_BUILD href_build = {0}; MD_ATTRIBUTE_BUILD title_build = {0}; MD_SPAN_A_DETAIL a_det; - li_det.is_task = FALSE; int ret = 0; int level = 0; int i; From f2fab2e4eba706382efa32940ddc223a2e2ccb7a Mon Sep 17 00:00:00 2001 From: chowette Date: Thu, 20 Oct 2022 00:58:31 +0200 Subject: [PATCH 34/35] Fix bug with empty heading found by @software-made-easy --- src/md4c.c | 10 +++++++--- test/heading-auto-identifier.txt | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index b9398749..89f4642c 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -6422,10 +6422,14 @@ md_heading_build_ident(MD_CTX* ctx, MD_HEADING_DEF* def, MD_LINE* lines, int n_l off = mark->end; - /* Move to next resolved mark. */ - mark++; - while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off) + /* Move to next resolved mark. But not past the last mark */ + if(mark < &ctx->marks[ctx->n_marks]) mark++; + while((mark < &ctx->marks[ctx->n_marks]) && + ( !(mark->flags & MD_MARK_RESOLVED) || mark->beg < off)) + { + mark++; + } } /* If reached end of line, move to next one. */ diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index 0d9b56cc..43fe919f 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -153,3 +153,11 @@ line link inside ```````````````````````````````` + +We need to be able to parse empty title +```````````````````````````````` example +# +. +

+```````````````````````````````` + From a41ab752d59db079d2cbc2a33f211fa509655670 Mon Sep 17 00:00:00 2001 From: chowette Date: Mon, 24 Oct 2022 22:53:45 +0200 Subject: [PATCH 35/35] add more tests to improve coverage --- scripts/run-tests.sh | 2 +- test/heading-auto-identifier.txt | 4 ++-- test/toc.txt | 28 ++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh index 874345ce..6ed95ffa 100755 --- a/scripts/run-tests.sh +++ b/scripts/run-tests.sh @@ -87,5 +87,5 @@ echo "Table of content extension:" $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/toc.txt" -p "$PROGRAM --table-of-content" echo -echo "Table of content placement extension :" +echo "Table of content placement extension:" $PYTHON "$TEST_DIR/spec_tests.py" -s "$TEST_DIR/toc-mark.txt" -p "$PROGRAM --table-of-content=[[__TOC__]]" diff --git a/test/heading-auto-identifier.txt b/test/heading-auto-identifier.txt index 43fe919f..09969626 100644 --- a/test/heading-auto-identifier.txt +++ b/test/heading-auto-identifier.txt @@ -20,9 +20,9 @@ Spaces are replaced by `-` and uppercase are replaced by lower case Unicode characters can also be put lower case ```````````````````````````````` example -# ĀĄŁŇŢŰŽבあИЯ +# ĀĄŁŇŢŰŽבあИЯ𐒰 . -

ĀĄŁŇŢŰŽבあИЯ

+

ĀĄŁŇŢŰŽבあИЯ𐒰

```````````````````````````````` diff --git a/test/toc.txt b/test/toc.txt index ee010b4f..a21fc788 100644 --- a/test/toc.txt +++ b/test/toc.txt @@ -74,3 +74,31 @@ The toc can skip some level
title level 5

title level 3 again

```````````````````````````````` + +# Coverage + +Additional test to improve test coverage. + +This sample will output TOC with heading suffix numbers. + + +```````````````````````````````` example +# title +## title +### title +. + +

title

+

title

+

title

+```````````````````````````````` \ No newline at end of file