From 64948d26cec15f05485bdab8b0d3b9026b10808a Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Thu, 25 Jun 2026 10:14:30 +0200 Subject: [PATCH] fix: decode HTML entities before slugifying header IDs When a markdown header contains HTML entities (e.g. `# <othertext`), `header_id_from_text` passed the raw entity string to `_slugify` before HTML-decoding it. The `&` and `;` were stripped as non-word characters but the entity name letters (e.g. `lt`) were kept, silently corrupting the generated ID (`ltothertext` instead of `othertext`). Fix: call `html.unescape()` on the header text before slugifying so that entity characters are resolved to their actual Unicode code points first, then stripped (or kept) by the slug logic as any other character would be. Closes #649 --- lib/markdown2.py | 3 ++- test/tm-cases/header_ids_entity.html | 5 +++++ test/tm-cases/header_ids_entity.opts | 1 + test/tm-cases/header_ids_entity.text | 5 +++++ 4 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 test/tm-cases/header_ids_entity.html create mode 100644 test/tm-cases/header_ids_entity.opts create mode 100644 test/tm-cases/header_ids_entity.text diff --git a/lib/markdown2.py b/lib/markdown2.py index 8dd1c48e..855ce806 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -116,6 +116,7 @@ __author__ = "Trent Mick" import argparse +import html import logging import re import sys @@ -1578,7 +1579,7 @@ def header_id_from_text(self, None to not have an id attribute and to exclude this header from the TOC (if the "toc" extra is specified). """ - header_id = _slugify(text) + header_id = _slugify(html.unescape(text)) if prefix and isinstance(prefix, str): header_id = prefix + '-' + header_id diff --git a/test/tm-cases/header_ids_entity.html b/test/tm-cases/header_ids_entity.html new file mode 100644 index 00000000..6c21b7fa --- /dev/null +++ b/test/tm-cases/header_ids_entity.html @@ -0,0 +1,5 @@ +

<escaped-lt

+ +

R&D Notes

+ +

Normal Header

diff --git a/test/tm-cases/header_ids_entity.opts b/test/tm-cases/header_ids_entity.opts new file mode 100644 index 00000000..38b8bfe8 --- /dev/null +++ b/test/tm-cases/header_ids_entity.opts @@ -0,0 +1 @@ +{"extras": ["header-ids"]} diff --git a/test/tm-cases/header_ids_entity.text b/test/tm-cases/header_ids_entity.text new file mode 100644 index 00000000..2dbfcc85 --- /dev/null +++ b/test/tm-cases/header_ids_entity.text @@ -0,0 +1,5 @@ +# <escaped-lt + +# R&D Notes + +# Normal Header