From 5fa7c7d1a670387dc04b6408a06e7b1a6dbbbf42 Mon Sep 17 00:00:00 2001 From: Vinta Chen Date: Wed, 18 Mar 2026 17:20:23 +0800 Subject: [PATCH] feat(website): add markdown-it-py README parser and inline renderer tests Introduce readme_parser.py which parses README.md into structured section data using the markdown-it-py AST. Includes TypedDicts for ParsedEntry/ParsedSection, slugify(), render_inline_html(), and render_inline_text(). Add test_readme_parser.py covering HTML escaping, link rendering, emphasis, strong, and code_inline for both renderers. Co-Authored-By: Claude --- website/readme_parser.py | 93 +++++++++++++++++++++++++++++ website/tests/test_readme_parser.py | 69 +++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 website/readme_parser.py create mode 100644 website/tests/test_readme_parser.py diff --git a/website/readme_parser.py b/website/readme_parser.py new file mode 100644 index 00000000..a98e0e0c --- /dev/null +++ b/website/readme_parser.py @@ -0,0 +1,93 @@ +"""Parse README.md into structured section data using markdown-it-py AST.""" + +from __future__ import annotations + +import re +from typing import TypedDict + +from markdown_it.tree import SyntaxTreeNode +from markupsafe import escape + + +class AlsoSee(TypedDict): + name: str + url: str + + +class ParsedEntry(TypedDict): + name: str + url: str + description: str # inline HTML, properly escaped + also_see: list[AlsoSee] + + +class ParsedSection(TypedDict): + name: str + slug: str + description: str # plain text, links resolved to text + content: str # raw markdown (backward compat) + entries: list[ParsedEntry] + entry_count: int + preview: str + content_html: str # rendered HTML, properly escaped + + +# --- Slugify ---------------------------------------------------------------- + +_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]") +_SLUG_WHITESPACE_RE = re.compile(r"[\s]+") +_SLUG_MULTI_DASH_RE = re.compile(r"-+") + + +def slugify(name: str) -> str: + """Convert a category name to a URL-friendly slug.""" + slug = name.lower() + slug = _SLUG_NON_ALNUM_RE.sub("", slug) + slug = _SLUG_WHITESPACE_RE.sub("-", slug.strip()) + slug = _SLUG_MULTI_DASH_RE.sub("-", slug) + return slug + + +# --- Inline renderers ------------------------------------------------------- + + +def render_inline_html(children: list[SyntaxTreeNode]) -> str: + """Render inline AST nodes to HTML with proper escaping.""" + parts: list[str] = [] + for child in children: + match child.type: + case "text": + parts.append(str(escape(child.content))) + case "softbreak": + parts.append(" ") + case "link": + href = str(escape(child.attrGet("href") or "")) + inner = render_inline_html(child.children) + parts.append( + f'{inner}' + ) + case "em": + parts.append(f"{render_inline_html(child.children)}") + case "strong": + parts.append(f"{render_inline_html(child.children)}") + case "code_inline": + parts.append(f"{escape(child.content)}") + case "html_inline": + parts.append(str(escape(child.content))) + return "".join(parts) + + +def render_inline_text(children: list[SyntaxTreeNode]) -> str: + """Render inline AST nodes to plain text (links become their text).""" + parts: list[str] = [] + for child in children: + match child.type: + case "text": + parts.append(child.content) + case "softbreak": + parts.append(" ") + case "code_inline": + parts.append(child.content) + case "em" | "strong" | "link": + parts.append(render_inline_text(child.children)) + return "".join(parts) diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py new file mode 100644 index 00000000..974143e5 --- /dev/null +++ b/website/tests/test_readme_parser.py @@ -0,0 +1,69 @@ +"""Tests for the readme_parser module.""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +from readme_parser import render_inline_html, render_inline_text + +from markdown_it import MarkdownIt +from markdown_it.tree import SyntaxTreeNode + + +def _parse_inline(md_text: str) -> list[SyntaxTreeNode]: + """Helper: parse a single paragraph and return its inline children.""" + md = MarkdownIt("commonmark") + root = SyntaxTreeNode(md.parse(md_text)) + # root > paragraph > inline > children + return root.children[0].children[0].children + + +class TestRenderInlineHtml: + def test_plain_text_escapes_html(self): + children = _parse_inline("Hello & friends") + assert render_inline_html(children) == "Hello <world> & friends" + + def test_link_with_target(self): + children = _parse_inline("[name](https://example.com)") + html = render_inline_html(children) + assert 'href="https://example.com"' in html + assert 'target="_blank"' in html + assert 'rel="noopener"' in html + assert ">name" in html + + def test_emphasis(self): + children = _parse_inline("*italic* text") + assert "italic" in render_inline_html(children) + + def test_strong(self): + children = _parse_inline("**bold** text") + assert "bold" in render_inline_html(children) + + def test_code_inline(self): + children = _parse_inline("`some code`") + assert "some code" in render_inline_html(children) + + def test_mixed_link_and_text(self): + children = _parse_inline("See [foo](https://x.com) for details.") + html = render_inline_html(children) + assert "See " in html + assert ">foo" in html + assert " for details." in html + + +class TestRenderInlineText: + def test_plain_text(self): + children = _parse_inline("Hello world") + assert render_inline_text(children) == "Hello world" + + def test_link_becomes_text(self): + children = _parse_inline("See [awesome-algos](https://github.com/x/y).") + assert render_inline_text(children) == "See awesome-algos." + + def test_emphasis_stripped(self): + children = _parse_inline("*italic* text") + assert render_inline_text(children) == "italic text" + + def test_code_inline_kept(self): + children = _parse_inline("`code` here") + assert render_inline_text(children) == "code here"