feat(website): add markdown-it-py README parser and inline renderer tests

Introduce readme_parser.py which parses README.md into structured section data using the markdown-it-py AST. Includes TypedDicts for ParsedEntry/ParsedSection, slugify(), render_inline_html(), and render_inline_text(). Add test_readme_parser.py covering HTML escaping, link rendering, emphasis, strong, and code_inline for both renderers. Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-10 08:38:05 +08:00 · 2026-03-18 17:20:23 +08:00
parent c5caa5a5e1
commit 5fa7c7d1a6
2 changed files with 162 additions and 0 deletions
@@ -0,0 +1,93 @@
+"""Parse README.md into structured section data using markdown-it-py AST."""
+
+from __future__ import annotations
+
+import re
+from typing import TypedDict
+
+from markdown_it.tree import SyntaxTreeNode
+from markupsafe import escape
+
+
+class AlsoSee(TypedDict):
+    name: str
+    url: str
+
+
+class ParsedEntry(TypedDict):
+    name: str
+    url: str
+    description: str  # inline HTML, properly escaped
+    also_see: list[AlsoSee]
+
+
+class ParsedSection(TypedDict):
+    name: str
+    slug: str
+    description: str  # plain text, links resolved to text
+    content: str  # raw markdown (backward compat)
+    entries: list[ParsedEntry]
+    entry_count: int
+    preview: str
+    content_html: str  # rendered HTML, properly escaped
+
+
+# --- Slugify ----------------------------------------------------------------
+
+_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]")
+_SLUG_WHITESPACE_RE = re.compile(r"[\s]+")
+_SLUG_MULTI_DASH_RE = re.compile(r"-+")
+
+
+def slugify(name: str) -> str:
+    """Convert a category name to a URL-friendly slug."""
+    slug = name.lower()
+    slug = _SLUG_NON_ALNUM_RE.sub("", slug)
+    slug = _SLUG_WHITESPACE_RE.sub("-", slug.strip())
+    slug = _SLUG_MULTI_DASH_RE.sub("-", slug)
+    return slug
+
+
+# --- Inline renderers -------------------------------------------------------
+
+
+def render_inline_html(children: list[SyntaxTreeNode]) -> str:
+    """Render inline AST nodes to HTML with proper escaping."""
+    parts: list[str] = []
+    for child in children:
+        match child.type:
+            case "text":
+                parts.append(str(escape(child.content)))
+            case "softbreak":
+                parts.append(" ")
+            case "link":
+                href = str(escape(child.attrGet("href") or ""))
+                inner = render_inline_html(child.children)
+                parts.append(
+                    f'<a href="{href}" target="_blank" rel="noopener">{inner}</a>'
+                )
+            case "em":
+                parts.append(f"<em>{render_inline_html(child.children)}</em>")
+            case "strong":
+                parts.append(f"<strong>{render_inline_html(child.children)}</strong>")
+            case "code_inline":
+                parts.append(f"<code>{escape(child.content)}</code>")
+            case "html_inline":
+                parts.append(str(escape(child.content)))
+    return "".join(parts)
+
+
+def render_inline_text(children: list[SyntaxTreeNode]) -> str:
+    """Render inline AST nodes to plain text (links become their text)."""
+    parts: list[str] = []
+    for child in children:
+        match child.type:
+            case "text":
+                parts.append(child.content)
+            case "softbreak":
+                parts.append(" ")
+            case "code_inline":
+                parts.append(child.content)
+            case "em" | "strong" | "link":
+                parts.append(render_inline_text(child.children))
+    return "".join(parts)
@@ -0,0 +1,69 @@
+"""Tests for the readme_parser module."""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from readme_parser import render_inline_html, render_inline_text
+
+from markdown_it import MarkdownIt
+from markdown_it.tree import SyntaxTreeNode
+
+
+def _parse_inline(md_text: str) -> list[SyntaxTreeNode]:
+    """Helper: parse a single paragraph and return its inline children."""
+    md = MarkdownIt("commonmark")
+    root = SyntaxTreeNode(md.parse(md_text))
+    # root > paragraph > inline > children
+    return root.children[0].children[0].children
+
+
+class TestRenderInlineHtml:
+    def test_plain_text_escapes_html(self):
+        children = _parse_inline("Hello <world> & friends")
+        assert render_inline_html(children) == "Hello &lt;world&gt; &amp; friends"
+
+    def test_link_with_target(self):
+        children = _parse_inline("[name](https://example.com)")
+        html = render_inline_html(children)
+        assert 'href="https://example.com"' in html
+        assert 'target="_blank"' in html
+        assert 'rel="noopener"' in html
+        assert ">name</a>" in html
+
+    def test_emphasis(self):
+        children = _parse_inline("*italic* text")
+        assert "<em>italic</em>" in render_inline_html(children)
+
+    def test_strong(self):
+        children = _parse_inline("**bold** text")
+        assert "<strong>bold</strong>" in render_inline_html(children)
+
+    def test_code_inline(self):
+        children = _parse_inline("`some code`")
+        assert "<code>some code</code>" in render_inline_html(children)
+
+    def test_mixed_link_and_text(self):
+        children = _parse_inline("See [foo](https://x.com) for details.")
+        html = render_inline_html(children)
+        assert "See " in html
+        assert ">foo</a>" in html
+        assert " for details." in html
+
+
+class TestRenderInlineText:
+    def test_plain_text(self):
+        children = _parse_inline("Hello world")
+        assert render_inline_text(children) == "Hello world"
+
+    def test_link_becomes_text(self):
+        children = _parse_inline("See [awesome-algos](https://github.com/x/y).")
+        assert render_inline_text(children) == "See awesome-algos."
+
+    def test_emphasis_stripped(self):
+        children = _parse_inline("*italic* text")
+        assert render_inline_text(children) == "italic text"
+
+    def test_code_inline_kept(self):
+        children = _parse_inline("`code` here")
+        assert render_inline_text(children) == "code here"