feat: replace regex README parser with markdown-it-py AST parser

Introduce parse_readme() which uses MarkdownIt to build a full AST instead of line-by-line regex matching. The function splits the document at the thematic break, groups nodes by h2 heading, extracts category descriptions from leading italic paragraphs, and separates the Categories, Resources, and Contributing sections cleanly. Add markdown-it-py==4.0.0 (+ mdurl) as a runtime dependency to support the new parser. Tests cover section counts, names, slugs, descriptions, content presence, boundary conditions (no separator, no description), and mixed description markup. Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-09 22:53:49 +08:00 · 2026-03-18 17:21:49 +08:00
parent 5fa7c7d1a6
commit 1c67c9f0e6
4 changed files with 325 additions and 1 deletions
@@ -5,6 +5,7 @@ from __future__ import annotations
 import re
 from typing import TypedDict

+from markdown_it import MarkdownIt
 from markdown_it.tree import SyntaxTreeNode
 from markupsafe import escape

@@ -91,3 +92,169 @@ def render_inline_text(children: list[SyntaxTreeNode]) -> str:
            case "em" | "strong" | "link":
                parts.append(render_inline_text(child.children))
    return "".join(parts)
+
+
+# --- AST helpers -------------------------------------------------------------
+
+
+def _heading_text(node: SyntaxTreeNode) -> str:
+    """Extract plain text from a heading node."""
+    for child in node.children:
+        if child.type == "inline":
+            return render_inline_text(child.children)
+    return ""
+
+
+def _extract_description(nodes: list[SyntaxTreeNode]) -> str:
+    """Extract description from the first paragraph if it's a single <em> block.
+
+    Pattern: _Libraries for foo._ -> "Libraries for foo."
+    """
+    if not nodes:
+        return ""
+    first = nodes[0]
+    if first.type != "paragraph":
+        return ""
+    for child in first.children:
+        if child.type == "inline" and len(child.children) == 1:
+            em = child.children[0]
+            if em.type == "em":
+                return render_inline_text(em.children)
+    return ""
+
+
+def _has_description(nodes: list[SyntaxTreeNode]) -> bool:
+    """Check if the first node is a description paragraph (_italic text_)."""
+    if not nodes:
+        return False
+    first = nodes[0]
+    if first.type != "paragraph":
+        return False
+    for child in first.children:
+        if child.type == "inline" and len(child.children) == 1:
+            if child.children[0].type == "em":
+                return True
+    return False
+
+
+def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) -> str:
+    """Extract raw markdown text for AST nodes using source line mappings."""
+    if not nodes:
+        return ""
+    start_line = None
+    end_line = None
+    for node in nodes:
+        node_map = node.map
+        if node_map is not None:
+            if start_line is None or node_map[0] < start_line:
+                start_line = node_map[0]
+            if end_line is None or node_map[1] > end_line:
+                end_line = node_map[1]
+    if start_line is None:
+        return ""
+    return "\n".join(source_lines[start_line:end_line]).strip()
+
+
+# --- Stubs for Tasks 3 & 4 (replace in later tasks) -------------------------
+
+
+def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]:
+    return []
+
+
+def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str:
+    return ""
+
+
+# --- Section splitting -------------------------------------------------------
+
+
+def _group_by_h2(
+    nodes: list[SyntaxTreeNode],
+    source_lines: list[str],
+) -> list[ParsedSection]:
+    """Group AST nodes into sections by h2 headings."""
+    sections: list[ParsedSection] = []
+    current_name: str | None = None
+    current_body: list[SyntaxTreeNode] = []
+
+    def flush() -> None:
+        nonlocal current_name
+        if current_name is None:
+            return
+        desc = _extract_description(current_body)
+        content_nodes = current_body[1:] if _has_description(current_body) else current_body
+        content = _nodes_to_raw_markdown(content_nodes, source_lines)
+        entries = _parse_section_entries(content_nodes)
+        entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
+        preview = ", ".join(e["name"] for e in entries[:4])
+        content_html = _render_section_html(content_nodes)
+
+        sections.append(ParsedSection(
+            name=current_name,
+            slug=slugify(current_name),
+            description=desc,
+            content=content,
+            entries=entries,
+            entry_count=entry_count,
+            preview=preview,
+            content_html=content_html,
+        ))
+        current_name = None
+
+    for node in nodes:
+        if node.type == "heading" and node.tag == "h2":
+            flush()
+            current_name = _heading_text(node)
+            current_body = []
+        elif current_name is not None:
+            current_body.append(node)
+
+    flush()
+    return sections
+
+
+def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
+    """Parse README.md text into categories and resources.
+
+    Returns (categories, resources) where each is a list of ParsedSection dicts.
+    """
+    md = MarkdownIt("commonmark")
+    tokens = md.parse(text)
+    root = SyntaxTreeNode(tokens)
+    source_lines = text.split("\n")
+    children = root.children
+
+    # Find thematic break (---)
+    hr_idx = None
+    for i, node in enumerate(children):
+        if node.type == "hr":
+            hr_idx = i
+            break
+    if hr_idx is None:
+        return [], []
+
+    # Find # Resources and # Contributing boundaries
+    resources_idx = None
+    contributing_idx = None
+    for i, node in enumerate(children):
+        if node.type == "heading" and node.tag == "h1":
+            text_content = _heading_text(node)
+            if text_content == "Resources":
+                resources_idx = i
+            elif text_content == "Contributing":
+                contributing_idx = i
+
+    # Slice into category and resource ranges
+    cat_end = resources_idx or contributing_idx or len(children)
+    cat_nodes = children[hr_idx + 1 : cat_end]
+
+    res_nodes: list[SyntaxTreeNode] = []
+    if resources_idx is not None:
+        res_end = contributing_idx or len(children)
+        res_nodes = children[resources_idx + 1 : res_end]
+
+    categories = _group_by_h2(cat_nodes, source_lines)
+    resources = _group_by_h2(res_nodes, source_lines)
+
+    return categories, resources
@@ -2,9 +2,10 @@

 import os
 import sys
+import textwrap

 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-from readme_parser import render_inline_html, render_inline_text
+from readme_parser import parse_readme, render_inline_html, render_inline_text

 from markdown_it import MarkdownIt
 from markdown_it.tree import SyntaxTreeNode
@@ -67,3 +68,135 @@ class TestRenderInlineText:
    def test_code_inline_kept(self):
        children = _parse_inline("`code` here")
        assert render_inline_text(children) == "code here"
+
+
+MINIMAL_README = textwrap.dedent("""\
+    # Awesome Python
+
+    Some intro text.
+
+    ---
+
+    ## Alpha
+
+    _Libraries for alpha stuff._
+
+    - [lib-a](https://example.com/a) - Does A.
+    - [lib-b](https://example.com/b) - Does B.
+
+    ## Beta
+
+    _Tools for beta._
+
+    - [lib-c](https://example.com/c) - Does C.
+
+    # Resources
+
+    Where to discover resources.
+
+    ## Newsletters
+
+    - [News One](https://example.com/n1)
+    - [News Two](https://example.com/n2)
+
+    ## Podcasts
+
+    - [Pod One](https://example.com/p1)
+
+    # Contributing
+
+    Please contribute!
+""")
+
+
+class TestParseReadmeSections:
+    def test_category_count(self):
+        cats, resources = parse_readme(MINIMAL_README)
+        assert len(cats) == 2
+
+    def test_resource_count(self):
+        cats, resources = parse_readme(MINIMAL_README)
+        assert len(resources) == 2
+
+    def test_category_names(self):
+        cats, _ = parse_readme(MINIMAL_README)
+        assert cats[0]["name"] == "Alpha"
+        assert cats[1]["name"] == "Beta"
+
+    def test_category_slugs(self):
+        cats, _ = parse_readme(MINIMAL_README)
+        assert cats[0]["slug"] == "alpha"
+        assert cats[1]["slug"] == "beta"
+
+    def test_category_description(self):
+        cats, _ = parse_readme(MINIMAL_README)
+        assert cats[0]["description"] == "Libraries for alpha stuff."
+        assert cats[1]["description"] == "Tools for beta."
+
+    def test_category_content_has_entries(self):
+        cats, _ = parse_readme(MINIMAL_README)
+        assert "lib-a" in cats[0]["content"]
+        assert "lib-b" in cats[0]["content"]
+
+    def test_resource_names(self):
+        _, resources = parse_readme(MINIMAL_README)
+        assert resources[0]["name"] == "Newsletters"
+        assert resources[1]["name"] == "Podcasts"
+
+    def test_resource_content(self):
+        _, resources = parse_readme(MINIMAL_README)
+        assert "News One" in resources[0]["content"]
+        assert "Pod One" in resources[1]["content"]
+
+    def test_contributing_skipped(self):
+        cats, resources = parse_readme(MINIMAL_README)
+        all_names = [c["name"] for c in cats] + [r["name"] for r in resources]
+        assert "Contributing" not in all_names
+
+    def test_no_separator(self):
+        cats, resources = parse_readme("# Just a heading\n\nSome text.\n")
+        assert cats == []
+        assert resources == []
+
+    def test_no_description(self):
+        readme = textwrap.dedent("""\
+            # Title
+
+            ---
+
+            ## NullDesc
+
+            - [item](https://x.com) - Thing.
+
+            # Resources
+
+            ## Tips
+
+            - [tip](https://x.com)
+
+            # Contributing
+
+            Done.
+        """)
+        cats, resources = parse_readme(readme)
+        assert cats[0]["description"] == ""
+        assert "item" in cats[0]["content"]
+
+    def test_description_with_link_stripped(self):
+        readme = textwrap.dedent("""\
+            # T
+
+            ---
+
+            ## Algos
+
+            _Algorithms. Also see [awesome-algos](https://example.com)._
+
+            - [lib](https://x.com) - Lib.
+
+            # Contributing
+
+            Done.
+        """)
+        cats, _ = parse_readme(readme)
+        assert cats[0]["description"] == "Algorithms. Also see awesome-algos."