feat: replace regex README parser with markdown-it-py AST parser

Introduce parse_readme() which uses MarkdownIt to build a full AST
instead of line-by-line regex matching. The function splits the document
at the thematic break, groups nodes by h2 heading, extracts category
descriptions from leading italic paragraphs, and separates the
Categories, Resources, and Contributing sections cleanly.

Add markdown-it-py==4.0.0 (+ mdurl) as a runtime dependency to support
the new parser.

Tests cover section counts, names, slugs, descriptions, content
presence, boundary conditions (no separator, no description), and mixed
description markup.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-03-18 17:21:49 +08:00
parent 5fa7c7d1a6
commit 1c67c9f0e6
4 changed files with 325 additions and 1 deletions

View File

@@ -7,6 +7,7 @@ dependencies = [
"httpx==0.28.1",
"jinja2==3.1.6",
"markdown==3.10.2",
"markdown-it-py==4.0.0",
]
[dependency-groups]

23
uv.lock generated
View File

@@ -22,6 +22,7 @@ dependencies = [
{ name = "httpx" },
{ name = "jinja2" },
{ name = "markdown" },
{ name = "markdown-it-py" },
]
[package.dev-dependencies]
@@ -35,6 +36,7 @@ requires-dist = [
{ name = "httpx", specifier = "==0.28.1" },
{ name = "jinja2", specifier = "==3.1.6" },
{ name = "markdown", specifier = "==3.10.2" },
{ name = "markdown-it-py", specifier = "==4.0.0" },
]
[package.metadata.requires-dev]
@@ -137,6 +139,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" },
]
[[package]]
name = "markdown-it-py"
version = "4.0.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "mdurl" },
]
sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
]
[[package]]
name = "markupsafe"
version = "3.0.3"
@@ -189,6 +203,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
]
[[package]]
name = "mdurl"
version = "0.1.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
]
[[package]]
name = "packaging"
version = "26.0"

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import re
from typing import TypedDict
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
from markupsafe import escape
@@ -91,3 +92,169 @@ def render_inline_text(children: list[SyntaxTreeNode]) -> str:
case "em" | "strong" | "link":
parts.append(render_inline_text(child.children))
return "".join(parts)
# --- AST helpers -------------------------------------------------------------
def _heading_text(node: SyntaxTreeNode) -> str:
"""Extract plain text from a heading node."""
for child in node.children:
if child.type == "inline":
return render_inline_text(child.children)
return ""
def _extract_description(nodes: list[SyntaxTreeNode]) -> str:
"""Extract description from the first paragraph if it's a single <em> block.
Pattern: _Libraries for foo._ -> "Libraries for foo."
"""
if not nodes:
return ""
first = nodes[0]
if first.type != "paragraph":
return ""
for child in first.children:
if child.type == "inline" and len(child.children) == 1:
em = child.children[0]
if em.type == "em":
return render_inline_text(em.children)
return ""
def _has_description(nodes: list[SyntaxTreeNode]) -> bool:
"""Check if the first node is a description paragraph (_italic text_)."""
if not nodes:
return False
first = nodes[0]
if first.type != "paragraph":
return False
for child in first.children:
if child.type == "inline" and len(child.children) == 1:
if child.children[0].type == "em":
return True
return False
def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) -> str:
"""Extract raw markdown text for AST nodes using source line mappings."""
if not nodes:
return ""
start_line = None
end_line = None
for node in nodes:
node_map = node.map
if node_map is not None:
if start_line is None or node_map[0] < start_line:
start_line = node_map[0]
if end_line is None or node_map[1] > end_line:
end_line = node_map[1]
if start_line is None:
return ""
return "\n".join(source_lines[start_line:end_line]).strip()
# --- Stubs for Tasks 3 & 4 (replace in later tasks) -------------------------
def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]:
return []
def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str:
return ""
# --- Section splitting -------------------------------------------------------
def _group_by_h2(
nodes: list[SyntaxTreeNode],
source_lines: list[str],
) -> list[ParsedSection]:
"""Group AST nodes into sections by h2 headings."""
sections: list[ParsedSection] = []
current_name: str | None = None
current_body: list[SyntaxTreeNode] = []
def flush() -> None:
nonlocal current_name
if current_name is None:
return
desc = _extract_description(current_body)
content_nodes = current_body[1:] if _has_description(current_body) else current_body
content = _nodes_to_raw_markdown(content_nodes, source_lines)
entries = _parse_section_entries(content_nodes)
entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
preview = ", ".join(e["name"] for e in entries[:4])
content_html = _render_section_html(content_nodes)
sections.append(ParsedSection(
name=current_name,
slug=slugify(current_name),
description=desc,
content=content,
entries=entries,
entry_count=entry_count,
preview=preview,
content_html=content_html,
))
current_name = None
for node in nodes:
if node.type == "heading" and node.tag == "h2":
flush()
current_name = _heading_text(node)
current_body = []
elif current_name is not None:
current_body.append(node)
flush()
return sections
def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
"""Parse README.md text into categories and resources.
Returns (categories, resources) where each is a list of ParsedSection dicts.
"""
md = MarkdownIt("commonmark")
tokens = md.parse(text)
root = SyntaxTreeNode(tokens)
source_lines = text.split("\n")
children = root.children
# Find thematic break (---)
hr_idx = None
for i, node in enumerate(children):
if node.type == "hr":
hr_idx = i
break
if hr_idx is None:
return [], []
# Find # Resources and # Contributing boundaries
resources_idx = None
contributing_idx = None
for i, node in enumerate(children):
if node.type == "heading" and node.tag == "h1":
text_content = _heading_text(node)
if text_content == "Resources":
resources_idx = i
elif text_content == "Contributing":
contributing_idx = i
# Slice into category and resource ranges
cat_end = resources_idx or contributing_idx or len(children)
cat_nodes = children[hr_idx + 1 : cat_end]
res_nodes: list[SyntaxTreeNode] = []
if resources_idx is not None:
res_end = contributing_idx or len(children)
res_nodes = children[resources_idx + 1 : res_end]
categories = _group_by_h2(cat_nodes, source_lines)
resources = _group_by_h2(res_nodes, source_lines)
return categories, resources

View File

@@ -2,9 +2,10 @@
import os
import sys
import textwrap
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from readme_parser import render_inline_html, render_inline_text
from readme_parser import parse_readme, render_inline_html, render_inline_text
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
@@ -67,3 +68,135 @@ class TestRenderInlineText:
def test_code_inline_kept(self):
children = _parse_inline("`code` here")
assert render_inline_text(children) == "code here"
MINIMAL_README = textwrap.dedent("""\
# Awesome Python
Some intro text.
---
## Alpha
_Libraries for alpha stuff._
- [lib-a](https://example.com/a) - Does A.
- [lib-b](https://example.com/b) - Does B.
## Beta
_Tools for beta._
- [lib-c](https://example.com/c) - Does C.
# Resources
Where to discover resources.
## Newsletters
- [News One](https://example.com/n1)
- [News Two](https://example.com/n2)
## Podcasts
- [Pod One](https://example.com/p1)
# Contributing
Please contribute!
""")
class TestParseReadmeSections:
def test_category_count(self):
cats, resources = parse_readme(MINIMAL_README)
assert len(cats) == 2
def test_resource_count(self):
cats, resources = parse_readme(MINIMAL_README)
assert len(resources) == 2
def test_category_names(self):
cats, _ = parse_readme(MINIMAL_README)
assert cats[0]["name"] == "Alpha"
assert cats[1]["name"] == "Beta"
def test_category_slugs(self):
cats, _ = parse_readme(MINIMAL_README)
assert cats[0]["slug"] == "alpha"
assert cats[1]["slug"] == "beta"
def test_category_description(self):
cats, _ = parse_readme(MINIMAL_README)
assert cats[0]["description"] == "Libraries for alpha stuff."
assert cats[1]["description"] == "Tools for beta."
def test_category_content_has_entries(self):
cats, _ = parse_readme(MINIMAL_README)
assert "lib-a" in cats[0]["content"]
assert "lib-b" in cats[0]["content"]
def test_resource_names(self):
_, resources = parse_readme(MINIMAL_README)
assert resources[0]["name"] == "Newsletters"
assert resources[1]["name"] == "Podcasts"
def test_resource_content(self):
_, resources = parse_readme(MINIMAL_README)
assert "News One" in resources[0]["content"]
assert "Pod One" in resources[1]["content"]
def test_contributing_skipped(self):
cats, resources = parse_readme(MINIMAL_README)
all_names = [c["name"] for c in cats] + [r["name"] for r in resources]
assert "Contributing" not in all_names
def test_no_separator(self):
cats, resources = parse_readme("# Just a heading\n\nSome text.\n")
assert cats == []
assert resources == []
def test_no_description(self):
readme = textwrap.dedent("""\
# Title
---
## NullDesc
- [item](https://x.com) - Thing.
# Resources
## Tips
- [tip](https://x.com)
# Contributing
Done.
""")
cats, resources = parse_readme(readme)
assert cats[0]["description"] == ""
assert "item" in cats[0]["content"]
def test_description_with_link_stripped(self):
readme = textwrap.dedent("""\
# T
---
## Algos
_Algorithms. Also see [awesome-algos](https://example.com)._
- [lib](https://x.com) - Lib.
# Contributing
Done.
""")
cats, _ = parse_readme(readme)
assert cats[0]["description"] == "Algorithms. Also see awesome-algos."