refactor: parse thematic groups from README bold markers instead of hardcoding them

The website builder previously relied on a hardcoded SECTION_GROUPS list in
build.py to organize categories into thematic groups. This was fragile: any
rename or addition to README.md required a matching code change.

Replace this with a parser-driven approach:
- readme_parser.py now detects bold-only paragraphs (**Group Name**) as
  group boundary markers and groups H2 categories beneath them into
  ParsedGroup structs.
- build.py drops SECTION_GROUPS entirely; group_categories() now just
  passes parsed groups through and appends the Resources group.
- sort.py is removed as it relied on the old flat section model.
- Tests updated throughout to reflect the new (groups, resources) return
  shape and to cover the new grouping logic.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-03-20 18:43:09 +08:00
parent fd9b2665ed
commit 4322026817
5 changed files with 346 additions and 324 deletions
+107 -21
View File
@@ -32,6 +32,12 @@ class ParsedSection(TypedDict):
content_html: str # rendered HTML, properly escaped
class ParsedGroup(TypedDict):
name: str
slug: str
categories: list[ParsedSection]
# --- Slugify ----------------------------------------------------------------
_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]")
@@ -305,6 +311,25 @@ def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str:
# --- Section splitting -------------------------------------------------------
def _build_section(name: str, body: list[SyntaxTreeNode]) -> ParsedSection:
"""Build a ParsedSection from a heading name and its body nodes."""
desc = _extract_description(body)
content_nodes = body[1:] if desc else body
entries = _parse_section_entries(content_nodes)
entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
preview = ", ".join(e["name"] for e in entries[:4])
content_html = _render_section_html(content_nodes)
return ParsedSection(
name=name,
slug=slugify(name),
description=desc,
entries=entries,
entry_count=entry_count,
preview=preview,
content_html=content_html,
)
def _group_by_h2(
nodes: list[SyntaxTreeNode],
) -> list[ParsedSection]:
@@ -317,22 +342,7 @@ def _group_by_h2(
nonlocal current_name
if current_name is None:
return
desc = _extract_description(current_body)
content_nodes = current_body[1:] if desc else current_body
entries = _parse_section_entries(content_nodes)
entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
preview = ", ".join(e["name"] for e in entries[:4])
content_html = _render_section_html(content_nodes)
sections.append(ParsedSection(
name=current_name,
slug=slugify(current_name),
description=desc,
entries=entries,
entry_count=entry_count,
preview=preview,
content_html=content_html,
))
sections.append(_build_section(current_name, current_body))
current_name = None
for node in nodes:
@@ -347,10 +357,86 @@ def _group_by_h2(
return sections
def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
"""Parse README.md text into categories and resources.
def _is_bold_marker(node: SyntaxTreeNode) -> str | None:
"""Detect a bold-only paragraph used as a group marker.
Returns (categories, resources) where each is a list of ParsedSection dicts.
Pattern: a paragraph whose only content is **Group Name** (possibly
surrounded by empty text nodes in the AST).
Returns the group name text, or None if not a group marker.
"""
if node.type != "paragraph":
return None
for child in node.children:
if child.type != "inline":
continue
# Filter out empty text nodes that markdown-it inserts around strong
meaningful = [c for c in child.children if not (c.type == "text" and c.content == "")]
if len(meaningful) == 1 and meaningful[0].type == "strong":
return render_inline_text(meaningful[0].children)
return None
def _parse_grouped_sections(
nodes: list[SyntaxTreeNode],
) -> list[ParsedGroup]:
"""Parse nodes into groups of categories using bold markers as group boundaries.
Bold-only paragraphs (**Group Name**) delimit groups. H2 headings under each
bold marker become categories within that group. Categories appearing before
any bold marker go into an "Other" group.
"""
groups: list[ParsedGroup] = []
current_group_name: str | None = None
current_group_cats: list[ParsedSection] = []
current_cat_name: str | None = None
current_cat_body: list[SyntaxTreeNode] = []
def flush_cat() -> None:
nonlocal current_cat_name
if current_cat_name is None:
return
current_group_cats.append(_build_section(current_cat_name, current_cat_body))
current_cat_name = None
def flush_group() -> None:
nonlocal current_group_name, current_group_cats
if not current_group_cats:
current_group_name = None
current_group_cats = []
return
name = current_group_name or "Other"
groups.append(ParsedGroup(
name=name,
slug=slugify(name),
categories=list(current_group_cats),
))
current_group_name = None
current_group_cats = []
for node in nodes:
bold_name = _is_bold_marker(node)
if bold_name is not None:
flush_cat()
flush_group()
current_group_name = bold_name
current_cat_body = []
elif node.type == "heading" and node.tag == "h2":
flush_cat()
current_cat_name = _heading_text(node)
current_cat_body = []
elif current_cat_name is not None:
current_cat_body.append(node)
flush_cat()
flush_group()
return groups
def parse_readme(text: str) -> tuple[list[ParsedGroup], list[ParsedSection]]:
"""Parse README.md text into grouped categories and resources.
Returns (groups, resources) where groups is a list of ParsedGroup dicts
containing nested categories, and resources is a flat list of ParsedSection.
"""
md = MarkdownIt("commonmark")
tokens = md.parse(text)
@@ -382,7 +468,7 @@ def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
res_end = contributing_idx or len(children)
res_nodes = children[resources_idx + 1 : res_end]
categories = _group_by_h2(cat_nodes)
groups = _parse_grouped_sections(cat_nodes)
resources = _group_by_h2(res_nodes)
return categories, resources
return groups, resources