"""Parse README.md into structured section data using markdown-it-py AST."""
from __future__ import annotations
import re
from typing import TypedDict
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
from markupsafe import escape
class AlsoSee(TypedDict):
name: str
url: str
class ParsedEntry(TypedDict):
name: str
url: str
description: str # inline HTML, properly escaped
also_see: list[AlsoSee]
class ParsedSection(TypedDict):
name: str
slug: str
description: str # plain text, links resolved to text
content: str # raw markdown (backward compat)
entries: list[ParsedEntry]
entry_count: int
preview: str
content_html: str # rendered HTML, properly escaped
# --- Slugify ----------------------------------------------------------------
_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]")
_SLUG_WHITESPACE_RE = re.compile(r"[\s]+")
_SLUG_MULTI_DASH_RE = re.compile(r"-+")
def slugify(name: str) -> str:
"""Convert a category name to a URL-friendly slug."""
slug = name.lower()
slug = _SLUG_NON_ALNUM_RE.sub("", slug)
slug = _SLUG_WHITESPACE_RE.sub("-", slug.strip())
slug = _SLUG_MULTI_DASH_RE.sub("-", slug)
return slug
# --- Inline renderers -------------------------------------------------------
def render_inline_html(children: list[SyntaxTreeNode]) -> str:
"""Render inline AST nodes to HTML with proper escaping."""
parts: list[str] = []
for child in children:
match child.type:
case "text":
parts.append(str(escape(child.content)))
case "softbreak":
parts.append(" ")
case "link":
href = str(escape(child.attrGet("href") or ""))
inner = render_inline_html(child.children)
parts.append(
f'{inner}'
)
case "em":
parts.append(f"{render_inline_html(child.children)}")
case "strong":
parts.append(f"{render_inline_html(child.children)}")
case "code_inline":
parts.append(f"{escape(child.content)}")
case "html_inline":
parts.append(str(escape(child.content)))
return "".join(parts)
def render_inline_text(children: list[SyntaxTreeNode]) -> str:
"""Render inline AST nodes to plain text (links become their text)."""
parts: list[str] = []
for child in children:
match child.type:
case "text":
parts.append(child.content)
case "softbreak":
parts.append(" ")
case "code_inline":
parts.append(child.content)
case "em" | "strong" | "link":
parts.append(render_inline_text(child.children))
return "".join(parts)
# --- AST helpers -------------------------------------------------------------
def _heading_text(node: SyntaxTreeNode) -> str:
"""Extract plain text from a heading node."""
for child in node.children:
if child.type == "inline":
return render_inline_text(child.children)
return ""
def _extract_description(nodes: list[SyntaxTreeNode]) -> str:
"""Extract description from the first paragraph if it's a single block.
Pattern: _Libraries for foo._ -> "Libraries for foo."
"""
if not nodes:
return ""
first = nodes[0]
if first.type != "paragraph":
return ""
for child in first.children:
if child.type == "inline" and len(child.children) == 1:
em = child.children[0]
if em.type == "em":
return render_inline_text(em.children)
return ""
def _has_description(nodes: list[SyntaxTreeNode]) -> bool:
"""Check if the first node is a description paragraph (_italic text_)."""
if not nodes:
return False
first = nodes[0]
if first.type != "paragraph":
return False
for child in first.children:
if child.type == "inline" and len(child.children) == 1:
if child.children[0].type == "em":
return True
return False
def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) -> str:
"""Extract raw markdown text for AST nodes using source line mappings."""
if not nodes:
return ""
start_line = None
end_line = None
for node in nodes:
node_map = node.map
if node_map is not None:
if start_line is None or node_map[0] < start_line:
start_line = node_map[0]
if end_line is None or node_map[1] > end_line:
end_line = node_map[1]
if start_line is None:
return ""
return "\n".join(source_lines[start_line:end_line]).strip()
# --- Stubs for Tasks 3 & 4 (replace in later tasks) -------------------------
def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]:
return []
def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str:
return ""
# --- Section splitting -------------------------------------------------------
def _group_by_h2(
nodes: list[SyntaxTreeNode],
source_lines: list[str],
) -> list[ParsedSection]:
"""Group AST nodes into sections by h2 headings."""
sections: list[ParsedSection] = []
current_name: str | None = None
current_body: list[SyntaxTreeNode] = []
def flush() -> None:
nonlocal current_name
if current_name is None:
return
desc = _extract_description(current_body)
content_nodes = current_body[1:] if _has_description(current_body) else current_body
content = _nodes_to_raw_markdown(content_nodes, source_lines)
entries = _parse_section_entries(content_nodes)
entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
preview = ", ".join(e["name"] for e in entries[:4])
content_html = _render_section_html(content_nodes)
sections.append(ParsedSection(
name=current_name,
slug=slugify(current_name),
description=desc,
content=content,
entries=entries,
entry_count=entry_count,
preview=preview,
content_html=content_html,
))
current_name = None
for node in nodes:
if node.type == "heading" and node.tag == "h2":
flush()
current_name = _heading_text(node)
current_body = []
elif current_name is not None:
current_body.append(node)
flush()
return sections
def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
"""Parse README.md text into categories and resources.
Returns (categories, resources) where each is a list of ParsedSection dicts.
"""
md = MarkdownIt("commonmark")
tokens = md.parse(text)
root = SyntaxTreeNode(tokens)
source_lines = text.split("\n")
children = root.children
# Find thematic break (---)
hr_idx = None
for i, node in enumerate(children):
if node.type == "hr":
hr_idx = i
break
if hr_idx is None:
return [], []
# Find # Resources and # Contributing boundaries
resources_idx = None
contributing_idx = None
for i, node in enumerate(children):
if node.type == "heading" and node.tag == "h1":
text_content = _heading_text(node)
if text_content == "Resources":
resources_idx = i
elif text_content == "Contributing":
contributing_idx = i
# Slice into category and resource ranges
cat_end = resources_idx or contributing_idx or len(children)
cat_nodes = children[hr_idx + 1 : cat_end]
res_nodes: list[SyntaxTreeNode] = []
if resources_idx is not None:
res_end = contributing_idx or len(children)
res_nodes = children[resources_idx + 1 : res_end]
categories = _group_by_h2(cat_nodes, source_lines)
resources = _group_by_h2(res_nodes, source_lines)
return categories, resources