"""Parse README.md into structured section data using markdown-it-py AST.""" from __future__ import annotations import re from typing import TypedDict from markdown_it import MarkdownIt from markdown_it.tree import SyntaxTreeNode from markupsafe import escape class AlsoSee(TypedDict): name: str url: str class ParsedEntry(TypedDict): name: str url: str description: str # inline HTML, properly escaped also_see: list[AlsoSee] class ParsedSection(TypedDict): name: str slug: str description: str # plain text, links resolved to text content: str # raw markdown (backward compat) entries: list[ParsedEntry] entry_count: int preview: str content_html: str # rendered HTML, properly escaped # --- Slugify ---------------------------------------------------------------- _SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]") _SLUG_WHITESPACE_RE = re.compile(r"[\s]+") _SLUG_MULTI_DASH_RE = re.compile(r"-+") def slugify(name: str) -> str: """Convert a category name to a URL-friendly slug.""" slug = name.lower() slug = _SLUG_NON_ALNUM_RE.sub("", slug) slug = _SLUG_WHITESPACE_RE.sub("-", slug.strip()) slug = _SLUG_MULTI_DASH_RE.sub("-", slug) return slug # --- Inline renderers ------------------------------------------------------- def render_inline_html(children: list[SyntaxTreeNode]) -> str: """Render inline AST nodes to HTML with proper escaping.""" parts: list[str] = [] for child in children: match child.type: case "text": parts.append(str(escape(child.content))) case "softbreak": parts.append(" ") case "link": href = str(escape(child.attrGet("href") or "")) inner = render_inline_html(child.children) parts.append( f'{inner}' ) case "em": parts.append(f"{render_inline_html(child.children)}") case "strong": parts.append(f"{render_inline_html(child.children)}") case "code_inline": parts.append(f"{escape(child.content)}") case "html_inline": parts.append(str(escape(child.content))) return "".join(parts) def render_inline_text(children: list[SyntaxTreeNode]) -> str: """Render inline AST nodes to plain text (links become their text).""" parts: list[str] = [] for child in children: match child.type: case "text": parts.append(child.content) case "softbreak": parts.append(" ") case "code_inline": parts.append(child.content) case "em" | "strong" | "link": parts.append(render_inline_text(child.children)) return "".join(parts) # --- AST helpers ------------------------------------------------------------- def _heading_text(node: SyntaxTreeNode) -> str: """Extract plain text from a heading node.""" for child in node.children: if child.type == "inline": return render_inline_text(child.children) return "" def _extract_description(nodes: list[SyntaxTreeNode]) -> str: """Extract description from the first paragraph if it's a single block. Pattern: _Libraries for foo._ -> "Libraries for foo." """ if not nodes: return "" first = nodes[0] if first.type != "paragraph": return "" for child in first.children: if child.type == "inline" and len(child.children) == 1: em = child.children[0] if em.type == "em": return render_inline_text(em.children) return "" def _has_description(nodes: list[SyntaxTreeNode]) -> bool: """Check if the first node is a description paragraph (_italic text_).""" if not nodes: return False first = nodes[0] if first.type != "paragraph": return False for child in first.children: if child.type == "inline" and len(child.children) == 1: if child.children[0].type == "em": return True return False def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) -> str: """Extract raw markdown text for AST nodes using source line mappings.""" if not nodes: return "" start_line = None end_line = None for node in nodes: node_map = node.map if node_map is not None: if start_line is None or node_map[0] < start_line: start_line = node_map[0] if end_line is None or node_map[1] > end_line: end_line = node_map[1] if start_line is None: return "" return "\n".join(source_lines[start_line:end_line]).strip() # --- Entry extraction -------------------------------------------------------- _DESC_SEP_RE = re.compile(r"^\s*[-\u2013\u2014]\s*") def _find_inline(node: SyntaxTreeNode) -> SyntaxTreeNode | None: """Find the inline node in a list_item's paragraph.""" for child in node.children: if child.type == "paragraph": for sub in child.children: if sub.type == "inline": return sub return None def _find_first_link(inline: SyntaxTreeNode) -> SyntaxTreeNode | None: """Find the first link node among inline children.""" for child in inline.children: if child.type == "link": return child return None def _find_child(node: SyntaxTreeNode, child_type: str) -> SyntaxTreeNode | None: """Find first direct child of a given type.""" for child in node.children: if child.type == child_type: return child return None def _extract_description_html(inline: SyntaxTreeNode, first_link: SyntaxTreeNode) -> str: """Extract description HTML from inline content after the first link. AST: [link("name"), text(" - Description.")] -> "Description." The separator (- / en-dash / em-dash) is stripped. """ link_idx = next((i for i, c in enumerate(inline.children) if c is first_link), None) if link_idx is None: return "" desc_children = inline.children[link_idx + 1 :] if not desc_children: return "" html = render_inline_html(desc_children) return _DESC_SEP_RE.sub("", html) def _parse_list_entries(bullet_list: SyntaxTreeNode) -> list[ParsedEntry]: """Extract entries from a bullet_list AST node. Handles three patterns: - Text-only list_item -> subcategory label -> recurse into nested list - Link list_item with nested link-only items -> entry with also_see - Link list_item without nesting -> simple entry """ entries: list[ParsedEntry] = [] for list_item in bullet_list.children: if list_item.type != "list_item": continue inline = _find_inline(list_item) if inline is None: continue first_link = _find_first_link(inline) if first_link is None: # Subcategory label — recurse into nested bullet_list nested = _find_child(list_item, "bullet_list") if nested: entries.extend(_parse_list_entries(nested)) continue # Entry with a link name = render_inline_text(first_link.children) url = first_link.attrGet("href") or "" desc_html = _extract_description_html(inline, first_link) # Collect also_see from nested bullet_list also_see: list[AlsoSee] = [] nested = _find_child(list_item, "bullet_list") if nested: for sub_item in nested.children: if sub_item.type != "list_item": continue sub_inline = _find_inline(sub_item) if sub_inline: sub_link = _find_first_link(sub_inline) if sub_link: also_see.append(AlsoSee( name=render_inline_text(sub_link.children), url=sub_link.attrGet("href") or "", )) entries.append(ParsedEntry( name=name, url=url, description=desc_html, also_see=also_see, )) return entries def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]: """Extract all entries from a section's content nodes.""" entries: list[ParsedEntry] = [] for node in content_nodes: if node.type == "bullet_list": entries.extend(_parse_list_entries(node)) return entries # --- Content HTML rendering (stub for Task 4) -------------------------------- def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str: return "" # --- Section splitting ------------------------------------------------------- def _group_by_h2( nodes: list[SyntaxTreeNode], source_lines: list[str], ) -> list[ParsedSection]: """Group AST nodes into sections by h2 headings.""" sections: list[ParsedSection] = [] current_name: str | None = None current_body: list[SyntaxTreeNode] = [] def flush() -> None: nonlocal current_name if current_name is None: return desc = _extract_description(current_body) content_nodes = current_body[1:] if _has_description(current_body) else current_body content = _nodes_to_raw_markdown(content_nodes, source_lines) entries = _parse_section_entries(content_nodes) entry_count = len(entries) + sum(len(e["also_see"]) for e in entries) preview = ", ".join(e["name"] for e in entries[:4]) content_html = _render_section_html(content_nodes) sections.append(ParsedSection( name=current_name, slug=slugify(current_name), description=desc, content=content, entries=entries, entry_count=entry_count, preview=preview, content_html=content_html, )) current_name = None for node in nodes: if node.type == "heading" and node.tag == "h2": flush() current_name = _heading_text(node) current_body = [] elif current_name is not None: current_body.append(node) flush() return sections def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]: """Parse README.md text into categories and resources. Returns (categories, resources) where each is a list of ParsedSection dicts. """ md = MarkdownIt("commonmark") tokens = md.parse(text) root = SyntaxTreeNode(tokens) source_lines = text.split("\n") children = root.children # Find thematic break (---) hr_idx = None for i, node in enumerate(children): if node.type == "hr": hr_idx = i break if hr_idx is None: return [], [] # Find # Resources and # Contributing boundaries resources_idx = None contributing_idx = None for i, node in enumerate(children): if node.type == "heading" and node.tag == "h1": text_content = _heading_text(node) if text_content == "Resources": resources_idx = i elif text_content == "Contributing": contributing_idx = i # Slice into category and resource ranges cat_end = resources_idx or contributing_idx or len(children) cat_nodes = children[hr_idx + 1 : cat_end] res_nodes: list[SyntaxTreeNode] = [] if resources_idx is not None: res_end = contributing_idx or len(children) res_nodes = children[resources_idx + 1 : res_end] categories = _group_by_h2(cat_nodes, source_lines) resources = _group_by_h2(res_nodes, source_lines) return categories, resources