mirror of
https://github.com/vinta/awesome-python.git
synced 2026-03-24 01:24:43 +08:00
Introduce parse_readme() which uses MarkdownIt to build a full AST instead of line-by-line regex matching. The function splits the document at the thematic break, groups nodes by h2 heading, extracts category descriptions from leading italic paragraphs, and separates the Categories, Resources, and Contributing sections cleanly. Add markdown-it-py==4.0.0 (+ mdurl) as a runtime dependency to support the new parser. Tests cover section counts, names, slugs, descriptions, content presence, boundary conditions (no separator, no description), and mixed description markup. Co-Authored-By: Claude <noreply@anthropic.com>
261 lines
8.1 KiB
Python
261 lines
8.1 KiB
Python
"""Parse README.md into structured section data using markdown-it-py AST."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import TypedDict
|
|
|
|
from markdown_it import MarkdownIt
|
|
from markdown_it.tree import SyntaxTreeNode
|
|
from markupsafe import escape
|
|
|
|
|
|
class AlsoSee(TypedDict):
|
|
name: str
|
|
url: str
|
|
|
|
|
|
class ParsedEntry(TypedDict):
|
|
name: str
|
|
url: str
|
|
description: str # inline HTML, properly escaped
|
|
also_see: list[AlsoSee]
|
|
|
|
|
|
class ParsedSection(TypedDict):
|
|
name: str
|
|
slug: str
|
|
description: str # plain text, links resolved to text
|
|
content: str # raw markdown (backward compat)
|
|
entries: list[ParsedEntry]
|
|
entry_count: int
|
|
preview: str
|
|
content_html: str # rendered HTML, properly escaped
|
|
|
|
|
|
# --- Slugify ----------------------------------------------------------------
|
|
|
|
_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]")
|
|
_SLUG_WHITESPACE_RE = re.compile(r"[\s]+")
|
|
_SLUG_MULTI_DASH_RE = re.compile(r"-+")
|
|
|
|
|
|
def slugify(name: str) -> str:
|
|
"""Convert a category name to a URL-friendly slug."""
|
|
slug = name.lower()
|
|
slug = _SLUG_NON_ALNUM_RE.sub("", slug)
|
|
slug = _SLUG_WHITESPACE_RE.sub("-", slug.strip())
|
|
slug = _SLUG_MULTI_DASH_RE.sub("-", slug)
|
|
return slug
|
|
|
|
|
|
# --- Inline renderers -------------------------------------------------------
|
|
|
|
|
|
def render_inline_html(children: list[SyntaxTreeNode]) -> str:
|
|
"""Render inline AST nodes to HTML with proper escaping."""
|
|
parts: list[str] = []
|
|
for child in children:
|
|
match child.type:
|
|
case "text":
|
|
parts.append(str(escape(child.content)))
|
|
case "softbreak":
|
|
parts.append(" ")
|
|
case "link":
|
|
href = str(escape(child.attrGet("href") or ""))
|
|
inner = render_inline_html(child.children)
|
|
parts.append(
|
|
f'<a href="{href}" target="_blank" rel="noopener">{inner}</a>'
|
|
)
|
|
case "em":
|
|
parts.append(f"<em>{render_inline_html(child.children)}</em>")
|
|
case "strong":
|
|
parts.append(f"<strong>{render_inline_html(child.children)}</strong>")
|
|
case "code_inline":
|
|
parts.append(f"<code>{escape(child.content)}</code>")
|
|
case "html_inline":
|
|
parts.append(str(escape(child.content)))
|
|
return "".join(parts)
|
|
|
|
|
|
def render_inline_text(children: list[SyntaxTreeNode]) -> str:
|
|
"""Render inline AST nodes to plain text (links become their text)."""
|
|
parts: list[str] = []
|
|
for child in children:
|
|
match child.type:
|
|
case "text":
|
|
parts.append(child.content)
|
|
case "softbreak":
|
|
parts.append(" ")
|
|
case "code_inline":
|
|
parts.append(child.content)
|
|
case "em" | "strong" | "link":
|
|
parts.append(render_inline_text(child.children))
|
|
return "".join(parts)
|
|
|
|
|
|
# --- AST helpers -------------------------------------------------------------
|
|
|
|
|
|
def _heading_text(node: SyntaxTreeNode) -> str:
|
|
"""Extract plain text from a heading node."""
|
|
for child in node.children:
|
|
if child.type == "inline":
|
|
return render_inline_text(child.children)
|
|
return ""
|
|
|
|
|
|
def _extract_description(nodes: list[SyntaxTreeNode]) -> str:
|
|
"""Extract description from the first paragraph if it's a single <em> block.
|
|
|
|
Pattern: _Libraries for foo._ -> "Libraries for foo."
|
|
"""
|
|
if not nodes:
|
|
return ""
|
|
first = nodes[0]
|
|
if first.type != "paragraph":
|
|
return ""
|
|
for child in first.children:
|
|
if child.type == "inline" and len(child.children) == 1:
|
|
em = child.children[0]
|
|
if em.type == "em":
|
|
return render_inline_text(em.children)
|
|
return ""
|
|
|
|
|
|
def _has_description(nodes: list[SyntaxTreeNode]) -> bool:
|
|
"""Check if the first node is a description paragraph (_italic text_)."""
|
|
if not nodes:
|
|
return False
|
|
first = nodes[0]
|
|
if first.type != "paragraph":
|
|
return False
|
|
for child in first.children:
|
|
if child.type == "inline" and len(child.children) == 1:
|
|
if child.children[0].type == "em":
|
|
return True
|
|
return False
|
|
|
|
|
|
def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) -> str:
|
|
"""Extract raw markdown text for AST nodes using source line mappings."""
|
|
if not nodes:
|
|
return ""
|
|
start_line = None
|
|
end_line = None
|
|
for node in nodes:
|
|
node_map = node.map
|
|
if node_map is not None:
|
|
if start_line is None or node_map[0] < start_line:
|
|
start_line = node_map[0]
|
|
if end_line is None or node_map[1] > end_line:
|
|
end_line = node_map[1]
|
|
if start_line is None:
|
|
return ""
|
|
return "\n".join(source_lines[start_line:end_line]).strip()
|
|
|
|
|
|
# --- Stubs for Tasks 3 & 4 (replace in later tasks) -------------------------
|
|
|
|
|
|
def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]:
|
|
return []
|
|
|
|
|
|
def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str:
|
|
return ""
|
|
|
|
|
|
# --- Section splitting -------------------------------------------------------
|
|
|
|
|
|
def _group_by_h2(
|
|
nodes: list[SyntaxTreeNode],
|
|
source_lines: list[str],
|
|
) -> list[ParsedSection]:
|
|
"""Group AST nodes into sections by h2 headings."""
|
|
sections: list[ParsedSection] = []
|
|
current_name: str | None = None
|
|
current_body: list[SyntaxTreeNode] = []
|
|
|
|
def flush() -> None:
|
|
nonlocal current_name
|
|
if current_name is None:
|
|
return
|
|
desc = _extract_description(current_body)
|
|
content_nodes = current_body[1:] if _has_description(current_body) else current_body
|
|
content = _nodes_to_raw_markdown(content_nodes, source_lines)
|
|
entries = _parse_section_entries(content_nodes)
|
|
entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
|
|
preview = ", ".join(e["name"] for e in entries[:4])
|
|
content_html = _render_section_html(content_nodes)
|
|
|
|
sections.append(ParsedSection(
|
|
name=current_name,
|
|
slug=slugify(current_name),
|
|
description=desc,
|
|
content=content,
|
|
entries=entries,
|
|
entry_count=entry_count,
|
|
preview=preview,
|
|
content_html=content_html,
|
|
))
|
|
current_name = None
|
|
|
|
for node in nodes:
|
|
if node.type == "heading" and node.tag == "h2":
|
|
flush()
|
|
current_name = _heading_text(node)
|
|
current_body = []
|
|
elif current_name is not None:
|
|
current_body.append(node)
|
|
|
|
flush()
|
|
return sections
|
|
|
|
|
|
def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
|
|
"""Parse README.md text into categories and resources.
|
|
|
|
Returns (categories, resources) where each is a list of ParsedSection dicts.
|
|
"""
|
|
md = MarkdownIt("commonmark")
|
|
tokens = md.parse(text)
|
|
root = SyntaxTreeNode(tokens)
|
|
source_lines = text.split("\n")
|
|
children = root.children
|
|
|
|
# Find thematic break (---)
|
|
hr_idx = None
|
|
for i, node in enumerate(children):
|
|
if node.type == "hr":
|
|
hr_idx = i
|
|
break
|
|
if hr_idx is None:
|
|
return [], []
|
|
|
|
# Find # Resources and # Contributing boundaries
|
|
resources_idx = None
|
|
contributing_idx = None
|
|
for i, node in enumerate(children):
|
|
if node.type == "heading" and node.tag == "h1":
|
|
text_content = _heading_text(node)
|
|
if text_content == "Resources":
|
|
resources_idx = i
|
|
elif text_content == "Contributing":
|
|
contributing_idx = i
|
|
|
|
# Slice into category and resource ranges
|
|
cat_end = resources_idx or contributing_idx or len(children)
|
|
cat_nodes = children[hr_idx + 1 : cat_end]
|
|
|
|
res_nodes: list[SyntaxTreeNode] = []
|
|
if resources_idx is not None:
|
|
res_end = contributing_idx or len(children)
|
|
res_nodes = children[resources_idx + 1 : res_end]
|
|
|
|
categories = _group_by_h2(cat_nodes, source_lines)
|
|
resources = _group_by_h2(res_nodes, source_lines)
|
|
|
|
return categories, resources
|