From 5fa7c7d1a670387dc04b6408a06e7b1a6dbbbf42 Mon Sep 17 00:00:00 2001 From: Vinta Chen Date: Wed, 18 Mar 2026 17:20:23 +0800 Subject: [PATCH 01/16] feat(website): add markdown-it-py README parser and inline renderer tests Introduce readme_parser.py which parses README.md into structured section data using the markdown-it-py AST. Includes TypedDicts for ParsedEntry/ParsedSection, slugify(), render_inline_html(), and render_inline_text(). Add test_readme_parser.py covering HTML escaping, link rendering, emphasis, strong, and code_inline for both renderers. Co-Authored-By: Claude --- website/readme_parser.py | 93 +++++++++++++++++++++++++++++ website/tests/test_readme_parser.py | 69 +++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 website/readme_parser.py create mode 100644 website/tests/test_readme_parser.py diff --git a/website/readme_parser.py b/website/readme_parser.py new file mode 100644 index 00000000..a98e0e0c --- /dev/null +++ b/website/readme_parser.py @@ -0,0 +1,93 @@ +"""Parse README.md into structured section data using markdown-it-py AST.""" + +from __future__ import annotations + +import re +from typing import TypedDict + +from markdown_it.tree import SyntaxTreeNode +from markupsafe import escape + + +class AlsoSee(TypedDict): + name: str + url: str + + +class ParsedEntry(TypedDict): + name: str + url: str + description: str # inline HTML, properly escaped + also_see: list[AlsoSee] + + +class ParsedSection(TypedDict): + name: str + slug: str + description: str # plain text, links resolved to text + content: str # raw markdown (backward compat) + entries: list[ParsedEntry] + entry_count: int + preview: str + content_html: str # rendered HTML, properly escaped + + +# --- Slugify ---------------------------------------------------------------- + +_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]") +_SLUG_WHITESPACE_RE = re.compile(r"[\s]+") +_SLUG_MULTI_DASH_RE = re.compile(r"-+") + + +def slugify(name: str) -> str: + """Convert a category name to a URL-friendly slug.""" + slug = name.lower() + slug = _SLUG_NON_ALNUM_RE.sub("", slug) + slug = _SLUG_WHITESPACE_RE.sub("-", slug.strip()) + slug = _SLUG_MULTI_DASH_RE.sub("-", slug) + return slug + + +# --- Inline renderers ------------------------------------------------------- + + +def render_inline_html(children: list[SyntaxTreeNode]) -> str: + """Render inline AST nodes to HTML with proper escaping.""" + parts: list[str] = [] + for child in children: + match child.type: + case "text": + parts.append(str(escape(child.content))) + case "softbreak": + parts.append(" ") + case "link": + href = str(escape(child.attrGet("href") or "")) + inner = render_inline_html(child.children) + parts.append( + f'{inner}' + ) + case "em": + parts.append(f"{render_inline_html(child.children)}") + case "strong": + parts.append(f"{render_inline_html(child.children)}") + case "code_inline": + parts.append(f"{escape(child.content)}") + case "html_inline": + parts.append(str(escape(child.content))) + return "".join(parts) + + +def render_inline_text(children: list[SyntaxTreeNode]) -> str: + """Render inline AST nodes to plain text (links become their text).""" + parts: list[str] = [] + for child in children: + match child.type: + case "text": + parts.append(child.content) + case "softbreak": + parts.append(" ") + case "code_inline": + parts.append(child.content) + case "em" | "strong" | "link": + parts.append(render_inline_text(child.children)) + return "".join(parts) diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py new file mode 100644 index 00000000..974143e5 --- /dev/null +++ b/website/tests/test_readme_parser.py @@ -0,0 +1,69 @@ +"""Tests for the readme_parser module.""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) +from readme_parser import render_inline_html, render_inline_text + +from markdown_it import MarkdownIt +from markdown_it.tree import SyntaxTreeNode + + +def _parse_inline(md_text: str) -> list[SyntaxTreeNode]: + """Helper: parse a single paragraph and return its inline children.""" + md = MarkdownIt("commonmark") + root = SyntaxTreeNode(md.parse(md_text)) + # root > paragraph > inline > children + return root.children[0].children[0].children + + +class TestRenderInlineHtml: + def test_plain_text_escapes_html(self): + children = _parse_inline("Hello & friends") + assert render_inline_html(children) == "Hello <world> & friends" + + def test_link_with_target(self): + children = _parse_inline("[name](https://example.com)") + html = render_inline_html(children) + assert 'href="https://example.com"' in html + assert 'target="_blank"' in html + assert 'rel="noopener"' in html + assert ">name" in html + + def test_emphasis(self): + children = _parse_inline("*italic* text") + assert "italic" in render_inline_html(children) + + def test_strong(self): + children = _parse_inline("**bold** text") + assert "bold" in render_inline_html(children) + + def test_code_inline(self): + children = _parse_inline("`some code`") + assert "some code" in render_inline_html(children) + + def test_mixed_link_and_text(self): + children = _parse_inline("See [foo](https://x.com) for details.") + html = render_inline_html(children) + assert "See " in html + assert ">foo" in html + assert " for details." in html + + +class TestRenderInlineText: + def test_plain_text(self): + children = _parse_inline("Hello world") + assert render_inline_text(children) == "Hello world" + + def test_link_becomes_text(self): + children = _parse_inline("See [awesome-algos](https://github.com/x/y).") + assert render_inline_text(children) == "See awesome-algos." + + def test_emphasis_stripped(self): + children = _parse_inline("*italic* text") + assert render_inline_text(children) == "italic text" + + def test_code_inline_kept(self): + children = _parse_inline("`code` here") + assert render_inline_text(children) == "code here" From 1c67c9f0e68718cd23f885d11a9168d8d0d53980 Mon Sep 17 00:00:00 2001 From: Vinta Chen Date: Wed, 18 Mar 2026 17:21:49 +0800 Subject: [PATCH 02/16] feat: replace regex README parser with markdown-it-py AST parser Introduce parse_readme() which uses MarkdownIt to build a full AST instead of line-by-line regex matching. The function splits the document at the thematic break, groups nodes by h2 heading, extracts category descriptions from leading italic paragraphs, and separates the Categories, Resources, and Contributing sections cleanly. Add markdown-it-py==4.0.0 (+ mdurl) as a runtime dependency to support the new parser. Tests cover section counts, names, slugs, descriptions, content presence, boundary conditions (no separator, no description), and mixed description markup. Co-Authored-By: Claude --- pyproject.toml | 1 + uv.lock | 23 ++++ website/readme_parser.py | 167 ++++++++++++++++++++++++++++ website/tests/test_readme_parser.py | 135 +++++++++++++++++++++- 4 files changed, 325 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d564cde9..3f03420a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "httpx==0.28.1", "jinja2==3.1.6", "markdown==3.10.2", + "markdown-it-py==4.0.0", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index 1f7b17c7..51bd6822 100644 --- a/uv.lock +++ b/uv.lock @@ -22,6 +22,7 @@ dependencies = [ { name = "httpx" }, { name = "jinja2" }, { name = "markdown" }, + { name = "markdown-it-py" }, ] [package.dev-dependencies] @@ -35,6 +36,7 @@ requires-dist = [ { name = "httpx", specifier = "==0.28.1" }, { name = "jinja2", specifier = "==3.1.6" }, { name = "markdown", specifier = "==3.10.2" }, + { name = "markdown-it-py", specifier = "==4.0.0" }, ] [package.metadata.requires-dev] @@ -137,6 +139,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" }, ] +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + [[package]] name = "markupsafe" version = "3.0.3" @@ -189,6 +203,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, ] +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + [[package]] name = "packaging" version = "26.0" diff --git a/website/readme_parser.py b/website/readme_parser.py index a98e0e0c..62afd94c 100644 --- a/website/readme_parser.py +++ b/website/readme_parser.py @@ -5,6 +5,7 @@ from __future__ import annotations import re from typing import TypedDict +from markdown_it import MarkdownIt from markdown_it.tree import SyntaxTreeNode from markupsafe import escape @@ -91,3 +92,169 @@ def render_inline_text(children: list[SyntaxTreeNode]) -> str: case "em" | "strong" | "link": parts.append(render_inline_text(child.children)) return "".join(parts) + + +# --- AST helpers ------------------------------------------------------------- + + +def _heading_text(node: SyntaxTreeNode) -> str: + """Extract plain text from a heading node.""" + for child in node.children: + if child.type == "inline": + return render_inline_text(child.children) + return "" + + +def _extract_description(nodes: list[SyntaxTreeNode]) -> str: + """Extract description from the first paragraph if it's a single block. + + Pattern: _Libraries for foo._ -> "Libraries for foo." + """ + if not nodes: + return "" + first = nodes[0] + if first.type != "paragraph": + return "" + for child in first.children: + if child.type == "inline" and len(child.children) == 1: + em = child.children[0] + if em.type == "em": + return render_inline_text(em.children) + return "" + + +def _has_description(nodes: list[SyntaxTreeNode]) -> bool: + """Check if the first node is a description paragraph (_italic text_).""" + if not nodes: + return False + first = nodes[0] + if first.type != "paragraph": + return False + for child in first.children: + if child.type == "inline" and len(child.children) == 1: + if child.children[0].type == "em": + return True + return False + + +def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) -> str: + """Extract raw markdown text for AST nodes using source line mappings.""" + if not nodes: + return "" + start_line = None + end_line = None + for node in nodes: + node_map = node.map + if node_map is not None: + if start_line is None or node_map[0] < start_line: + start_line = node_map[0] + if end_line is None or node_map[1] > end_line: + end_line = node_map[1] + if start_line is None: + return "" + return "\n".join(source_lines[start_line:end_line]).strip() + + +# --- Stubs for Tasks 3 & 4 (replace in later tasks) ------------------------- + + +def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]: + return [] + + +def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str: + return "" + + +# --- Section splitting ------------------------------------------------------- + + +def _group_by_h2( + nodes: list[SyntaxTreeNode], + source_lines: list[str], +) -> list[ParsedSection]: + """Group AST nodes into sections by h2 headings.""" + sections: list[ParsedSection] = [] + current_name: str | None = None + current_body: list[SyntaxTreeNode] = [] + + def flush() -> None: + nonlocal current_name + if current_name is None: + return + desc = _extract_description(current_body) + content_nodes = current_body[1:] if _has_description(current_body) else current_body + content = _nodes_to_raw_markdown(content_nodes, source_lines) + entries = _parse_section_entries(content_nodes) + entry_count = len(entries) + sum(len(e["also_see"]) for e in entries) + preview = ", ".join(e["name"] for e in entries[:4]) + content_html = _render_section_html(content_nodes) + + sections.append(ParsedSection( + name=current_name, + slug=slugify(current_name), + description=desc, + content=content, + entries=entries, + entry_count=entry_count, + preview=preview, + content_html=content_html, + )) + current_name = None + + for node in nodes: + if node.type == "heading" and node.tag == "h2": + flush() + current_name = _heading_text(node) + current_body = [] + elif current_name is not None: + current_body.append(node) + + flush() + return sections + + +def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]: + """Parse README.md text into categories and resources. + + Returns (categories, resources) where each is a list of ParsedSection dicts. + """ + md = MarkdownIt("commonmark") + tokens = md.parse(text) + root = SyntaxTreeNode(tokens) + source_lines = text.split("\n") + children = root.children + + # Find thematic break (---) + hr_idx = None + for i, node in enumerate(children): + if node.type == "hr": + hr_idx = i + break + if hr_idx is None: + return [], [] + + # Find # Resources and # Contributing boundaries + resources_idx = None + contributing_idx = None + for i, node in enumerate(children): + if node.type == "heading" and node.tag == "h1": + text_content = _heading_text(node) + if text_content == "Resources": + resources_idx = i + elif text_content == "Contributing": + contributing_idx = i + + # Slice into category and resource ranges + cat_end = resources_idx or contributing_idx or len(children) + cat_nodes = children[hr_idx + 1 : cat_end] + + res_nodes: list[SyntaxTreeNode] = [] + if resources_idx is not None: + res_end = contributing_idx or len(children) + res_nodes = children[resources_idx + 1 : res_end] + + categories = _group_by_h2(cat_nodes, source_lines) + resources = _group_by_h2(res_nodes, source_lines) + + return categories, resources diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py index 974143e5..3f32e844 100644 --- a/website/tests/test_readme_parser.py +++ b/website/tests/test_readme_parser.py @@ -2,9 +2,10 @@ import os import sys +import textwrap sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from readme_parser import render_inline_html, render_inline_text +from readme_parser import parse_readme, render_inline_html, render_inline_text from markdown_it import MarkdownIt from markdown_it.tree import SyntaxTreeNode @@ -67,3 +68,135 @@ class TestRenderInlineText: def test_code_inline_kept(self): children = _parse_inline("`code` here") assert render_inline_text(children) == "code here" + + +MINIMAL_README = textwrap.dedent("""\ + # Awesome Python + + Some intro text. + + --- + + ## Alpha + + _Libraries for alpha stuff._ + + - [lib-a](https://example.com/a) - Does A. + - [lib-b](https://example.com/b) - Does B. + + ## Beta + + _Tools for beta._ + + - [lib-c](https://example.com/c) - Does C. + + # Resources + + Where to discover resources. + + ## Newsletters + + - [News One](https://example.com/n1) + - [News Two](https://example.com/n2) + + ## Podcasts + + - [Pod One](https://example.com/p1) + + # Contributing + + Please contribute! +""") + + +class TestParseReadmeSections: + def test_category_count(self): + cats, resources = parse_readme(MINIMAL_README) + assert len(cats) == 2 + + def test_resource_count(self): + cats, resources = parse_readme(MINIMAL_README) + assert len(resources) == 2 + + def test_category_names(self): + cats, _ = parse_readme(MINIMAL_README) + assert cats[0]["name"] == "Alpha" + assert cats[1]["name"] == "Beta" + + def test_category_slugs(self): + cats, _ = parse_readme(MINIMAL_README) + assert cats[0]["slug"] == "alpha" + assert cats[1]["slug"] == "beta" + + def test_category_description(self): + cats, _ = parse_readme(MINIMAL_README) + assert cats[0]["description"] == "Libraries for alpha stuff." + assert cats[1]["description"] == "Tools for beta." + + def test_category_content_has_entries(self): + cats, _ = parse_readme(MINIMAL_README) + assert "lib-a" in cats[0]["content"] + assert "lib-b" in cats[0]["content"] + + def test_resource_names(self): + _, resources = parse_readme(MINIMAL_README) + assert resources[0]["name"] == "Newsletters" + assert resources[1]["name"] == "Podcasts" + + def test_resource_content(self): + _, resources = parse_readme(MINIMAL_README) + assert "News One" in resources[0]["content"] + assert "Pod One" in resources[1]["content"] + + def test_contributing_skipped(self): + cats, resources = parse_readme(MINIMAL_README) + all_names = [c["name"] for c in cats] + [r["name"] for r in resources] + assert "Contributing" not in all_names + + def test_no_separator(self): + cats, resources = parse_readme("# Just a heading\n\nSome text.\n") + assert cats == [] + assert resources == [] + + def test_no_description(self): + readme = textwrap.dedent("""\ + # Title + + --- + + ## NullDesc + + - [item](https://x.com) - Thing. + + # Resources + + ## Tips + + - [tip](https://x.com) + + # Contributing + + Done. + """) + cats, resources = parse_readme(readme) + assert cats[0]["description"] == "" + assert "item" in cats[0]["content"] + + def test_description_with_link_stripped(self): + readme = textwrap.dedent("""\ + # T + + --- + + ## Algos + + _Algorithms. Also see [awesome-algos](https://example.com)._ + + - [lib](https://x.com) - Lib. + + # Contributing + + Done. + """) + cats, _ = parse_readme(readme) + assert cats[0]["description"] == "Algorithms. Also see awesome-algos." From 3d015bc63026635087701358237d9ddb60fb67d7 Mon Sep 17 00:00:00 2001 From: Vinta Chen Date: Wed, 18 Mar 2026 17:23:11 +0800 Subject: [PATCH 03/16] feat(parser): implement entry extraction from bullet list AST nodes Replace _parse_section_entries stub with full implementation that walks bullet_list AST nodes to extract ParsedEntry records, including support for subcategory labels (text-only list items) and also_see nested links. Add _parse_list_entries, helper finders (_find_inline, _find_first_link, _find_child), and _extract_description_html with separator stripping. Extend test suite with TestParseSectionEntries covering flat entries, link-only entries, subcategorized entries, also_see, entry_count, preview first-four, and XSS escaping in description HTML. Co-Authored-By: Claude --- website/readme_parser.py | 114 +++++++++++++++++++++++++++- website/tests/test_readme_parser.py | 105 ++++++++++++++++++++++++- 2 files changed, 216 insertions(+), 3 deletions(-) diff --git a/website/readme_parser.py b/website/readme_parser.py index 62afd94c..71a36742 100644 --- a/website/readme_parser.py +++ b/website/readme_parser.py @@ -155,11 +155,121 @@ def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) return "\n".join(source_lines[start_line:end_line]).strip() -# --- Stubs for Tasks 3 & 4 (replace in later tasks) ------------------------- +# --- Entry extraction -------------------------------------------------------- + +_DESC_SEP_RE = re.compile(r"^\s*[-\u2013\u2014]\s*") + + +def _find_inline(node: SyntaxTreeNode) -> SyntaxTreeNode | None: + """Find the inline node in a list_item's paragraph.""" + for child in node.children: + if child.type == "paragraph": + for sub in child.children: + if sub.type == "inline": + return sub + return None + + +def _find_first_link(inline: SyntaxTreeNode) -> SyntaxTreeNode | None: + """Find the first link node among inline children.""" + for child in inline.children: + if child.type == "link": + return child + return None + + +def _find_child(node: SyntaxTreeNode, child_type: str) -> SyntaxTreeNode | None: + """Find first direct child of a given type.""" + for child in node.children: + if child.type == child_type: + return child + return None + + +def _extract_description_html(inline: SyntaxTreeNode, first_link: SyntaxTreeNode) -> str: + """Extract description HTML from inline content after the first link. + + AST: [link("name"), text(" - Description.")] -> "Description." + The separator (- / en-dash / em-dash) is stripped. + """ + link_idx = next((i for i, c in enumerate(inline.children) if c is first_link), None) + if link_idx is None: + return "" + desc_children = inline.children[link_idx + 1 :] + if not desc_children: + return "" + html = render_inline_html(desc_children) + return _DESC_SEP_RE.sub("", html) + + +def _parse_list_entries(bullet_list: SyntaxTreeNode) -> list[ParsedEntry]: + """Extract entries from a bullet_list AST node. + + Handles three patterns: + - Text-only list_item -> subcategory label -> recurse into nested list + - Link list_item with nested link-only items -> entry with also_see + - Link list_item without nesting -> simple entry + """ + entries: list[ParsedEntry] = [] + + for list_item in bullet_list.children: + if list_item.type != "list_item": + continue + + inline = _find_inline(list_item) + if inline is None: + continue + + first_link = _find_first_link(inline) + + if first_link is None: + # Subcategory label — recurse into nested bullet_list + nested = _find_child(list_item, "bullet_list") + if nested: + entries.extend(_parse_list_entries(nested)) + continue + + # Entry with a link + name = render_inline_text(first_link.children) + url = first_link.attrGet("href") or "" + desc_html = _extract_description_html(inline, first_link) + + # Collect also_see from nested bullet_list + also_see: list[AlsoSee] = [] + nested = _find_child(list_item, "bullet_list") + if nested: + for sub_item in nested.children: + if sub_item.type != "list_item": + continue + sub_inline = _find_inline(sub_item) + if sub_inline: + sub_link = _find_first_link(sub_inline) + if sub_link: + also_see.append(AlsoSee( + name=render_inline_text(sub_link.children), + url=sub_link.attrGet("href") or "", + )) + + entries.append(ParsedEntry( + name=name, + url=url, + description=desc_html, + also_see=also_see, + )) + + return entries def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]: - return [] + """Extract all entries from a section's content nodes.""" + entries: list[ParsedEntry] = [] + for node in content_nodes: + if node.type == "bullet_list": + entries.extend(_parse_list_entries(node)) + return entries + + +# --- Content HTML rendering (stub for Task 4) -------------------------------- def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str: diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py index 3f32e844..f0f53e92 100644 --- a/website/tests/test_readme_parser.py +++ b/website/tests/test_readme_parser.py @@ -5,7 +5,7 @@ import sys import textwrap sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from readme_parser import parse_readme, render_inline_html, render_inline_text +from readme_parser import _parse_section_entries, parse_readme, render_inline_html, render_inline_text from markdown_it import MarkdownIt from markdown_it.tree import SyntaxTreeNode @@ -200,3 +200,106 @@ class TestParseReadmeSections: """) cats, _ = parse_readme(readme) assert cats[0]["description"] == "Algorithms. Also see awesome-algos." + + +def _content_nodes(md_text: str) -> list[SyntaxTreeNode]: + """Helper: parse markdown and return all block nodes.""" + md = MarkdownIt("commonmark") + root = SyntaxTreeNode(md.parse(md_text)) + return root.children + + +class TestParseSectionEntries: + def test_flat_entries(self): + nodes = _content_nodes( + "- [django](https://example.com/d) - A web framework.\n" + "- [flask](https://example.com/f) - A micro framework.\n" + ) + entries = _parse_section_entries(nodes) + assert len(entries) == 2 + assert entries[0]["name"] == "django" + assert entries[0]["url"] == "https://example.com/d" + assert "web framework" in entries[0]["description"] + assert entries[0]["also_see"] == [] + assert entries[1]["name"] == "flask" + + def test_link_only_entry(self): + nodes = _content_nodes("- [tool](https://x.com)\n") + entries = _parse_section_entries(nodes) + assert len(entries) == 1 + assert entries[0]["name"] == "tool" + assert entries[0]["description"] == "" + + def test_subcategorized_entries(self): + nodes = _content_nodes( + "- Algorithms\n" + " - [algos](https://x.com/a) - Algo lib.\n" + " - [sorts](https://x.com/s) - Sort lib.\n" + "- Design Patterns\n" + " - [patterns](https://x.com/p) - Pattern lib.\n" + ) + entries = _parse_section_entries(nodes) + assert len(entries) == 3 + assert entries[0]["name"] == "algos" + assert entries[2]["name"] == "patterns" + + def test_also_see_sub_entries(self): + nodes = _content_nodes( + "- [asyncio](https://docs.python.org/3/library/asyncio.html) - Async I/O.\n" + " - [awesome-asyncio](https://github.com/timofurrer/awesome-asyncio)\n" + "- [trio](https://github.com/python-trio/trio) - Friendly async.\n" + ) + entries = _parse_section_entries(nodes) + assert len(entries) == 2 + assert entries[0]["name"] == "asyncio" + assert len(entries[0]["also_see"]) == 1 + assert entries[0]["also_see"][0]["name"] == "awesome-asyncio" + assert entries[1]["name"] == "trio" + assert entries[1]["also_see"] == [] + + def test_entry_count_includes_also_see(self): + readme = textwrap.dedent("""\ + # T + + --- + + ## Async + + - [asyncio](https://x.com) - Async I/O. + - [awesome-asyncio](https://y.com) + - [trio](https://z.com) - Friendly async. + + # Contributing + + Done. + """) + cats, _ = parse_readme(readme) + # 2 main entries + 1 also_see = 3 + assert cats[0]["entry_count"] == 3 + + def test_preview_first_four_names(self): + readme = textwrap.dedent("""\ + # T + + --- + + ## Libs + + - [alpha](https://x.com) - A. + - [beta](https://x.com) - B. + - [gamma](https://x.com) - C. + - [delta](https://x.com) - D. + - [epsilon](https://x.com) - E. + + # Contributing + + Done. + """) + cats, _ = parse_readme(readme) + assert cats[0]["preview"] == "alpha, beta, gamma, delta" + + def test_description_html_escapes_xss(self): + nodes = _content_nodes('- [lib](https://x.com) - A lib.\n') + entries = _parse_section_entries(nodes) + assert "\n") + html = _render_section_html(nodes) + assert "\n") html = _render_section_html(nodes) assert "