diff --git a/pyproject.toml b/pyproject.toml index d564cde9..3f03420a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "httpx==0.28.1", "jinja2==3.1.6", "markdown==3.10.2", + "markdown-it-py==4.0.0", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index 1f7b17c7..51bd6822 100644 --- a/uv.lock +++ b/uv.lock @@ -22,6 +22,7 @@ dependencies = [ { name = "httpx" }, { name = "jinja2" }, { name = "markdown" }, + { name = "markdown-it-py" }, ] [package.dev-dependencies] @@ -35,6 +36,7 @@ requires-dist = [ { name = "httpx", specifier = "==0.28.1" }, { name = "jinja2", specifier = "==3.1.6" }, { name = "markdown", specifier = "==3.10.2" }, + { name = "markdown-it-py", specifier = "==4.0.0" }, ] [package.metadata.requires-dev] @@ -137,6 +139,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" }, ] +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + [[package]] name = "markupsafe" version = "3.0.3" @@ -189,6 +203,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, ] +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + [[package]] name = "packaging" version = "26.0" diff --git a/website/readme_parser.py b/website/readme_parser.py index a98e0e0c..62afd94c 100644 --- a/website/readme_parser.py +++ b/website/readme_parser.py @@ -5,6 +5,7 @@ from __future__ import annotations import re from typing import TypedDict +from markdown_it import MarkdownIt from markdown_it.tree import SyntaxTreeNode from markupsafe import escape @@ -91,3 +92,169 @@ def render_inline_text(children: list[SyntaxTreeNode]) -> str: case "em" | "strong" | "link": parts.append(render_inline_text(child.children)) return "".join(parts) + + +# --- AST helpers ------------------------------------------------------------- + + +def _heading_text(node: SyntaxTreeNode) -> str: + """Extract plain text from a heading node.""" + for child in node.children: + if child.type == "inline": + return render_inline_text(child.children) + return "" + + +def _extract_description(nodes: list[SyntaxTreeNode]) -> str: + """Extract description from the first paragraph if it's a single block. + + Pattern: _Libraries for foo._ -> "Libraries for foo." + """ + if not nodes: + return "" + first = nodes[0] + if first.type != "paragraph": + return "" + for child in first.children: + if child.type == "inline" and len(child.children) == 1: + em = child.children[0] + if em.type == "em": + return render_inline_text(em.children) + return "" + + +def _has_description(nodes: list[SyntaxTreeNode]) -> bool: + """Check if the first node is a description paragraph (_italic text_).""" + if not nodes: + return False + first = nodes[0] + if first.type != "paragraph": + return False + for child in first.children: + if child.type == "inline" and len(child.children) == 1: + if child.children[0].type == "em": + return True + return False + + +def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) -> str: + """Extract raw markdown text for AST nodes using source line mappings.""" + if not nodes: + return "" + start_line = None + end_line = None + for node in nodes: + node_map = node.map + if node_map is not None: + if start_line is None or node_map[0] < start_line: + start_line = node_map[0] + if end_line is None or node_map[1] > end_line: + end_line = node_map[1] + if start_line is None: + return "" + return "\n".join(source_lines[start_line:end_line]).strip() + + +# --- Stubs for Tasks 3 & 4 (replace in later tasks) ------------------------- + + +def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]: + return [] + + +def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str: + return "" + + +# --- Section splitting ------------------------------------------------------- + + +def _group_by_h2( + nodes: list[SyntaxTreeNode], + source_lines: list[str], +) -> list[ParsedSection]: + """Group AST nodes into sections by h2 headings.""" + sections: list[ParsedSection] = [] + current_name: str | None = None + current_body: list[SyntaxTreeNode] = [] + + def flush() -> None: + nonlocal current_name + if current_name is None: + return + desc = _extract_description(current_body) + content_nodes = current_body[1:] if _has_description(current_body) else current_body + content = _nodes_to_raw_markdown(content_nodes, source_lines) + entries = _parse_section_entries(content_nodes) + entry_count = len(entries) + sum(len(e["also_see"]) for e in entries) + preview = ", ".join(e["name"] for e in entries[:4]) + content_html = _render_section_html(content_nodes) + + sections.append(ParsedSection( + name=current_name, + slug=slugify(current_name), + description=desc, + content=content, + entries=entries, + entry_count=entry_count, + preview=preview, + content_html=content_html, + )) + current_name = None + + for node in nodes: + if node.type == "heading" and node.tag == "h2": + flush() + current_name = _heading_text(node) + current_body = [] + elif current_name is not None: + current_body.append(node) + + flush() + return sections + + +def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]: + """Parse README.md text into categories and resources. + + Returns (categories, resources) where each is a list of ParsedSection dicts. + """ + md = MarkdownIt("commonmark") + tokens = md.parse(text) + root = SyntaxTreeNode(tokens) + source_lines = text.split("\n") + children = root.children + + # Find thematic break (---) + hr_idx = None + for i, node in enumerate(children): + if node.type == "hr": + hr_idx = i + break + if hr_idx is None: + return [], [] + + # Find # Resources and # Contributing boundaries + resources_idx = None + contributing_idx = None + for i, node in enumerate(children): + if node.type == "heading" and node.tag == "h1": + text_content = _heading_text(node) + if text_content == "Resources": + resources_idx = i + elif text_content == "Contributing": + contributing_idx = i + + # Slice into category and resource ranges + cat_end = resources_idx or contributing_idx or len(children) + cat_nodes = children[hr_idx + 1 : cat_end] + + res_nodes: list[SyntaxTreeNode] = [] + if resources_idx is not None: + res_end = contributing_idx or len(children) + res_nodes = children[resources_idx + 1 : res_end] + + categories = _group_by_h2(cat_nodes, source_lines) + resources = _group_by_h2(res_nodes, source_lines) + + return categories, resources diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py index 974143e5..3f32e844 100644 --- a/website/tests/test_readme_parser.py +++ b/website/tests/test_readme_parser.py @@ -2,9 +2,10 @@ import os import sys +import textwrap sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from readme_parser import render_inline_html, render_inline_text +from readme_parser import parse_readme, render_inline_html, render_inline_text from markdown_it import MarkdownIt from markdown_it.tree import SyntaxTreeNode @@ -67,3 +68,135 @@ class TestRenderInlineText: def test_code_inline_kept(self): children = _parse_inline("`code` here") assert render_inline_text(children) == "code here" + + +MINIMAL_README = textwrap.dedent("""\ + # Awesome Python + + Some intro text. + + --- + + ## Alpha + + _Libraries for alpha stuff._ + + - [lib-a](https://example.com/a) - Does A. + - [lib-b](https://example.com/b) - Does B. + + ## Beta + + _Tools for beta._ + + - [lib-c](https://example.com/c) - Does C. + + # Resources + + Where to discover resources. + + ## Newsletters + + - [News One](https://example.com/n1) + - [News Two](https://example.com/n2) + + ## Podcasts + + - [Pod One](https://example.com/p1) + + # Contributing + + Please contribute! +""") + + +class TestParseReadmeSections: + def test_category_count(self): + cats, resources = parse_readme(MINIMAL_README) + assert len(cats) == 2 + + def test_resource_count(self): + cats, resources = parse_readme(MINIMAL_README) + assert len(resources) == 2 + + def test_category_names(self): + cats, _ = parse_readme(MINIMAL_README) + assert cats[0]["name"] == "Alpha" + assert cats[1]["name"] == "Beta" + + def test_category_slugs(self): + cats, _ = parse_readme(MINIMAL_README) + assert cats[0]["slug"] == "alpha" + assert cats[1]["slug"] == "beta" + + def test_category_description(self): + cats, _ = parse_readme(MINIMAL_README) + assert cats[0]["description"] == "Libraries for alpha stuff." + assert cats[1]["description"] == "Tools for beta." + + def test_category_content_has_entries(self): + cats, _ = parse_readme(MINIMAL_README) + assert "lib-a" in cats[0]["content"] + assert "lib-b" in cats[0]["content"] + + def test_resource_names(self): + _, resources = parse_readme(MINIMAL_README) + assert resources[0]["name"] == "Newsletters" + assert resources[1]["name"] == "Podcasts" + + def test_resource_content(self): + _, resources = parse_readme(MINIMAL_README) + assert "News One" in resources[0]["content"] + assert "Pod One" in resources[1]["content"] + + def test_contributing_skipped(self): + cats, resources = parse_readme(MINIMAL_README) + all_names = [c["name"] for c in cats] + [r["name"] for r in resources] + assert "Contributing" not in all_names + + def test_no_separator(self): + cats, resources = parse_readme("# Just a heading\n\nSome text.\n") + assert cats == [] + assert resources == [] + + def test_no_description(self): + readme = textwrap.dedent("""\ + # Title + + --- + + ## NullDesc + + - [item](https://x.com) - Thing. + + # Resources + + ## Tips + + - [tip](https://x.com) + + # Contributing + + Done. + """) + cats, resources = parse_readme(readme) + assert cats[0]["description"] == "" + assert "item" in cats[0]["content"] + + def test_description_with_link_stripped(self): + readme = textwrap.dedent("""\ + # T + + --- + + ## Algos + + _Algorithms. Also see [awesome-algos](https://example.com)._ + + - [lib](https://x.com) - Lib. + + # Contributing + + Done. + """) + cats, _ = parse_readme(readme) + assert cats[0]["description"] == "Algorithms. Also see awesome-algos."