diff --git a/website/readme_parser.py b/website/readme_parser.py index 62afd94c..71a36742 100644 --- a/website/readme_parser.py +++ b/website/readme_parser.py @@ -155,11 +155,121 @@ def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str]) return "\n".join(source_lines[start_line:end_line]).strip() -# --- Stubs for Tasks 3 & 4 (replace in later tasks) ------------------------- +# --- Entry extraction -------------------------------------------------------- + +_DESC_SEP_RE = re.compile(r"^\s*[-\u2013\u2014]\s*") + + +def _find_inline(node: SyntaxTreeNode) -> SyntaxTreeNode | None: + """Find the inline node in a list_item's paragraph.""" + for child in node.children: + if child.type == "paragraph": + for sub in child.children: + if sub.type == "inline": + return sub + return None + + +def _find_first_link(inline: SyntaxTreeNode) -> SyntaxTreeNode | None: + """Find the first link node among inline children.""" + for child in inline.children: + if child.type == "link": + return child + return None + + +def _find_child(node: SyntaxTreeNode, child_type: str) -> SyntaxTreeNode | None: + """Find first direct child of a given type.""" + for child in node.children: + if child.type == child_type: + return child + return None + + +def _extract_description_html(inline: SyntaxTreeNode, first_link: SyntaxTreeNode) -> str: + """Extract description HTML from inline content after the first link. + + AST: [link("name"), text(" - Description.")] -> "Description." + The separator (- / en-dash / em-dash) is stripped. + """ + link_idx = next((i for i, c in enumerate(inline.children) if c is first_link), None) + if link_idx is None: + return "" + desc_children = inline.children[link_idx + 1 :] + if not desc_children: + return "" + html = render_inline_html(desc_children) + return _DESC_SEP_RE.sub("", html) + + +def _parse_list_entries(bullet_list: SyntaxTreeNode) -> list[ParsedEntry]: + """Extract entries from a bullet_list AST node. + + Handles three patterns: + - Text-only list_item -> subcategory label -> recurse into nested list + - Link list_item with nested link-only items -> entry with also_see + - Link list_item without nesting -> simple entry + """ + entries: list[ParsedEntry] = [] + + for list_item in bullet_list.children: + if list_item.type != "list_item": + continue + + inline = _find_inline(list_item) + if inline is None: + continue + + first_link = _find_first_link(inline) + + if first_link is None: + # Subcategory label — recurse into nested bullet_list + nested = _find_child(list_item, "bullet_list") + if nested: + entries.extend(_parse_list_entries(nested)) + continue + + # Entry with a link + name = render_inline_text(first_link.children) + url = first_link.attrGet("href") or "" + desc_html = _extract_description_html(inline, first_link) + + # Collect also_see from nested bullet_list + also_see: list[AlsoSee] = [] + nested = _find_child(list_item, "bullet_list") + if nested: + for sub_item in nested.children: + if sub_item.type != "list_item": + continue + sub_inline = _find_inline(sub_item) + if sub_inline: + sub_link = _find_first_link(sub_inline) + if sub_link: + also_see.append(AlsoSee( + name=render_inline_text(sub_link.children), + url=sub_link.attrGet("href") or "", + )) + + entries.append(ParsedEntry( + name=name, + url=url, + description=desc_html, + also_see=also_see, + )) + + return entries def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]: - return [] + """Extract all entries from a section's content nodes.""" + entries: list[ParsedEntry] = [] + for node in content_nodes: + if node.type == "bullet_list": + entries.extend(_parse_list_entries(node)) + return entries + + +# --- Content HTML rendering (stub for Task 4) -------------------------------- def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str: diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py index 3f32e844..f0f53e92 100644 --- a/website/tests/test_readme_parser.py +++ b/website/tests/test_readme_parser.py @@ -5,7 +5,7 @@ import sys import textwrap sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from readme_parser import parse_readme, render_inline_html, render_inline_text +from readme_parser import _parse_section_entries, parse_readme, render_inline_html, render_inline_text from markdown_it import MarkdownIt from markdown_it.tree import SyntaxTreeNode @@ -200,3 +200,106 @@ class TestParseReadmeSections: """) cats, _ = parse_readme(readme) assert cats[0]["description"] == "Algorithms. Also see awesome-algos." + + +def _content_nodes(md_text: str) -> list[SyntaxTreeNode]: + """Helper: parse markdown and return all block nodes.""" + md = MarkdownIt("commonmark") + root = SyntaxTreeNode(md.parse(md_text)) + return root.children + + +class TestParseSectionEntries: + def test_flat_entries(self): + nodes = _content_nodes( + "- [django](https://example.com/d) - A web framework.\n" + "- [flask](https://example.com/f) - A micro framework.\n" + ) + entries = _parse_section_entries(nodes) + assert len(entries) == 2 + assert entries[0]["name"] == "django" + assert entries[0]["url"] == "https://example.com/d" + assert "web framework" in entries[0]["description"] + assert entries[0]["also_see"] == [] + assert entries[1]["name"] == "flask" + + def test_link_only_entry(self): + nodes = _content_nodes("- [tool](https://x.com)\n") + entries = _parse_section_entries(nodes) + assert len(entries) == 1 + assert entries[0]["name"] == "tool" + assert entries[0]["description"] == "" + + def test_subcategorized_entries(self): + nodes = _content_nodes( + "- Algorithms\n" + " - [algos](https://x.com/a) - Algo lib.\n" + " - [sorts](https://x.com/s) - Sort lib.\n" + "- Design Patterns\n" + " - [patterns](https://x.com/p) - Pattern lib.\n" + ) + entries = _parse_section_entries(nodes) + assert len(entries) == 3 + assert entries[0]["name"] == "algos" + assert entries[2]["name"] == "patterns" + + def test_also_see_sub_entries(self): + nodes = _content_nodes( + "- [asyncio](https://docs.python.org/3/library/asyncio.html) - Async I/O.\n" + " - [awesome-asyncio](https://github.com/timofurrer/awesome-asyncio)\n" + "- [trio](https://github.com/python-trio/trio) - Friendly async.\n" + ) + entries = _parse_section_entries(nodes) + assert len(entries) == 2 + assert entries[0]["name"] == "asyncio" + assert len(entries[0]["also_see"]) == 1 + assert entries[0]["also_see"][0]["name"] == "awesome-asyncio" + assert entries[1]["name"] == "trio" + assert entries[1]["also_see"] == [] + + def test_entry_count_includes_also_see(self): + readme = textwrap.dedent("""\ + # T + + --- + + ## Async + + - [asyncio](https://x.com) - Async I/O. + - [awesome-asyncio](https://y.com) + - [trio](https://z.com) - Friendly async. + + # Contributing + + Done. + """) + cats, _ = parse_readme(readme) + # 2 main entries + 1 also_see = 3 + assert cats[0]["entry_count"] == 3 + + def test_preview_first_four_names(self): + readme = textwrap.dedent("""\ + # T + + --- + + ## Libs + + - [alpha](https://x.com) - A. + - [beta](https://x.com) - B. + - [gamma](https://x.com) - C. + - [delta](https://x.com) - D. + - [epsilon](https://x.com) - E. + + # Contributing + + Done. + """) + cats, _ = parse_readme(readme) + assert cats[0]["preview"] == "alpha, beta, gamma, delta" + + def test_description_html_escapes_xss(self): + nodes = _content_nodes('- [lib](https://x.com) - A lib.\n') + entries = _parse_section_entries(nodes) + assert "