diff --git a/website/readme_parser.py b/website/readme_parser.py
index 62afd94c..71a36742 100644
--- a/website/readme_parser.py
+++ b/website/readme_parser.py
@@ -155,11 +155,121 @@ def _nodes_to_raw_markdown(nodes: list[SyntaxTreeNode], source_lines: list[str])
return "\n".join(source_lines[start_line:end_line]).strip()
-# --- Stubs for Tasks 3 & 4 (replace in later tasks) -------------------------
+# --- Entry extraction --------------------------------------------------------
+
+_DESC_SEP_RE = re.compile(r"^\s*[-\u2013\u2014]\s*")
+
+
+def _find_inline(node: SyntaxTreeNode) -> SyntaxTreeNode | None:
+ """Find the inline node in a list_item's paragraph."""
+ for child in node.children:
+ if child.type == "paragraph":
+ for sub in child.children:
+ if sub.type == "inline":
+ return sub
+ return None
+
+
+def _find_first_link(inline: SyntaxTreeNode) -> SyntaxTreeNode | None:
+ """Find the first link node among inline children."""
+ for child in inline.children:
+ if child.type == "link":
+ return child
+ return None
+
+
+def _find_child(node: SyntaxTreeNode, child_type: str) -> SyntaxTreeNode | None:
+ """Find first direct child of a given type."""
+ for child in node.children:
+ if child.type == child_type:
+ return child
+ return None
+
+
+def _extract_description_html(inline: SyntaxTreeNode, first_link: SyntaxTreeNode) -> str:
+ """Extract description HTML from inline content after the first link.
+
+ AST: [link("name"), text(" - Description.")] -> "Description."
+ The separator (- / en-dash / em-dash) is stripped.
+ """
+ link_idx = next((i for i, c in enumerate(inline.children) if c is first_link), None)
+ if link_idx is None:
+ return ""
+ desc_children = inline.children[link_idx + 1 :]
+ if not desc_children:
+ return ""
+ html = render_inline_html(desc_children)
+ return _DESC_SEP_RE.sub("", html)
+
+
+def _parse_list_entries(bullet_list: SyntaxTreeNode) -> list[ParsedEntry]:
+ """Extract entries from a bullet_list AST node.
+
+ Handles three patterns:
+ - Text-only list_item -> subcategory label -> recurse into nested list
+ - Link list_item with nested link-only items -> entry with also_see
+ - Link list_item without nesting -> simple entry
+ """
+ entries: list[ParsedEntry] = []
+
+ for list_item in bullet_list.children:
+ if list_item.type != "list_item":
+ continue
+
+ inline = _find_inline(list_item)
+ if inline is None:
+ continue
+
+ first_link = _find_first_link(inline)
+
+ if first_link is None:
+ # Subcategory label — recurse into nested bullet_list
+ nested = _find_child(list_item, "bullet_list")
+ if nested:
+ entries.extend(_parse_list_entries(nested))
+ continue
+
+ # Entry with a link
+ name = render_inline_text(first_link.children)
+ url = first_link.attrGet("href") or ""
+ desc_html = _extract_description_html(inline, first_link)
+
+ # Collect also_see from nested bullet_list
+ also_see: list[AlsoSee] = []
+ nested = _find_child(list_item, "bullet_list")
+ if nested:
+ for sub_item in nested.children:
+ if sub_item.type != "list_item":
+ continue
+ sub_inline = _find_inline(sub_item)
+ if sub_inline:
+ sub_link = _find_first_link(sub_inline)
+ if sub_link:
+ also_see.append(AlsoSee(
+ name=render_inline_text(sub_link.children),
+ url=sub_link.attrGet("href") or "",
+ ))
+
+ entries.append(ParsedEntry(
+ name=name,
+ url=url,
+ description=desc_html,
+ also_see=also_see,
+ ))
+
+ return entries
def _parse_section_entries(content_nodes: list[SyntaxTreeNode]) -> list[ParsedEntry]:
- return []
+ """Extract all entries from a section's content nodes."""
+ entries: list[ParsedEntry] = []
+ for node in content_nodes:
+ if node.type == "bullet_list":
+ entries.extend(_parse_list_entries(node))
+ return entries
+
+
+# --- Content HTML rendering (stub for Task 4) --------------------------------
def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str:
diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py
index 3f32e844..f0f53e92 100644
--- a/website/tests/test_readme_parser.py
+++ b/website/tests/test_readme_parser.py
@@ -5,7 +5,7 @@ import sys
import textwrap
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-from readme_parser import parse_readme, render_inline_html, render_inline_text
+from readme_parser import _parse_section_entries, parse_readme, render_inline_html, render_inline_text
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
@@ -200,3 +200,106 @@ class TestParseReadmeSections:
""")
cats, _ = parse_readme(readme)
assert cats[0]["description"] == "Algorithms. Also see awesome-algos."
+
+
+def _content_nodes(md_text: str) -> list[SyntaxTreeNode]:
+ """Helper: parse markdown and return all block nodes."""
+ md = MarkdownIt("commonmark")
+ root = SyntaxTreeNode(md.parse(md_text))
+ return root.children
+
+
+class TestParseSectionEntries:
+ def test_flat_entries(self):
+ nodes = _content_nodes(
+ "- [django](https://example.com/d) - A web framework.\n"
+ "- [flask](https://example.com/f) - A micro framework.\n"
+ )
+ entries = _parse_section_entries(nodes)
+ assert len(entries) == 2
+ assert entries[0]["name"] == "django"
+ assert entries[0]["url"] == "https://example.com/d"
+ assert "web framework" in entries[0]["description"]
+ assert entries[0]["also_see"] == []
+ assert entries[1]["name"] == "flask"
+
+ def test_link_only_entry(self):
+ nodes = _content_nodes("- [tool](https://x.com)\n")
+ entries = _parse_section_entries(nodes)
+ assert len(entries) == 1
+ assert entries[0]["name"] == "tool"
+ assert entries[0]["description"] == ""
+
+ def test_subcategorized_entries(self):
+ nodes = _content_nodes(
+ "- Algorithms\n"
+ " - [algos](https://x.com/a) - Algo lib.\n"
+ " - [sorts](https://x.com/s) - Sort lib.\n"
+ "- Design Patterns\n"
+ " - [patterns](https://x.com/p) - Pattern lib.\n"
+ )
+ entries = _parse_section_entries(nodes)
+ assert len(entries) == 3
+ assert entries[0]["name"] == "algos"
+ assert entries[2]["name"] == "patterns"
+
+ def test_also_see_sub_entries(self):
+ nodes = _content_nodes(
+ "- [asyncio](https://docs.python.org/3/library/asyncio.html) - Async I/O.\n"
+ " - [awesome-asyncio](https://github.com/timofurrer/awesome-asyncio)\n"
+ "- [trio](https://github.com/python-trio/trio) - Friendly async.\n"
+ )
+ entries = _parse_section_entries(nodes)
+ assert len(entries) == 2
+ assert entries[0]["name"] == "asyncio"
+ assert len(entries[0]["also_see"]) == 1
+ assert entries[0]["also_see"][0]["name"] == "awesome-asyncio"
+ assert entries[1]["name"] == "trio"
+ assert entries[1]["also_see"] == []
+
+ def test_entry_count_includes_also_see(self):
+ readme = textwrap.dedent("""\
+ # T
+
+ ---
+
+ ## Async
+
+ - [asyncio](https://x.com) - Async I/O.
+ - [awesome-asyncio](https://y.com)
+ - [trio](https://z.com) - Friendly async.
+
+ # Contributing
+
+ Done.
+ """)
+ cats, _ = parse_readme(readme)
+ # 2 main entries + 1 also_see = 3
+ assert cats[0]["entry_count"] == 3
+
+ def test_preview_first_four_names(self):
+ readme = textwrap.dedent("""\
+ # T
+
+ ---
+
+ ## Libs
+
+ - [alpha](https://x.com) - A.
+ - [beta](https://x.com) - B.
+ - [gamma](https://x.com) - C.
+ - [delta](https://x.com) - D.
+ - [epsilon](https://x.com) - E.
+
+ # Contributing
+
+ Done.
+ """)
+ cats, _ = parse_readme(readme)
+ assert cats[0]["preview"] == "alpha, beta, gamma, delta"
+
+ def test_description_html_escapes_xss(self):
+ nodes = _content_nodes('- [lib](https://x.com) - A lib.\n')
+ entries = _parse_section_entries(nodes)
+ assert "