refactor: parse thematic groups from README bold markers instead of hardcoding them

The website builder previously relied on a hardcoded SECTION_GROUPS list in
build.py to organize categories into thematic groups. This was fragile: any
rename or addition to README.md required a matching code change.

Replace this with a parser-driven approach:
- readme_parser.py now detects bold-only paragraphs (**Group Name**) as
  group boundary markers and groups H2 categories beneath them into
  ParsedGroup structs.
- build.py drops SECTION_GROUPS entirely; group_categories() now just
  passes parsed groups through and appends the Resources group.
- sort.py is removed as it relied on the old flat section model.
- Tests updated throughout to reflect the new (groups, resources) return
  shape and to cover the new grouping logic.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-03-20 18:43:09 +08:00
parent fd9b2665ed
commit 4322026817
5 changed files with 346 additions and 324 deletions

83
sort.py
View File

@@ -1,83 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
"""
The approach taken is explained below. I decided to do it simply.
Initially I was considering parsing the data into some sort of
structure and then generating an appropriate README. I am still
considering doing it - but for now this should work. The only issue
I see is that it only sorts the entries at the lowest level, and that
the order of the top-level contents do not match the order of the actual
entries.
This could be extended by having nested blocks, sorting them recursively
and flattening the end structure into a list of lines. Revision 2 maybe ^.^.
"""
def sort_blocks():
# First, we load the current README into memory
with open('README.md', 'r') as read_me_file:
read_me = read_me_file.read()
# Separating the 'table of contents' from the contents (blocks)
table_of_contents = ''.join(read_me.split('- - -')[0])
blocks = ''.join(read_me.split('- - -')[1]).split('\n# ')
for i in range(len(blocks)):
if i == 0:
blocks[i] = blocks[i] + '\n'
else:
blocks[i] = '# ' + blocks[i] + '\n'
# Sorting the libraries
inner_blocks = sorted(blocks[0].split('##'))
for i in range(1, len(inner_blocks)):
if inner_blocks[i][0] != '#':
inner_blocks[i] = '##' + inner_blocks[i]
inner_blocks = ''.join(inner_blocks)
# Replacing the non-sorted libraries by the sorted ones and gathering all at the final_README file
blocks[0] = inner_blocks
final_README = table_of_contents + '- - -' + ''.join(blocks)
with open('README.md', 'w+') as sorted_file:
sorted_file.write(final_README)
def main():
# First, we load the current README into memory as an array of lines
with open('README.md', 'r') as read_me_file:
read_me = read_me_file.readlines()
# Then we cluster the lines together as blocks
# Each block represents a collection of lines that should be sorted
# This was done by assuming only links ([...](...)) are meant to be sorted
# Clustering is done by indentation
blocks = []
last_indent = None
for line in read_me:
s_line = line.lstrip()
indent = len(line) - len(s_line)
if any([s_line.startswith(s) for s in ['* [', '- [']]):
if indent == last_indent:
blocks[-1].append(line)
else:
blocks.append([line])
last_indent = indent
else:
blocks.append([line])
last_indent = None
with open('README.md', 'w+') as sorted_file:
# Then all of the blocks are sorted individually
blocks = [
''.join(sorted(block, key=str.lower)) for block in blocks
]
# And the result is written back to README.md
sorted_file.write(''.join(blocks))
# Then we call the sorting method
sort_blocks()
if __name__ == "__main__":
main()

View File

@@ -10,179 +10,20 @@ from typing import TypedDict
from jinja2 import Environment, FileSystemLoader
from readme_parser import parse_readme, slugify
# Thematic grouping of categories. Each category name must match exactly
# as it appears in README.md (the ## heading text).
SECTION_GROUPS: list[tuple[str, list[str]]] = [
(
"Web & API",
[
"Admin Panels",
"CMS",
"Email",
"Static Site Generator",
"URL Manipulation",
"Web Frameworks",
"RESTful API",
"GraphQL",
"WebSocket",
"ASGI Servers",
"WSGI Servers",
"HTTP Clients",
"Template Engine",
"Web Asset Management",
"Web Content Extracting",
"Web Crawling",
],
),
(
"AI & ML",
[
"AI and Agents",
"Machine Learning",
"Deep Learning",
"Computer Vision",
"Natural Language Processing",
"Recommender Systems",
"Robotics",
],
),
(
"Data & Science",
[
"Data Analysis",
"Data Validation",
"Data Visualization",
"Geolocation",
"Science",
"Quantum Computing",
],
),
(
"DevOps & Infrastructure",
[
"DevOps Tools",
"Distributed Computing",
"Task Queues",
"Job Scheduler",
"Serverless Frameworks",
"Logging",
"Processes",
"Shell",
"Network Virtualization",
"RPC Servers",
],
),
(
"Database & Storage",
[
"Database",
"Database Drivers",
"ORM",
"Caching",
"Search",
"Serialization",
],
),
(
"Development Tools",
[
"Testing",
"Debugging Tools",
"Code Analysis",
"Build Tools",
"Algorithms and Design Patterns",
"Refactoring",
"Documentation",
"Editor Plugins and IDEs",
"Interactive Interpreter",
],
),
(
"CLI & GUI",
[
"Command-line Interface Development",
"Command-line Tools",
"GUI Development",
],
),
(
"Content & Media",
[
"Audio",
"Video",
"Game Development",
"Image Processing",
"Internationalization",
"HTML Manipulation",
"Text Processing",
"Specific Formats Processing",
"File Manipulation",
"Downloader",
],
),
(
"System & Runtime",
[
"Asynchronous Programming",
"Environment Management",
"Package Management",
"Package Repositories",
"Date and Time",
"Distribution",
"Hardware",
"Implementations",
"Microsoft Windows",
"Built-in Classes Enhancement",
"Functional Programming",
"Configuration Files",
],
),
(
"Security & Auth",
[
"Authentication",
"Cryptography",
"Penetration Testing",
"Permissions",
],
),
("Resources", []), # Filled dynamically from parsed resources
]
def group_categories(
categories: list[dict],
parsed_groups: list[dict],
resources: list[dict],
) -> list[dict]:
"""Organize categories and resources into thematic section groups."""
cat_by_name = {c["name"]: c for c in categories}
groups = []
grouped_names: set[str] = set()
"""Combine parsed groups with resources for template rendering."""
groups = list(parsed_groups)
for group_name, cat_names in SECTION_GROUPS:
grouped_names.update(cat_names)
if group_name == "Resources":
group_cats = list(resources)
else:
group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name]
if group_cats:
groups.append(
{
"name": group_name,
"slug": slugify(group_name),
"categories": group_cats,
}
)
# Any categories not in a group go into "Other"
ungrouped = [c for c in categories if c["name"] not in grouped_names]
if ungrouped:
if resources:
groups.append(
{
"name": "Other",
"slug": "other",
"categories": ungrouped,
"name": "Resources",
"slug": slugify("Resources"),
"categories": list(resources),
}
)
@@ -295,11 +136,11 @@ def build(repo_root: str) -> None:
subtitle = stripped
break
categories, resources = parse_readme(readme_text)
# All fields pre-computed: entry_count, content_html, preview, description
parsed_groups, resources = parse_readme(readme_text)
categories = [cat for g in parsed_groups for cat in g["categories"]]
total_entries = sum(c["entry_count"] for c in categories)
groups = group_categories(categories, resources)
groups = group_categories(parsed_groups, resources)
entries = extract_entries(categories, groups)
stars_data = load_stars(website / "data" / "github_stars.json")
@@ -344,7 +185,7 @@ def build(repo_root: str) -> None:
shutil.copy(repo / "README.md", site_dir / "llms.txt")
print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories + {len(resources)} resources")
print(f"Total entries: {total_entries}")
print(f"Output: {site_dir}")

View File

@@ -32,6 +32,12 @@ class ParsedSection(TypedDict):
content_html: str # rendered HTML, properly escaped
class ParsedGroup(TypedDict):
name: str
slug: str
categories: list[ParsedSection]
# --- Slugify ----------------------------------------------------------------
_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]")
@@ -305,6 +311,25 @@ def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str:
# --- Section splitting -------------------------------------------------------
def _build_section(name: str, body: list[SyntaxTreeNode]) -> ParsedSection:
"""Build a ParsedSection from a heading name and its body nodes."""
desc = _extract_description(body)
content_nodes = body[1:] if desc else body
entries = _parse_section_entries(content_nodes)
entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
preview = ", ".join(e["name"] for e in entries[:4])
content_html = _render_section_html(content_nodes)
return ParsedSection(
name=name,
slug=slugify(name),
description=desc,
entries=entries,
entry_count=entry_count,
preview=preview,
content_html=content_html,
)
def _group_by_h2(
nodes: list[SyntaxTreeNode],
) -> list[ParsedSection]:
@@ -317,22 +342,7 @@ def _group_by_h2(
nonlocal current_name
if current_name is None:
return
desc = _extract_description(current_body)
content_nodes = current_body[1:] if desc else current_body
entries = _parse_section_entries(content_nodes)
entry_count = len(entries) + sum(len(e["also_see"]) for e in entries)
preview = ", ".join(e["name"] for e in entries[:4])
content_html = _render_section_html(content_nodes)
sections.append(ParsedSection(
name=current_name,
slug=slugify(current_name),
description=desc,
entries=entries,
entry_count=entry_count,
preview=preview,
content_html=content_html,
))
sections.append(_build_section(current_name, current_body))
current_name = None
for node in nodes:
@@ -347,10 +357,86 @@ def _group_by_h2(
return sections
def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
"""Parse README.md text into categories and resources.
def _is_bold_marker(node: SyntaxTreeNode) -> str | None:
"""Detect a bold-only paragraph used as a group marker.
Returns (categories, resources) where each is a list of ParsedSection dicts.
Pattern: a paragraph whose only content is **Group Name** (possibly
surrounded by empty text nodes in the AST).
Returns the group name text, or None if not a group marker.
"""
if node.type != "paragraph":
return None
for child in node.children:
if child.type != "inline":
continue
# Filter out empty text nodes that markdown-it inserts around strong
meaningful = [c for c in child.children if not (c.type == "text" and c.content == "")]
if len(meaningful) == 1 and meaningful[0].type == "strong":
return render_inline_text(meaningful[0].children)
return None
def _parse_grouped_sections(
nodes: list[SyntaxTreeNode],
) -> list[ParsedGroup]:
"""Parse nodes into groups of categories using bold markers as group boundaries.
Bold-only paragraphs (**Group Name**) delimit groups. H2 headings under each
bold marker become categories within that group. Categories appearing before
any bold marker go into an "Other" group.
"""
groups: list[ParsedGroup] = []
current_group_name: str | None = None
current_group_cats: list[ParsedSection] = []
current_cat_name: str | None = None
current_cat_body: list[SyntaxTreeNode] = []
def flush_cat() -> None:
nonlocal current_cat_name
if current_cat_name is None:
return
current_group_cats.append(_build_section(current_cat_name, current_cat_body))
current_cat_name = None
def flush_group() -> None:
nonlocal current_group_name, current_group_cats
if not current_group_cats:
current_group_name = None
current_group_cats = []
return
name = current_group_name or "Other"
groups.append(ParsedGroup(
name=name,
slug=slugify(name),
categories=list(current_group_cats),
))
current_group_name = None
current_group_cats = []
for node in nodes:
bold_name = _is_bold_marker(node)
if bold_name is not None:
flush_cat()
flush_group()
current_group_name = bold_name
current_cat_body = []
elif node.type == "heading" and node.tag == "h2":
flush_cat()
current_cat_name = _heading_text(node)
current_cat_body = []
elif current_cat_name is not None:
current_cat_body.append(node)
flush_cat()
flush_group()
return groups
def parse_readme(text: str) -> tuple[list[ParsedGroup], list[ParsedSection]]:
"""Parse README.md text into grouped categories and resources.
Returns (groups, resources) where groups is a list of ParsedGroup dicts
containing nested categories, and resources is a flat list of ParsedSection.
"""
md = MarkdownIt("commonmark")
tokens = md.parse(text)
@@ -382,7 +468,7 @@ def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]:
res_end = contributing_idx or len(children)
res_nodes = children[resources_idx + 1 : res_end]
categories = _group_by_h2(cat_nodes)
groups = _parse_grouped_sections(cat_nodes)
resources = _group_by_h2(res_nodes)
return categories, resources
return groups, resources

View File

@@ -48,28 +48,33 @@ class TestSlugify:
class TestGroupCategories:
def test_groups_known_categories(self):
cats = [
{"name": "Web Frameworks", "slug": "web-frameworks"},
{"name": "Testing", "slug": "testing"},
def test_appends_resources(self):
parsed_groups = [
{"name": "G1", "slug": "g1", "categories": [{"name": "Cat1"}]},
]
groups = group_categories(cats, [])
group_names = [g["name"] for g in groups]
assert "Web & API" in group_names
assert "Development Tools" in group_names
def test_ungrouped_go_to_other(self):
cats = [{"name": "Unknown Category", "slug": "unknown-category"}]
groups = group_categories(cats, [])
group_names = [g["name"] for g in groups]
assert "Other" in group_names
def test_resources_grouped(self):
resources = [{"name": "Newsletters", "slug": "newsletters"}]
groups = group_categories([], resources)
groups = group_categories(parsed_groups, resources)
group_names = [g["name"] for g in groups]
assert "G1" in group_names
assert "Resources" in group_names
def test_no_resources_no_extra_group(self):
parsed_groups = [
{"name": "G1", "slug": "g1", "categories": [{"name": "Cat1"}]},
]
groups = group_categories(parsed_groups, [])
assert len(groups) == 1
assert groups[0]["name"] == "G1"
def test_preserves_group_order(self):
parsed_groups = [
{"name": "Second", "slug": "second", "categories": [{"name": "C2"}]},
{"name": "First", "slug": "first", "categories": [{"name": "C1"}]},
]
groups = group_categories(parsed_groups, [])
assert groups[0]["name"] == "Second"
assert groups[1]["name"] == "First"
# ---------------------------------------------------------------------------
# build (integration)
@@ -114,6 +119,8 @@ class TestBuild:
---
**Tools**
## Widgets
_Widget libraries._
@@ -176,10 +183,14 @@ class TestBuild:
---
**Group A**
## Alpha
- [a](https://x.com) - A.
**Group B**
## Beta
- [b](https://x.com) - B.
@@ -194,6 +205,8 @@ class TestBuild:
index_html = (tmp_path / "website" / "output" / "index.html").read_text()
assert "Alpha" in index_html
assert "Beta" in index_html
assert "Group A" in index_html
assert "Group B" in index_html
def test_index_contains_preview_text(self, tmp_path):
readme = textwrap.dedent("""\

View File

@@ -115,27 +115,74 @@ MINIMAL_README = textwrap.dedent("""\
""")
GROUPED_README = textwrap.dedent("""\
# Awesome Python
Some intro text.
---
**Group One**
## Alpha
_Libraries for alpha stuff._
- [lib-a](https://example.com/a) - Does A.
- [lib-b](https://example.com/b) - Does B.
**Group Two**
## Beta
_Tools for beta._
- [lib-c](https://example.com/c) - Does C.
## Gamma
- [lib-d](https://example.com/d) - Does D.
# Resources
Where to discover resources.
## Newsletters
- [News One](https://example.com/n1)
# Contributing
Please contribute!
""")
class TestParseReadmeSections:
def test_category_count(self):
cats, resources = parse_readme(MINIMAL_README)
assert len(cats) == 2
def test_ungrouped_categories_go_to_other(self):
groups, resources = parse_readme(MINIMAL_README)
assert len(groups) == 1
assert groups[0]["name"] == "Other"
assert len(groups[0]["categories"]) == 2
def test_resource_count(self):
cats, resources = parse_readme(MINIMAL_README)
assert len(resources) == 2
def test_category_names(self):
cats, _ = parse_readme(MINIMAL_README)
def test_ungrouped_category_names(self):
groups, _ = parse_readme(MINIMAL_README)
cats = groups[0]["categories"]
assert cats[0]["name"] == "Alpha"
assert cats[1]["name"] == "Beta"
def test_resource_count(self):
_, resources = parse_readme(MINIMAL_README)
assert len(resources) == 2
def test_category_slugs(self):
cats, _ = parse_readme(MINIMAL_README)
groups, _ = parse_readme(MINIMAL_README)
cats = groups[0]["categories"]
assert cats[0]["slug"] == "alpha"
assert cats[1]["slug"] == "beta"
def test_category_description(self):
cats, _ = parse_readme(MINIMAL_README)
groups, _ = parse_readme(MINIMAL_README)
cats = groups[0]["categories"]
assert cats[0]["description"] == "Libraries for alpha stuff."
assert cats[1]["description"] == "Tools for beta."
@@ -145,13 +192,16 @@ class TestParseReadmeSections:
assert resources[1]["name"] == "Podcasts"
def test_contributing_skipped(self):
cats, resources = parse_readme(MINIMAL_README)
all_names = [c["name"] for c in cats] + [r["name"] for r in resources]
groups, resources = parse_readme(MINIMAL_README)
all_names = []
for g in groups:
all_names.extend(c["name"] for c in g["categories"])
all_names.extend(r["name"] for r in resources)
assert "Contributing" not in all_names
def test_no_separator(self):
cats, resources = parse_readme("# Just a heading\n\nSome text.\n")
assert cats == []
groups, resources = parse_readme("# Just a heading\n\nSome text.\n")
assert groups == []
assert resources == []
def test_no_description(self):
@@ -174,7 +224,8 @@ class TestParseReadmeSections:
Done.
""")
cats, resources = parse_readme(readme)
groups, resources = parse_readme(readme)
cats = groups[0]["categories"]
assert cats[0]["description"] == ""
assert cats[0]["entries"][0]["name"] == "item"
@@ -194,10 +245,114 @@ class TestParseReadmeSections:
Done.
""")
cats, _ = parse_readme(readme)
groups, _ = parse_readme(readme)
cats = groups[0]["categories"]
assert cats[0]["description"] == "Algorithms. Also see awesome-algos."
class TestParseGroupedReadme:
def test_group_count(self):
groups, _ = parse_readme(GROUPED_README)
assert len(groups) == 2
def test_group_names(self):
groups, _ = parse_readme(GROUPED_README)
assert groups[0]["name"] == "Group One"
assert groups[1]["name"] == "Group Two"
def test_group_slugs(self):
groups, _ = parse_readme(GROUPED_README)
assert groups[0]["slug"] == "group-one"
assert groups[1]["slug"] == "group-two"
def test_group_one_has_one_category(self):
groups, _ = parse_readme(GROUPED_README)
assert len(groups[0]["categories"]) == 1
assert groups[0]["categories"][0]["name"] == "Alpha"
def test_group_two_has_two_categories(self):
groups, _ = parse_readme(GROUPED_README)
assert len(groups[1]["categories"]) == 2
assert groups[1]["categories"][0]["name"] == "Beta"
assert groups[1]["categories"][1]["name"] == "Gamma"
def test_resources_still_parsed(self):
_, resources = parse_readme(GROUPED_README)
assert len(resources) == 1
assert resources[0]["name"] == "Newsletters"
def test_empty_group_skipped(self):
readme = textwrap.dedent("""\
# T
---
**Empty**
**HasCats**
## Cat
- [x](https://x.com) - X.
# Contributing
Done.
""")
groups, _ = parse_readme(readme)
assert len(groups) == 1
assert groups[0]["name"] == "HasCats"
def test_bold_with_extra_text_not_group_marker(self):
readme = textwrap.dedent("""\
# T
---
**Note:** This is not a group marker.
## Cat
- [x](https://x.com) - X.
# Contributing
Done.
""")
groups, _ = parse_readme(readme)
# "Note:" has text after the strong node, so it's not a group marker
# Category goes into "Other"
assert len(groups) == 1
assert groups[0]["name"] == "Other"
def test_categories_before_any_group_marker(self):
readme = textwrap.dedent("""\
# T
---
## Orphan
- [x](https://x.com) - X.
**A Group**
## Grouped
- [y](https://x.com) - Y.
# Contributing
Done.
""")
groups, _ = parse_readme(readme)
assert len(groups) == 2
assert groups[0]["name"] == "Other"
assert groups[0]["categories"][0]["name"] == "Orphan"
assert groups[1]["name"] == "A Group"
assert groups[1]["categories"][0]["name"] == "Grouped"
def _content_nodes(md_text: str) -> list[SyntaxTreeNode]:
"""Helper: parse markdown and return all block nodes."""
md = MarkdownIt("commonmark")
@@ -283,7 +438,8 @@ class TestParseSectionEntries:
Done.
""")
cats, _ = parse_readme(readme)
groups, _ = parse_readme(readme)
cats = groups[0]["categories"]
# 2 main entries + 1 also_see = 3
assert cats[0]["entry_count"] == 3
@@ -305,7 +461,8 @@ class TestParseSectionEntries:
Done.
""")
cats, _ = parse_readme(readme)
groups, _ = parse_readme(readme)
cats = groups[0]["categories"]
assert cats[0]["preview"] == "alpha, beta, gamma, delta"
def test_description_html_escapes_xss(self):
@@ -366,10 +523,17 @@ class TestParseRealReadme:
readme_path = os.path.join(os.path.dirname(__file__), "..", "..", "README.md")
with open(readme_path, encoding="utf-8") as f:
self.readme_text = f.read()
self.cats, self.resources = parse_readme(self.readme_text)
self.groups, self.resources = parse_readme(self.readme_text)
self.cats = [c for g in self.groups for c in g["categories"]]
def test_at_least_83_categories(self):
assert len(self.cats) >= 83
def test_at_least_11_groups(self):
assert len(self.groups) >= 11
def test_first_group_is_ai_ml(self):
assert self.groups[0]["name"] == "AI & ML"
def test_at_least_76_categories(self):
assert len(self.cats) >= 76
def test_resources_has_newsletters_and_podcasts(self):
names = [r["name"] for r in self.resources]
@@ -380,21 +544,17 @@ class TestParseRealReadme:
all_names = [c["name"] for c in self.cats] + [r["name"] for r in self.resources]
assert "Contributing" not in all_names
def test_first_category_is_admin_panels(self):
assert self.cats[0]["name"] == "Admin Panels"
assert self.cats[0]["slug"] == "admin-panels"
def test_first_category_is_ai_and_agents(self):
assert self.cats[0]["name"] == "AI and Agents"
assert self.cats[0]["slug"] == "ai-and-agents"
def test_last_category_is_wsgi_servers(self):
assert self.cats[-1]["name"] == "WSGI Servers"
assert self.cats[-1]["slug"] == "wsgi-servers"
def test_restful_api_slug(self):
def test_web_apis_slug(self):
slugs = [c["slug"] for c in self.cats]
assert "restful-api" in slugs
assert "web-apis" in slugs
def test_descriptions_extracted(self):
admin = self.cats[0]
assert admin["description"] == "Libraries for administrative interfaces."
ai = next(c for c in self.cats if c["name"] == "AI and Agents")
assert "AI applications" in ai["description"]
def test_entry_counts_nonzero(self):
for cat in self.cats:
@@ -422,3 +582,8 @@ class TestParseRealReadme:
algos = next(c for c in self.cats if c["name"] == "Algorithms and Design Patterns")
assert "awesome-algorithms" in algos["description"]
assert "https://" not in algos["description"]
def test_miscellaneous_in_own_group(self):
misc_group = next((g for g in self.groups if g["name"] == "Miscellaneous"), None)
assert misc_group is not None
assert any(c["name"] == "Miscellaneous" for c in misc_group["categories"])