refactor: parse thematic groups from README bold markers instead of hardcoding them

The website builder previously relied on a hardcoded SECTION_GROUPS list in
build.py to organize categories into thematic groups. This was fragile: any
rename or addition to README.md required a matching code change.

Replace this with a parser-driven approach:
- readme_parser.py now detects bold-only paragraphs (**Group Name**) as
  group boundary markers and groups H2 categories beneath them into
  ParsedGroup structs.
- build.py drops SECTION_GROUPS entirely; group_categories() now just
  passes parsed groups through and appends the Resources group.
- sort.py is removed as it relied on the old flat section model.
- Tests updated throughout to reflect the new (groups, resources) return
  shape and to cover the new grouping logic.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-03-20 18:43:09 +08:00
parent fd9b2665ed
commit 4322026817
5 changed files with 346 additions and 324 deletions
+11 -170
View File
@@ -10,179 +10,20 @@ from typing import TypedDict
from jinja2 import Environment, FileSystemLoader
from readme_parser import parse_readme, slugify
# Thematic grouping of categories. Each category name must match exactly
# as it appears in README.md (the ## heading text).
SECTION_GROUPS: list[tuple[str, list[str]]] = [
(
"Web & API",
[
"Admin Panels",
"CMS",
"Email",
"Static Site Generator",
"URL Manipulation",
"Web Frameworks",
"RESTful API",
"GraphQL",
"WebSocket",
"ASGI Servers",
"WSGI Servers",
"HTTP Clients",
"Template Engine",
"Web Asset Management",
"Web Content Extracting",
"Web Crawling",
],
),
(
"AI & ML",
[
"AI and Agents",
"Machine Learning",
"Deep Learning",
"Computer Vision",
"Natural Language Processing",
"Recommender Systems",
"Robotics",
],
),
(
"Data & Science",
[
"Data Analysis",
"Data Validation",
"Data Visualization",
"Geolocation",
"Science",
"Quantum Computing",
],
),
(
"DevOps & Infrastructure",
[
"DevOps Tools",
"Distributed Computing",
"Task Queues",
"Job Scheduler",
"Serverless Frameworks",
"Logging",
"Processes",
"Shell",
"Network Virtualization",
"RPC Servers",
],
),
(
"Database & Storage",
[
"Database",
"Database Drivers",
"ORM",
"Caching",
"Search",
"Serialization",
],
),
(
"Development Tools",
[
"Testing",
"Debugging Tools",
"Code Analysis",
"Build Tools",
"Algorithms and Design Patterns",
"Refactoring",
"Documentation",
"Editor Plugins and IDEs",
"Interactive Interpreter",
],
),
(
"CLI & GUI",
[
"Command-line Interface Development",
"Command-line Tools",
"GUI Development",
],
),
(
"Content & Media",
[
"Audio",
"Video",
"Game Development",
"Image Processing",
"Internationalization",
"HTML Manipulation",
"Text Processing",
"Specific Formats Processing",
"File Manipulation",
"Downloader",
],
),
(
"System & Runtime",
[
"Asynchronous Programming",
"Environment Management",
"Package Management",
"Package Repositories",
"Date and Time",
"Distribution",
"Hardware",
"Implementations",
"Microsoft Windows",
"Built-in Classes Enhancement",
"Functional Programming",
"Configuration Files",
],
),
(
"Security & Auth",
[
"Authentication",
"Cryptography",
"Penetration Testing",
"Permissions",
],
),
("Resources", []), # Filled dynamically from parsed resources
]
def group_categories(
categories: list[dict],
parsed_groups: list[dict],
resources: list[dict],
) -> list[dict]:
"""Organize categories and resources into thematic section groups."""
cat_by_name = {c["name"]: c for c in categories}
groups = []
grouped_names: set[str] = set()
"""Combine parsed groups with resources for template rendering."""
groups = list(parsed_groups)
for group_name, cat_names in SECTION_GROUPS:
grouped_names.update(cat_names)
if group_name == "Resources":
group_cats = list(resources)
else:
group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name]
if group_cats:
groups.append(
{
"name": group_name,
"slug": slugify(group_name),
"categories": group_cats,
}
)
# Any categories not in a group go into "Other"
ungrouped = [c for c in categories if c["name"] not in grouped_names]
if ungrouped:
if resources:
groups.append(
{
"name": "Other",
"slug": "other",
"categories": ungrouped,
"name": "Resources",
"slug": slugify("Resources"),
"categories": list(resources),
}
)
@@ -295,11 +136,11 @@ def build(repo_root: str) -> None:
subtitle = stripped
break
categories, resources = parse_readme(readme_text)
# All fields pre-computed: entry_count, content_html, preview, description
parsed_groups, resources = parse_readme(readme_text)
categories = [cat for g in parsed_groups for cat in g["categories"]]
total_entries = sum(c["entry_count"] for c in categories)
groups = group_categories(categories, resources)
groups = group_categories(parsed_groups, resources)
entries = extract_entries(categories, groups)
stars_data = load_stars(website / "data" / "github_stars.json")
@@ -344,7 +185,7 @@ def build(repo_root: str) -> None:
shutil.copy(repo / "README.md", site_dir / "llms.txt")
print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories + {len(resources)} resources")
print(f"Total entries: {total_entries}")
print(f"Output: {site_dir}")