Files
awesome-python/website/build.py
Vinta Chen 4322026817 refactor: parse thematic groups from README bold markers instead of hardcoding them
The website builder previously relied on a hardcoded SECTION_GROUPS list in
build.py to organize categories into thematic groups. This was fragile: any
rename or addition to README.md required a matching code change.

Replace this with a parser-driven approach:
- readme_parser.py now detects bold-only paragraphs (**Group Name**) as
  group boundary markers and groups H2 categories beneath them into
  ParsedGroup structs.
- build.py drops SECTION_GROUPS entirely; group_categories() now just
  passes parsed groups through and appends the Resources group.
- sort.py is removed as it relied on the old flat section model.
- Tests updated throughout to reflect the new (groups, resources) return
  shape and to cover the new grouping logic.

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-20 18:43:09 +08:00

195 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""Build a single-page HTML site from README.md for the awesome-python website."""
import json
import re
import shutil
from pathlib import Path
from typing import TypedDict
from jinja2 import Environment, FileSystemLoader
from readme_parser import parse_readme, slugify
def group_categories(
parsed_groups: list[dict],
resources: list[dict],
) -> list[dict]:
"""Combine parsed groups with resources for template rendering."""
groups = list(parsed_groups)
if resources:
groups.append(
{
"name": "Resources",
"slug": slugify("Resources"),
"categories": list(resources),
}
)
return groups
class Entry(TypedDict):
name: str
url: str
description: str
category: str
group: str
stars: int | None
owner: str | None
last_commit_at: str | None
class StarData(TypedDict):
stars: int
owner: str
last_commit_at: str
fetched_at: str
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
def extract_github_repo(url: str) -> str | None:
"""Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs."""
m = GITHUB_REPO_URL_RE.match(url)
return m.group(1) if m else None
def load_stars(path: Path) -> dict[str, StarData]:
"""Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt."""
if path.exists():
try:
return json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
return {}
def sort_entries(entries: list[dict]) -> list[dict]:
"""Sort entries by stars descending, then name ascending. No-star entries go last."""
def sort_key(entry: dict) -> tuple[int, int, str]:
stars = entry["stars"]
name = entry["name"].lower()
if stars is None:
return (1, 0, name)
return (0, -stars, name)
return sorted(entries, key=sort_key)
def extract_entries(
categories: list[dict],
groups: list[dict],
) -> list[dict]:
"""Flatten categories into individual library entries for table display.
Entries appearing in multiple categories are merged into a single entry
with lists of categories and groups.
"""
cat_to_group: dict[str, str] = {}
for group in groups:
for cat in group["categories"]:
cat_to_group[cat["name"]] = group["name"]
seen: dict[str, dict] = {} # url -> entry
entries: list[dict] = []
for cat in categories:
group_name = cat_to_group.get(cat["name"], "Other")
for entry in cat["entries"]:
url = entry["url"]
if url in seen:
existing = seen[url]
if cat["name"] not in existing["categories"]:
existing["categories"].append(cat["name"])
if group_name not in existing["groups"]:
existing["groups"].append(group_name)
else:
merged = {
"name": entry["name"],
"url": url,
"description": entry["description"],
"categories": [cat["name"]],
"groups": [group_name],
"stars": None,
"owner": None,
"last_commit_at": None,
"also_see": entry["also_see"],
}
seen[url] = merged
entries.append(merged)
return entries
def build(repo_root: str) -> None:
"""Main build: parse README, render single-page HTML via Jinja2 templates."""
repo = Path(repo_root)
website = repo / "website"
readme_text = (repo / "README.md").read_text(encoding="utf-8")
subtitle = ""
for line in readme_text.split("\n"):
stripped = line.strip()
if stripped and not stripped.startswith("#"):
subtitle = stripped
break
parsed_groups, resources = parse_readme(readme_text)
categories = [cat for g in parsed_groups for cat in g["categories"]]
total_entries = sum(c["entry_count"] for c in categories)
groups = group_categories(parsed_groups, resources)
entries = extract_entries(categories, groups)
stars_data = load_stars(website / "data" / "github_stars.json")
for entry in entries:
repo_key = extract_github_repo(entry["url"])
if repo_key and repo_key in stars_data:
sd = stars_data[repo_key]
entry["stars"] = sd["stars"]
entry["owner"] = sd["owner"]
entry["last_commit_at"] = sd.get("last_commit_at", "")
entries = sort_entries(entries)
env = Environment(
loader=FileSystemLoader(website / "templates"),
autoescape=True,
)
site_dir = website / "output"
if site_dir.exists():
shutil.rmtree(site_dir)
site_dir.mkdir(parents=True)
tpl_index = env.get_template("index.html")
(site_dir / "index.html").write_text(
tpl_index.render(
categories=categories,
resources=resources,
groups=groups,
subtitle=subtitle,
entries=entries,
total_entries=total_entries,
total_categories=len(categories),
),
encoding="utf-8",
)
static_src = website / "static"
static_dst = site_dir / "static"
if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
shutil.copy(repo / "README.md", site_dir / "llms.txt")
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories + {len(resources)} resources")
print(f"Total entries: {total_entries}")
print(f"Output: {site_dir}")
if __name__ == "__main__":
build(str(Path(__file__).parent.parent))