Files
awesome-python/website/build.py
Vinta Chen 280f250ce0 feat: migrate README parser to markdown-it-py and refresh website
Switch readme_parser.py from regex-based parsing to markdown-it-py for
more robust and maintainable Markdown AST traversal. Update build pipeline,
templates, styles, and JS to support the new parser output. Refresh GitHub
stars data and update tests to match new parser behavior.

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-18 20:33:36 +08:00

251 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""Build a single-page HTML site from README.md for the awesome-python website."""
import json
import re
import shutil
from pathlib import Path
from typing import TypedDict
from jinja2 import Environment, FileSystemLoader
from readme_parser import parse_readme, slugify
# Thematic grouping of categories. Each category name must match exactly
# as it appears in README.md (the ## heading text).
SECTION_GROUPS: list[tuple[str, list[str]]] = [
("Web & API", [
"Web Frameworks", "RESTful API", "GraphQL", "WebSocket",
"ASGI Servers", "WSGI Servers", "HTTP Clients", "Template Engine",
"Web Asset Management", "Web Content Extracting", "Web Crawling",
]),
("Data & ML", [
"Data Analysis", "Data Validation", "Data Visualization",
"Machine Learning", "Deep Learning", "Computer Vision",
"Natural Language Processing", "Recommender Systems", "Science",
"Quantum Computing",
]),
("DevOps & Infrastructure", [
"DevOps Tools", "Distributed Computing", "Task Queues",
"Job Scheduler", "Serverless Frameworks", "Logging", "Processes",
"Shell", "Network Virtualization", "RPC Servers",
]),
("Database & Storage", [
"Database", "Database Drivers", "ORM", "Caching", "Search",
"Serialization",
]),
("Development Tools", [
"Testing", "Debugging Tools", "Code Analysis", "Build Tools",
"Refactoring", "Documentation", "Editor Plugins and IDEs",
"Interactive Interpreter",
]),
("CLI & GUI", [
"Command-line Interface Development", "Command-line Tools",
"GUI Development",
]),
("Content & Media", [
"Audio", "Video", "Image Processing", "HTML Manipulation",
"Text Processing", "Specific Formats Processing",
"File Manipulation", "Downloader",
]),
("System & Runtime", [
"Asynchronous Programming", "Environment Management",
"Package Management", "Package Repositories", "Distribution",
"Implementations", "Built-in Classes Enhancement",
"Functional Programming", "Configuration Files",
]),
("Security & Auth", [
"Authentication", "Cryptography", "Penetration Testing",
"Permissions",
]),
("Specialized", [
"CMS", "Admin Panels", "Email", "Game Development", "Geolocation",
"Hardware", "Internationalization", "Date and Time",
"URL Manipulation", "Robotics", "Microsoft Windows", "Miscellaneous",
"Algorithms and Design Patterns", "Static Site Generator",
]),
("Resources", []), # Filled dynamically from parsed resources
]
def group_categories(
categories: list[dict],
resources: list[dict],
) -> list[dict]:
"""Organize categories and resources into thematic section groups."""
cat_by_name = {c["name"]: c for c in categories}
groups = []
grouped_names: set[str] = set()
for group_name, cat_names in SECTION_GROUPS:
grouped_names.update(cat_names)
if group_name == "Resources":
group_cats = list(resources)
else:
group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name]
if group_cats:
groups.append({
"name": group_name,
"slug": slugify(group_name),
"categories": group_cats,
})
# Any categories not in a group go into "Other"
ungrouped = [c for c in categories if c["name"] not in grouped_names]
if ungrouped:
groups.append({
"name": "Other",
"slug": "other",
"categories": ungrouped,
})
return groups
class Entry(TypedDict):
name: str
url: str
description: str
category: str
group: str
stars: int | None
owner: str | None
last_commit_at: str | None
class StarData(TypedDict):
stars: int
owner: str
last_commit_at: str
fetched_at: str
GITHUB_REPO_URL_RE = re.compile(
r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$"
)
def extract_github_repo(url: str) -> str | None:
"""Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs."""
m = GITHUB_REPO_URL_RE.match(url)
return m.group(1) if m else None
def load_stars(path: Path) -> dict[str, StarData]:
"""Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt."""
if path.exists():
try:
return json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
return {}
def sort_entries(entries: list[dict]) -> list[dict]:
"""Sort entries by stars descending, then name ascending. No-star entries go last."""
def sort_key(entry: dict) -> tuple[int, int, str]:
stars = entry["stars"]
name = entry["name"].lower()
if stars is None:
return (1, 0, name)
return (0, -stars, name)
return sorted(entries, key=sort_key)
def extract_entries(
categories: list[dict],
groups: list[dict],
) -> list[dict]:
"""Flatten categories into individual library entries for table display."""
cat_to_group: dict[str, str] = {}
for group in groups:
for cat in group["categories"]:
cat_to_group[cat["name"]] = group["name"]
entries: list[dict] = []
for cat in categories:
group_name = cat_to_group.get(cat["name"], "Other")
for entry in cat["entries"]:
entries.append({
"name": entry["name"],
"url": entry["url"],
"description": entry["description"],
"category": cat["name"],
"group": group_name,
"stars": None,
"owner": None,
"last_commit_at": None,
"also_see": entry["also_see"],
})
return entries
def build(repo_root: str) -> None:
"""Main build: parse README, render single-page HTML via Jinja2 templates."""
repo = Path(repo_root)
website = repo / "website"
readme_text = (repo / "README.md").read_text(encoding="utf-8")
subtitle = ""
for line in readme_text.split("\n"):
stripped = line.strip()
if stripped and not stripped.startswith("#"):
subtitle = stripped
break
categories, resources = parse_readme(readme_text)
# All fields pre-computed: entry_count, content_html, preview, description
total_entries = sum(c["entry_count"] for c in categories)
groups = group_categories(categories, resources)
entries = extract_entries(categories, groups)
stars_data = load_stars(website / "data" / "github_stars.json")
for entry in entries:
repo_key = extract_github_repo(entry["url"])
if repo_key and repo_key in stars_data:
sd = stars_data[repo_key]
entry["stars"] = sd["stars"]
entry["owner"] = sd["owner"]
entry["last_commit_at"] = sd.get("last_commit_at", "")
entries = sort_entries(entries)
env = Environment(
loader=FileSystemLoader(website / "templates"),
autoescape=True,
)
site_dir = website / "output"
if site_dir.exists():
shutil.rmtree(site_dir)
site_dir.mkdir(parents=True)
tpl_index = env.get_template("index.html")
(site_dir / "index.html").write_text(
tpl_index.render(
categories=categories,
resources=resources,
groups=groups,
subtitle=subtitle,
entries=entries,
total_entries=total_entries,
total_categories=len(categories),
),
encoding="utf-8",
)
static_src = website / "static"
static_dst = site_dir / "static"
if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
print(f"Total entries: {total_entries}")
print(f"Output: {site_dir}")
if __name__ == "__main__":
build(str(Path(__file__).parent.parent))