#!/usr/bin/env python3 """Build a single-page HTML site from README.md for the awesome-python website.""" import json import re import shutil from datetime import datetime, timezone from pathlib import Path from typing import TypedDict from jinja2 import Environment, FileSystemLoader from readme_parser import parse_readme, slugify def group_categories( parsed_groups: list[dict], resources: list[dict], ) -> list[dict]: """Combine parsed groups with resources for template rendering.""" groups = list(parsed_groups) if resources: groups.append( { "name": "Resources", "slug": slugify("Resources"), "categories": list(resources), } ) return groups class StarData(TypedDict): stars: int owner: str last_commit_at: str fetched_at: str GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") SOURCE_TYPE_DOMAINS = { "docs.python.org": "Built-in", "gitlab.com": "GitLab", "bitbucket.org": "Bitbucket", } def detect_source_type(url: str) -> str | None: """Detect source type from URL domain. Returns None for GitHub URLs.""" if GITHUB_REPO_URL_RE.match(url): return None for domain, source_type in SOURCE_TYPE_DOMAINS.items(): if domain in url: return source_type if "github.com" not in url: return "External" return None def extract_github_repo(url: str) -> str | None: """Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs.""" m = GITHUB_REPO_URL_RE.match(url) return m.group(1) if m else None def load_stars(path: Path) -> dict[str, StarData]: """Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt.""" if path.exists(): try: return json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError: return {} return {} def sort_entries(entries: list[dict]) -> list[dict]: """Sort entries by stars descending, then name ascending. Three tiers: starred entries first, stdlib second, other non-starred last. """ def sort_key(entry: dict) -> tuple[int, int, int, str]: stars = entry["stars"] name = entry["name"].lower() if stars is not None: builtin = 1 if entry.get("source_type") == "Built-in" else 0 return (0, -stars, builtin, name) if entry.get("source_type") == "Built-in": return (1, 0, 0, name) return (2, 0, 0, name) return sorted(entries, key=sort_key) def extract_entries( categories: list[dict], groups: list[dict], ) -> list[dict]: """Flatten categories into individual library entries for table display. Entries appearing in multiple categories are merged into a single entry with lists of categories and groups. """ cat_to_group: dict[str, str] = {} for group in groups: for cat in group["categories"]: cat_to_group[cat["name"]] = group["name"] seen: dict[tuple[str, str], dict] = {} # (url, name) -> entry entries: list[dict] = [] for cat in categories: group_name = cat_to_group.get(cat["name"], "Other") for entry in cat["entries"]: url = entry["url"] key = (url, entry["name"]) if key in seen: existing = seen[key] if cat["name"] not in existing["categories"]: existing["categories"].append(cat["name"]) if group_name not in existing["groups"]: existing["groups"].append(group_name) else: merged = { "name": entry["name"], "url": url, "description": entry["description"], "categories": [cat["name"]], "groups": [group_name], "stars": None, "owner": None, "last_commit_at": None, "source_type": detect_source_type(url), "also_see": entry["also_see"], } seen[key] = merged entries.append(merged) return entries def build(repo_root: str) -> None: """Main build: parse README, render single-page HTML via Jinja2 templates.""" repo = Path(repo_root) website = repo / "website" readme_text = (repo / "README.md").read_text(encoding="utf-8") subtitle = "" for line in readme_text.split("\n"): stripped = line.strip() if stripped and not stripped.startswith("#"): subtitle = stripped break parsed_groups, _resources = parse_readme(readme_text) categories = [cat for g in parsed_groups for cat in g["categories"]] total_entries = sum(c["entry_count"] for c in categories) groups = group_categories(parsed_groups, []) entries = extract_entries(categories, groups) stars_data = load_stars(website / "data" / "github_stars.json") for entry in entries: repo_key = extract_github_repo(entry["url"]) if not repo_key and entry.get("source_type") == "Built-in": repo_key = "python/cpython" if repo_key and repo_key in stars_data: sd = stars_data[repo_key] entry["stars"] = sd["stars"] entry["owner"] = sd["owner"] entry["last_commit_at"] = sd.get("last_commit_at", "") entries = sort_entries(entries) env = Environment( loader=FileSystemLoader(website / "templates"), autoescape=True, ) site_dir = website / "output" if site_dir.exists(): shutil.rmtree(site_dir) site_dir.mkdir(parents=True) tpl_index = env.get_template("index.html") (site_dir / "index.html").write_text( tpl_index.render( categories=categories, resources=[], groups=groups, subtitle=subtitle, entries=entries, total_entries=total_entries, total_categories=len(categories), build_date=datetime.now(timezone.utc).strftime("%B %d, %Y"), ), encoding="utf-8", ) static_src = website / "static" static_dst = site_dir / "static" if static_src.exists(): shutil.copytree(static_src, static_dst, dirs_exist_ok=True) (site_dir / "llms.txt").write_text(readme_text, encoding="utf-8") print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories") print(f"Total entries: {total_entries}") print(f"Output: {site_dir}") if __name__ == "__main__": build(str(Path(__file__).parent.parent))