#!/usr/bin/env python3 """Build a single-page HTML site from README.md for the awesome-python website.""" import json import re import shutil from pathlib import Path from typing import TypedDict from jinja2 import Environment, FileSystemLoader from readme_parser import parse_readme, slugify # Thematic grouping of categories. Each category name must match exactly # as it appears in README.md (the ## heading text). SECTION_GROUPS: list[tuple[str, list[str]]] = [ ("Web & API", [ "Web Frameworks", "RESTful API", "GraphQL", "WebSocket", "ASGI Servers", "WSGI Servers", "HTTP Clients", "Template Engine", "Web Asset Management", "Web Content Extracting", "Web Crawling", ]), ("Data & ML", [ "Data Analysis", "Data Validation", "Data Visualization", "Machine Learning", "Deep Learning", "Computer Vision", "Natural Language Processing", "Recommender Systems", "Science", "Quantum Computing", ]), ("DevOps & Infrastructure", [ "DevOps Tools", "Distributed Computing", "Task Queues", "Job Scheduler", "Serverless Frameworks", "Logging", "Processes", "Shell", "Network Virtualization", "RPC Servers", ]), ("Database & Storage", [ "Database", "Database Drivers", "ORM", "Caching", "Search", "Serialization", ]), ("Development Tools", [ "Testing", "Debugging Tools", "Code Analysis", "Build Tools", "Refactoring", "Documentation", "Editor Plugins and IDEs", "Interactive Interpreter", ]), ("CLI & GUI", [ "Command-line Interface Development", "Command-line Tools", "GUI Development", ]), ("Content & Media", [ "Audio", "Video", "Image Processing", "HTML Manipulation", "Text Processing", "Specific Formats Processing", "File Manipulation", "Downloader", ]), ("System & Runtime", [ "Asynchronous Programming", "Environment Management", "Package Management", "Package Repositories", "Distribution", "Implementations", "Built-in Classes Enhancement", "Functional Programming", "Configuration Files", ]), ("Security & Auth", [ "Authentication", "Cryptography", "Penetration Testing", "Permissions", ]), ("Specialized", [ "CMS", "Admin Panels", "Email", "Game Development", "Geolocation", "Hardware", "Internationalization", "Date and Time", "URL Manipulation", "Robotics", "Microsoft Windows", "Miscellaneous", "Algorithms and Design Patterns", "Static Site Generator", ]), ("Resources", []), # Filled dynamically from parsed resources ] def group_categories( categories: list[dict], resources: list[dict], ) -> list[dict]: """Organize categories and resources into thematic section groups.""" cat_by_name = {c["name"]: c for c in categories} groups = [] grouped_names: set[str] = set() for group_name, cat_names in SECTION_GROUPS: grouped_names.update(cat_names) if group_name == "Resources": group_cats = list(resources) else: group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name] if group_cats: groups.append({ "name": group_name, "slug": slugify(group_name), "categories": group_cats, }) # Any categories not in a group go into "Other" ungrouped = [c for c in categories if c["name"] not in grouped_names] if ungrouped: groups.append({ "name": "Other", "slug": "other", "categories": ungrouped, }) return groups class Entry(TypedDict): name: str url: str description: str category: str group: str stars: int | None owner: str | None last_commit_at: str | None class StarData(TypedDict): stars: int owner: str last_commit_at: str fetched_at: str GITHUB_REPO_URL_RE = re.compile( r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$" ) def extract_github_repo(url: str) -> str | None: """Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs.""" m = GITHUB_REPO_URL_RE.match(url) return m.group(1) if m else None def load_stars(path: Path) -> dict[str, StarData]: """Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt.""" if path.exists(): try: return json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError: return {} return {} def sort_entries(entries: list[dict]) -> list[dict]: """Sort entries by stars descending, then name ascending. No-star entries go last.""" def sort_key(entry: dict) -> tuple[int, int, str]: stars = entry["stars"] name = entry["name"].lower() if stars is None: return (1, 0, name) return (0, -stars, name) return sorted(entries, key=sort_key) def extract_entries( categories: list[dict], groups: list[dict], ) -> list[dict]: """Flatten categories into individual library entries for table display.""" cat_to_group: dict[str, str] = {} for group in groups: for cat in group["categories"]: cat_to_group[cat["name"]] = group["name"] entries: list[dict] = [] for cat in categories: group_name = cat_to_group.get(cat["name"], "Other") for entry in cat["entries"]: entries.append({ "name": entry["name"], "url": entry["url"], "description": entry["description"], "category": cat["name"], "group": group_name, "stars": None, "owner": None, "last_commit_at": None, "also_see": entry["also_see"], }) return entries def build(repo_root: str) -> None: """Main build: parse README, render single-page HTML via Jinja2 templates.""" repo = Path(repo_root) website = repo / "website" readme_text = (repo / "README.md").read_text(encoding="utf-8") subtitle = "" for line in readme_text.split("\n"): stripped = line.strip() if stripped and not stripped.startswith("#"): subtitle = stripped break categories, resources = parse_readme(readme_text) # All fields pre-computed: entry_count, content_html, preview, description total_entries = sum(c["entry_count"] for c in categories) groups = group_categories(categories, resources) entries = extract_entries(categories, groups) stars_data = load_stars(website / "data" / "github_stars.json") for entry in entries: repo_key = extract_github_repo(entry["url"]) if repo_key and repo_key in stars_data: sd = stars_data[repo_key] entry["stars"] = sd["stars"] entry["owner"] = sd["owner"] entry["last_commit_at"] = sd.get("last_commit_at", "") entries = sort_entries(entries) env = Environment( loader=FileSystemLoader(website / "templates"), autoescape=True, ) site_dir = website / "output" if site_dir.exists(): shutil.rmtree(site_dir) site_dir.mkdir(parents=True) tpl_index = env.get_template("index.html") (site_dir / "index.html").write_text( tpl_index.render( categories=categories, resources=resources, groups=groups, subtitle=subtitle, entries=entries, total_entries=total_entries, total_categories=len(categories), ), encoding="utf-8", ) static_src = website / "static" static_dst = site_dir / "static" if static_src.exists(): shutil.copytree(static_src, static_dst, dirs_exist_ok=True) print(f"Built single page with {len(categories)} categories + {len(resources)} resources") print(f"Total entries: {total_entries}") print(f"Output: {site_dir}") if __name__ == "__main__": build(str(Path(__file__).parent.parent))