#!/usr/bin/env python3
"""Build a single-page HTML site from README.md for the awesome-python website."""
import json
import re
import shutil
from pathlib import Path
from typing import TypedDict
from jinja2 import Environment, FileSystemLoader
from readme_parser import parse_readme, slugify
# Thematic grouping of categories. Each category name must match exactly
# as it appears in README.md (the ## heading text).
SECTION_GROUPS: list[tuple[str, list[str]]] = [
(
"Web & API",
[
"Web Frameworks",
"RESTful API",
"GraphQL",
"WebSocket",
"ASGI Servers",
"WSGI Servers",
"HTTP Clients",
"Template Engine",
"Web Asset Management",
"Web Content Extracting",
"Web Crawling",
],
),
(
"Data & ML",
[
"Data Analysis",
"Data Validation",
"Data Visualization",
"Machine Learning",
"Deep Learning",
"Computer Vision",
"Natural Language Processing",
"Recommender Systems",
"Science",
"Quantum Computing",
],
),
(
"DevOps & Infrastructure",
[
"DevOps Tools",
"Distributed Computing",
"Task Queues",
"Job Scheduler",
"Serverless Frameworks",
"Logging",
"Processes",
"Shell",
"Network Virtualization",
"RPC Servers",
],
),
(
"Database & Storage",
[
"Database",
"Database Drivers",
"ORM",
"Caching",
"Search",
"Serialization",
],
),
(
"Development Tools",
[
"Testing",
"Debugging Tools",
"Code Analysis",
"Build Tools",
"Refactoring",
"Documentation",
"Editor Plugins and IDEs",
"Interactive Interpreter",
],
),
(
"CLI & GUI",
[
"Command-line Interface Development",
"Command-line Tools",
"GUI Development",
],
),
(
"Content & Media",
[
"Audio",
"Video",
"Image Processing",
"HTML Manipulation",
"Text Processing",
"Specific Formats Processing",
"File Manipulation",
"Downloader",
],
),
(
"System & Runtime",
[
"Asynchronous Programming",
"Environment Management",
"Package Management",
"Package Repositories",
"Distribution",
"Implementations",
"Built-in Classes Enhancement",
"Functional Programming",
"Configuration Files",
],
),
(
"Security & Auth",
[
"Authentication",
"Cryptography",
"Penetration Testing",
"Permissions",
],
),
(
"Specialized",
[
"CMS",
"Admin Panels",
"Email",
"Game Development",
"Geolocation",
"Hardware",
"Internationalization",
"Date and Time",
"URL Manipulation",
"Robotics",
"Microsoft Windows",
"Miscellaneous",
"Algorithms and Design Patterns",
"Static Site Generator",
],
),
("Resources", []), # Filled dynamically from parsed resources
]
def group_categories(
categories: list[dict],
resources: list[dict],
) -> list[dict]:
"""Organize categories and resources into thematic section groups."""
cat_by_name = {c["name"]: c for c in categories}
groups = []
grouped_names: set[str] = set()
for group_name, cat_names in SECTION_GROUPS:
grouped_names.update(cat_names)
if group_name == "Resources":
group_cats = list(resources)
else:
group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name]
if group_cats:
groups.append(
{
"name": group_name,
"slug": slugify(group_name),
"categories": group_cats,
}
)
# Any categories not in a group go into "Other"
ungrouped = [c for c in categories if c["name"] not in grouped_names]
if ungrouped:
groups.append(
{
"name": "Other",
"slug": "other",
"categories": ungrouped,
}
)
return groups
class Entry(TypedDict):
name: str
url: str
description: str
category: str
group: str
stars: int | None
owner: str | None
last_commit_at: str | None
class StarData(TypedDict):
stars: int
owner: str
last_commit_at: str
fetched_at: str
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
def extract_github_repo(url: str) -> str | None:
"""Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs."""
m = GITHUB_REPO_URL_RE.match(url)
return m.group(1) if m else None
def load_stars(path: Path) -> dict[str, StarData]:
"""Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt."""
if path.exists():
try:
return json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
return {}
def sort_entries(entries: list[dict]) -> list[dict]:
"""Sort entries by stars descending, then name ascending. No-star entries go last."""
def sort_key(entry: dict) -> tuple[int, int, str]:
stars = entry["stars"]
name = entry["name"].lower()
if stars is None:
return (1, 0, name)
return (0, -stars, name)
return sorted(entries, key=sort_key)
def extract_entries(
categories: list[dict],
groups: list[dict],
) -> list[dict]:
"""Flatten categories into individual library entries for table display."""
cat_to_group: dict[str, str] = {}
for group in groups:
for cat in group["categories"]:
cat_to_group[cat["name"]] = group["name"]
entries: list[dict] = []
for cat in categories:
group_name = cat_to_group.get(cat["name"], "Other")
for entry in cat["entries"]:
entries.append(
{
"name": entry["name"],
"url": entry["url"],
"description": entry["description"],
"category": cat["name"],
"group": group_name,
"stars": None,
"owner": None,
"last_commit_at": None,
"also_see": entry["also_see"],
}
)
return entries
def build(repo_root: str) -> None:
"""Main build: parse README, render single-page HTML via Jinja2 templates."""
repo = Path(repo_root)
website = repo / "website"
readme_text = (repo / "README.md").read_text(encoding="utf-8")
subtitle = ""
for line in readme_text.split("\n"):
stripped = line.strip()
if stripped and not stripped.startswith("#"):
subtitle = stripped
break
categories, resources = parse_readme(readme_text)
# All fields pre-computed: entry_count, content_html, preview, description
total_entries = sum(c["entry_count"] for c in categories)
groups = group_categories(categories, resources)
entries = extract_entries(categories, groups)
stars_data = load_stars(website / "data" / "github_stars.json")
for entry in entries:
repo_key = extract_github_repo(entry["url"])
if repo_key and repo_key in stars_data:
sd = stars_data[repo_key]
entry["stars"] = sd["stars"]
entry["owner"] = sd["owner"]
entry["last_commit_at"] = sd.get("last_commit_at", "")
entries = sort_entries(entries)
env = Environment(
loader=FileSystemLoader(website / "templates"),
autoescape=True,
)
site_dir = website / "output"
if site_dir.exists():
shutil.rmtree(site_dir)
site_dir.mkdir(parents=True)
tpl_index = env.get_template("index.html")
(site_dir / "index.html").write_text(
tpl_index.render(
categories=categories,
resources=resources,
groups=groups,
subtitle=subtitle,
entries=entries,
total_entries=total_entries,
total_categories=len(categories),
),
encoding="utf-8",
)
static_src = website / "static"
static_dst = site_dir / "static"
if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
shutil.copy(repo / "README.md", site_dir / "llms.txt")
print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
print(f"Total entries: {total_entries}")
print(f"Output: {site_dir}")
if __name__ == "__main__":
build(str(Path(__file__).parent.parent))