mirror of
https://github.com/vinta/awesome-python.git
synced 2026-03-24 18:53:33 +08:00
Replaces MkDocs with a bespoke Python site generator using Jinja2 templates and Markdown. Adds uv for dependency management, GitHub Actions workflow for deployment, and Makefile targets for local development (fetch_stars, build, preview, deploy). Co-Authored-By: Claude <noreply@anthropic.com>
503 lines
16 KiB
Python
503 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""Build a single-page HTML site from README.md for the awesome-python website."""
|
|
|
|
import json
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import TypedDict
|
|
|
|
import markdown
|
|
from jinja2 import Environment, FileSystemLoader
|
|
|
|
# Thematic grouping of categories. Each category name must match exactly
|
|
# as it appears in README.md (the ## heading text).
|
|
SECTION_GROUPS: list[tuple[str, list[str]]] = [
|
|
("Web & API", [
|
|
"Web Frameworks", "RESTful API", "GraphQL", "WebSocket",
|
|
"ASGI Servers", "WSGI Servers", "HTTP Clients", "Template Engine",
|
|
"Web Asset Management", "Web Content Extracting", "Web Crawling",
|
|
]),
|
|
("Data & ML", [
|
|
"Data Analysis", "Data Validation", "Data Visualization",
|
|
"Machine Learning", "Deep Learning", "Computer Vision",
|
|
"Natural Language Processing", "Recommender Systems", "Science",
|
|
"Quantum Computing",
|
|
]),
|
|
("DevOps & Infrastructure", [
|
|
"DevOps Tools", "Distributed Computing", "Task Queues",
|
|
"Job Scheduler", "Serverless Frameworks", "Logging", "Processes",
|
|
"Shell", "Network Virtualization", "RPC Servers",
|
|
]),
|
|
("Database & Storage", [
|
|
"Database", "Database Drivers", "ORM", "Caching", "Search",
|
|
"Serialization",
|
|
]),
|
|
("Development Tools", [
|
|
"Testing", "Debugging Tools", "Code Analysis", "Build Tools",
|
|
"Refactoring", "Documentation", "Editor Plugins and IDEs",
|
|
"Interactive Interpreter",
|
|
]),
|
|
("CLI & GUI", [
|
|
"Command-line Interface Development", "Command-line Tools",
|
|
"GUI Development",
|
|
]),
|
|
("Content & Media", [
|
|
"Audio", "Video", "Image Processing", "HTML Manipulation",
|
|
"Text Processing", "Specific Formats Processing",
|
|
"File Manipulation", "Downloader",
|
|
]),
|
|
("System & Runtime", [
|
|
"Asynchronous Programming", "Environment Management",
|
|
"Package Management", "Package Repositories", "Distribution",
|
|
"Implementations", "Built-in Classes Enhancement",
|
|
"Functional Programming", "Configuration Files",
|
|
]),
|
|
("Security & Auth", [
|
|
"Authentication", "Cryptography", "Penetration Testing",
|
|
"Permissions",
|
|
]),
|
|
("Specialized", [
|
|
"CMS", "Admin Panels", "Email", "Game Development", "Geolocation",
|
|
"Hardware", "Internationalization", "Date and Time",
|
|
"URL Manipulation", "Robotics", "Microsoft Windows", "Miscellaneous",
|
|
"Algorithms and Design Patterns", "Static Site Generator",
|
|
]),
|
|
("Resources", []), # Filled dynamically from parsed resources
|
|
]
|
|
|
|
|
|
def slugify(name: str) -> str:
|
|
"""Convert a category name to a URL-friendly slug."""
|
|
slug = name.lower()
|
|
slug = re.sub(r"[^a-z0-9\s-]", "", slug)
|
|
slug = re.sub(r"[\s]+", "-", slug.strip())
|
|
slug = re.sub(r"-+", "-", slug)
|
|
return slug
|
|
|
|
|
|
def count_entries(content: str) -> int:
|
|
"""Count library entries (lines starting with * [ or - [) in a content block."""
|
|
return sum(1 for line in content.split("\n") if re.match(r"\s*[-*]\s+\[", line))
|
|
|
|
|
|
def extract_preview(content: str, *, max_names: int = 4) -> str:
|
|
"""Extract first N main library names from markdown content for preview text.
|
|
|
|
Only includes top-level or single-indent entries (indent <= 3 spaces),
|
|
skipping subcategory labels (items without links) and deep sub-entries.
|
|
"""
|
|
names = []
|
|
for m in re.finditer(r"^(\s*)[-*]\s+\[([^\]]+)\]", content, re.MULTILINE):
|
|
indent_len = len(m.group(1))
|
|
if indent_len > 3:
|
|
continue
|
|
names.append(m.group(2))
|
|
if len(names) >= max_names:
|
|
break
|
|
return ", ".join(names)
|
|
|
|
|
|
def render_content_html(content: str) -> str:
|
|
"""Render category markdown content to HTML with subcategory detection.
|
|
|
|
Lines that are list items without links (e.g., "- Synchronous") are
|
|
treated as subcategory headers and rendered as bold dividers.
|
|
|
|
Indent levels in the README:
|
|
- 0 spaces: top-level entry or subcategory label
|
|
- 2 spaces: entry under a subcategory (still a main entry)
|
|
- 4+ spaces: sub-entry (e.g., awesome-django under django)
|
|
"""
|
|
lines = content.split("\n")
|
|
out: list[str] = []
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
indent_len = len(line) - len(line.lstrip())
|
|
|
|
# Detect subcategory labels: list items without links
|
|
m = re.match(r"^[-*]\s+(.+)$", stripped)
|
|
if m and "[" not in stripped:
|
|
label = m.group(1)
|
|
out.append(f'<div class="subcat">{label}</div>')
|
|
continue
|
|
|
|
# Entry with link and description: * [name](url) - Description.
|
|
m = re.match(
|
|
r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*[-\u2013\u2014]\s*(.+)$",
|
|
line,
|
|
)
|
|
if m:
|
|
name, url, desc = m.groups()
|
|
if indent_len > 3:
|
|
out.append(
|
|
f'<div class="entry-sub">'
|
|
f'<a href="{url}">{name}</a>'
|
|
f"</div>"
|
|
)
|
|
else:
|
|
out.append(
|
|
f'<div class="entry">'
|
|
f'<a href="{url}">{name}</a>'
|
|
f'<span class="sep">—</span>{desc}'
|
|
f"</div>"
|
|
)
|
|
continue
|
|
|
|
# Link-only entry (no description): * [name](url)
|
|
m = re.match(r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
|
|
if m:
|
|
name, url = m.groups()
|
|
if indent_len > 3:
|
|
out.append(
|
|
f'<div class="entry-sub">'
|
|
f'<a href="{url}">{name}</a>'
|
|
f"</div>"
|
|
)
|
|
else:
|
|
out.append(
|
|
f'<div class="entry">'
|
|
f'<a href="{url}">{name}</a>'
|
|
f"</div>"
|
|
)
|
|
continue
|
|
|
|
return "\n".join(out)
|
|
|
|
|
|
def parse_readme(text: str) -> tuple[list[dict], list[dict]]:
|
|
"""Parse README.md text into categories and resources.
|
|
|
|
Returns:
|
|
(categories, resources) where each is a list of dicts with keys:
|
|
name, slug, description, content
|
|
"""
|
|
lines = text.split("\n")
|
|
|
|
separator_idx = None
|
|
for i, line in enumerate(lines):
|
|
if line.strip() == "---" and i > 0:
|
|
separator_idx = i
|
|
break
|
|
|
|
if separator_idx is None:
|
|
return [], []
|
|
|
|
resources_idx = None
|
|
contributing_idx = None
|
|
for i, line in enumerate(lines):
|
|
if line.strip() == "# Resources":
|
|
resources_idx = i
|
|
elif line.strip() == "# Contributing":
|
|
contributing_idx = i
|
|
|
|
cat_end = resources_idx if resources_idx is not None else len(lines)
|
|
category_lines = lines[separator_idx + 1 : cat_end]
|
|
|
|
resource_lines = []
|
|
if resources_idx is not None:
|
|
res_end = contributing_idx if contributing_idx is not None else len(lines)
|
|
resource_lines = lines[resources_idx:res_end]
|
|
|
|
categories = _extract_sections(category_lines, level=2)
|
|
resources = _extract_sections(resource_lines, level=2)
|
|
|
|
return categories, resources
|
|
|
|
|
|
def _extract_sections(lines: list[str], *, level: int) -> list[dict]:
|
|
"""Extract ## sections from a block of lines."""
|
|
prefix = "#" * level + " "
|
|
sections = []
|
|
current_name = None
|
|
current_lines: list[str] = []
|
|
|
|
for line in lines:
|
|
if line.startswith(prefix) and not line.startswith(prefix + "#"):
|
|
if current_name is not None:
|
|
sections.append(_build_section(current_name, current_lines))
|
|
current_name = line[len(prefix) :].strip()
|
|
current_lines = []
|
|
elif current_name is not None:
|
|
current_lines.append(line)
|
|
|
|
if current_name is not None:
|
|
sections.append(_build_section(current_name, current_lines))
|
|
|
|
return sections
|
|
|
|
|
|
def _build_section(name: str, lines: list[str]) -> dict:
|
|
"""Build a section dict from a name and its content lines."""
|
|
while lines and not lines[0].strip():
|
|
lines = lines[1:]
|
|
while lines and not lines[-1].strip():
|
|
lines = lines[:-1]
|
|
|
|
description = ""
|
|
content_lines = lines
|
|
if lines:
|
|
m = re.match(r"^_(.+)_$", lines[0].strip())
|
|
if m:
|
|
description = m.group(1)
|
|
content_lines = lines[1:]
|
|
while content_lines and not content_lines[0].strip():
|
|
content_lines = content_lines[1:]
|
|
|
|
content = "\n".join(content_lines).strip()
|
|
|
|
return {
|
|
"name": name,
|
|
"slug": slugify(name),
|
|
"description": description,
|
|
"content": content,
|
|
}
|
|
|
|
|
|
def render_markdown(text: str) -> str:
|
|
"""Render markdown text to HTML."""
|
|
md = markdown.Markdown(extensions=["extra"])
|
|
return md.convert(text)
|
|
|
|
|
|
def strip_markdown_links(text: str) -> str:
|
|
"""Replace [text](url) with just text for plain-text contexts."""
|
|
return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
|
|
|
|
|
def render_inline_markdown(text: str) -> str:
|
|
"""Render inline markdown (links, bold, italic) to HTML."""
|
|
from markupsafe import Markup
|
|
|
|
html = markdown.markdown(text)
|
|
# Strip wrapping <p>...</p> since this is inline content
|
|
html = re.sub(r"^<p>(.*)</p>$", r"\1", html.strip())
|
|
# Add target/rel to links for external navigation
|
|
html = html.replace("<a ", '<a target="_blank" rel="noopener" ')
|
|
return Markup(html)
|
|
|
|
|
|
def group_categories(
|
|
categories: list[dict],
|
|
resources: list[dict],
|
|
) -> list[dict]:
|
|
"""Organize categories and resources into thematic section groups."""
|
|
cat_by_name = {c["name"]: c for c in categories}
|
|
groups = []
|
|
|
|
for group_name, cat_names in SECTION_GROUPS:
|
|
if group_name == "Resources":
|
|
# Resources group uses parsed resources directly
|
|
group_cats = list(resources)
|
|
else:
|
|
group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name]
|
|
|
|
if group_cats:
|
|
groups.append({
|
|
"name": group_name,
|
|
"slug": slugify(group_name),
|
|
"categories": group_cats,
|
|
})
|
|
|
|
# Any categories not in a group go into "Other"
|
|
grouped_names = set()
|
|
for _, cat_names in SECTION_GROUPS:
|
|
grouped_names.update(cat_names)
|
|
ungrouped = [c for c in categories if c["name"] not in grouped_names]
|
|
if ungrouped:
|
|
groups.append({
|
|
"name": "Other",
|
|
"slug": "other",
|
|
"categories": ungrouped,
|
|
})
|
|
|
|
return groups
|
|
|
|
|
|
class Entry(TypedDict):
|
|
name: str
|
|
url: str
|
|
description: str
|
|
category: str
|
|
group: str
|
|
stars: int | None
|
|
owner: str | None
|
|
pushed_at: str | None
|
|
|
|
|
|
class StarData(TypedDict):
|
|
stars: int
|
|
owner: str
|
|
pushed_at: str
|
|
fetched_at: str
|
|
|
|
|
|
GITHUB_REPO_URL_RE = re.compile(
|
|
r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$"
|
|
)
|
|
|
|
|
|
def extract_github_repo(url: str) -> str | None:
|
|
"""Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs."""
|
|
m = GITHUB_REPO_URL_RE.match(url)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def load_stars(path: Path) -> dict[str, StarData]:
|
|
"""Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt."""
|
|
if path.exists():
|
|
try:
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
return {}
|
|
|
|
|
|
def sort_entries(entries: list[dict]) -> list[dict]:
|
|
"""Sort entries by stars descending, then name ascending. No-star entries go last."""
|
|
def sort_key(entry: dict) -> tuple[int, int, str]:
|
|
stars = entry["stars"]
|
|
name = entry["name"].lower()
|
|
if stars is None:
|
|
return (1, 0, name)
|
|
return (0, -stars, name)
|
|
return sorted(entries, key=sort_key)
|
|
|
|
|
|
def extract_entries(
|
|
categories: list[dict],
|
|
resources: list[dict],
|
|
groups: list[dict],
|
|
) -> list[dict]:
|
|
"""Flatten categories into individual library entries for table display."""
|
|
cat_to_group: dict[str, str] = {}
|
|
for group in groups:
|
|
for cat in group["categories"]:
|
|
cat_to_group[cat["name"]] = group["name"]
|
|
|
|
entries: list[dict] = []
|
|
for cat in categories:
|
|
group_name = cat_to_group.get(cat["name"], "Other")
|
|
last_entry_indent = -1
|
|
for line in cat["content"].split("\n"):
|
|
indent_len = len(line) - len(line.lstrip())
|
|
|
|
# Link-only sub-item deeper than parent → "also see"
|
|
m_sub = re.match(r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
|
|
if m_sub and indent_len > last_entry_indent >= 0 and entries:
|
|
entries[-1]["also_see"].append({
|
|
"name": m_sub.group(1),
|
|
"url": m_sub.group(2),
|
|
})
|
|
continue
|
|
|
|
if indent_len > 3:
|
|
continue
|
|
m = re.match(
|
|
r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*(?:[-\u2013\u2014]\s*(.+))?$",
|
|
line,
|
|
)
|
|
if m:
|
|
last_entry_indent = indent_len
|
|
entries.append({
|
|
"name": m.group(1),
|
|
"url": m.group(2),
|
|
"description": render_inline_markdown(m.group(3)) if m.group(3) else "",
|
|
"category": cat["name"],
|
|
"group": group_name,
|
|
"stars": None,
|
|
"owner": None,
|
|
"pushed_at": None,
|
|
"also_see": [],
|
|
})
|
|
return entries
|
|
|
|
|
|
def build(repo_root: str) -> None:
|
|
"""Main build: parse README, render single-page HTML via Jinja2 templates."""
|
|
repo = Path(repo_root)
|
|
website = repo / "website"
|
|
readme_text = (repo / "README.md").read_text(encoding="utf-8")
|
|
|
|
# Extract subtitle from the first non-empty, non-heading line
|
|
subtitle = ""
|
|
for line in readme_text.split("\n"):
|
|
stripped = line.strip()
|
|
if stripped and not stripped.startswith("#"):
|
|
subtitle = stripped
|
|
break
|
|
|
|
categories, resources = parse_readme(readme_text)
|
|
|
|
# Enrich with entry counts, rendered HTML, previews, and clean descriptions
|
|
for cat in categories + resources:
|
|
cat["entry_count"] = count_entries(cat["content"])
|
|
cat["content_html"] = render_content_html(cat["content"])
|
|
cat["preview"] = extract_preview(cat["content"])
|
|
cat["description"] = strip_markdown_links(cat["description"])
|
|
|
|
total_entries = sum(c["entry_count"] for c in categories)
|
|
|
|
# Organize into groups
|
|
groups = group_categories(categories, resources)
|
|
|
|
# Flatten entries for table view
|
|
entries = extract_entries(categories, resources, groups)
|
|
|
|
# Load and merge GitHub star data
|
|
stars_data = load_stars(website / "data" / "github_stars.json")
|
|
for entry in entries:
|
|
repo_key = extract_github_repo(entry["url"])
|
|
if repo_key and repo_key in stars_data:
|
|
entry["stars"] = stars_data[repo_key]["stars"]
|
|
entry["owner"] = stars_data[repo_key]["owner"]
|
|
entry["pushed_at"] = stars_data[repo_key].get("pushed_at", "")
|
|
|
|
# Sort by stars descending
|
|
entries = sort_entries(entries)
|
|
|
|
# Set up Jinja2
|
|
env = Environment(
|
|
loader=FileSystemLoader(website / "templates"),
|
|
autoescape=True,
|
|
)
|
|
|
|
# Output directory
|
|
site_dir = website / "output"
|
|
if site_dir.exists():
|
|
shutil.rmtree(site_dir)
|
|
site_dir.mkdir(parents=True)
|
|
|
|
# Generate single index.html
|
|
tpl_index = env.get_template("index.html")
|
|
(site_dir / "index.html").write_text(
|
|
tpl_index.render(
|
|
categories=categories,
|
|
resources=resources,
|
|
groups=groups,
|
|
subtitle=subtitle,
|
|
entries=entries,
|
|
total_entries=total_entries,
|
|
total_categories=len(categories),
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
# Copy static assets
|
|
static_src = website / "static"
|
|
static_dst = site_dir / "static"
|
|
if static_src.exists():
|
|
shutil.copytree(static_src, static_dst)
|
|
|
|
# Write CNAME
|
|
(site_dir / "CNAME").write_text("awesome-python.com\n", encoding="utf-8")
|
|
|
|
print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
|
|
print(f"Total entries: {total_entries}")
|
|
print(f"Output: {site_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
build(str(Path(__file__).parent.parent))
|