#!/usr/bin/env python3 """Build a single-page HTML site from README.md for the awesome-python website.""" import json import re import shutil from pathlib import Path from typing import TypedDict import markdown from jinja2 import Environment, FileSystemLoader # Thematic grouping of categories. Each category name must match exactly # as it appears in README.md (the ## heading text). SECTION_GROUPS: list[tuple[str, list[str]]] = [ ("Web & API", [ "Web Frameworks", "RESTful API", "GraphQL", "WebSocket", "ASGI Servers", "WSGI Servers", "HTTP Clients", "Template Engine", "Web Asset Management", "Web Content Extracting", "Web Crawling", ]), ("Data & ML", [ "Data Analysis", "Data Validation", "Data Visualization", "Machine Learning", "Deep Learning", "Computer Vision", "Natural Language Processing", "Recommender Systems", "Science", "Quantum Computing", ]), ("DevOps & Infrastructure", [ "DevOps Tools", "Distributed Computing", "Task Queues", "Job Scheduler", "Serverless Frameworks", "Logging", "Processes", "Shell", "Network Virtualization", "RPC Servers", ]), ("Database & Storage", [ "Database", "Database Drivers", "ORM", "Caching", "Search", "Serialization", ]), ("Development Tools", [ "Testing", "Debugging Tools", "Code Analysis", "Build Tools", "Refactoring", "Documentation", "Editor Plugins and IDEs", "Interactive Interpreter", ]), ("CLI & GUI", [ "Command-line Interface Development", "Command-line Tools", "GUI Development", ]), ("Content & Media", [ "Audio", "Video", "Image Processing", "HTML Manipulation", "Text Processing", "Specific Formats Processing", "File Manipulation", "Downloader", ]), ("System & Runtime", [ "Asynchronous Programming", "Environment Management", "Package Management", "Package Repositories", "Distribution", "Implementations", "Built-in Classes Enhancement", "Functional Programming", "Configuration Files", ]), ("Security & Auth", [ "Authentication", "Cryptography", "Penetration Testing", "Permissions", ]), ("Specialized", [ "CMS", "Admin Panels", "Email", "Game Development", "Geolocation", "Hardware", "Internationalization", "Date and Time", "URL Manipulation", "Robotics", "Microsoft Windows", "Miscellaneous", "Algorithms and Design Patterns", "Static Site Generator", ]), ("Resources", []), # Filled dynamically from parsed resources ] def slugify(name: str) -> str: """Convert a category name to a URL-friendly slug.""" slug = name.lower() slug = re.sub(r"[^a-z0-9\s-]", "", slug) slug = re.sub(r"[\s]+", "-", slug.strip()) slug = re.sub(r"-+", "-", slug) return slug def count_entries(content: str) -> int: """Count library entries (lines starting with * [ or - [) in a content block.""" return sum(1 for line in content.split("\n") if re.match(r"\s*[-*]\s+\[", line)) def extract_preview(content: str, *, max_names: int = 4) -> str: """Extract first N main library names from markdown content for preview text. Only includes top-level or single-indent entries (indent <= 3 spaces), skipping subcategory labels (items without links) and deep sub-entries. """ names = [] for m in re.finditer(r"^(\s*)[-*]\s+\[([^\]]+)\]", content, re.MULTILINE): indent_len = len(m.group(1)) if indent_len > 3: continue names.append(m.group(2)) if len(names) >= max_names: break return ", ".join(names) def render_content_html(content: str) -> str: """Render category markdown content to HTML with subcategory detection. Lines that are list items without links (e.g., "- Synchronous") are treated as subcategory headers and rendered as bold dividers. Indent levels in the README: - 0 spaces: top-level entry or subcategory label - 2 spaces: entry under a subcategory (still a main entry) - 4+ spaces: sub-entry (e.g., awesome-django under django) """ lines = content.split("\n") out: list[str] = [] for line in lines: stripped = line.strip() indent_len = len(line) - len(line.lstrip()) # Detect subcategory labels: list items without links m = re.match(r"^[-*]\s+(.+)$", stripped) if m and "[" not in stripped: label = m.group(1) out.append(f'
{label}
') continue # Entry with link and description: * [name](url) - Description. m = re.match( r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*[-\u2013\u2014]\s*(.+)$", line, ) if m: name, url, desc = m.groups() if indent_len > 3: out.append( f'
' f'{name}' f"
" ) else: out.append( f'
' f'{name}' f'{desc}' f"
" ) continue # Link-only entry (no description): * [name](url) m = re.match(r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line) if m: name, url = m.groups() if indent_len > 3: out.append( f'
' f'{name}' f"
" ) else: out.append( f'
' f'{name}' f"
" ) continue return "\n".join(out) def parse_readme(text: str) -> tuple[list[dict], list[dict]]: """Parse README.md text into categories and resources. Returns: (categories, resources) where each is a list of dicts with keys: name, slug, description, content """ lines = text.split("\n") separator_idx = None for i, line in enumerate(lines): if line.strip() == "---" and i > 0: separator_idx = i break if separator_idx is None: return [], [] resources_idx = None contributing_idx = None for i, line in enumerate(lines): if line.strip() == "# Resources": resources_idx = i elif line.strip() == "# Contributing": contributing_idx = i cat_end = resources_idx if resources_idx is not None else len(lines) category_lines = lines[separator_idx + 1 : cat_end] resource_lines = [] if resources_idx is not None: res_end = contributing_idx if contributing_idx is not None else len(lines) resource_lines = lines[resources_idx:res_end] categories = _extract_sections(category_lines, level=2) resources = _extract_sections(resource_lines, level=2) return categories, resources def _extract_sections(lines: list[str], *, level: int) -> list[dict]: """Extract ## sections from a block of lines.""" prefix = "#" * level + " " sections = [] current_name = None current_lines: list[str] = [] for line in lines: if line.startswith(prefix) and not line.startswith(prefix + "#"): if current_name is not None: sections.append(_build_section(current_name, current_lines)) current_name = line[len(prefix) :].strip() current_lines = [] elif current_name is not None: current_lines.append(line) if current_name is not None: sections.append(_build_section(current_name, current_lines)) return sections def _build_section(name: str, lines: list[str]) -> dict: """Build a section dict from a name and its content lines.""" while lines and not lines[0].strip(): lines = lines[1:] while lines and not lines[-1].strip(): lines = lines[:-1] description = "" content_lines = lines if lines: m = re.match(r"^_(.+)_$", lines[0].strip()) if m: description = m.group(1) content_lines = lines[1:] while content_lines and not content_lines[0].strip(): content_lines = content_lines[1:] content = "\n".join(content_lines).strip() return { "name": name, "slug": slugify(name), "description": description, "content": content, } def render_markdown(text: str) -> str: """Render markdown text to HTML.""" md = markdown.Markdown(extensions=["extra"]) return md.convert(text) def strip_markdown_links(text: str) -> str: """Replace [text](url) with just text for plain-text contexts.""" return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) def render_inline_markdown(text: str) -> str: """Render inline markdown (links, bold, italic) to HTML.""" from markupsafe import Markup html = markdown.markdown(text) # Strip wrapping

...

since this is inline content html = re.sub(r"^

(.*)

$", r"\1", html.strip()) # Add target/rel to links for external navigation html = html.replace(" list[dict]: """Organize categories and resources into thematic section groups.""" cat_by_name = {c["name"]: c for c in categories} groups = [] for group_name, cat_names in SECTION_GROUPS: if group_name == "Resources": # Resources group uses parsed resources directly group_cats = list(resources) else: group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name] if group_cats: groups.append({ "name": group_name, "slug": slugify(group_name), "categories": group_cats, }) # Any categories not in a group go into "Other" grouped_names = set() for _, cat_names in SECTION_GROUPS: grouped_names.update(cat_names) ungrouped = [c for c in categories if c["name"] not in grouped_names] if ungrouped: groups.append({ "name": "Other", "slug": "other", "categories": ungrouped, }) return groups class Entry(TypedDict): name: str url: str description: str category: str group: str stars: int | None owner: str | None pushed_at: str | None class StarData(TypedDict): stars: int owner: str pushed_at: str fetched_at: str GITHUB_REPO_URL_RE = re.compile( r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$" ) def extract_github_repo(url: str) -> str | None: """Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs.""" m = GITHUB_REPO_URL_RE.match(url) return m.group(1) if m else None def load_stars(path: Path) -> dict[str, StarData]: """Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt.""" if path.exists(): try: return json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError: return {} return {} def sort_entries(entries: list[dict]) -> list[dict]: """Sort entries by stars descending, then name ascending. No-star entries go last.""" def sort_key(entry: dict) -> tuple[int, int, str]: stars = entry["stars"] name = entry["name"].lower() if stars is None: return (1, 0, name) return (0, -stars, name) return sorted(entries, key=sort_key) def extract_entries( categories: list[dict], resources: list[dict], groups: list[dict], ) -> list[dict]: """Flatten categories into individual library entries for table display.""" cat_to_group: dict[str, str] = {} for group in groups: for cat in group["categories"]: cat_to_group[cat["name"]] = group["name"] entries: list[dict] = [] for cat in categories: group_name = cat_to_group.get(cat["name"], "Other") last_entry_indent = -1 for line in cat["content"].split("\n"): indent_len = len(line) - len(line.lstrip()) # Link-only sub-item deeper than parent → "also see" m_sub = re.match(r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line) if m_sub and indent_len > last_entry_indent >= 0 and entries: entries[-1]["also_see"].append({ "name": m_sub.group(1), "url": m_sub.group(2), }) continue if indent_len > 3: continue m = re.match( r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*(?:[-\u2013\u2014]\s*(.+))?$", line, ) if m: last_entry_indent = indent_len entries.append({ "name": m.group(1), "url": m.group(2), "description": render_inline_markdown(m.group(3)) if m.group(3) else "", "category": cat["name"], "group": group_name, "stars": None, "owner": None, "pushed_at": None, "also_see": [], }) return entries def build(repo_root: str) -> None: """Main build: parse README, render single-page HTML via Jinja2 templates.""" repo = Path(repo_root) website = repo / "website" readme_text = (repo / "README.md").read_text(encoding="utf-8") # Extract subtitle from the first non-empty, non-heading line subtitle = "" for line in readme_text.split("\n"): stripped = line.strip() if stripped and not stripped.startswith("#"): subtitle = stripped break categories, resources = parse_readme(readme_text) # Enrich with entry counts, rendered HTML, previews, and clean descriptions for cat in categories + resources: cat["entry_count"] = count_entries(cat["content"]) cat["content_html"] = render_content_html(cat["content"]) cat["preview"] = extract_preview(cat["content"]) cat["description"] = strip_markdown_links(cat["description"]) total_entries = sum(c["entry_count"] for c in categories) # Organize into groups groups = group_categories(categories, resources) # Flatten entries for table view entries = extract_entries(categories, resources, groups) # Load and merge GitHub star data stars_data = load_stars(website / "data" / "github_stars.json") for entry in entries: repo_key = extract_github_repo(entry["url"]) if repo_key and repo_key in stars_data: entry["stars"] = stars_data[repo_key]["stars"] entry["owner"] = stars_data[repo_key]["owner"] entry["pushed_at"] = stars_data[repo_key].get("pushed_at", "") # Sort by stars descending entries = sort_entries(entries) # Set up Jinja2 env = Environment( loader=FileSystemLoader(website / "templates"), autoescape=True, ) # Output directory site_dir = website / "output" if site_dir.exists(): shutil.rmtree(site_dir) site_dir.mkdir(parents=True) # Generate single index.html tpl_index = env.get_template("index.html") (site_dir / "index.html").write_text( tpl_index.render( categories=categories, resources=resources, groups=groups, subtitle=subtitle, entries=entries, total_entries=total_entries, total_categories=len(categories), ), encoding="utf-8", ) # Copy static assets static_src = website / "static" static_dst = site_dir / "static" if static_src.exists(): shutil.copytree(static_src, static_dst) # Write CNAME (site_dir / "CNAME").write_text("awesome-python.com\n", encoding="utf-8") print(f"Built single page with {len(categories)} categories + {len(resources)} resources") print(f"Total entries: {total_entries}") print(f"Output: {site_dir}") if __name__ == "__main__": build(str(Path(__file__).parent.parent))