refactor: extract parsing logic from build.py into readme_parser module

slugify, parse_readme, count_entries, extract_preview, render_content_html, and related helpers are moved to a dedicated readme_parser module. build.py now imports from readme_parser rather than defining these inline. Tests for the removed functions are dropped from test_build.py since they now live with the module they test. Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-23 17:05:41 +08:00 · 2026-03-18 17:27:14 +08:00
parent 03ac212880
commit 0f374970dd
2 changed files with 20 additions and 532 deletions
@@ -7,9 +7,10 @@ import shutil
 from pathlib import Path
 from typing import TypedDict

-import markdown
 from jinja2 import Environment, FileSystemLoader

+from readme_parser import parse_readme, slugify
+
 # Thematic grouping of categories. Each category name must match exactly
 # as it appears in README.md (the ## heading text).
 SECTION_GROUPS: list[tuple[str, list[str]]] = [
@@ -67,217 +68,6 @@ SECTION_GROUPS: list[tuple[str, list[str]]] = [
 ]


-def slugify(name: str) -> str:
-    """Convert a category name to a URL-friendly slug."""
-    slug = name.lower()
-    slug = re.sub(r"[^a-z0-9\s-]", "", slug)
-    slug = re.sub(r"[\s]+", "-", slug.strip())
-    slug = re.sub(r"-+", "-", slug)
-    return slug
-
-
-def count_entries(content: str) -> int:
-    """Count library entries (lines starting with * [ or - [) in a content block."""
-    return sum(1 for line in content.split("\n") if re.match(r"\s*[-*]\s+\[", line))
-
-
-def extract_preview(content: str, *, max_names: int = 4) -> str:
-    """Extract first N main library names from markdown content for preview text.
-
-    Only includes top-level or single-indent entries (indent <= 3 spaces),
-    skipping subcategory labels (items without links) and deep sub-entries.
-    """
-    names = []
-    for m in re.finditer(r"^(\s*)[-*]\s+\[([^\]]+)\]", content, re.MULTILINE):
-        indent_len = len(m.group(1))
-        if indent_len > 3:
-            continue
-        names.append(m.group(2))
-        if len(names) >= max_names:
-            break
-    return ", ".join(names)
-
-
-def render_content_html(content: str) -> str:
-    """Render category markdown content to HTML with subcategory detection.
-
-    Lines that are list items without links (e.g., "- Synchronous") are
-    treated as subcategory headers and rendered as bold dividers.
-
-    Indent levels in the README:
-    - 0 spaces: top-level entry or subcategory label
-    - 2 spaces: entry under a subcategory (still a main entry)
-    - 4+ spaces: sub-entry (e.g., awesome-django under django)
-    """
-    lines = content.split("\n")
-    out: list[str] = []
-
-    for line in lines:
-        stripped = line.strip()
-        indent_len = len(line) - len(line.lstrip())
-
-        # Detect subcategory labels: list items without links
-        m = re.match(r"^[-*]\s+(.+)$", stripped)
-        if m and "[" not in stripped:
-            label = m.group(1)
-            out.append(f'<div class="subcat">{label}</div>')
-            continue
-
-        # Entry with link and description: * [name](url) - Description.
-        m = re.match(
-            r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*[-\u2013\u2014]\s*(.+)$",
-            line,
-        )
-        if m:
-            name, url, desc = m.groups()
-            if indent_len > 3:
-                out.append(
-                    f'<div class="entry-sub">'
-                    f'<a href="{url}">{name}</a>'
-                    f"</div>"
-                )
-            else:
-                out.append(
-                    f'<div class="entry">'
-                    f'<a href="{url}">{name}</a>'
-                    f'<span class="sep">&mdash;</span>{desc}'
-                    f"</div>"
-                )
-            continue
-
-        # Link-only entry (no description): * [name](url)
-        m = re.match(r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
-        if m:
-            name, url = m.groups()
-            if indent_len > 3:
-                out.append(
-                    f'<div class="entry-sub">'
-                    f'<a href="{url}">{name}</a>'
-                    f"</div>"
-                )
-            else:
-                out.append(
-                    f'<div class="entry">'
-                    f'<a href="{url}">{name}</a>'
-                    f"</div>"
-                )
-            continue
-
-    return "\n".join(out)
-
-
-def parse_readme(text: str) -> tuple[list[dict], list[dict]]:
-    """Parse README.md text into categories and resources.
-
-    Returns:
-        (categories, resources) where each is a list of dicts with keys:
-        name, slug, description, content
-    """
-    lines = text.split("\n")
-
-    separator_idx = None
-    for i, line in enumerate(lines):
-        if line.strip() == "---" and i > 0:
-            separator_idx = i
-            break
-
-    if separator_idx is None:
-        return [], []
-
-    resources_idx = None
-    contributing_idx = None
-    for i, line in enumerate(lines):
-        if line.strip() == "# Resources":
-            resources_idx = i
-        elif line.strip() == "# Contributing":
-            contributing_idx = i
-
-    cat_end = resources_idx if resources_idx is not None else len(lines)
-    category_lines = lines[separator_idx + 1 : cat_end]
-
-    resource_lines = []
-    if resources_idx is not None:
-        res_end = contributing_idx if contributing_idx is not None else len(lines)
-        resource_lines = lines[resources_idx:res_end]
-
-    categories = _extract_sections(category_lines, level=2)
-    resources = _extract_sections(resource_lines, level=2)
-
-    return categories, resources
-
-
-def _extract_sections(lines: list[str], *, level: int) -> list[dict]:
-    """Extract ## sections from a block of lines."""
-    prefix = "#" * level + " "
-    sections = []
-    current_name = None
-    current_lines: list[str] = []
-
-    for line in lines:
-        if line.startswith(prefix) and not line.startswith(prefix + "#"):
-            if current_name is not None:
-                sections.append(_build_section(current_name, current_lines))
-            current_name = line[len(prefix) :].strip()
-            current_lines = []
-        elif current_name is not None:
-            current_lines.append(line)
-
-    if current_name is not None:
-        sections.append(_build_section(current_name, current_lines))
-
-    return sections
-
-
-def _build_section(name: str, lines: list[str]) -> dict:
-    """Build a section dict from a name and its content lines."""
-    while lines and not lines[0].strip():
-        lines = lines[1:]
-    while lines and not lines[-1].strip():
-        lines = lines[:-1]
-
-    description = ""
-    content_lines = lines
-    if lines:
-        m = re.match(r"^_(.+)_$", lines[0].strip())
-        if m:
-            description = m.group(1)
-            content_lines = lines[1:]
-            while content_lines and not content_lines[0].strip():
-                content_lines = content_lines[1:]
-
-    content = "\n".join(content_lines).strip()
-
-    return {
-        "name": name,
-        "slug": slugify(name),
-        "description": description,
-        "content": content,
-    }
-
-
-def render_markdown(text: str) -> str:
-    """Render markdown text to HTML."""
-    md = markdown.Markdown(extensions=["extra"])
-    return md.convert(text)
-
-
-def strip_markdown_links(text: str) -> str:
-    """Replace [text](url) with just text for plain-text contexts."""
-    return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
-
-
-def render_inline_markdown(text: str) -> str:
-    """Render inline markdown (links, bold, italic) to HTML."""
-    from markupsafe import Markup
-
-    html = markdown.markdown(text)
-    # Strip wrapping <p>...</p> since this is inline content
-    html = re.sub(r"^<p>(.*)</p>$", r"\1", html.strip())
-    # Add target/rel to links for external navigation
-    html = html.replace("<a ", '<a target="_blank" rel="noopener" ')
-    return Markup(html)
-
-
 def group_categories(
    categories: list[dict],
    resources: list[dict],
@@ -379,38 +169,18 @@ def extract_entries(
    entries: list[dict] = []
    for cat in categories:
        group_name = cat_to_group.get(cat["name"], "Other")
-        last_entry_indent = -1
-        for line in cat["content"].split("\n"):
-            indent_len = len(line) - len(line.lstrip())
-
-            # Link-only sub-item deeper than parent → "also see"
-            m_sub = re.match(r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
-            if m_sub and indent_len > last_entry_indent >= 0 and entries:
-                entries[-1]["also_see"].append({
-                    "name": m_sub.group(1),
-                    "url": m_sub.group(2),
-                })
-                continue
-
-            if indent_len > 3:
-                continue
-            m = re.match(
-                r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*(?:[-\u2013\u2014]\s*(.+))?$",
-                line,
-            )
-            if m:
-                last_entry_indent = indent_len
-                entries.append({
-                    "name": m.group(1),
-                    "url": m.group(2),
-                    "description": render_inline_markdown(m.group(3)) if m.group(3) else "",
-                    "category": cat["name"],
-                    "group": group_name,
-                    "stars": None,
-                    "owner": None,
-                    "pushed_at": None,
-                    "also_see": [],
-                })
+        for entry in cat["entries"]:
+            entries.append({
+                "name": entry["name"],
+                "url": entry["url"],
+                "description": entry["description"],
+                "category": cat["name"],
+                "group": group_name,
+                "stars": None,
+                "owner": None,
+                "pushed_at": None,
+                "also_see": entry["also_see"],
+            })
    return entries


@@ -420,7 +190,6 @@ def build(repo_root: str) -> None:
    website = repo / "website"
    readme_text = (repo / "README.md").read_text(encoding="utf-8")

-    # Extract subtitle from the first non-empty, non-heading line
    subtitle = ""
    for line in readme_text.split("\n"):
        stripped = line.strip()
@@ -429,47 +198,33 @@ def build(repo_root: str) -> None:
            break

    categories, resources = parse_readme(readme_text)
-
-    # Enrich with entry counts, rendered HTML, previews, and clean descriptions
-    for cat in categories + resources:
-        cat["entry_count"] = count_entries(cat["content"])
-        cat["content_html"] = render_content_html(cat["content"])
-        cat["preview"] = extract_preview(cat["content"])
-        cat["description"] = strip_markdown_links(cat["description"])
+    # All fields pre-computed: entry_count, content_html, preview, description

    total_entries = sum(c["entry_count"] for c in categories)
-
-    # Organize into groups
    groups = group_categories(categories, resources)
-
-    # Flatten entries for table view
    entries = extract_entries(categories, resources, groups)

-    # Load and merge GitHub star data
    stars_data = load_stars(website / "data" / "github_stars.json")
    for entry in entries:
        repo_key = extract_github_repo(entry["url"])
        if repo_key and repo_key in stars_data:
-            entry["stars"] = stars_data[repo_key]["stars"]
-            entry["owner"] = stars_data[repo_key]["owner"]
-            entry["pushed_at"] = stars_data[repo_key].get("pushed_at", "")
+            sd = stars_data[repo_key]
+            entry["stars"] = sd["stars"]
+            entry["owner"] = sd["owner"]
+            entry["pushed_at"] = sd.get("pushed_at", "")

-    # Sort by stars descending
    entries = sort_entries(entries)

-    # Set up Jinja2
    env = Environment(
        loader=FileSystemLoader(website / "templates"),
        autoescape=True,
    )

-    # Output directory
    site_dir = website / "output"
    if site_dir.exists():
        shutil.rmtree(site_dir)
    site_dir.mkdir(parents=True)

-    # Generate single index.html
    tpl_index = env.get_template("index.html")
    (site_dir / "index.html").write_text(
        tpl_index.render(
@@ -484,13 +239,10 @@ def build(repo_root: str) -> None:
        encoding="utf-8",
    )

-    # Copy static assets
    static_src = website / "static"
    static_dst = site_dir / "static"
    if static_src.exists():
        shutil.copytree(static_src, static_dst)
-
-    # Write CNAME
    (site_dir / "CNAME").write_text("awesome-python.com\n", encoding="utf-8")

    print(f"Built single page with {len(categories)} categories + {len(resources)} resources")