refactor: extract parsing logic from build.py into readme_parser module

slugify, parse_readme, count_entries, extract_preview, render_content_html, and related helpers are moved to a dedicated readme_parser module. build.py now imports from readme_parser rather than defining these inline. Tests for the removed functions are dropped from test_build.py since they now live with the module they test. Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-28 12:15:48 +08:00 · 2026-03-18 17:27:14 +08:00
parent 03ac212880
commit 0f374970dd
2 changed files with 20 additions and 532 deletions
@@ -7,9 +7,10 @@ import shutil
 from pathlib import Path
 from typing import TypedDict
 import markdown
 from jinja2 import Environment, FileSystemLoader
 from readme_parser import parse_readme, slugify
 # Thematic grouping of categories. Each category name must match exactly
 # as it appears in README.md (the ## heading text).
 SECTION_GROUPS: list[tuple[str, list[str]]] = [
@@ -67,217 +68,6 @@ SECTION_GROUPS: list[tuple[str, list[str]]] = [
 ]
 def slugify(name: str) -> str:
    """Convert a category name to a URL-friendly slug."""
    slug = name.lower()
    slug = re.sub(r"[^a-z0-9\s-]", "", slug)
    slug = re.sub(r"[\s]+", "-", slug.strip())
    slug = re.sub(r"-+", "-", slug)
    return slug
 def count_entries(content: str) -> int:
    """Count library entries (lines starting with * [ or - [) in a content block."""
    return sum(1 for line in content.split("\n") if re.match(r"\s*[-*]\s+\[", line))
 def extract_preview(content: str, *, max_names: int = 4) -> str:
    """Extract first N main library names from markdown content for preview text.
    Only includes top-level or single-indent entries (indent <= 3 spaces),
    skipping subcategory labels (items without links) and deep sub-entries.
    """
    names = []
    for m in re.finditer(r"^(\s*)[-*]\s+\[([^\]]+)\]", content, re.MULTILINE):
        indent_len = len(m.group(1))
        if indent_len > 3:
            continue
        names.append(m.group(2))
        if len(names) >= max_names:
            break
    return ", ".join(names)
 def render_content_html(content: str) -> str:
    """Render category markdown content to HTML with subcategory detection.
    Lines that are list items without links (e.g., "- Synchronous") are
    treated as subcategory headers and rendered as bold dividers.
    Indent levels in the README:
    - 0 spaces: top-level entry or subcategory label
    - 2 spaces: entry under a subcategory (still a main entry)
    - 4+ spaces: sub-entry (e.g., awesome-django under django)
    """
    lines = content.split("\n")
    out: list[str] = []
    for line in lines:
        stripped = line.strip()
        indent_len = len(line) - len(line.lstrip())
        # Detect subcategory labels: list items without links
        m = re.match(r"^[-*]\s+(.+)$", stripped)
        if m and "[" not in stripped:
            label = m.group(1)
            out.append(f'<div class="subcat">{label}</div>')
            continue
        # Entry with link and description: * [name](url) - Description.
        m = re.match(
            r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*[-\u2013\u2014]\s*(.+)$",
            line,
        )
        if m:
            name, url, desc = m.groups()
            if indent_len > 3:
                out.append(
                    f'<div class="entry-sub">'
                    f'<a href="{url}">{name}</a>'
                    f"</div>"
                )
            else:
                out.append(
                    f'<div class="entry">'
                    f'<a href="{url}">{name}</a>'
                    f'<span class="sep">&mdash;</span>{desc}'
                    f"</div>"
                )
            continue
        # Link-only entry (no description): * [name](url)
        m = re.match(r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
        if m:
            name, url = m.groups()
            if indent_len > 3:
                out.append(
                    f'<div class="entry-sub">'
                    f'<a href="{url}">{name}</a>'
                    f"</div>"
                )
            else:
                out.append(
                    f'<div class="entry">'
                    f'<a href="{url}">{name}</a>'
                    f"</div>"
                )
            continue
    return "\n".join(out)
 def parse_readme(text: str) -> tuple[list[dict], list[dict]]:
    """Parse README.md text into categories and resources.
    Returns:
        (categories, resources) where each is a list of dicts with keys:
        name, slug, description, content
    """
    lines = text.split("\n")
    separator_idx = None
    for i, line in enumerate(lines):
        if line.strip() == "---" and i > 0:
            separator_idx = i
            break
    if separator_idx is None:
        return [], []
    resources_idx = None
    contributing_idx = None
    for i, line in enumerate(lines):
        if line.strip() == "# Resources":
            resources_idx = i
        elif line.strip() == "# Contributing":
            contributing_idx = i
    cat_end = resources_idx if resources_idx is not None else len(lines)
    category_lines = lines[separator_idx + 1 : cat_end]
    resource_lines = []
    if resources_idx is not None:
        res_end = contributing_idx if contributing_idx is not None else len(lines)
        resource_lines = lines[resources_idx:res_end]
    categories = _extract_sections(category_lines, level=2)
    resources = _extract_sections(resource_lines, level=2)
    return categories, resources
 def _extract_sections(lines: list[str], *, level: int) -> list[dict]:
    """Extract ## sections from a block of lines."""
    prefix = "#" * level + " "
    sections = []
    current_name = None
    current_lines: list[str] = []
    for line in lines:
        if line.startswith(prefix) and not line.startswith(prefix + "#"):
            if current_name is not None:
                sections.append(_build_section(current_name, current_lines))
            current_name = line[len(prefix) :].strip()
            current_lines = []
        elif current_name is not None:
            current_lines.append(line)
    if current_name is not None:
        sections.append(_build_section(current_name, current_lines))
    return sections
 def _build_section(name: str, lines: list[str]) -> dict:
    """Build a section dict from a name and its content lines."""
    while lines and not lines[0].strip():
        lines = lines[1:]
    while lines and not lines[-1].strip():
        lines = lines[:-1]
    description = ""
    content_lines = lines
    if lines:
        m = re.match(r"^_(.+)_$", lines[0].strip())
        if m:
            description = m.group(1)
            content_lines = lines[1:]
            while content_lines and not content_lines[0].strip():
                content_lines = content_lines[1:]
    content = "\n".join(content_lines).strip()
    return {
        "name": name,
        "slug": slugify(name),
        "description": description,
        "content": content,
    }
 def render_markdown(text: str) -> str:
    """Render markdown text to HTML."""
    md = markdown.Markdown(extensions=["extra"])
    return md.convert(text)
 def strip_markdown_links(text: str) -> str:
    """Replace [text](url) with just text for plain-text contexts."""
    return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
 def render_inline_markdown(text: str) -> str:
    """Render inline markdown (links, bold, italic) to HTML."""
    from markupsafe import Markup
    html = markdown.markdown(text)
    # Strip wrapping <p>...</p> since this is inline content
    html = re.sub(r"^<p>(.*)</p>$", r"\1", html.strip())
    # Add target/rel to links for external navigation
    html = html.replace("<a ", '<a target="_blank" rel="noopener" ')
    return Markup(html)
 def group_categories(
    categories: list[dict],
    resources: list[dict],
@@ -379,37 +169,17 @@ def extract_entries(
    entries: list[dict] = []
    for cat in categories:
        group_name = cat_to_group.get(cat["name"], "Other")
-        last_entry_indent = -1
+        for entry in cat["entries"]:
        for line in cat["content"].split("\n"):
            indent_len = len(line) - len(line.lstrip())
            # Link-only sub-item deeper than parent → "also see"
            m_sub = re.match(r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
            if m_sub and indent_len > last_entry_indent >= 0 and entries:
                entries[-1]["also_see"].append({
                    "name": m_sub.group(1),
                    "url": m_sub.group(2),
                })
                continue
            if indent_len > 3:
                continue
            m = re.match(
                r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*(?:[-\u2013\u2014]\s*(.+))?$",
                line,
            )
            if m:
                last_entry_indent = indent_len
            entries.append({
-                    "name": m.group(1),
+                "name": entry["name"],
-                    "url": m.group(2),
+                "url": entry["url"],
-                    "description": render_inline_markdown(m.group(3)) if m.group(3) else "",
+                "description": entry["description"],
                "category": cat["name"],
                "group": group_name,
                "stars": None,
                "owner": None,
                "pushed_at": None,
-                    "also_see": [],
+                "also_see": entry["also_see"],
            })
    return entries
@@ -420,7 +190,6 @@ def build(repo_root: str) -> None:
    website = repo / "website"
    readme_text = (repo / "README.md").read_text(encoding="utf-8")
    # Extract subtitle from the first non-empty, non-heading line
    subtitle = ""
    for line in readme_text.split("\n"):
        stripped = line.strip()
@@ -429,47 +198,33 @@ def build(repo_root: str) -> None:
            break
    categories, resources = parse_readme(readme_text)
-
+    # All fields pre-computed: entry_count, content_html, preview, description
    # Enrich with entry counts, rendered HTML, previews, and clean descriptions
    for cat in categories + resources:
        cat["entry_count"] = count_entries(cat["content"])
        cat["content_html"] = render_content_html(cat["content"])
        cat["preview"] = extract_preview(cat["content"])
        cat["description"] = strip_markdown_links(cat["description"])
    total_entries = sum(c["entry_count"] for c in categories)
    # Organize into groups
    groups = group_categories(categories, resources)
    # Flatten entries for table view
    entries = extract_entries(categories, resources, groups)
    # Load and merge GitHub star data
    stars_data = load_stars(website / "data" / "github_stars.json")
    for entry in entries:
        repo_key = extract_github_repo(entry["url"])
        if repo_key and repo_key in stars_data:
-            entry["stars"] = stars_data[repo_key]["stars"]
+            sd = stars_data[repo_key]
-            entry["owner"] = stars_data[repo_key]["owner"]
+            entry["stars"] = sd["stars"]
-            entry["pushed_at"] = stars_data[repo_key].get("pushed_at", "")
+            entry["owner"] = sd["owner"]
            entry["pushed_at"] = sd.get("pushed_at", "")
    # Sort by stars descending
    entries = sort_entries(entries)
    # Set up Jinja2
    env = Environment(
        loader=FileSystemLoader(website / "templates"),
        autoescape=True,
    )
    # Output directory
    site_dir = website / "output"
    if site_dir.exists():
        shutil.rmtree(site_dir)
    site_dir.mkdir(parents=True)
    # Generate single index.html
    tpl_index = env.get_template("index.html")
    (site_dir / "index.html").write_text(
        tpl_index.render(
@@ -484,13 +239,10 @@ def build(repo_root: str) -> None:
        encoding="utf-8",
    )
    # Copy static assets
    static_src = website / "static"
    static_dst = site_dir / "static"
    if static_src.exists():
        shutil.copytree(static_src, static_dst)
    # Write CNAME
    (site_dir / "CNAME").write_text("awesome-python.com\n", encoding="utf-8")
    print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
@@ -7,21 +7,15 @@ import sys
 import textwrap
 from pathlib import Path
 import pytest
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 from build import (
    build,
    count_entries,
    extract_github_repo,
    extract_preview,
    group_categories,
    load_stars,
    parse_readme,
    render_content_html,
    slugify,
    sort_entries,
 )
 from readme_parser import slugify
 # ---------------------------------------------------------------------------
 # slugify
@@ -51,244 +45,6 @@ class TestSlugify:
        assert slugify("  Date  and  Time  ") == "date-and-time"
 # ---------------------------------------------------------------------------
 # count_entries
 # ---------------------------------------------------------------------------
 class TestCountEntries:
    def test_counts_dash_entries(self):
        assert count_entries("- [a](url) - Desc.\n- [b](url) - Desc.") == 2
    def test_counts_star_entries(self):
        assert count_entries("* [a](url) - Desc.") == 1
    def test_ignores_non_entries(self):
        assert count_entries("Some text\n- [a](url) - Desc.\nMore text") == 1
    def test_counts_indented_entries(self):
        assert count_entries("    - [a](url) - Desc.") == 1
    def test_empty_content(self):
        assert count_entries("") == 0
 # ---------------------------------------------------------------------------
 # extract_preview
 # ---------------------------------------------------------------------------
 class TestExtractPreview:
    def test_basic(self):
        content = "* [alpha](url) - A.\n* [beta](url) - B.\n* [gamma](url) - C."
        assert extract_preview(content) == "alpha, beta, gamma"
    def test_max_four(self):
        content = "\n".join(f"* [lib{i}](url) - Desc." for i in range(10))
        assert extract_preview(content) == "lib0, lib1, lib2, lib3"
    def test_empty(self):
        assert extract_preview("") == ""
    def test_skips_subcategory_labels(self):
        content = "* Synchronous\n* [django](url) - Framework.\n* [flask](url) - Micro."
        assert extract_preview(content) == "django, flask"
 # ---------------------------------------------------------------------------
 # render_content_html
 # ---------------------------------------------------------------------------
 class TestRenderContentHtml:
    def test_basic_entry(self):
        content = "* [django](https://example.com) - A web framework."
        html = render_content_html(content)
        assert 'href="https://example.com"' in html
        assert "django" in html
        assert "A web framework." in html
        assert 'class="entry"' in html
    def test_subcategory_label(self):
        content = "* Synchronous\n* [django](https://x.com) - Framework."
        html = render_content_html(content)
        assert 'class="subcat"' in html
        assert "Synchronous" in html
    def test_sub_entry(self):
        content = "* [django](https://x.com) - Framework.\n    * [awesome-django](https://y.com)"
        html = render_content_html(content)
        assert 'class="entry-sub"' in html
        assert "awesome-django" in html
    def test_link_only_entry(self):
        content = "* [tool](https://x.com)"
        html = render_content_html(content)
        assert 'href="https://x.com"' in html
        assert "tool" in html
 # ---------------------------------------------------------------------------
 # parse_readme
 # ---------------------------------------------------------------------------
 MINIMAL_README = textwrap.dedent("""\
    # Awesome Python
    Some intro text.
    ---
    ## Alpha
    _Libraries for alpha stuff._
    - [lib-a](https://example.com/a) - Does A.
    - [lib-b](https://example.com/b) - Does B.
    ## Beta
    _Tools for beta._
    - [lib-c](https://example.com/c) - Does C.
    # Resources
    Where to discover resources.
    ## Newsletters
    - [News One](https://example.com/n1)
    - [News Two](https://example.com/n2)
    ## Podcasts
    - [Pod One](https://example.com/p1)
    # Contributing
    Please contribute!
 """)
 class TestParseReadme:
    def test_category_count(self):
        cats, resources = parse_readme(MINIMAL_README)
        assert len(cats) == 2
    def test_resource_count(self):
        cats, resources = parse_readme(MINIMAL_README)
        assert len(resources) == 2
    def test_category_names(self):
        cats, _ = parse_readme(MINIMAL_README)
        assert cats[0]["name"] == "Alpha"
        assert cats[1]["name"] == "Beta"
    def test_category_slugs(self):
        cats, _ = parse_readme(MINIMAL_README)
        assert cats[0]["slug"] == "alpha"
        assert cats[1]["slug"] == "beta"
    def test_category_description(self):
        cats, _ = parse_readme(MINIMAL_README)
        assert cats[0]["description"] == "Libraries for alpha stuff."
        assert cats[1]["description"] == "Tools for beta."
    def test_category_content_has_entries(self):
        cats, _ = parse_readme(MINIMAL_README)
        assert "lib-a" in cats[0]["content"]
        assert "lib-b" in cats[0]["content"]
    def test_resources_names(self):
        _, resources = parse_readme(MINIMAL_README)
        assert resources[0]["name"] == "Newsletters"
        assert resources[1]["name"] == "Podcasts"
    def test_resources_content(self):
        _, resources = parse_readme(MINIMAL_README)
        assert "News One" in resources[0]["content"]
        assert "Pod One" in resources[1]["content"]
    def test_contributing_skipped(self):
        cats, resources = parse_readme(MINIMAL_README)
        all_names = [c["name"] for c in cats] + [r["name"] for r in resources]
        assert "Contributing" not in all_names
    def test_no_separator(self):
        cats, resources = parse_readme("# Just a heading\n\nSome text.\n")
        assert cats == []
        assert resources == []
    def test_no_description(self):
        readme = textwrap.dedent("""\
            # Title
            ---
            ## NullDesc
            - [item](https://x.com) - Thing.
            # Resources
            ## Tips
            - [tip](https://x.com)
            # Contributing
            Done.
        """)
        cats, resources = parse_readme(readme)
        assert cats[0]["description"] == ""
        assert "item" in cats[0]["content"]
 # ---------------------------------------------------------------------------
 # parse_readme on real README
 # ---------------------------------------------------------------------------
 class TestParseRealReadme:
    @pytest.fixture(autouse=True)
    def load_readme(self):
        readme_path = os.path.join(os.path.dirname(__file__), "..", "..", "README.md")
        with open(readme_path, encoding="utf-8") as f:
            self.readme_text = f.read()
        self.cats, self.resources = parse_readme(self.readme_text)
    def test_at_least_83_categories(self):
        assert len(self.cats) >= 83
    def test_resources_has_newsletters_and_podcasts(self):
        names = [r["name"] for r in self.resources]
        assert "Newsletters" in names
        assert "Podcasts" in names
    def test_contributing_not_in_results(self):
        all_names = [c["name"] for c in self.cats] + [
            r["name"] for r in self.resources
        ]
        assert "Contributing" not in all_names
    def test_first_category_is_admin_panels(self):
        assert self.cats[0]["name"] == "Admin Panels"
        assert self.cats[0]["slug"] == "admin-panels"
    def test_last_category_is_wsgi_servers(self):
        assert self.cats[-1]["name"] == "WSGI Servers"
        assert self.cats[-1]["slug"] == "wsgi-servers"
    def test_restful_api_slug(self):
        slugs = [c["slug"] for c in self.cats]
        assert "restful-api" in slugs
    def test_descriptions_extracted(self):
        admin = self.cats[0]
        assert admin["description"] == "Libraries for administrative interfaces."
 # ---------------------------------------------------------------------------
 # group_categories
 # ---------------------------------------------------------------------------
@@ -318,26 +74,6 @@ class TestGroupCategories:
        assert "Resources" in group_names
 # ---------------------------------------------------------------------------
 # render_markdown (kept for compatibility)
 # ---------------------------------------------------------------------------
 class TestRenderMarkdown:
    def test_renders_link_list(self):
        from build import render_markdown
        html = render_markdown("- [lib](https://example.com) - Does stuff.")
        assert "<li>" in html
        assert '<a href="https://example.com">lib</a>' in html
    def test_renders_plain_text(self):
        from build import render_markdown
        html = render_markdown("Hello world")
        assert "<p>Hello world</p>" in html
 # ---------------------------------------------------------------------------
 # build (integration)
 # ---------------------------------------------------------------------------