From 0f374970dd6dce9c8827912a191da79fa0c3a5ec Mon Sep 17 00:00:00 2001 From: Vinta Chen Date: Wed, 18 Mar 2026 17:27:14 +0800 Subject: [PATCH] refactor: extract parsing logic from build.py into readme_parser module slugify, parse_readme, count_entries, extract_preview, render_content_html, and related helpers are moved to a dedicated readme_parser module. build.py now imports from readme_parser rather than defining these inline. Tests for the removed functions are dropped from test_build.py since they now live with the module they test. Co-Authored-By: Claude --- website/build.py | 286 +++--------------------------------- website/tests/test_build.py | 266 +-------------------------------- 2 files changed, 20 insertions(+), 532 deletions(-) diff --git a/website/build.py b/website/build.py index b8340eb5..5d9290b9 100644 --- a/website/build.py +++ b/website/build.py @@ -7,9 +7,10 @@ import shutil from pathlib import Path from typing import TypedDict -import markdown from jinja2 import Environment, FileSystemLoader +from readme_parser import parse_readme, slugify + # Thematic grouping of categories. Each category name must match exactly # as it appears in README.md (the ## heading text). SECTION_GROUPS: list[tuple[str, list[str]]] = [ @@ -67,217 +68,6 @@ SECTION_GROUPS: list[tuple[str, list[str]]] = [ ] -def slugify(name: str) -> str: - """Convert a category name to a URL-friendly slug.""" - slug = name.lower() - slug = re.sub(r"[^a-z0-9\s-]", "", slug) - slug = re.sub(r"[\s]+", "-", slug.strip()) - slug = re.sub(r"-+", "-", slug) - return slug - - -def count_entries(content: str) -> int: - """Count library entries (lines starting with * [ or - [) in a content block.""" - return sum(1 for line in content.split("\n") if re.match(r"\s*[-*]\s+\[", line)) - - -def extract_preview(content: str, *, max_names: int = 4) -> str: - """Extract first N main library names from markdown content for preview text. - - Only includes top-level or single-indent entries (indent <= 3 spaces), - skipping subcategory labels (items without links) and deep sub-entries. - """ - names = [] - for m in re.finditer(r"^(\s*)[-*]\s+\[([^\]]+)\]", content, re.MULTILINE): - indent_len = len(m.group(1)) - if indent_len > 3: - continue - names.append(m.group(2)) - if len(names) >= max_names: - break - return ", ".join(names) - - -def render_content_html(content: str) -> str: - """Render category markdown content to HTML with subcategory detection. - - Lines that are list items without links (e.g., "- Synchronous") are - treated as subcategory headers and rendered as bold dividers. - - Indent levels in the README: - - 0 spaces: top-level entry or subcategory label - - 2 spaces: entry under a subcategory (still a main entry) - - 4+ spaces: sub-entry (e.g., awesome-django under django) - """ - lines = content.split("\n") - out: list[str] = [] - - for line in lines: - stripped = line.strip() - indent_len = len(line) - len(line.lstrip()) - - # Detect subcategory labels: list items without links - m = re.match(r"^[-*]\s+(.+)$", stripped) - if m and "[" not in stripped: - label = m.group(1) - out.append(f'
{label}
') - continue - - # Entry with link and description: * [name](url) - Description. - m = re.match( - r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*[-\u2013\u2014]\s*(.+)$", - line, - ) - if m: - name, url, desc = m.groups() - if indent_len > 3: - out.append( - f'
' - f'{name}' - f"
" - ) - else: - out.append( - f'
' - f'{name}' - f'{desc}' - f"
" - ) - continue - - # Link-only entry (no description): * [name](url) - m = re.match(r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line) - if m: - name, url = m.groups() - if indent_len > 3: - out.append( - f'
' - f'{name}' - f"
" - ) - else: - out.append( - f'
' - f'{name}' - f"
" - ) - continue - - return "\n".join(out) - - -def parse_readme(text: str) -> tuple[list[dict], list[dict]]: - """Parse README.md text into categories and resources. - - Returns: - (categories, resources) where each is a list of dicts with keys: - name, slug, description, content - """ - lines = text.split("\n") - - separator_idx = None - for i, line in enumerate(lines): - if line.strip() == "---" and i > 0: - separator_idx = i - break - - if separator_idx is None: - return [], [] - - resources_idx = None - contributing_idx = None - for i, line in enumerate(lines): - if line.strip() == "# Resources": - resources_idx = i - elif line.strip() == "# Contributing": - contributing_idx = i - - cat_end = resources_idx if resources_idx is not None else len(lines) - category_lines = lines[separator_idx + 1 : cat_end] - - resource_lines = [] - if resources_idx is not None: - res_end = contributing_idx if contributing_idx is not None else len(lines) - resource_lines = lines[resources_idx:res_end] - - categories = _extract_sections(category_lines, level=2) - resources = _extract_sections(resource_lines, level=2) - - return categories, resources - - -def _extract_sections(lines: list[str], *, level: int) -> list[dict]: - """Extract ## sections from a block of lines.""" - prefix = "#" * level + " " - sections = [] - current_name = None - current_lines: list[str] = [] - - for line in lines: - if line.startswith(prefix) and not line.startswith(prefix + "#"): - if current_name is not None: - sections.append(_build_section(current_name, current_lines)) - current_name = line[len(prefix) :].strip() - current_lines = [] - elif current_name is not None: - current_lines.append(line) - - if current_name is not None: - sections.append(_build_section(current_name, current_lines)) - - return sections - - -def _build_section(name: str, lines: list[str]) -> dict: - """Build a section dict from a name and its content lines.""" - while lines and not lines[0].strip(): - lines = lines[1:] - while lines and not lines[-1].strip(): - lines = lines[:-1] - - description = "" - content_lines = lines - if lines: - m = re.match(r"^_(.+)_$", lines[0].strip()) - if m: - description = m.group(1) - content_lines = lines[1:] - while content_lines and not content_lines[0].strip(): - content_lines = content_lines[1:] - - content = "\n".join(content_lines).strip() - - return { - "name": name, - "slug": slugify(name), - "description": description, - "content": content, - } - - -def render_markdown(text: str) -> str: - """Render markdown text to HTML.""" - md = markdown.Markdown(extensions=["extra"]) - return md.convert(text) - - -def strip_markdown_links(text: str) -> str: - """Replace [text](url) with just text for plain-text contexts.""" - return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) - - -def render_inline_markdown(text: str) -> str: - """Render inline markdown (links, bold, italic) to HTML.""" - from markupsafe import Markup - - html = markdown.markdown(text) - # Strip wrapping

...

since this is inline content - html = re.sub(r"^

(.*)

$", r"\1", html.strip()) - # Add target/rel to links for external navigation - html = html.replace(" last_entry_indent >= 0 and entries: - entries[-1]["also_see"].append({ - "name": m_sub.group(1), - "url": m_sub.group(2), - }) - continue - - if indent_len > 3: - continue - m = re.match( - r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*(?:[-\u2013\u2014]\s*(.+))?$", - line, - ) - if m: - last_entry_indent = indent_len - entries.append({ - "name": m.group(1), - "url": m.group(2), - "description": render_inline_markdown(m.group(3)) if m.group(3) else "", - "category": cat["name"], - "group": group_name, - "stars": None, - "owner": None, - "pushed_at": None, - "also_see": [], - }) + for entry in cat["entries"]: + entries.append({ + "name": entry["name"], + "url": entry["url"], + "description": entry["description"], + "category": cat["name"], + "group": group_name, + "stars": None, + "owner": None, + "pushed_at": None, + "also_see": entry["also_see"], + }) return entries @@ -420,7 +190,6 @@ def build(repo_root: str) -> None: website = repo / "website" readme_text = (repo / "README.md").read_text(encoding="utf-8") - # Extract subtitle from the first non-empty, non-heading line subtitle = "" for line in readme_text.split("\n"): stripped = line.strip() @@ -429,47 +198,33 @@ def build(repo_root: str) -> None: break categories, resources = parse_readme(readme_text) - - # Enrich with entry counts, rendered HTML, previews, and clean descriptions - for cat in categories + resources: - cat["entry_count"] = count_entries(cat["content"]) - cat["content_html"] = render_content_html(cat["content"]) - cat["preview"] = extract_preview(cat["content"]) - cat["description"] = strip_markdown_links(cat["description"]) + # All fields pre-computed: entry_count, content_html, preview, description total_entries = sum(c["entry_count"] for c in categories) - - # Organize into groups groups = group_categories(categories, resources) - - # Flatten entries for table view entries = extract_entries(categories, resources, groups) - # Load and merge GitHub star data stars_data = load_stars(website / "data" / "github_stars.json") for entry in entries: repo_key = extract_github_repo(entry["url"]) if repo_key and repo_key in stars_data: - entry["stars"] = stars_data[repo_key]["stars"] - entry["owner"] = stars_data[repo_key]["owner"] - entry["pushed_at"] = stars_data[repo_key].get("pushed_at", "") + sd = stars_data[repo_key] + entry["stars"] = sd["stars"] + entry["owner"] = sd["owner"] + entry["pushed_at"] = sd.get("pushed_at", "") - # Sort by stars descending entries = sort_entries(entries) - # Set up Jinja2 env = Environment( loader=FileSystemLoader(website / "templates"), autoescape=True, ) - # Output directory site_dir = website / "output" if site_dir.exists(): shutil.rmtree(site_dir) site_dir.mkdir(parents=True) - # Generate single index.html tpl_index = env.get_template("index.html") (site_dir / "index.html").write_text( tpl_index.render( @@ -484,13 +239,10 @@ def build(repo_root: str) -> None: encoding="utf-8", ) - # Copy static assets static_src = website / "static" static_dst = site_dir / "static" if static_src.exists(): shutil.copytree(static_src, static_dst) - - # Write CNAME (site_dir / "CNAME").write_text("awesome-python.com\n", encoding="utf-8") print(f"Built single page with {len(categories)} categories + {len(resources)} resources") diff --git a/website/tests/test_build.py b/website/tests/test_build.py index e551f954..e0fb2bf1 100644 --- a/website/tests/test_build.py +++ b/website/tests/test_build.py @@ -7,21 +7,15 @@ import sys import textwrap from pathlib import Path -import pytest - sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from build import ( build, - count_entries, extract_github_repo, - extract_preview, group_categories, load_stars, - parse_readme, - render_content_html, - slugify, sort_entries, ) +from readme_parser import slugify # --------------------------------------------------------------------------- # slugify @@ -51,244 +45,6 @@ class TestSlugify: assert slugify(" Date and Time ") == "date-and-time" -# --------------------------------------------------------------------------- -# count_entries -# --------------------------------------------------------------------------- - - -class TestCountEntries: - def test_counts_dash_entries(self): - assert count_entries("- [a](url) - Desc.\n- [b](url) - Desc.") == 2 - - def test_counts_star_entries(self): - assert count_entries("* [a](url) - Desc.") == 1 - - def test_ignores_non_entries(self): - assert count_entries("Some text\n- [a](url) - Desc.\nMore text") == 1 - - def test_counts_indented_entries(self): - assert count_entries(" - [a](url) - Desc.") == 1 - - def test_empty_content(self): - assert count_entries("") == 0 - - -# --------------------------------------------------------------------------- -# extract_preview -# --------------------------------------------------------------------------- - - -class TestExtractPreview: - def test_basic(self): - content = "* [alpha](url) - A.\n* [beta](url) - B.\n* [gamma](url) - C." - assert extract_preview(content) == "alpha, beta, gamma" - - def test_max_four(self): - content = "\n".join(f"* [lib{i}](url) - Desc." for i in range(10)) - assert extract_preview(content) == "lib0, lib1, lib2, lib3" - - def test_empty(self): - assert extract_preview("") == "" - - def test_skips_subcategory_labels(self): - content = "* Synchronous\n* [django](url) - Framework.\n* [flask](url) - Micro." - assert extract_preview(content) == "django, flask" - - -# --------------------------------------------------------------------------- -# render_content_html -# --------------------------------------------------------------------------- - - -class TestRenderContentHtml: - def test_basic_entry(self): - content = "* [django](https://example.com) - A web framework." - html = render_content_html(content) - assert 'href="https://example.com"' in html - assert "django" in html - assert "A web framework." in html - assert 'class="entry"' in html - - def test_subcategory_label(self): - content = "* Synchronous\n* [django](https://x.com) - Framework." - html = render_content_html(content) - assert 'class="subcat"' in html - assert "Synchronous" in html - - def test_sub_entry(self): - content = "* [django](https://x.com) - Framework.\n * [awesome-django](https://y.com)" - html = render_content_html(content) - assert 'class="entry-sub"' in html - assert "awesome-django" in html - - def test_link_only_entry(self): - content = "* [tool](https://x.com)" - html = render_content_html(content) - assert 'href="https://x.com"' in html - assert "tool" in html - - -# --------------------------------------------------------------------------- -# parse_readme -# --------------------------------------------------------------------------- - -MINIMAL_README = textwrap.dedent("""\ - # Awesome Python - - Some intro text. - - --- - - ## Alpha - - _Libraries for alpha stuff._ - - - [lib-a](https://example.com/a) - Does A. - - [lib-b](https://example.com/b) - Does B. - - ## Beta - - _Tools for beta._ - - - [lib-c](https://example.com/c) - Does C. - - # Resources - - Where to discover resources. - - ## Newsletters - - - [News One](https://example.com/n1) - - [News Two](https://example.com/n2) - - ## Podcasts - - - [Pod One](https://example.com/p1) - - # Contributing - - Please contribute! -""") - - -class TestParseReadme: - def test_category_count(self): - cats, resources = parse_readme(MINIMAL_README) - assert len(cats) == 2 - - def test_resource_count(self): - cats, resources = parse_readme(MINIMAL_README) - assert len(resources) == 2 - - def test_category_names(self): - cats, _ = parse_readme(MINIMAL_README) - assert cats[0]["name"] == "Alpha" - assert cats[1]["name"] == "Beta" - - def test_category_slugs(self): - cats, _ = parse_readme(MINIMAL_README) - assert cats[0]["slug"] == "alpha" - assert cats[1]["slug"] == "beta" - - def test_category_description(self): - cats, _ = parse_readme(MINIMAL_README) - assert cats[0]["description"] == "Libraries for alpha stuff." - assert cats[1]["description"] == "Tools for beta." - - def test_category_content_has_entries(self): - cats, _ = parse_readme(MINIMAL_README) - assert "lib-a" in cats[0]["content"] - assert "lib-b" in cats[0]["content"] - - def test_resources_names(self): - _, resources = parse_readme(MINIMAL_README) - assert resources[0]["name"] == "Newsletters" - assert resources[1]["name"] == "Podcasts" - - def test_resources_content(self): - _, resources = parse_readme(MINIMAL_README) - assert "News One" in resources[0]["content"] - assert "Pod One" in resources[1]["content"] - - def test_contributing_skipped(self): - cats, resources = parse_readme(MINIMAL_README) - all_names = [c["name"] for c in cats] + [r["name"] for r in resources] - assert "Contributing" not in all_names - - def test_no_separator(self): - cats, resources = parse_readme("# Just a heading\n\nSome text.\n") - assert cats == [] - assert resources == [] - - def test_no_description(self): - readme = textwrap.dedent("""\ - # Title - - --- - - ## NullDesc - - - [item](https://x.com) - Thing. - - # Resources - - ## Tips - - - [tip](https://x.com) - - # Contributing - - Done. - """) - cats, resources = parse_readme(readme) - assert cats[0]["description"] == "" - assert "item" in cats[0]["content"] - - -# --------------------------------------------------------------------------- -# parse_readme on real README -# --------------------------------------------------------------------------- - - -class TestParseRealReadme: - @pytest.fixture(autouse=True) - def load_readme(self): - readme_path = os.path.join(os.path.dirname(__file__), "..", "..", "README.md") - with open(readme_path, encoding="utf-8") as f: - self.readme_text = f.read() - self.cats, self.resources = parse_readme(self.readme_text) - - def test_at_least_83_categories(self): - assert len(self.cats) >= 83 - - def test_resources_has_newsletters_and_podcasts(self): - names = [r["name"] for r in self.resources] - assert "Newsletters" in names - assert "Podcasts" in names - - def test_contributing_not_in_results(self): - all_names = [c["name"] for c in self.cats] + [ - r["name"] for r in self.resources - ] - assert "Contributing" not in all_names - - def test_first_category_is_admin_panels(self): - assert self.cats[0]["name"] == "Admin Panels" - assert self.cats[0]["slug"] == "admin-panels" - - def test_last_category_is_wsgi_servers(self): - assert self.cats[-1]["name"] == "WSGI Servers" - assert self.cats[-1]["slug"] == "wsgi-servers" - - def test_restful_api_slug(self): - slugs = [c["slug"] for c in self.cats] - assert "restful-api" in slugs - - def test_descriptions_extracted(self): - admin = self.cats[0] - assert admin["description"] == "Libraries for administrative interfaces." - - # --------------------------------------------------------------------------- # group_categories # --------------------------------------------------------------------------- @@ -318,26 +74,6 @@ class TestGroupCategories: assert "Resources" in group_names -# --------------------------------------------------------------------------- -# render_markdown (kept for compatibility) -# --------------------------------------------------------------------------- - - -class TestRenderMarkdown: - def test_renders_link_list(self): - from build import render_markdown - - html = render_markdown("- [lib](https://example.com) - Does stuff.") - assert "
  • " in html - assert 'lib' in html - - def test_renders_plain_text(self): - from build import render_markdown - - html = render_markdown("Hello world") - assert "

    Hello world

    " in html - - # --------------------------------------------------------------------------- # build (integration) # ---------------------------------------------------------------------------