refactor: extract parsing logic from build.py into readme_parser module

slugify, parse_readme, count_entries, extract_preview, render_content_html,
and related helpers are moved to a dedicated readme_parser module.
build.py now imports from readme_parser rather than defining these inline.
Tests for the removed functions are dropped from test_build.py since they
now live with the module they test.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-03-18 17:27:14 +08:00
parent 03ac212880
commit 0f374970dd
2 changed files with 20 additions and 532 deletions
+12 -260
View File
@@ -7,9 +7,10 @@ import shutil
from pathlib import Path from pathlib import Path
from typing import TypedDict from typing import TypedDict
import markdown
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from readme_parser import parse_readme, slugify
# Thematic grouping of categories. Each category name must match exactly # Thematic grouping of categories. Each category name must match exactly
# as it appears in README.md (the ## heading text). # as it appears in README.md (the ## heading text).
SECTION_GROUPS: list[tuple[str, list[str]]] = [ SECTION_GROUPS: list[tuple[str, list[str]]] = [
@@ -67,217 +68,6 @@ SECTION_GROUPS: list[tuple[str, list[str]]] = [
] ]
def slugify(name: str) -> str:
"""Convert a category name to a URL-friendly slug."""
slug = name.lower()
slug = re.sub(r"[^a-z0-9\s-]", "", slug)
slug = re.sub(r"[\s]+", "-", slug.strip())
slug = re.sub(r"-+", "-", slug)
return slug
def count_entries(content: str) -> int:
"""Count library entries (lines starting with * [ or - [) in a content block."""
return sum(1 for line in content.split("\n") if re.match(r"\s*[-*]\s+\[", line))
def extract_preview(content: str, *, max_names: int = 4) -> str:
"""Extract first N main library names from markdown content for preview text.
Only includes top-level or single-indent entries (indent <= 3 spaces),
skipping subcategory labels (items without links) and deep sub-entries.
"""
names = []
for m in re.finditer(r"^(\s*)[-*]\s+\[([^\]]+)\]", content, re.MULTILINE):
indent_len = len(m.group(1))
if indent_len > 3:
continue
names.append(m.group(2))
if len(names) >= max_names:
break
return ", ".join(names)
def render_content_html(content: str) -> str:
"""Render category markdown content to HTML with subcategory detection.
Lines that are list items without links (e.g., "- Synchronous") are
treated as subcategory headers and rendered as bold dividers.
Indent levels in the README:
- 0 spaces: top-level entry or subcategory label
- 2 spaces: entry under a subcategory (still a main entry)
- 4+ spaces: sub-entry (e.g., awesome-django under django)
"""
lines = content.split("\n")
out: list[str] = []
for line in lines:
stripped = line.strip()
indent_len = len(line) - len(line.lstrip())
# Detect subcategory labels: list items without links
m = re.match(r"^[-*]\s+(.+)$", stripped)
if m and "[" not in stripped:
label = m.group(1)
out.append(f'<div class="subcat">{label}</div>')
continue
# Entry with link and description: * [name](url) - Description.
m = re.match(
r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*[-\u2013\u2014]\s*(.+)$",
line,
)
if m:
name, url, desc = m.groups()
if indent_len > 3:
out.append(
f'<div class="entry-sub">'
f'<a href="{url}">{name}</a>'
f"</div>"
)
else:
out.append(
f'<div class="entry">'
f'<a href="{url}">{name}</a>'
f'<span class="sep">&mdash;</span>{desc}'
f"</div>"
)
continue
# Link-only entry (no description): * [name](url)
m = re.match(r"^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
if m:
name, url = m.groups()
if indent_len > 3:
out.append(
f'<div class="entry-sub">'
f'<a href="{url}">{name}</a>'
f"</div>"
)
else:
out.append(
f'<div class="entry">'
f'<a href="{url}">{name}</a>'
f"</div>"
)
continue
return "\n".join(out)
def parse_readme(text: str) -> tuple[list[dict], list[dict]]:
"""Parse README.md text into categories and resources.
Returns:
(categories, resources) where each is a list of dicts with keys:
name, slug, description, content
"""
lines = text.split("\n")
separator_idx = None
for i, line in enumerate(lines):
if line.strip() == "---" and i > 0:
separator_idx = i
break
if separator_idx is None:
return [], []
resources_idx = None
contributing_idx = None
for i, line in enumerate(lines):
if line.strip() == "# Resources":
resources_idx = i
elif line.strip() == "# Contributing":
contributing_idx = i
cat_end = resources_idx if resources_idx is not None else len(lines)
category_lines = lines[separator_idx + 1 : cat_end]
resource_lines = []
if resources_idx is not None:
res_end = contributing_idx if contributing_idx is not None else len(lines)
resource_lines = lines[resources_idx:res_end]
categories = _extract_sections(category_lines, level=2)
resources = _extract_sections(resource_lines, level=2)
return categories, resources
def _extract_sections(lines: list[str], *, level: int) -> list[dict]:
"""Extract ## sections from a block of lines."""
prefix = "#" * level + " "
sections = []
current_name = None
current_lines: list[str] = []
for line in lines:
if line.startswith(prefix) and not line.startswith(prefix + "#"):
if current_name is not None:
sections.append(_build_section(current_name, current_lines))
current_name = line[len(prefix) :].strip()
current_lines = []
elif current_name is not None:
current_lines.append(line)
if current_name is not None:
sections.append(_build_section(current_name, current_lines))
return sections
def _build_section(name: str, lines: list[str]) -> dict:
"""Build a section dict from a name and its content lines."""
while lines and not lines[0].strip():
lines = lines[1:]
while lines and not lines[-1].strip():
lines = lines[:-1]
description = ""
content_lines = lines
if lines:
m = re.match(r"^_(.+)_$", lines[0].strip())
if m:
description = m.group(1)
content_lines = lines[1:]
while content_lines and not content_lines[0].strip():
content_lines = content_lines[1:]
content = "\n".join(content_lines).strip()
return {
"name": name,
"slug": slugify(name),
"description": description,
"content": content,
}
def render_markdown(text: str) -> str:
"""Render markdown text to HTML."""
md = markdown.Markdown(extensions=["extra"])
return md.convert(text)
def strip_markdown_links(text: str) -> str:
"""Replace [text](url) with just text for plain-text contexts."""
return re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
def render_inline_markdown(text: str) -> str:
"""Render inline markdown (links, bold, italic) to HTML."""
from markupsafe import Markup
html = markdown.markdown(text)
# Strip wrapping <p>...</p> since this is inline content
html = re.sub(r"^<p>(.*)</p>$", r"\1", html.strip())
# Add target/rel to links for external navigation
html = html.replace("<a ", '<a target="_blank" rel="noopener" ')
return Markup(html)
def group_categories( def group_categories(
categories: list[dict], categories: list[dict],
resources: list[dict], resources: list[dict],
@@ -379,37 +169,17 @@ def extract_entries(
entries: list[dict] = [] entries: list[dict] = []
for cat in categories: for cat in categories:
group_name = cat_to_group.get(cat["name"], "Other") group_name = cat_to_group.get(cat["name"], "Other")
last_entry_indent = -1 for entry in cat["entries"]:
for line in cat["content"].split("\n"):
indent_len = len(line) - len(line.lstrip())
# Link-only sub-item deeper than parent → "also see"
m_sub = re.match(r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*$", line)
if m_sub and indent_len > last_entry_indent >= 0 and entries:
entries[-1]["also_see"].append({
"name": m_sub.group(1),
"url": m_sub.group(2),
})
continue
if indent_len > 3:
continue
m = re.match(
r"\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s*(?:[-\u2013\u2014]\s*(.+))?$",
line,
)
if m:
last_entry_indent = indent_len
entries.append({ entries.append({
"name": m.group(1), "name": entry["name"],
"url": m.group(2), "url": entry["url"],
"description": render_inline_markdown(m.group(3)) if m.group(3) else "", "description": entry["description"],
"category": cat["name"], "category": cat["name"],
"group": group_name, "group": group_name,
"stars": None, "stars": None,
"owner": None, "owner": None,
"pushed_at": None, "pushed_at": None,
"also_see": [], "also_see": entry["also_see"],
}) })
return entries return entries
@@ -420,7 +190,6 @@ def build(repo_root: str) -> None:
website = repo / "website" website = repo / "website"
readme_text = (repo / "README.md").read_text(encoding="utf-8") readme_text = (repo / "README.md").read_text(encoding="utf-8")
# Extract subtitle from the first non-empty, non-heading line
subtitle = "" subtitle = ""
for line in readme_text.split("\n"): for line in readme_text.split("\n"):
stripped = line.strip() stripped = line.strip()
@@ -429,47 +198,33 @@ def build(repo_root: str) -> None:
break break
categories, resources = parse_readme(readme_text) categories, resources = parse_readme(readme_text)
# All fields pre-computed: entry_count, content_html, preview, description
# Enrich with entry counts, rendered HTML, previews, and clean descriptions
for cat in categories + resources:
cat["entry_count"] = count_entries(cat["content"])
cat["content_html"] = render_content_html(cat["content"])
cat["preview"] = extract_preview(cat["content"])
cat["description"] = strip_markdown_links(cat["description"])
total_entries = sum(c["entry_count"] for c in categories) total_entries = sum(c["entry_count"] for c in categories)
# Organize into groups
groups = group_categories(categories, resources) groups = group_categories(categories, resources)
# Flatten entries for table view
entries = extract_entries(categories, resources, groups) entries = extract_entries(categories, resources, groups)
# Load and merge GitHub star data
stars_data = load_stars(website / "data" / "github_stars.json") stars_data = load_stars(website / "data" / "github_stars.json")
for entry in entries: for entry in entries:
repo_key = extract_github_repo(entry["url"]) repo_key = extract_github_repo(entry["url"])
if repo_key and repo_key in stars_data: if repo_key and repo_key in stars_data:
entry["stars"] = stars_data[repo_key]["stars"] sd = stars_data[repo_key]
entry["owner"] = stars_data[repo_key]["owner"] entry["stars"] = sd["stars"]
entry["pushed_at"] = stars_data[repo_key].get("pushed_at", "") entry["owner"] = sd["owner"]
entry["pushed_at"] = sd.get("pushed_at", "")
# Sort by stars descending
entries = sort_entries(entries) entries = sort_entries(entries)
# Set up Jinja2
env = Environment( env = Environment(
loader=FileSystemLoader(website / "templates"), loader=FileSystemLoader(website / "templates"),
autoescape=True, autoescape=True,
) )
# Output directory
site_dir = website / "output" site_dir = website / "output"
if site_dir.exists(): if site_dir.exists():
shutil.rmtree(site_dir) shutil.rmtree(site_dir)
site_dir.mkdir(parents=True) site_dir.mkdir(parents=True)
# Generate single index.html
tpl_index = env.get_template("index.html") tpl_index = env.get_template("index.html")
(site_dir / "index.html").write_text( (site_dir / "index.html").write_text(
tpl_index.render( tpl_index.render(
@@ -484,13 +239,10 @@ def build(repo_root: str) -> None:
encoding="utf-8", encoding="utf-8",
) )
# Copy static assets
static_src = website / "static" static_src = website / "static"
static_dst = site_dir / "static" static_dst = site_dir / "static"
if static_src.exists(): if static_src.exists():
shutil.copytree(static_src, static_dst) shutil.copytree(static_src, static_dst)
# Write CNAME
(site_dir / "CNAME").write_text("awesome-python.com\n", encoding="utf-8") (site_dir / "CNAME").write_text("awesome-python.com\n", encoding="utf-8")
print(f"Built single page with {len(categories)} categories + {len(resources)} resources") print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
+1 -265
View File
@@ -7,21 +7,15 @@ import sys
import textwrap import textwrap
from pathlib import Path from pathlib import Path
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from build import ( from build import (
build, build,
count_entries,
extract_github_repo, extract_github_repo,
extract_preview,
group_categories, group_categories,
load_stars, load_stars,
parse_readme,
render_content_html,
slugify,
sort_entries, sort_entries,
) )
from readme_parser import slugify
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# slugify # slugify
@@ -51,244 +45,6 @@ class TestSlugify:
assert slugify(" Date and Time ") == "date-and-time" assert slugify(" Date and Time ") == "date-and-time"
# ---------------------------------------------------------------------------
# count_entries
# ---------------------------------------------------------------------------
class TestCountEntries:
def test_counts_dash_entries(self):
assert count_entries("- [a](url) - Desc.\n- [b](url) - Desc.") == 2
def test_counts_star_entries(self):
assert count_entries("* [a](url) - Desc.") == 1
def test_ignores_non_entries(self):
assert count_entries("Some text\n- [a](url) - Desc.\nMore text") == 1
def test_counts_indented_entries(self):
assert count_entries(" - [a](url) - Desc.") == 1
def test_empty_content(self):
assert count_entries("") == 0
# ---------------------------------------------------------------------------
# extract_preview
# ---------------------------------------------------------------------------
class TestExtractPreview:
def test_basic(self):
content = "* [alpha](url) - A.\n* [beta](url) - B.\n* [gamma](url) - C."
assert extract_preview(content) == "alpha, beta, gamma"
def test_max_four(self):
content = "\n".join(f"* [lib{i}](url) - Desc." for i in range(10))
assert extract_preview(content) == "lib0, lib1, lib2, lib3"
def test_empty(self):
assert extract_preview("") == ""
def test_skips_subcategory_labels(self):
content = "* Synchronous\n* [django](url) - Framework.\n* [flask](url) - Micro."
assert extract_preview(content) == "django, flask"
# ---------------------------------------------------------------------------
# render_content_html
# ---------------------------------------------------------------------------
class TestRenderContentHtml:
def test_basic_entry(self):
content = "* [django](https://example.com) - A web framework."
html = render_content_html(content)
assert 'href="https://example.com"' in html
assert "django" in html
assert "A web framework." in html
assert 'class="entry"' in html
def test_subcategory_label(self):
content = "* Synchronous\n* [django](https://x.com) - Framework."
html = render_content_html(content)
assert 'class="subcat"' in html
assert "Synchronous" in html
def test_sub_entry(self):
content = "* [django](https://x.com) - Framework.\n * [awesome-django](https://y.com)"
html = render_content_html(content)
assert 'class="entry-sub"' in html
assert "awesome-django" in html
def test_link_only_entry(self):
content = "* [tool](https://x.com)"
html = render_content_html(content)
assert 'href="https://x.com"' in html
assert "tool" in html
# ---------------------------------------------------------------------------
# parse_readme
# ---------------------------------------------------------------------------
MINIMAL_README = textwrap.dedent("""\
# Awesome Python
Some intro text.
---
## Alpha
_Libraries for alpha stuff._
- [lib-a](https://example.com/a) - Does A.
- [lib-b](https://example.com/b) - Does B.
## Beta
_Tools for beta._
- [lib-c](https://example.com/c) - Does C.
# Resources
Where to discover resources.
## Newsletters
- [News One](https://example.com/n1)
- [News Two](https://example.com/n2)
## Podcasts
- [Pod One](https://example.com/p1)
# Contributing
Please contribute!
""")
class TestParseReadme:
def test_category_count(self):
cats, resources = parse_readme(MINIMAL_README)
assert len(cats) == 2
def test_resource_count(self):
cats, resources = parse_readme(MINIMAL_README)
assert len(resources) == 2
def test_category_names(self):
cats, _ = parse_readme(MINIMAL_README)
assert cats[0]["name"] == "Alpha"
assert cats[1]["name"] == "Beta"
def test_category_slugs(self):
cats, _ = parse_readme(MINIMAL_README)
assert cats[0]["slug"] == "alpha"
assert cats[1]["slug"] == "beta"
def test_category_description(self):
cats, _ = parse_readme(MINIMAL_README)
assert cats[0]["description"] == "Libraries for alpha stuff."
assert cats[1]["description"] == "Tools for beta."
def test_category_content_has_entries(self):
cats, _ = parse_readme(MINIMAL_README)
assert "lib-a" in cats[0]["content"]
assert "lib-b" in cats[0]["content"]
def test_resources_names(self):
_, resources = parse_readme(MINIMAL_README)
assert resources[0]["name"] == "Newsletters"
assert resources[1]["name"] == "Podcasts"
def test_resources_content(self):
_, resources = parse_readme(MINIMAL_README)
assert "News One" in resources[0]["content"]
assert "Pod One" in resources[1]["content"]
def test_contributing_skipped(self):
cats, resources = parse_readme(MINIMAL_README)
all_names = [c["name"] for c in cats] + [r["name"] for r in resources]
assert "Contributing" not in all_names
def test_no_separator(self):
cats, resources = parse_readme("# Just a heading\n\nSome text.\n")
assert cats == []
assert resources == []
def test_no_description(self):
readme = textwrap.dedent("""\
# Title
---
## NullDesc
- [item](https://x.com) - Thing.
# Resources
## Tips
- [tip](https://x.com)
# Contributing
Done.
""")
cats, resources = parse_readme(readme)
assert cats[0]["description"] == ""
assert "item" in cats[0]["content"]
# ---------------------------------------------------------------------------
# parse_readme on real README
# ---------------------------------------------------------------------------
class TestParseRealReadme:
@pytest.fixture(autouse=True)
def load_readme(self):
readme_path = os.path.join(os.path.dirname(__file__), "..", "..", "README.md")
with open(readme_path, encoding="utf-8") as f:
self.readme_text = f.read()
self.cats, self.resources = parse_readme(self.readme_text)
def test_at_least_83_categories(self):
assert len(self.cats) >= 83
def test_resources_has_newsletters_and_podcasts(self):
names = [r["name"] for r in self.resources]
assert "Newsletters" in names
assert "Podcasts" in names
def test_contributing_not_in_results(self):
all_names = [c["name"] for c in self.cats] + [
r["name"] for r in self.resources
]
assert "Contributing" not in all_names
def test_first_category_is_admin_panels(self):
assert self.cats[0]["name"] == "Admin Panels"
assert self.cats[0]["slug"] == "admin-panels"
def test_last_category_is_wsgi_servers(self):
assert self.cats[-1]["name"] == "WSGI Servers"
assert self.cats[-1]["slug"] == "wsgi-servers"
def test_restful_api_slug(self):
slugs = [c["slug"] for c in self.cats]
assert "restful-api" in slugs
def test_descriptions_extracted(self):
admin = self.cats[0]
assert admin["description"] == "Libraries for administrative interfaces."
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# group_categories # group_categories
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -318,26 +74,6 @@ class TestGroupCategories:
assert "Resources" in group_names assert "Resources" in group_names
# ---------------------------------------------------------------------------
# render_markdown (kept for compatibility)
# ---------------------------------------------------------------------------
class TestRenderMarkdown:
def test_renders_link_list(self):
from build import render_markdown
html = render_markdown("- [lib](https://example.com) - Does stuff.")
assert "<li>" in html
assert '<a href="https://example.com">lib</a>' in html
def test_renders_plain_text(self):
from build import render_markdown
html = render_markdown("Hello world")
assert "<p>Hello world</p>" in html
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# build (integration) # build (integration)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------