feat: generate llms.txt from template and annotate entries with star counts

- Add llms.txt Jinja2 template with a categories_md placeholder - Extract categories body from README and inject it into the template - Annotate bullet-entry lines with GitHub star counts (N GitHub stars) for the main index.md and bare numbers for llms.txt - Add TestAnnotateEntriesWithStars unit tests Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-23 16:25:48 +08:00 · 2026-05-02 02:32:18 +08:00
parent d9f26a8635
commit 429c9b3d12
3 changed files with 169 additions and 3 deletions
@@ -14,6 +14,8 @@ from jinja2 import Environment, FileSystemLoader
 from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors

 GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
+MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)")
+BULLET_LINE_RE = re.compile(r"^\s*-\s")
 SITE_URL = "https://awesome-python.com/"
 SITEMAP_URL = f"{SITE_URL}sitemap.xml"
 SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
@@ -104,6 +106,72 @@ def top_level_heading_text(line: str) -> str | None:
    return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()


+LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}"
+
+
+def extract_categories_body(markdown: str) -> str:
+    """Return content under the `# Categories` heading, excluding the heading line itself."""
+    lines = markdown.splitlines(keepends=True)
+    start_idx = None
+    end_idx = len(lines)
+    for i, line in enumerate(lines):
+        heading = top_level_heading_text(line)
+        if heading is None:
+            continue
+        if start_idx is None and heading.lower() == "categories":
+            start_idx = i + 1
+            while start_idx < len(lines) and lines[start_idx].strip() == "":
+                start_idx += 1
+        elif start_idx is not None and i >= start_idx:
+            end_idx = i
+            break
+    if start_idx is None:
+        return ""
+    return "".join(lines[start_idx:end_idx]).rstrip() + "\n"
+
+
+def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str:
+    """Render the llms.txt template by injecting the README's Categories body, then annotate stars."""
+    body = extract_categories_body(readme_text).rstrip()
+    rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body)
+    return annotate_entries_with_stars(rendered, stars_data, format_stars=str)
+
+
+def annotate_entries_with_stars(
+    markdown: str,
+    stars_data: dict[str, dict],
+    *,
+    format_stars=None,
+) -> str:
+    """Append the star count to bullet entry lines whose first GitHub link has known star data.
+
+    `format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars".
+    Pass `str` for a bare number.
+    """
+    if format_stars is None:
+        format_stars = lambda n: f"{n} GitHub stars"  # noqa: E731 lambda-assignment
+    lines = markdown.splitlines(keepends=True)
+    out: list[str] = []
+    for line in lines:
+        if not BULLET_LINE_RE.match(line):
+            out.append(line)
+            continue
+        annotated = line
+        for match in MARKDOWN_LINK_RE.finditer(line):
+            repo_key = extract_github_repo(match.group(1))
+            if not repo_key:
+                continue
+            entry = stars_data.get(repo_key)
+            if not entry or "stars" not in entry:
+                continue
+            stripped = line.rstrip("\n")
+            ending = line[len(stripped):]
+            annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}"
+            break
+        out.append(annotated)
+    return "".join(out)
+
+
 def remove_sponsors_section(markdown: str) -> str:
    lines = markdown.splitlines(keepends=True)
    start_idx = None
@@ -243,11 +311,15 @@ def build(repo_root: Path) -> None:
    if static_src.exists():
        shutil.copytree(static_src, static_dst, dirs_exist_ok=True)

-    markdown_index = remove_sponsors_section(readme_text)
+    markdown_index = annotate_entries_with_stars(
+        remove_sponsors_section(readme_text), stars_data
+    )
+    llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8")
+    llms_txt = build_llms_txt(llms_template, readme_text, stars_data)
    (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
    write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
    (site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
-    (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
+    (site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8")

    print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
    print(f"Total entries: {total_entries}")