feat: generate llms.txt from template and annotate entries with star counts
Deploy Website / deploy (push) Has been cancelled
CI / test (push) Has been cancelled

- Add llms.txt Jinja2 template with a categories_md placeholder
- Extract categories body from README and inject it into the template
- Annotate bullet-entry lines with GitHub star counts (N GitHub stars)
  for the main index.md and bare numbers for llms.txt
- Add TestAnnotateEntriesWithStars unit tests

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-05-02 02:32:18 +08:00
parent d9f26a8635
commit 429c9b3d12
3 changed files with 169 additions and 3 deletions
+74 -2
View File
@@ -14,6 +14,8 @@ from jinja2 import Environment, FileSystemLoader
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)")
BULLET_LINE_RE = re.compile(r"^\s*-\s")
SITE_URL = "https://awesome-python.com/"
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
@@ -104,6 +106,72 @@ def top_level_heading_text(line: str) -> str | None:
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}"
def extract_categories_body(markdown: str) -> str:
"""Return content under the `# Categories` heading, excluding the heading line itself."""
lines = markdown.splitlines(keepends=True)
start_idx = None
end_idx = len(lines)
for i, line in enumerate(lines):
heading = top_level_heading_text(line)
if heading is None:
continue
if start_idx is None and heading.lower() == "categories":
start_idx = i + 1
while start_idx < len(lines) and lines[start_idx].strip() == "":
start_idx += 1
elif start_idx is not None and i >= start_idx:
end_idx = i
break
if start_idx is None:
return ""
return "".join(lines[start_idx:end_idx]).rstrip() + "\n"
def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str:
"""Render the llms.txt template by injecting the README's Categories body, then annotate stars."""
body = extract_categories_body(readme_text).rstrip()
rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body)
return annotate_entries_with_stars(rendered, stars_data, format_stars=str)
def annotate_entries_with_stars(
markdown: str,
stars_data: dict[str, dict],
*,
format_stars=None,
) -> str:
"""Append the star count to bullet entry lines whose first GitHub link has known star data.
`format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars".
Pass `str` for a bare number.
"""
if format_stars is None:
format_stars = lambda n: f"{n} GitHub stars" # noqa: E731 lambda-assignment
lines = markdown.splitlines(keepends=True)
out: list[str] = []
for line in lines:
if not BULLET_LINE_RE.match(line):
out.append(line)
continue
annotated = line
for match in MARKDOWN_LINK_RE.finditer(line):
repo_key = extract_github_repo(match.group(1))
if not repo_key:
continue
entry = stars_data.get(repo_key)
if not entry or "stars" not in entry:
continue
stripped = line.rstrip("\n")
ending = line[len(stripped):]
annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}"
break
out.append(annotated)
return "".join(out)
def remove_sponsors_section(markdown: str) -> str:
lines = markdown.splitlines(keepends=True)
start_idx = None
@@ -243,11 +311,15 @@ def build(repo_root: Path) -> None:
if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
markdown_index = remove_sponsors_section(readme_text)
markdown_index = annotate_entries_with_stars(
remove_sponsors_section(readme_text), stars_data
)
llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8")
llms_txt = build_llms_txt(llms_template, readme_text, stars_data)
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
(site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
(site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8")
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
print(f"Total entries: {total_entries}")