Improve SEO/AEO discovery surface for awesome-python.com (#3103)

* update gitignore * feat: tighten homepage metadata * fix: trim generated HTML whitespace * feat(website): add discovery files and markdown alternate * feat(website): add sitemap lastmod * feat(seo): add Content-Signal directive to robots.txt Signals search, ai-input, and ai-train to crawlers via the experimental Content-Signal header in robots.txt. Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
2026-05-21 01:32:09 +08:00 · 2026-05-02 01:53:19 +08:00
parent ccd4fb7591
commit d9f26a8635
5 changed files with 260 additions and 27 deletions
@@ -4,6 +4,8 @@
 import json
 import re
 import shutil
+import xml.etree.ElementTree as ET
+from collections.abc import Sequence
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
@@ -12,6 +14,9 @@ from jinja2 import Environment, FileSystemLoader
 from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors

 GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
+SITE_URL = "https://awesome-python.com/"
+SITEMAP_URL = f"{SITE_URL}sitemap.xml"
+SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"

 SOURCE_TYPE_DOMAINS = {
    "docs.python.org": "Built-in",
@@ -67,6 +72,59 @@ def sort_entries(entries: list[dict]) -> list[dict]:
    return sorted(entries, key=sort_key)


+def build_robots_txt() -> str:
+    return (
+        "User-agent: *\n"
+        "Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
+        "Allow: /\n"
+        "\n"
+        f"Sitemap: {SITEMAP_URL}\n"
+    )
+
+
+def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
+    ET.register_namespace("", SITEMAP_NS)
+    urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
+    for url, lastmod in urls:
+        url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
+        loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
+        loc_el.text = url
+        lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
+        lastmod_el.text = lastmod
+
+    ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
+    with path.open("ab") as f:
+        f.write(b"\n")
+
+
+def top_level_heading_text(line: str) -> str | None:
+    stripped = line.strip()
+    if not stripped.startswith("# "):
+        return None
+    return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
+
+
+def remove_sponsors_section(markdown: str) -> str:
+    lines = markdown.splitlines(keepends=True)
+    start_idx = None
+    for i, line in enumerate(lines):
+        heading = top_level_heading_text(line)
+        if heading and heading.lower() == "sponsors":
+            start_idx = i
+            break
+
+    if start_idx is None:
+        return markdown
+
+    end_idx = len(lines)
+    for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
+        if top_level_heading_text(line):
+            end_idx = i
+            break
+
+    return "".join(lines[:start_idx] + lines[end_idx:])
+
+
 def extract_entries(
    categories: list[ParsedSection],
    groups: list[ParsedGroup],
@@ -131,6 +189,7 @@ def build(repo_root: Path) -> None:
    categories = [cat for g in parsed_groups for cat in g["categories"]]
    total_entries = sum(c["entry_count"] for c in categories)
    entries = extract_entries(categories, parsed_groups)
+    build_date = datetime.now(UTC)

    stars_data = load_stars(website / "data" / "github_stars.json")

@@ -155,6 +214,8 @@ def build(repo_root: Path) -> None:
    env = Environment(
        loader=FileSystemLoader(website / "templates"),
        autoescape=True,
+        trim_blocks=True,
+        lstrip_blocks=True,
    )

    site_dir = website / "output"
@@ -171,7 +232,7 @@ def build(repo_root: Path) -> None:
            total_entries=total_entries,
            total_categories=len(categories),
            repo_stars=repo_stars,
-            build_date=datetime.now(UTC).strftime("%B %d, %Y"),
+            build_date=build_date.strftime("%B %d, %Y"),
            sponsors=sponsors,
        ),
        encoding="utf-8",
@@ -182,7 +243,11 @@ def build(repo_root: Path) -> None:
    if static_src.exists():
        shutil.copytree(static_src, static_dst, dirs_exist_ok=True)

-    (site_dir / "llms.txt").write_text(readme_text, encoding="utf-8")
+    markdown_index = remove_sponsors_section(readme_text)
+    (site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
+    write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
+    (site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
+    (site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")

    print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
    print(f"Total entries: {total_entries}")