Improve SEO/AEO discovery surface for awesome-python.com (#3103)

* update gitignore

* feat: tighten homepage metadata

* fix: trim generated HTML whitespace

* feat(website): add discovery files and markdown alternate

* feat(website): add sitemap lastmod

* feat(seo): add Content-Signal directive to robots.txt

Signals search, ai-input, and ai-train to crawlers
via the experimental Content-Signal header in robots.txt.

Co-Authored-By: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-05-02 01:53:19 +08:00
committed by GitHub
parent ccd4fb7591
commit d9f26a8635
5 changed files with 260 additions and 27 deletions
+67 -2
View File
@@ -4,6 +4,8 @@
import json
import re
import shutil
import xml.etree.ElementTree as ET
from collections.abc import Sequence
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
@@ -12,6 +14,9 @@ from jinja2 import Environment, FileSystemLoader
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
SITE_URL = "https://awesome-python.com/"
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
SOURCE_TYPE_DOMAINS = {
"docs.python.org": "Built-in",
@@ -67,6 +72,59 @@ def sort_entries(entries: list[dict]) -> list[dict]:
return sorted(entries, key=sort_key)
def build_robots_txt() -> str:
return (
"User-agent: *\n"
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
"Allow: /\n"
"\n"
f"Sitemap: {SITEMAP_URL}\n"
)
def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
ET.register_namespace("", SITEMAP_NS)
urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
for url, lastmod in urls:
url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
loc_el.text = url
lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
lastmod_el.text = lastmod
ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
with path.open("ab") as f:
f.write(b"\n")
def top_level_heading_text(line: str) -> str | None:
stripped = line.strip()
if not stripped.startswith("# "):
return None
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
def remove_sponsors_section(markdown: str) -> str:
lines = markdown.splitlines(keepends=True)
start_idx = None
for i, line in enumerate(lines):
heading = top_level_heading_text(line)
if heading and heading.lower() == "sponsors":
start_idx = i
break
if start_idx is None:
return markdown
end_idx = len(lines)
for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
if top_level_heading_text(line):
end_idx = i
break
return "".join(lines[:start_idx] + lines[end_idx:])
def extract_entries(
categories: list[ParsedSection],
groups: list[ParsedGroup],
@@ -131,6 +189,7 @@ def build(repo_root: Path) -> None:
categories = [cat for g in parsed_groups for cat in g["categories"]]
total_entries = sum(c["entry_count"] for c in categories)
entries = extract_entries(categories, parsed_groups)
build_date = datetime.now(UTC)
stars_data = load_stars(website / "data" / "github_stars.json")
@@ -155,6 +214,8 @@ def build(repo_root: Path) -> None:
env = Environment(
loader=FileSystemLoader(website / "templates"),
autoescape=True,
trim_blocks=True,
lstrip_blocks=True,
)
site_dir = website / "output"
@@ -171,7 +232,7 @@ def build(repo_root: Path) -> None:
total_entries=total_entries,
total_categories=len(categories),
repo_stars=repo_stars,
build_date=datetime.now(UTC).strftime("%B %d, %Y"),
build_date=build_date.strftime("%B %d, %Y"),
sponsors=sponsors,
),
encoding="utf-8",
@@ -182,7 +243,11 @@ def build(repo_root: Path) -> None:
if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
(site_dir / "llms.txt").write_text(readme_text, encoding="utf-8")
markdown_index = remove_sponsors_section(readme_text)
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
(site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
print(f"Total entries: {total_entries}")