Improve SEO/AEO discovery surface for awesome-python.com (#3103)

* update gitignore

* feat: tighten homepage metadata

* fix: trim generated HTML whitespace

* feat(website): add discovery files and markdown alternate

* feat(website): add sitemap lastmod

* feat(seo): add Content-Signal directive to robots.txt

Signals search, ai-input, and ai-train to crawlers
via the experimental Content-Signal header in robots.txt.

Co-Authored-By: Claude <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Vinta Chen
2026-05-02 01:53:19 +08:00
committed by GitHub
parent ccd4fb7591
commit d9f26a8635
5 changed files with 260 additions and 27 deletions
+7 -7
View File
@@ -10,12 +10,12 @@ __pycache__/
website/output/ website/output/
website/data/ website/data/
# claude code # planning docs
.claude/skills/ docs/
.gstack/
.playwright-cli/
.superpowers/
skills-lock.json
# codex # agents
.agents/ .agents/
.claude/skills/
.superpowers/
.playwright-cli/
skills-lock.json
+1 -1
View File
@@ -1,6 +1,6 @@
# Awesome Python # Awesome Python
An opinionated list of Python frameworks, libraries, tools, and resources. An opinionated guide to the best Python frameworks, libraries, tools, and resources.
# **Sponsors** # **Sponsors**
+67 -2
View File
@@ -4,6 +4,8 @@
import json import json
import re import re
import shutil import shutil
import xml.etree.ElementTree as ET
from collections.abc import Sequence
from datetime import UTC, datetime from datetime import UTC, datetime
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -12,6 +14,9 @@ from jinja2 import Environment, FileSystemLoader
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$") GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
SITE_URL = "https://awesome-python.com/"
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
SOURCE_TYPE_DOMAINS = { SOURCE_TYPE_DOMAINS = {
"docs.python.org": "Built-in", "docs.python.org": "Built-in",
@@ -67,6 +72,59 @@ def sort_entries(entries: list[dict]) -> list[dict]:
return sorted(entries, key=sort_key) return sorted(entries, key=sort_key)
def build_robots_txt() -> str:
return (
"User-agent: *\n"
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
"Allow: /\n"
"\n"
f"Sitemap: {SITEMAP_URL}\n"
)
def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
ET.register_namespace("", SITEMAP_NS)
urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
for url, lastmod in urls:
url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
loc_el.text = url
lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
lastmod_el.text = lastmod
ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
with path.open("ab") as f:
f.write(b"\n")
def top_level_heading_text(line: str) -> str | None:
stripped = line.strip()
if not stripped.startswith("# "):
return None
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
def remove_sponsors_section(markdown: str) -> str:
lines = markdown.splitlines(keepends=True)
start_idx = None
for i, line in enumerate(lines):
heading = top_level_heading_text(line)
if heading and heading.lower() == "sponsors":
start_idx = i
break
if start_idx is None:
return markdown
end_idx = len(lines)
for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
if top_level_heading_text(line):
end_idx = i
break
return "".join(lines[:start_idx] + lines[end_idx:])
def extract_entries( def extract_entries(
categories: list[ParsedSection], categories: list[ParsedSection],
groups: list[ParsedGroup], groups: list[ParsedGroup],
@@ -131,6 +189,7 @@ def build(repo_root: Path) -> None:
categories = [cat for g in parsed_groups for cat in g["categories"]] categories = [cat for g in parsed_groups for cat in g["categories"]]
total_entries = sum(c["entry_count"] for c in categories) total_entries = sum(c["entry_count"] for c in categories)
entries = extract_entries(categories, parsed_groups) entries = extract_entries(categories, parsed_groups)
build_date = datetime.now(UTC)
stars_data = load_stars(website / "data" / "github_stars.json") stars_data = load_stars(website / "data" / "github_stars.json")
@@ -155,6 +214,8 @@ def build(repo_root: Path) -> None:
env = Environment( env = Environment(
loader=FileSystemLoader(website / "templates"), loader=FileSystemLoader(website / "templates"),
autoescape=True, autoescape=True,
trim_blocks=True,
lstrip_blocks=True,
) )
site_dir = website / "output" site_dir = website / "output"
@@ -171,7 +232,7 @@ def build(repo_root: Path) -> None:
total_entries=total_entries, total_entries=total_entries,
total_categories=len(categories), total_categories=len(categories),
repo_stars=repo_stars, repo_stars=repo_stars,
build_date=datetime.now(UTC).strftime("%B %d, %Y"), build_date=build_date.strftime("%B %d, %Y"),
sponsors=sponsors, sponsors=sponsors,
), ),
encoding="utf-8", encoding="utf-8",
@@ -182,7 +243,11 @@ def build(repo_root: Path) -> None:
if static_src.exists(): if static_src.exists():
shutil.copytree(static_src, static_dst, dirs_exist_ok=True) shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
(site_dir / "llms.txt").write_text(readme_text, encoding="utf-8") markdown_index = remove_sponsors_section(readme_text)
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
(site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories") print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
print(f"Total entries: {total_entries}") print(f"Total entries: {total_entries}")
+18 -17
View File
@@ -1,26 +1,27 @@
<!doctype html> <!doctype html>
<html lang="en"> <html lang="en">
<head> <head>
{% set default_meta_title = "Awesome Python" %}
{% set default_meta_description = "An opinionated guide to the best Python frameworks, libraries, and tools. Explore " ~ (entries | length) ~ " curated projects across " ~ total_categories ~ " categories, from AI and agents to data science and web development." %}
{% set canonical_url = "https://awesome-python.com/" %}
{% set social_image_url = "https://awesome-python.com/static/og-image.png" %}
{% set meta_title %}{% block title %}{{ default_meta_title }}{% endblock %}{% endset %}
{% set meta_description %}{% block description %}{{ default_meta_description }}{% endblock %}{% endset %}
<meta charset="utf-8" /> <meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" /> <meta name="viewport" content="width=device-width, initial-scale=1" />
<title>{% block title %}Awesome Python{% endblock %}</title> <title>{{ meta_title | trim }}</title>
<meta <meta name="description" content="{{ meta_description | trim }}" />
name="description" <link rel="canonical" href="{{ canonical_url }}" />
content="{% block description %}An opinionated list of Python frameworks, libraries, tools, and resources. {{ total_entries }} projects across {{ categories | length }} categories.{% endblock %}" <link rel="alternate" type="text/markdown" href="/index.md" />
/>
<link rel="canonical" href="https://awesome-python.com/" />
<meta property="og:type" content="website" /> <meta property="og:type" content="website" />
<meta property="og:title" content="Awesome Python" /> <meta property="og:title" content="{{ meta_title | trim }}" />
<meta <meta property="og:description" content="{{ meta_description | trim }}" />
property="og:description" <meta property="og:image" content="{{ social_image_url }}" />
content="An opinionated list of Python frameworks, libraries, tools, and resources." <meta property="og:url" content="{{ canonical_url }}" />
/> <meta name="twitter:card" content="summary_large_image" />
<meta <meta name="twitter:title" content="{{ meta_title | trim }}" />
property="og:image" <meta name="twitter:description" content="{{ meta_description | trim }}" />
content="https://awesome-python.com/static/og-image.png" <meta name="twitter:image" content="{{ social_image_url }}" />
/>
<meta property="og:url" content="https://awesome-python.com/" />
<meta name="twitter:card" content="summary" />
<meta name="theme-color" content="#1c1410" /> <meta name="theme-color" content="#1c1410" />
<link rel="icon" href="/static/favicon.svg" type="image/svg+xml" /> <link rel="icon" href="/static/favicon.svg" type="image/svg+xml" />
<link rel="preconnect" href="https://fonts.googleapis.com" /> <link rel="preconnect" href="https://fonts.googleapis.com" />
+167
View File
@@ -3,6 +3,9 @@
import json import json
import shutil import shutil
import textwrap import textwrap
import xml.etree.ElementTree as ET
from datetime import UTC, date, datetime
from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
from build import ( from build import (
@@ -15,6 +18,40 @@ from build import (
) )
from readme_parser import parse_readme, slugify from readme_parser import parse_readme, slugify
class HeadMetadataParser(HTMLParser):
def __init__(self):
super().__init__()
self.title_count = 0
self.title = ""
self.meta_by_name = {}
self.meta_by_property = {}
self.links_by_rel = {}
self._in_title = False
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == "title":
self.title_count += 1
self._in_title = True
elif tag == "meta":
if "name" in attrs:
self.meta_by_name[attrs["name"]] = attrs.get("content", "")
if "property" in attrs:
self.meta_by_property[attrs["property"]] = attrs.get("content", "")
elif tag == "link" and attrs.get("rel"):
for rel in attrs["rel"].split():
self.links_by_rel[rel] = attrs.get("href", "")
def handle_endtag(self, tag):
if tag == "title":
self._in_title = False
def handle_data(self, data):
if self._in_title:
self.title += data
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# slugify # slugify
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -72,6 +109,11 @@ class TestBuild:
encoding="utf-8", encoding="utf-8",
) )
def _copy_real_templates(self, tmp_path):
real_tpl = Path(__file__).parent / ".." / "templates"
tpl_dir = tmp_path / "website" / "templates"
shutil.copytree(real_tpl, tpl_dir)
def test_build_creates_single_page(self, tmp_path): def test_build_creates_single_page(self, tmp_path):
readme = textwrap.dedent("""\ readme = textwrap.dedent("""\
# Awesome Python # Awesome Python
@@ -114,6 +156,97 @@ class TestBuild:
# No category sub-pages # No category sub-pages
assert not (site / "categories").exists() assert not (site / "categories").exists()
def test_build_creates_root_discovery_files(self, tmp_path):
readme = textwrap.dedent("""\
# Awesome Python
Intro.
---
## Widgets
- [w1](https://example.com) - A widget.
# Contributing
Help!
""")
self._make_repo(tmp_path, readme)
start_date = datetime.now(UTC).date()
build(tmp_path)
end_date = datetime.now(UTC).date()
site = tmp_path / "website" / "output"
robots = (site / "robots.txt").read_text(encoding="utf-8")
assert robots == (
"User-agent: *\n"
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
"Allow: /\n"
"\n"
"Sitemap: https://awesome-python.com/sitemap.xml\n"
)
sitemap = ET.parse(site / "sitemap.xml")
root = sitemap.getroot()
ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"}
locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)]
lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)]
assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset"
assert locs == ["https://awesome-python.com/"]
assert len(lastmods) == 1
assert start_date <= date.fromisoformat(lastmods[0]) <= end_date
assert all(loc.startswith("https://awesome-python.com/") for loc in locs)
assert all("?" not in loc for loc in locs)
def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path):
readme = textwrap.dedent("""\
# Awesome Python
Intro.
# **Sponsors**
- **[Sponsor](https://sponsor.example.com)**: Sponsored tool.
> Become a sponsor: [Sponsor us](SPONSORSHIP.md).
# Categories
**Tools**
- [Widgets](#widgets)
---
## Widgets
- [w1](https://example.com) - A widget.
# Contributing
Help!
""")
(tmp_path / "README.md").write_text(readme, encoding="utf-8")
self._copy_real_templates(tmp_path)
build(tmp_path)
site = tmp_path / "website" / "output"
index_html = (site / "index.html").read_text(encoding="utf-8")
index_md = (site / "index.md").read_text(encoding="utf-8")
llms_txt = (site / "llms.txt").read_text(encoding="utf-8")
assert '<link rel="alternate" type="text/markdown" href="/index.md" />' in index_html
assert index_md == llms_txt
assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories")
assert "# **Sponsors**" not in index_md
assert "Sponsor" not in index_md
assert "SPONSORSHIP.md" not in index_md
assert "## Widgets" in index_md
assert "- [w1](https://example.com) - A widget." in index_md
def test_build_cleans_stale_output(self, tmp_path): def test_build_cleans_stale_output(self, tmp_path):
readme = textwrap.dedent("""\ readme = textwrap.dedent("""\
# T # T
@@ -235,6 +368,40 @@ class TestBuild:
# Expand content present # Expand content present
assert "expand-content" in html assert "expand-content" in html
def test_index_contains_aligned_homepage_metadata(self, tmp_path):
readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8")
(tmp_path / "README.md").write_text(readme, encoding="utf-8")
self._copy_real_templates(tmp_path)
build(tmp_path)
parsed_groups = parse_readme(readme)
categories = [cat for group in parsed_groups for cat in group["categories"]]
entries = extract_entries(categories, parsed_groups)
html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8")
parser = HeadMetadataParser()
parser.feed(html)
expected_title = "Awesome Python"
expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development."
expected_url = "https://awesome-python.com/"
expected_image = "https://awesome-python.com/static/og-image.png"
assert parser.title_count == 1
assert parser.title.strip() == expected_title
assert parser.meta_by_name["description"] == expected_description
assert parser.links_by_rel["canonical"] == expected_url
assert parser.meta_by_property["og:type"] == "website"
assert parser.meta_by_property["og:title"] == expected_title
assert parser.meta_by_property["og:description"] == expected_description
assert parser.meta_by_property["og:image"] == expected_image
assert parser.meta_by_property["og:url"] == expected_url
assert parser.meta_by_name["twitter:card"] == "summary_large_image"
assert parser.meta_by_name["twitter:title"] == expected_title
assert parser.meta_by_name["twitter:description"] == expected_description
assert parser.meta_by_name["twitter:image"] == expected_image
assert "<head>\n <meta charset" in html
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# extract_github_repo # extract_github_repo