mirror of
https://github.com/vinta/awesome-python.git
synced 2026-05-10 07:09:08 +08:00
Improve SEO/AEO discovery surface for awesome-python.com (#3103)
* update gitignore * feat: tighten homepage metadata * fix: trim generated HTML whitespace * feat(website): add discovery files and markdown alternate * feat(website): add sitemap lastmod * feat(seo): add Content-Signal directive to robots.txt Signals search, ai-input, and ai-train to crawlers via the experimental Content-Signal header in robots.txt. Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
+7
-7
@@ -10,12 +10,12 @@ __pycache__/
|
|||||||
website/output/
|
website/output/
|
||||||
website/data/
|
website/data/
|
||||||
|
|
||||||
# claude code
|
# planning docs
|
||||||
.claude/skills/
|
docs/
|
||||||
.gstack/
|
|
||||||
.playwright-cli/
|
|
||||||
.superpowers/
|
|
||||||
skills-lock.json
|
|
||||||
|
|
||||||
# codex
|
# agents
|
||||||
.agents/
|
.agents/
|
||||||
|
.claude/skills/
|
||||||
|
.superpowers/
|
||||||
|
.playwright-cli/
|
||||||
|
skills-lock.json
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Awesome Python
|
# Awesome Python
|
||||||
|
|
||||||
An opinionated list of Python frameworks, libraries, tools, and resources.
|
An opinionated guide to the best Python frameworks, libraries, tools, and resources.
|
||||||
|
|
||||||
# **Sponsors**
|
# **Sponsors**
|
||||||
|
|
||||||
|
|||||||
+67
-2
@@ -4,6 +4,8 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from collections.abc import Sequence
|
||||||
from datetime import UTC, datetime
|
from datetime import UTC, datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -12,6 +14,9 @@ from jinja2 import Environment, FileSystemLoader
|
|||||||
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
|
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
|
||||||
|
|
||||||
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
|
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
|
||||||
|
SITE_URL = "https://awesome-python.com/"
|
||||||
|
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
|
||||||
|
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||||
|
|
||||||
SOURCE_TYPE_DOMAINS = {
|
SOURCE_TYPE_DOMAINS = {
|
||||||
"docs.python.org": "Built-in",
|
"docs.python.org": "Built-in",
|
||||||
@@ -67,6 +72,59 @@ def sort_entries(entries: list[dict]) -> list[dict]:
|
|||||||
return sorted(entries, key=sort_key)
|
return sorted(entries, key=sort_key)
|
||||||
|
|
||||||
|
|
||||||
|
def build_robots_txt() -> str:
|
||||||
|
return (
|
||||||
|
"User-agent: *\n"
|
||||||
|
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
|
||||||
|
"Allow: /\n"
|
||||||
|
"\n"
|
||||||
|
f"Sitemap: {SITEMAP_URL}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
|
||||||
|
ET.register_namespace("", SITEMAP_NS)
|
||||||
|
urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
|
||||||
|
for url, lastmod in urls:
|
||||||
|
url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
|
||||||
|
loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
|
||||||
|
loc_el.text = url
|
||||||
|
lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
|
||||||
|
lastmod_el.text = lastmod
|
||||||
|
|
||||||
|
ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
|
||||||
|
with path.open("ab") as f:
|
||||||
|
f.write(b"\n")
|
||||||
|
|
||||||
|
|
||||||
|
def top_level_heading_text(line: str) -> str | None:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped.startswith("# "):
|
||||||
|
return None
|
||||||
|
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_sponsors_section(markdown: str) -> str:
|
||||||
|
lines = markdown.splitlines(keepends=True)
|
||||||
|
start_idx = None
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
heading = top_level_heading_text(line)
|
||||||
|
if heading and heading.lower() == "sponsors":
|
||||||
|
start_idx = i
|
||||||
|
break
|
||||||
|
|
||||||
|
if start_idx is None:
|
||||||
|
return markdown
|
||||||
|
|
||||||
|
end_idx = len(lines)
|
||||||
|
for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
|
||||||
|
if top_level_heading_text(line):
|
||||||
|
end_idx = i
|
||||||
|
break
|
||||||
|
|
||||||
|
return "".join(lines[:start_idx] + lines[end_idx:])
|
||||||
|
|
||||||
|
|
||||||
def extract_entries(
|
def extract_entries(
|
||||||
categories: list[ParsedSection],
|
categories: list[ParsedSection],
|
||||||
groups: list[ParsedGroup],
|
groups: list[ParsedGroup],
|
||||||
@@ -131,6 +189,7 @@ def build(repo_root: Path) -> None:
|
|||||||
categories = [cat for g in parsed_groups for cat in g["categories"]]
|
categories = [cat for g in parsed_groups for cat in g["categories"]]
|
||||||
total_entries = sum(c["entry_count"] for c in categories)
|
total_entries = sum(c["entry_count"] for c in categories)
|
||||||
entries = extract_entries(categories, parsed_groups)
|
entries = extract_entries(categories, parsed_groups)
|
||||||
|
build_date = datetime.now(UTC)
|
||||||
|
|
||||||
stars_data = load_stars(website / "data" / "github_stars.json")
|
stars_data = load_stars(website / "data" / "github_stars.json")
|
||||||
|
|
||||||
@@ -155,6 +214,8 @@ def build(repo_root: Path) -> None:
|
|||||||
env = Environment(
|
env = Environment(
|
||||||
loader=FileSystemLoader(website / "templates"),
|
loader=FileSystemLoader(website / "templates"),
|
||||||
autoescape=True,
|
autoescape=True,
|
||||||
|
trim_blocks=True,
|
||||||
|
lstrip_blocks=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
site_dir = website / "output"
|
site_dir = website / "output"
|
||||||
@@ -171,7 +232,7 @@ def build(repo_root: Path) -> None:
|
|||||||
total_entries=total_entries,
|
total_entries=total_entries,
|
||||||
total_categories=len(categories),
|
total_categories=len(categories),
|
||||||
repo_stars=repo_stars,
|
repo_stars=repo_stars,
|
||||||
build_date=datetime.now(UTC).strftime("%B %d, %Y"),
|
build_date=build_date.strftime("%B %d, %Y"),
|
||||||
sponsors=sponsors,
|
sponsors=sponsors,
|
||||||
),
|
),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
@@ -182,7 +243,11 @@ def build(repo_root: Path) -> None:
|
|||||||
if static_src.exists():
|
if static_src.exists():
|
||||||
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
|
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
|
||||||
|
|
||||||
(site_dir / "llms.txt").write_text(readme_text, encoding="utf-8")
|
markdown_index = remove_sponsors_section(readme_text)
|
||||||
|
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
|
||||||
|
write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
|
||||||
|
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
|
||||||
|
(site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
|
||||||
|
|
||||||
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
|
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
|
||||||
print(f"Total entries: {total_entries}")
|
print(f"Total entries: {total_entries}")
|
||||||
|
|||||||
+18
-17
@@ -1,26 +1,27 @@
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
|
{% set default_meta_title = "Awesome Python" %}
|
||||||
|
{% set default_meta_description = "An opinionated guide to the best Python frameworks, libraries, and tools. Explore " ~ (entries | length) ~ " curated projects across " ~ total_categories ~ " categories, from AI and agents to data science and web development." %}
|
||||||
|
{% set canonical_url = "https://awesome-python.com/" %}
|
||||||
|
{% set social_image_url = "https://awesome-python.com/static/og-image.png" %}
|
||||||
|
{% set meta_title %}{% block title %}{{ default_meta_title }}{% endblock %}{% endset %}
|
||||||
|
{% set meta_description %}{% block description %}{{ default_meta_description }}{% endblock %}{% endset %}
|
||||||
<meta charset="utf-8" />
|
<meta charset="utf-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
<title>{% block title %}Awesome Python{% endblock %}</title>
|
<title>{{ meta_title | trim }}</title>
|
||||||
<meta
|
<meta name="description" content="{{ meta_description | trim }}" />
|
||||||
name="description"
|
<link rel="canonical" href="{{ canonical_url }}" />
|
||||||
content="{% block description %}An opinionated list of Python frameworks, libraries, tools, and resources. {{ total_entries }} projects across {{ categories | length }} categories.{% endblock %}"
|
<link rel="alternate" type="text/markdown" href="/index.md" />
|
||||||
/>
|
|
||||||
<link rel="canonical" href="https://awesome-python.com/" />
|
|
||||||
<meta property="og:type" content="website" />
|
<meta property="og:type" content="website" />
|
||||||
<meta property="og:title" content="Awesome Python" />
|
<meta property="og:title" content="{{ meta_title | trim }}" />
|
||||||
<meta
|
<meta property="og:description" content="{{ meta_description | trim }}" />
|
||||||
property="og:description"
|
<meta property="og:image" content="{{ social_image_url }}" />
|
||||||
content="An opinionated list of Python frameworks, libraries, tools, and resources."
|
<meta property="og:url" content="{{ canonical_url }}" />
|
||||||
/>
|
<meta name="twitter:card" content="summary_large_image" />
|
||||||
<meta
|
<meta name="twitter:title" content="{{ meta_title | trim }}" />
|
||||||
property="og:image"
|
<meta name="twitter:description" content="{{ meta_description | trim }}" />
|
||||||
content="https://awesome-python.com/static/og-image.png"
|
<meta name="twitter:image" content="{{ social_image_url }}" />
|
||||||
/>
|
|
||||||
<meta property="og:url" content="https://awesome-python.com/" />
|
|
||||||
<meta name="twitter:card" content="summary" />
|
|
||||||
<meta name="theme-color" content="#1c1410" />
|
<meta name="theme-color" content="#1c1410" />
|
||||||
<link rel="icon" href="/static/favicon.svg" type="image/svg+xml" />
|
<link rel="icon" href="/static/favicon.svg" type="image/svg+xml" />
|
||||||
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
||||||
|
|||||||
@@ -3,6 +3,9 @@
|
|||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
import textwrap
|
import textwrap
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from datetime import UTC, date, datetime
|
||||||
|
from html.parser import HTMLParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from build import (
|
from build import (
|
||||||
@@ -15,6 +18,40 @@ from build import (
|
|||||||
)
|
)
|
||||||
from readme_parser import parse_readme, slugify
|
from readme_parser import parse_readme, slugify
|
||||||
|
|
||||||
|
|
||||||
|
class HeadMetadataParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.title_count = 0
|
||||||
|
self.title = ""
|
||||||
|
self.meta_by_name = {}
|
||||||
|
self.meta_by_property = {}
|
||||||
|
self.links_by_rel = {}
|
||||||
|
self._in_title = False
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict(attrs)
|
||||||
|
if tag == "title":
|
||||||
|
self.title_count += 1
|
||||||
|
self._in_title = True
|
||||||
|
elif tag == "meta":
|
||||||
|
if "name" in attrs:
|
||||||
|
self.meta_by_name[attrs["name"]] = attrs.get("content", "")
|
||||||
|
if "property" in attrs:
|
||||||
|
self.meta_by_property[attrs["property"]] = attrs.get("content", "")
|
||||||
|
elif tag == "link" and attrs.get("rel"):
|
||||||
|
for rel in attrs["rel"].split():
|
||||||
|
self.links_by_rel[rel] = attrs.get("href", "")
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == "title":
|
||||||
|
self._in_title = False
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self._in_title:
|
||||||
|
self.title += data
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# slugify
|
# slugify
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -72,6 +109,11 @@ class TestBuild:
|
|||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _copy_real_templates(self, tmp_path):
|
||||||
|
real_tpl = Path(__file__).parent / ".." / "templates"
|
||||||
|
tpl_dir = tmp_path / "website" / "templates"
|
||||||
|
shutil.copytree(real_tpl, tpl_dir)
|
||||||
|
|
||||||
def test_build_creates_single_page(self, tmp_path):
|
def test_build_creates_single_page(self, tmp_path):
|
||||||
readme = textwrap.dedent("""\
|
readme = textwrap.dedent("""\
|
||||||
# Awesome Python
|
# Awesome Python
|
||||||
@@ -114,6 +156,97 @@ class TestBuild:
|
|||||||
# No category sub-pages
|
# No category sub-pages
|
||||||
assert not (site / "categories").exists()
|
assert not (site / "categories").exists()
|
||||||
|
|
||||||
|
def test_build_creates_root_discovery_files(self, tmp_path):
|
||||||
|
readme = textwrap.dedent("""\
|
||||||
|
# Awesome Python
|
||||||
|
|
||||||
|
Intro.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Widgets
|
||||||
|
|
||||||
|
- [w1](https://example.com) - A widget.
|
||||||
|
|
||||||
|
# Contributing
|
||||||
|
|
||||||
|
Help!
|
||||||
|
""")
|
||||||
|
self._make_repo(tmp_path, readme)
|
||||||
|
start_date = datetime.now(UTC).date()
|
||||||
|
build(tmp_path)
|
||||||
|
end_date = datetime.now(UTC).date()
|
||||||
|
|
||||||
|
site = tmp_path / "website" / "output"
|
||||||
|
robots = (site / "robots.txt").read_text(encoding="utf-8")
|
||||||
|
assert robots == (
|
||||||
|
"User-agent: *\n"
|
||||||
|
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
|
||||||
|
"Allow: /\n"
|
||||||
|
"\n"
|
||||||
|
"Sitemap: https://awesome-python.com/sitemap.xml\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
sitemap = ET.parse(site / "sitemap.xml")
|
||||||
|
root = sitemap.getroot()
|
||||||
|
ns = {"sitemap": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
||||||
|
locs = [loc.text for loc in root.findall("sitemap:url/sitemap:loc", ns)]
|
||||||
|
lastmods = [lastmod.text for lastmod in root.findall("sitemap:url/sitemap:lastmod", ns)]
|
||||||
|
|
||||||
|
assert root.tag == "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset"
|
||||||
|
assert locs == ["https://awesome-python.com/"]
|
||||||
|
assert len(lastmods) == 1
|
||||||
|
assert start_date <= date.fromisoformat(lastmods[0]) <= end_date
|
||||||
|
assert all(loc.startswith("https://awesome-python.com/") for loc in locs)
|
||||||
|
assert all("?" not in loc for loc in locs)
|
||||||
|
|
||||||
|
def test_build_creates_markdown_alternate_without_sponsors(self, tmp_path):
|
||||||
|
readme = textwrap.dedent("""\
|
||||||
|
# Awesome Python
|
||||||
|
|
||||||
|
Intro.
|
||||||
|
|
||||||
|
# **Sponsors**
|
||||||
|
|
||||||
|
- **[Sponsor](https://sponsor.example.com)**: Sponsored tool.
|
||||||
|
|
||||||
|
> Become a sponsor: [Sponsor us](SPONSORSHIP.md).
|
||||||
|
|
||||||
|
# Categories
|
||||||
|
|
||||||
|
**Tools**
|
||||||
|
|
||||||
|
- [Widgets](#widgets)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Widgets
|
||||||
|
|
||||||
|
- [w1](https://example.com) - A widget.
|
||||||
|
|
||||||
|
# Contributing
|
||||||
|
|
||||||
|
Help!
|
||||||
|
""")
|
||||||
|
(tmp_path / "README.md").write_text(readme, encoding="utf-8")
|
||||||
|
self._copy_real_templates(tmp_path)
|
||||||
|
|
||||||
|
build(tmp_path)
|
||||||
|
|
||||||
|
site = tmp_path / "website" / "output"
|
||||||
|
index_html = (site / "index.html").read_text(encoding="utf-8")
|
||||||
|
index_md = (site / "index.md").read_text(encoding="utf-8")
|
||||||
|
llms_txt = (site / "llms.txt").read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
assert '<link rel="alternate" type="text/markdown" href="/index.md" />' in index_html
|
||||||
|
assert index_md == llms_txt
|
||||||
|
assert index_md.startswith("# Awesome Python\n\nIntro.\n\n# Categories")
|
||||||
|
assert "# **Sponsors**" not in index_md
|
||||||
|
assert "Sponsor" not in index_md
|
||||||
|
assert "SPONSORSHIP.md" not in index_md
|
||||||
|
assert "## Widgets" in index_md
|
||||||
|
assert "- [w1](https://example.com) - A widget." in index_md
|
||||||
|
|
||||||
def test_build_cleans_stale_output(self, tmp_path):
|
def test_build_cleans_stale_output(self, tmp_path):
|
||||||
readme = textwrap.dedent("""\
|
readme = textwrap.dedent("""\
|
||||||
# T
|
# T
|
||||||
@@ -235,6 +368,40 @@ class TestBuild:
|
|||||||
# Expand content present
|
# Expand content present
|
||||||
assert "expand-content" in html
|
assert "expand-content" in html
|
||||||
|
|
||||||
|
def test_index_contains_aligned_homepage_metadata(self, tmp_path):
|
||||||
|
readme = (Path(__file__).parents[2] / "README.md").read_text(encoding="utf-8")
|
||||||
|
(tmp_path / "README.md").write_text(readme, encoding="utf-8")
|
||||||
|
self._copy_real_templates(tmp_path)
|
||||||
|
|
||||||
|
build(tmp_path)
|
||||||
|
|
||||||
|
parsed_groups = parse_readme(readme)
|
||||||
|
categories = [cat for group in parsed_groups for cat in group["categories"]]
|
||||||
|
entries = extract_entries(categories, parsed_groups)
|
||||||
|
html = (tmp_path / "website" / "output" / "index.html").read_text(encoding="utf-8")
|
||||||
|
parser = HeadMetadataParser()
|
||||||
|
parser.feed(html)
|
||||||
|
|
||||||
|
expected_title = "Awesome Python"
|
||||||
|
expected_description = f"An opinionated guide to the best Python frameworks, libraries, and tools. Explore {len(entries)} curated projects across {len(categories)} categories, from AI and agents to data science and web development."
|
||||||
|
expected_url = "https://awesome-python.com/"
|
||||||
|
expected_image = "https://awesome-python.com/static/og-image.png"
|
||||||
|
|
||||||
|
assert parser.title_count == 1
|
||||||
|
assert parser.title.strip() == expected_title
|
||||||
|
assert parser.meta_by_name["description"] == expected_description
|
||||||
|
assert parser.links_by_rel["canonical"] == expected_url
|
||||||
|
assert parser.meta_by_property["og:type"] == "website"
|
||||||
|
assert parser.meta_by_property["og:title"] == expected_title
|
||||||
|
assert parser.meta_by_property["og:description"] == expected_description
|
||||||
|
assert parser.meta_by_property["og:image"] == expected_image
|
||||||
|
assert parser.meta_by_property["og:url"] == expected_url
|
||||||
|
assert parser.meta_by_name["twitter:card"] == "summary_large_image"
|
||||||
|
assert parser.meta_by_name["twitter:title"] == expected_title
|
||||||
|
assert parser.meta_by_name["twitter:description"] == expected_description
|
||||||
|
assert parser.meta_by_name["twitter:image"] == expected_image
|
||||||
|
assert "<head>\n <meta charset" in html
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# extract_github_repo
|
# extract_github_repo
|
||||||
|
|||||||
Reference in New Issue
Block a user