mirror of
https://github.com/vinta/awesome-python.git
synced 2026-05-21 01:32:09 +08:00
Improve SEO/AEO discovery surface for awesome-python.com (#3103)
* update gitignore * feat: tighten homepage metadata * fix: trim generated HTML whitespace * feat(website): add discovery files and markdown alternate * feat(website): add sitemap lastmod * feat(seo): add Content-Signal directive to robots.txt Signals search, ai-input, and ai-train to crawlers via the experimental Content-Signal header in robots.txt. Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
+67
-2
@@ -4,6 +4,8 @@
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import xml.etree.ElementTree as ET
|
||||
from collections.abc import Sequence
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -12,6 +14,9 @@ from jinja2 import Environment, FileSystemLoader
|
||||
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors
|
||||
|
||||
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
|
||||
SITE_URL = "https://awesome-python.com/"
|
||||
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
|
||||
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
|
||||
SOURCE_TYPE_DOMAINS = {
|
||||
"docs.python.org": "Built-in",
|
||||
@@ -67,6 +72,59 @@ def sort_entries(entries: list[dict]) -> list[dict]:
|
||||
return sorted(entries, key=sort_key)
|
||||
|
||||
|
||||
def build_robots_txt() -> str:
|
||||
return (
|
||||
"User-agent: *\n"
|
||||
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
|
||||
"Allow: /\n"
|
||||
"\n"
|
||||
f"Sitemap: {SITEMAP_URL}\n"
|
||||
)
|
||||
|
||||
|
||||
def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
|
||||
ET.register_namespace("", SITEMAP_NS)
|
||||
urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
|
||||
for url, lastmod in urls:
|
||||
url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
|
||||
loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
|
||||
loc_el.text = url
|
||||
lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
|
||||
lastmod_el.text = lastmod
|
||||
|
||||
ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
|
||||
with path.open("ab") as f:
|
||||
f.write(b"\n")
|
||||
|
||||
|
||||
def top_level_heading_text(line: str) -> str | None:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("# "):
|
||||
return None
|
||||
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
|
||||
|
||||
|
||||
def remove_sponsors_section(markdown: str) -> str:
|
||||
lines = markdown.splitlines(keepends=True)
|
||||
start_idx = None
|
||||
for i, line in enumerate(lines):
|
||||
heading = top_level_heading_text(line)
|
||||
if heading and heading.lower() == "sponsors":
|
||||
start_idx = i
|
||||
break
|
||||
|
||||
if start_idx is None:
|
||||
return markdown
|
||||
|
||||
end_idx = len(lines)
|
||||
for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
|
||||
if top_level_heading_text(line):
|
||||
end_idx = i
|
||||
break
|
||||
|
||||
return "".join(lines[:start_idx] + lines[end_idx:])
|
||||
|
||||
|
||||
def extract_entries(
|
||||
categories: list[ParsedSection],
|
||||
groups: list[ParsedGroup],
|
||||
@@ -131,6 +189,7 @@ def build(repo_root: Path) -> None:
|
||||
categories = [cat for g in parsed_groups for cat in g["categories"]]
|
||||
total_entries = sum(c["entry_count"] for c in categories)
|
||||
entries = extract_entries(categories, parsed_groups)
|
||||
build_date = datetime.now(UTC)
|
||||
|
||||
stars_data = load_stars(website / "data" / "github_stars.json")
|
||||
|
||||
@@ -155,6 +214,8 @@ def build(repo_root: Path) -> None:
|
||||
env = Environment(
|
||||
loader=FileSystemLoader(website / "templates"),
|
||||
autoescape=True,
|
||||
trim_blocks=True,
|
||||
lstrip_blocks=True,
|
||||
)
|
||||
|
||||
site_dir = website / "output"
|
||||
@@ -171,7 +232,7 @@ def build(repo_root: Path) -> None:
|
||||
total_entries=total_entries,
|
||||
total_categories=len(categories),
|
||||
repo_stars=repo_stars,
|
||||
build_date=datetime.now(UTC).strftime("%B %d, %Y"),
|
||||
build_date=build_date.strftime("%B %d, %Y"),
|
||||
sponsors=sponsors,
|
||||
),
|
||||
encoding="utf-8",
|
||||
@@ -182,7 +243,11 @@ def build(repo_root: Path) -> None:
|
||||
if static_src.exists():
|
||||
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
|
||||
|
||||
(site_dir / "llms.txt").write_text(readme_text, encoding="utf-8")
|
||||
markdown_index = remove_sponsors_section(readme_text)
|
||||
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
|
||||
write_sitemap_xml(site_dir / "sitemap.xml", [(SITE_URL, build_date.date().isoformat())])
|
||||
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
|
||||
(site_dir / "llms.txt").write_text(markdown_index, encoding="utf-8")
|
||||
|
||||
print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories")
|
||||
print(f"Total entries: {total_entries}")
|
||||
|
||||
Reference in New Issue
Block a user