Merge pull request #2972 from vinta/feature/fetch-stats-workflow

ci: consolidate star fetch into deploy workflow with Actions cache
2026-05-10 21:20:39 +08:00 · 2026-03-18 22:57:43 +08:00
parent 539edc4e20 957d685ff4
commit fa00f10922
7 changed files with 296 additions and 3227 deletions
@@ -4,6 +4,8 @@ on:
  push:
    branches:
      - master
+  schedule:
+    - cron: "0 0 * * *"

 permissions:
  contents: read
@@ -26,10 +28,40 @@ jobs:
          enable-cache: true

      - name: Install dependencies
-        run: uv sync --no-dev
+        run: uv sync --group build
+
+      - name: Restore star data cache
+        id: cache-stars
+        uses: actions/cache/restore@v4
+        with:
+          path: website/data/github_stars.json
+          key: github-stars-${{ github.run_id }}
+          restore-keys: github-stars-
+
+      - name: Fetch GitHub stars
+        id: fetch-stars
+        continue-on-error: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: make fetch_github_stars
+
+      - name: Save star data cache
+        if: steps.fetch-stars.outcome == 'success'
+        uses: actions/cache/save@v4
+        with:
+          path: website/data/github_stars.json
+          key: github-stars-${{ github.run_id }}
+
+      - name: Verify star data exists
+        run: |
+          if [ ! -f website/data/github_stars.json ]; then
+            echo "::error::github_stars.json not found. No cache and fetch failed or was skipped."
+            exit 1
+          fi
+          echo "Star data found: $(wc -l < website/data/github_stars.json) lines"

      - name: Build site
-        run: uv run python website/build.py
+        run: make build

      - name: Upload artifact
        uses: actions/upload-pages-artifact@v4
@@ -7,6 +7,7 @@

 # website
 website/output/
+website/data/

 # claude code
 .claude/skills/
@@ -4,7 +4,7 @@ export
 install:
 	uv sync

-fetch_stats:
+fetch_github_stars:
 	uv run python website/fetch_github_stars.py

 test:
@@ -8,62 +8,144 @@ from pathlib import Path
 from typing import TypedDict

 from jinja2 import Environment, FileSystemLoader
-
 from readme_parser import parse_readme, slugify

 # Thematic grouping of categories. Each category name must match exactly
 # as it appears in README.md (the ## heading text).
 SECTION_GROUPS: list[tuple[str, list[str]]] = [
-    ("Web & API", [
-        "Web Frameworks", "RESTful API", "GraphQL", "WebSocket",
-        "ASGI Servers", "WSGI Servers", "HTTP Clients", "Template Engine",
-        "Web Asset Management", "Web Content Extracting", "Web Crawling",
-    ]),
-    ("Data & ML", [
-        "Data Analysis", "Data Validation", "Data Visualization",
-        "Machine Learning", "Deep Learning", "Computer Vision",
-        "Natural Language Processing", "Recommender Systems", "Science",
-        "Quantum Computing",
-    ]),
-    ("DevOps & Infrastructure", [
-        "DevOps Tools", "Distributed Computing", "Task Queues",
-        "Job Scheduler", "Serverless Frameworks", "Logging", "Processes",
-        "Shell", "Network Virtualization", "RPC Servers",
-    ]),
-    ("Database & Storage", [
-        "Database", "Database Drivers", "ORM", "Caching", "Search",
-        "Serialization",
-    ]),
-    ("Development Tools", [
-        "Testing", "Debugging Tools", "Code Analysis", "Build Tools",
-        "Refactoring", "Documentation", "Editor Plugins and IDEs",
-        "Interactive Interpreter",
-    ]),
-    ("CLI & GUI", [
-        "Command-line Interface Development", "Command-line Tools",
-        "GUI Development",
-    ]),
-    ("Content & Media", [
-        "Audio", "Video", "Image Processing", "HTML Manipulation",
-        "Text Processing", "Specific Formats Processing",
-        "File Manipulation", "Downloader",
-    ]),
-    ("System & Runtime", [
-        "Asynchronous Programming", "Environment Management",
-        "Package Management", "Package Repositories", "Distribution",
-        "Implementations", "Built-in Classes Enhancement",
-        "Functional Programming", "Configuration Files",
-    ]),
-    ("Security & Auth", [
-        "Authentication", "Cryptography", "Penetration Testing",
-        "Permissions",
-    ]),
-    ("Specialized", [
-        "CMS", "Admin Panels", "Email", "Game Development", "Geolocation",
-        "Hardware", "Internationalization", "Date and Time",
-        "URL Manipulation", "Robotics", "Microsoft Windows", "Miscellaneous",
-        "Algorithms and Design Patterns", "Static Site Generator",
-    ]),
+    (
+        "Web & API",
+        [
+            "Web Frameworks",
+            "RESTful API",
+            "GraphQL",
+            "WebSocket",
+            "ASGI Servers",
+            "WSGI Servers",
+            "HTTP Clients",
+            "Template Engine",
+            "Web Asset Management",
+            "Web Content Extracting",
+            "Web Crawling",
+        ],
+    ),
+    (
+        "Data & ML",
+        [
+            "Data Analysis",
+            "Data Validation",
+            "Data Visualization",
+            "Machine Learning",
+            "Deep Learning",
+            "Computer Vision",
+            "Natural Language Processing",
+            "Recommender Systems",
+            "Science",
+            "Quantum Computing",
+        ],
+    ),
+    (
+        "DevOps & Infrastructure",
+        [
+            "DevOps Tools",
+            "Distributed Computing",
+            "Task Queues",
+            "Job Scheduler",
+            "Serverless Frameworks",
+            "Logging",
+            "Processes",
+            "Shell",
+            "Network Virtualization",
+            "RPC Servers",
+        ],
+    ),
+    (
+        "Database & Storage",
+        [
+            "Database",
+            "Database Drivers",
+            "ORM",
+            "Caching",
+            "Search",
+            "Serialization",
+        ],
+    ),
+    (
+        "Development Tools",
+        [
+            "Testing",
+            "Debugging Tools",
+            "Code Analysis",
+            "Build Tools",
+            "Refactoring",
+            "Documentation",
+            "Editor Plugins and IDEs",
+            "Interactive Interpreter",
+        ],
+    ),
+    (
+        "CLI & GUI",
+        [
+            "Command-line Interface Development",
+            "Command-line Tools",
+            "GUI Development",
+        ],
+    ),
+    (
+        "Content & Media",
+        [
+            "Audio",
+            "Video",
+            "Image Processing",
+            "HTML Manipulation",
+            "Text Processing",
+            "Specific Formats Processing",
+            "File Manipulation",
+            "Downloader",
+        ],
+    ),
+    (
+        "System & Runtime",
+        [
+            "Asynchronous Programming",
+            "Environment Management",
+            "Package Management",
+            "Package Repositories",
+            "Distribution",
+            "Implementations",
+            "Built-in Classes Enhancement",
+            "Functional Programming",
+            "Configuration Files",
+        ],
+    ),
+    (
+        "Security & Auth",
+        [
+            "Authentication",
+            "Cryptography",
+            "Penetration Testing",
+            "Permissions",
+        ],
+    ),
+    (
+        "Specialized",
+        [
+            "CMS",
+            "Admin Panels",
+            "Email",
+            "Game Development",
+            "Geolocation",
+            "Hardware",
+            "Internationalization",
+            "Date and Time",
+            "URL Manipulation",
+            "Robotics",
+            "Microsoft Windows",
+            "Miscellaneous",
+            "Algorithms and Design Patterns",
+            "Static Site Generator",
+        ],
+    ),
    ("Resources", []),  # Filled dynamically from parsed resources
 ]

@@ -85,20 +167,24 @@ def group_categories(
            group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name]

        if group_cats:
-            groups.append({
-                "name": group_name,
-                "slug": slugify(group_name),
-                "categories": group_cats,
-            })
+            groups.append(
+                {
+                    "name": group_name,
+                    "slug": slugify(group_name),
+                    "categories": group_cats,
+                }
+            )

    # Any categories not in a group go into "Other"
    ungrouped = [c for c in categories if c["name"] not in grouped_names]
    if ungrouped:
-        groups.append({
-            "name": "Other",
-            "slug": "other",
-            "categories": ungrouped,
-        })
+        groups.append(
+            {
+                "name": "Other",
+                "slug": "other",
+                "categories": ungrouped,
+            }
+        )

    return groups

@@ -121,9 +207,7 @@ class StarData(TypedDict):
    fetched_at: str


-GITHUB_REPO_URL_RE = re.compile(
-    r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$"
-)
+GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")


 def extract_github_repo(url: str) -> str | None:
@@ -144,12 +228,14 @@ def load_stars(path: Path) -> dict[str, StarData]:

 def sort_entries(entries: list[dict]) -> list[dict]:
    """Sort entries by stars descending, then name ascending. No-star entries go last."""
+
    def sort_key(entry: dict) -> tuple[int, int, str]:
        stars = entry["stars"]
        name = entry["name"].lower()
        if stars is None:
            return (1, 0, name)
        return (0, -stars, name)
+
    return sorted(entries, key=sort_key)


@@ -167,17 +253,19 @@ def extract_entries(
    for cat in categories:
        group_name = cat_to_group.get(cat["name"], "Other")
        for entry in cat["entries"]:
-            entries.append({
-                "name": entry["name"],
-                "url": entry["url"],
-                "description": entry["description"],
-                "category": cat["name"],
-                "group": group_name,
-                "stars": None,
-                "owner": None,
-                "last_commit_at": None,
-                "also_see": entry["also_see"],
-            })
+            entries.append(
+                {
+                    "name": entry["name"],
+                    "url": entry["url"],
+                    "description": entry["description"],
+                    "category": cat["name"],
+                    "group": group_name,
+                    "stars": None,
+                    "owner": None,
+                    "last_commit_at": None,
+                    "also_see": entry["also_see"],
+                }
+            )
    return entries


@@ -241,6 +329,8 @@ def build(repo_root: str) -> None:
    if static_src.exists():
        shutil.copytree(static_src, static_dst, dirs_exist_ok=True)

+    shutil.copy(repo / "README.md", site_dir / "llms.txt")
+
    print(f"Built single page with {len(categories)} categories + {len(resources)} resources")
    print(f"Total entries: {total_entries}")
    print(f"Output: {site_dir}")
@@ -12,7 +12,7 @@ import httpx

 from build import extract_github_repo, load_stars

-CACHE_MAX_AGE_DAYS = 7
+CACHE_MAX_AGE_HOURS = 12
 DATA_DIR = Path(__file__).parent / "data"
 CACHE_FILE = DATA_DIR / "github_stars.json"
 README_PATH = Path(__file__).parent.parent / "README.md"
@@ -120,8 +120,8 @@ def main() -> None:
        entry = cache.get(repo)
        if entry and "fetched_at" in entry:
            fetched = datetime.fromisoformat(entry["fetched_at"])
-            age_days = (now - fetched).days
-            if age_days < CACHE_MAX_AGE_DAYS:
+            age_hours = (now - fetched).total_seconds() / 3600
+            if age_hours < CACHE_MAX_AGE_HOURS:
                continue
        to_fetch.append(repo)

@@ -137,3 +137,95 @@ class TestParseGraphqlResponse:
        assert len(result) == 2
        assert result["a/x"]["stars"] == 100
        assert result["b/y"]["stars"] == 200
+
+
+class TestMainSkipsFreshCache:
+    """Verify that main() skips fetching when all cache entries are fresh."""
+
+    def test_skips_fetch_when_cache_is_fresh(self, tmp_path, monkeypatch, capsys):
+        from datetime import datetime, timedelta, timezone
+
+        from fetch_github_stars import main
+
+        # Set up a minimal README with one repo
+        readme = tmp_path / "README.md"
+        readme.write_text("* [req](https://github.com/psf/requests) - HTTP.\n")
+        monkeypatch.setattr("fetch_github_stars.README_PATH", readme)
+
+        # Pre-populate cache with a fresh entry (1 hour ago)
+        data_dir = tmp_path / "data"
+        data_dir.mkdir()
+        cache_file = data_dir / "github_stars.json"
+        now = datetime.now(timezone.utc)
+        fresh_cache = {
+            "psf/requests": {
+                "stars": 52000,
+                "owner": "psf",
+                "last_commit_at": "2025-01-01T00:00:00+00:00",
+                "fetched_at": (now - timedelta(hours=1)).isoformat(),
+            }
+        }
+        cache_file.write_text(json.dumps(fresh_cache), encoding="utf-8")
+        monkeypatch.setattr("fetch_github_stars.CACHE_FILE", cache_file)
+        monkeypatch.setattr("fetch_github_stars.DATA_DIR", data_dir)
+        monkeypatch.setenv("GITHUB_TOKEN", "fake-token")
+
+        main()
+
+        output = capsys.readouterr().out
+        assert "0 repos to fetch" in output
+        assert "Cache is up to date" in output
+
+    def test_fetches_when_cache_is_stale(self, tmp_path, monkeypatch, capsys):
+        from datetime import datetime, timedelta, timezone
+        from unittest.mock import MagicMock
+
+        from fetch_github_stars import main
+
+        # Set up a minimal README with one repo
+        readme = tmp_path / "README.md"
+        readme.write_text("* [req](https://github.com/psf/requests) - HTTP.\n")
+        monkeypatch.setattr("fetch_github_stars.README_PATH", readme)
+
+        # Pre-populate cache with a stale entry (24 hours ago)
+        data_dir = tmp_path / "data"
+        data_dir.mkdir()
+        cache_file = data_dir / "github_stars.json"
+        now = datetime.now(timezone.utc)
+        stale_cache = {
+            "psf/requests": {
+                "stars": 52000,
+                "owner": "psf",
+                "last_commit_at": "2025-01-01T00:00:00+00:00",
+                "fetched_at": (now - timedelta(hours=24)).isoformat(),
+            }
+        }
+        cache_file.write_text(json.dumps(stale_cache), encoding="utf-8")
+        monkeypatch.setattr("fetch_github_stars.CACHE_FILE", cache_file)
+        monkeypatch.setattr("fetch_github_stars.DATA_DIR", data_dir)
+        monkeypatch.setenv("GITHUB_TOKEN", "fake-token")
+
+        # Mock httpx.Client to avoid real API calls
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "data": {
+                "repo_0": {
+                    "stargazerCount": 53000,
+                    "owner": {"login": "psf"},
+                    "defaultBranchRef": {"target": {"committedDate": "2025-06-01T00:00:00Z"}},
+                }
+            }
+        }
+        mock_response.raise_for_status = MagicMock()
+        mock_client = MagicMock()
+        mock_client.__enter__ = MagicMock(return_value=mock_client)
+        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client.post.return_value = mock_response
+        monkeypatch.setattr("fetch_github_stars.httpx.Client", lambda **kwargs: mock_client)
+
+        main()
+
+        output = capsys.readouterr().out
+        assert "1 repos to fetch" in output
+        assert "Done. Fetched 1 repos" in output
+        mock_client.post.assert_called_once()