diff --git a/sort.py b/sort.py deleted file mode 100755 index 431bd02b..00000000 --- a/sort.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -""" - The approach taken is explained below. I decided to do it simply. - Initially I was considering parsing the data into some sort of - structure and then generating an appropriate README. I am still - considering doing it - but for now this should work. The only issue - I see is that it only sorts the entries at the lowest level, and that - the order of the top-level contents do not match the order of the actual - entries. - - This could be extended by having nested blocks, sorting them recursively - and flattening the end structure into a list of lines. Revision 2 maybe ^.^. -""" - -def sort_blocks(): - # First, we load the current README into memory - with open('README.md', 'r') as read_me_file: - read_me = read_me_file.read() - - # Separating the 'table of contents' from the contents (blocks) - table_of_contents = ''.join(read_me.split('- - -')[0]) - blocks = ''.join(read_me.split('- - -')[1]).split('\n# ') - for i in range(len(blocks)): - if i == 0: - blocks[i] = blocks[i] + '\n' - else: - blocks[i] = '# ' + blocks[i] + '\n' - - # Sorting the libraries - inner_blocks = sorted(blocks[0].split('##')) - for i in range(1, len(inner_blocks)): - if inner_blocks[i][0] != '#': - inner_blocks[i] = '##' + inner_blocks[i] - inner_blocks = ''.join(inner_blocks) - - # Replacing the non-sorted libraries by the sorted ones and gathering all at the final_README file - blocks[0] = inner_blocks - final_README = table_of_contents + '- - -' + ''.join(blocks) - - with open('README.md', 'w+') as sorted_file: - sorted_file.write(final_README) - -def main(): - # First, we load the current README into memory as an array of lines - with open('README.md', 'r') as read_me_file: - read_me = read_me_file.readlines() - - # Then we cluster the lines together as blocks - # Each block represents a collection of lines that should be sorted - # This was done by assuming only links ([...](...)) are meant to be sorted - # Clustering is done by indentation - blocks = [] - last_indent = None - for line in read_me: - s_line = line.lstrip() - indent = len(line) - len(s_line) - - if any([s_line.startswith(s) for s in ['* [', '- [']]): - if indent == last_indent: - blocks[-1].append(line) - else: - blocks.append([line]) - last_indent = indent - else: - blocks.append([line]) - last_indent = None - - with open('README.md', 'w+') as sorted_file: - # Then all of the blocks are sorted individually - blocks = [ - ''.join(sorted(block, key=str.lower)) for block in blocks - ] - # And the result is written back to README.md - sorted_file.write(''.join(blocks)) - - # Then we call the sorting method - sort_blocks() - - -if __name__ == "__main__": - main() diff --git a/website/build.py b/website/build.py index a2bf34a1..1645e83b 100644 --- a/website/build.py +++ b/website/build.py @@ -10,179 +10,20 @@ from typing import TypedDict from jinja2 import Environment, FileSystemLoader from readme_parser import parse_readme, slugify -# Thematic grouping of categories. Each category name must match exactly -# as it appears in README.md (the ## heading text). -SECTION_GROUPS: list[tuple[str, list[str]]] = [ - ( - "Web & API", - [ - "Admin Panels", - "CMS", - "Email", - "Static Site Generator", - "URL Manipulation", - "Web Frameworks", - "RESTful API", - "GraphQL", - "WebSocket", - "ASGI Servers", - "WSGI Servers", - "HTTP Clients", - "Template Engine", - "Web Asset Management", - "Web Content Extracting", - "Web Crawling", - ], - ), - ( - "AI & ML", - [ - "AI and Agents", - "Machine Learning", - "Deep Learning", - "Computer Vision", - "Natural Language Processing", - "Recommender Systems", - "Robotics", - ], - ), - ( - "Data & Science", - [ - "Data Analysis", - "Data Validation", - "Data Visualization", - "Geolocation", - "Science", - "Quantum Computing", - ], - ), - ( - "DevOps & Infrastructure", - [ - "DevOps Tools", - "Distributed Computing", - "Task Queues", - "Job Scheduler", - "Serverless Frameworks", - "Logging", - "Processes", - "Shell", - "Network Virtualization", - "RPC Servers", - ], - ), - ( - "Database & Storage", - [ - "Database", - "Database Drivers", - "ORM", - "Caching", - "Search", - "Serialization", - ], - ), - ( - "Development Tools", - [ - "Testing", - "Debugging Tools", - "Code Analysis", - "Build Tools", - "Algorithms and Design Patterns", - "Refactoring", - "Documentation", - "Editor Plugins and IDEs", - "Interactive Interpreter", - ], - ), - ( - "CLI & GUI", - [ - "Command-line Interface Development", - "Command-line Tools", - "GUI Development", - ], - ), - ( - "Content & Media", - [ - "Audio", - "Video", - "Game Development", - "Image Processing", - "Internationalization", - "HTML Manipulation", - "Text Processing", - "Specific Formats Processing", - "File Manipulation", - "Downloader", - ], - ), - ( - "System & Runtime", - [ - "Asynchronous Programming", - "Environment Management", - "Package Management", - "Package Repositories", - "Date and Time", - "Distribution", - "Hardware", - "Implementations", - "Microsoft Windows", - "Built-in Classes Enhancement", - "Functional Programming", - "Configuration Files", - ], - ), - ( - "Security & Auth", - [ - "Authentication", - "Cryptography", - "Penetration Testing", - "Permissions", - ], - ), - ("Resources", []), # Filled dynamically from parsed resources -] - def group_categories( - categories: list[dict], + parsed_groups: list[dict], resources: list[dict], ) -> list[dict]: - """Organize categories and resources into thematic section groups.""" - cat_by_name = {c["name"]: c for c in categories} - groups = [] - grouped_names: set[str] = set() + """Combine parsed groups with resources for template rendering.""" + groups = list(parsed_groups) - for group_name, cat_names in SECTION_GROUPS: - grouped_names.update(cat_names) - if group_name == "Resources": - group_cats = list(resources) - else: - group_cats = [cat_by_name[n] for n in cat_names if n in cat_by_name] - - if group_cats: - groups.append( - { - "name": group_name, - "slug": slugify(group_name), - "categories": group_cats, - } - ) - - # Any categories not in a group go into "Other" - ungrouped = [c for c in categories if c["name"] not in grouped_names] - if ungrouped: + if resources: groups.append( { - "name": "Other", - "slug": "other", - "categories": ungrouped, + "name": "Resources", + "slug": slugify("Resources"), + "categories": list(resources), } ) @@ -295,11 +136,11 @@ def build(repo_root: str) -> None: subtitle = stripped break - categories, resources = parse_readme(readme_text) - # All fields pre-computed: entry_count, content_html, preview, description + parsed_groups, resources = parse_readme(readme_text) + categories = [cat for g in parsed_groups for cat in g["categories"]] total_entries = sum(c["entry_count"] for c in categories) - groups = group_categories(categories, resources) + groups = group_categories(parsed_groups, resources) entries = extract_entries(categories, groups) stars_data = load_stars(website / "data" / "github_stars.json") @@ -344,7 +185,7 @@ def build(repo_root: str) -> None: shutil.copy(repo / "README.md", site_dir / "llms.txt") - print(f"Built single page with {len(categories)} categories + {len(resources)} resources") + print(f"Built single page with {len(parsed_groups)} groups, {len(categories)} categories + {len(resources)} resources") print(f"Total entries: {total_entries}") print(f"Output: {site_dir}") diff --git a/website/readme_parser.py b/website/readme_parser.py index 97fd37af..c0ecfc60 100644 --- a/website/readme_parser.py +++ b/website/readme_parser.py @@ -32,6 +32,12 @@ class ParsedSection(TypedDict): content_html: str # rendered HTML, properly escaped +class ParsedGroup(TypedDict): + name: str + slug: str + categories: list[ParsedSection] + + # --- Slugify ---------------------------------------------------------------- _SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]") @@ -305,6 +311,25 @@ def _render_section_html(content_nodes: list[SyntaxTreeNode]) -> str: # --- Section splitting ------------------------------------------------------- +def _build_section(name: str, body: list[SyntaxTreeNode]) -> ParsedSection: + """Build a ParsedSection from a heading name and its body nodes.""" + desc = _extract_description(body) + content_nodes = body[1:] if desc else body + entries = _parse_section_entries(content_nodes) + entry_count = len(entries) + sum(len(e["also_see"]) for e in entries) + preview = ", ".join(e["name"] for e in entries[:4]) + content_html = _render_section_html(content_nodes) + return ParsedSection( + name=name, + slug=slugify(name), + description=desc, + entries=entries, + entry_count=entry_count, + preview=preview, + content_html=content_html, + ) + + def _group_by_h2( nodes: list[SyntaxTreeNode], ) -> list[ParsedSection]: @@ -317,22 +342,7 @@ def _group_by_h2( nonlocal current_name if current_name is None: return - desc = _extract_description(current_body) - content_nodes = current_body[1:] if desc else current_body - entries = _parse_section_entries(content_nodes) - entry_count = len(entries) + sum(len(e["also_see"]) for e in entries) - preview = ", ".join(e["name"] for e in entries[:4]) - content_html = _render_section_html(content_nodes) - - sections.append(ParsedSection( - name=current_name, - slug=slugify(current_name), - description=desc, - entries=entries, - entry_count=entry_count, - preview=preview, - content_html=content_html, - )) + sections.append(_build_section(current_name, current_body)) current_name = None for node in nodes: @@ -347,10 +357,86 @@ def _group_by_h2( return sections -def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]: - """Parse README.md text into categories and resources. +def _is_bold_marker(node: SyntaxTreeNode) -> str | None: + """Detect a bold-only paragraph used as a group marker. - Returns (categories, resources) where each is a list of ParsedSection dicts. + Pattern: a paragraph whose only content is **Group Name** (possibly + surrounded by empty text nodes in the AST). + Returns the group name text, or None if not a group marker. + """ + if node.type != "paragraph": + return None + for child in node.children: + if child.type != "inline": + continue + # Filter out empty text nodes that markdown-it inserts around strong + meaningful = [c for c in child.children if not (c.type == "text" and c.content == "")] + if len(meaningful) == 1 and meaningful[0].type == "strong": + return render_inline_text(meaningful[0].children) + return None + + +def _parse_grouped_sections( + nodes: list[SyntaxTreeNode], +) -> list[ParsedGroup]: + """Parse nodes into groups of categories using bold markers as group boundaries. + + Bold-only paragraphs (**Group Name**) delimit groups. H2 headings under each + bold marker become categories within that group. Categories appearing before + any bold marker go into an "Other" group. + """ + groups: list[ParsedGroup] = [] + current_group_name: str | None = None + current_group_cats: list[ParsedSection] = [] + current_cat_name: str | None = None + current_cat_body: list[SyntaxTreeNode] = [] + + def flush_cat() -> None: + nonlocal current_cat_name + if current_cat_name is None: + return + current_group_cats.append(_build_section(current_cat_name, current_cat_body)) + current_cat_name = None + + def flush_group() -> None: + nonlocal current_group_name, current_group_cats + if not current_group_cats: + current_group_name = None + current_group_cats = [] + return + name = current_group_name or "Other" + groups.append(ParsedGroup( + name=name, + slug=slugify(name), + categories=list(current_group_cats), + )) + current_group_name = None + current_group_cats = [] + + for node in nodes: + bold_name = _is_bold_marker(node) + if bold_name is not None: + flush_cat() + flush_group() + current_group_name = bold_name + current_cat_body = [] + elif node.type == "heading" and node.tag == "h2": + flush_cat() + current_cat_name = _heading_text(node) + current_cat_body = [] + elif current_cat_name is not None: + current_cat_body.append(node) + + flush_cat() + flush_group() + return groups + + +def parse_readme(text: str) -> tuple[list[ParsedGroup], list[ParsedSection]]: + """Parse README.md text into grouped categories and resources. + + Returns (groups, resources) where groups is a list of ParsedGroup dicts + containing nested categories, and resources is a flat list of ParsedSection. """ md = MarkdownIt("commonmark") tokens = md.parse(text) @@ -382,7 +468,7 @@ def parse_readme(text: str) -> tuple[list[ParsedSection], list[ParsedSection]]: res_end = contributing_idx or len(children) res_nodes = children[resources_idx + 1 : res_end] - categories = _group_by_h2(cat_nodes) + groups = _parse_grouped_sections(cat_nodes) resources = _group_by_h2(res_nodes) - return categories, resources + return groups, resources diff --git a/website/tests/test_build.py b/website/tests/test_build.py index a75c2294..6302c3d3 100644 --- a/website/tests/test_build.py +++ b/website/tests/test_build.py @@ -48,28 +48,33 @@ class TestSlugify: class TestGroupCategories: - def test_groups_known_categories(self): - cats = [ - {"name": "Web Frameworks", "slug": "web-frameworks"}, - {"name": "Testing", "slug": "testing"}, + def test_appends_resources(self): + parsed_groups = [ + {"name": "G1", "slug": "g1", "categories": [{"name": "Cat1"}]}, ] - groups = group_categories(cats, []) - group_names = [g["name"] for g in groups] - assert "Web & API" in group_names - assert "Development Tools" in group_names - - def test_ungrouped_go_to_other(self): - cats = [{"name": "Unknown Category", "slug": "unknown-category"}] - groups = group_categories(cats, []) - group_names = [g["name"] for g in groups] - assert "Other" in group_names - - def test_resources_grouped(self): resources = [{"name": "Newsletters", "slug": "newsletters"}] - groups = group_categories([], resources) + groups = group_categories(parsed_groups, resources) group_names = [g["name"] for g in groups] + assert "G1" in group_names assert "Resources" in group_names + def test_no_resources_no_extra_group(self): + parsed_groups = [ + {"name": "G1", "slug": "g1", "categories": [{"name": "Cat1"}]}, + ] + groups = group_categories(parsed_groups, []) + assert len(groups) == 1 + assert groups[0]["name"] == "G1" + + def test_preserves_group_order(self): + parsed_groups = [ + {"name": "Second", "slug": "second", "categories": [{"name": "C2"}]}, + {"name": "First", "slug": "first", "categories": [{"name": "C1"}]}, + ] + groups = group_categories(parsed_groups, []) + assert groups[0]["name"] == "Second" + assert groups[1]["name"] == "First" + # --------------------------------------------------------------------------- # build (integration) @@ -114,6 +119,8 @@ class TestBuild: --- + **Tools** + ## Widgets _Widget libraries._ @@ -176,10 +183,14 @@ class TestBuild: --- + **Group A** + ## Alpha - [a](https://x.com) - A. + **Group B** + ## Beta - [b](https://x.com) - B. @@ -194,6 +205,8 @@ class TestBuild: index_html = (tmp_path / "website" / "output" / "index.html").read_text() assert "Alpha" in index_html assert "Beta" in index_html + assert "Group A" in index_html + assert "Group B" in index_html def test_index_contains_preview_text(self, tmp_path): readme = textwrap.dedent("""\ diff --git a/website/tests/test_readme_parser.py b/website/tests/test_readme_parser.py index 1e51036a..0b0236c7 100644 --- a/website/tests/test_readme_parser.py +++ b/website/tests/test_readme_parser.py @@ -115,27 +115,74 @@ MINIMAL_README = textwrap.dedent("""\ """) +GROUPED_README = textwrap.dedent("""\ + # Awesome Python + + Some intro text. + + --- + + **Group One** + + ## Alpha + + _Libraries for alpha stuff._ + + - [lib-a](https://example.com/a) - Does A. + - [lib-b](https://example.com/b) - Does B. + + **Group Two** + + ## Beta + + _Tools for beta._ + + - [lib-c](https://example.com/c) - Does C. + + ## Gamma + + - [lib-d](https://example.com/d) - Does D. + + # Resources + + Where to discover resources. + + ## Newsletters + + - [News One](https://example.com/n1) + + # Contributing + + Please contribute! +""") + + class TestParseReadmeSections: - def test_category_count(self): - cats, resources = parse_readme(MINIMAL_README) - assert len(cats) == 2 + def test_ungrouped_categories_go_to_other(self): + groups, resources = parse_readme(MINIMAL_README) + assert len(groups) == 1 + assert groups[0]["name"] == "Other" + assert len(groups[0]["categories"]) == 2 - def test_resource_count(self): - cats, resources = parse_readme(MINIMAL_README) - assert len(resources) == 2 - - def test_category_names(self): - cats, _ = parse_readme(MINIMAL_README) + def test_ungrouped_category_names(self): + groups, _ = parse_readme(MINIMAL_README) + cats = groups[0]["categories"] assert cats[0]["name"] == "Alpha" assert cats[1]["name"] == "Beta" + def test_resource_count(self): + _, resources = parse_readme(MINIMAL_README) + assert len(resources) == 2 + def test_category_slugs(self): - cats, _ = parse_readme(MINIMAL_README) + groups, _ = parse_readme(MINIMAL_README) + cats = groups[0]["categories"] assert cats[0]["slug"] == "alpha" assert cats[1]["slug"] == "beta" def test_category_description(self): - cats, _ = parse_readme(MINIMAL_README) + groups, _ = parse_readme(MINIMAL_README) + cats = groups[0]["categories"] assert cats[0]["description"] == "Libraries for alpha stuff." assert cats[1]["description"] == "Tools for beta." @@ -145,13 +192,16 @@ class TestParseReadmeSections: assert resources[1]["name"] == "Podcasts" def test_contributing_skipped(self): - cats, resources = parse_readme(MINIMAL_README) - all_names = [c["name"] for c in cats] + [r["name"] for r in resources] + groups, resources = parse_readme(MINIMAL_README) + all_names = [] + for g in groups: + all_names.extend(c["name"] for c in g["categories"]) + all_names.extend(r["name"] for r in resources) assert "Contributing" not in all_names def test_no_separator(self): - cats, resources = parse_readme("# Just a heading\n\nSome text.\n") - assert cats == [] + groups, resources = parse_readme("# Just a heading\n\nSome text.\n") + assert groups == [] assert resources == [] def test_no_description(self): @@ -174,7 +224,8 @@ class TestParseReadmeSections: Done. """) - cats, resources = parse_readme(readme) + groups, resources = parse_readme(readme) + cats = groups[0]["categories"] assert cats[0]["description"] == "" assert cats[0]["entries"][0]["name"] == "item" @@ -194,10 +245,114 @@ class TestParseReadmeSections: Done. """) - cats, _ = parse_readme(readme) + groups, _ = parse_readme(readme) + cats = groups[0]["categories"] assert cats[0]["description"] == "Algorithms. Also see awesome-algos." +class TestParseGroupedReadme: + def test_group_count(self): + groups, _ = parse_readme(GROUPED_README) + assert len(groups) == 2 + + def test_group_names(self): + groups, _ = parse_readme(GROUPED_README) + assert groups[0]["name"] == "Group One" + assert groups[1]["name"] == "Group Two" + + def test_group_slugs(self): + groups, _ = parse_readme(GROUPED_README) + assert groups[0]["slug"] == "group-one" + assert groups[1]["slug"] == "group-two" + + def test_group_one_has_one_category(self): + groups, _ = parse_readme(GROUPED_README) + assert len(groups[0]["categories"]) == 1 + assert groups[0]["categories"][0]["name"] == "Alpha" + + def test_group_two_has_two_categories(self): + groups, _ = parse_readme(GROUPED_README) + assert len(groups[1]["categories"]) == 2 + assert groups[1]["categories"][0]["name"] == "Beta" + assert groups[1]["categories"][1]["name"] == "Gamma" + + def test_resources_still_parsed(self): + _, resources = parse_readme(GROUPED_README) + assert len(resources) == 1 + assert resources[0]["name"] == "Newsletters" + + def test_empty_group_skipped(self): + readme = textwrap.dedent("""\ + # T + + --- + + **Empty** + + **HasCats** + + ## Cat + + - [x](https://x.com) - X. + + # Contributing + + Done. + """) + groups, _ = parse_readme(readme) + assert len(groups) == 1 + assert groups[0]["name"] == "HasCats" + + def test_bold_with_extra_text_not_group_marker(self): + readme = textwrap.dedent("""\ + # T + + --- + + **Note:** This is not a group marker. + + ## Cat + + - [x](https://x.com) - X. + + # Contributing + + Done. + """) + groups, _ = parse_readme(readme) + # "Note:" has text after the strong node, so it's not a group marker + # Category goes into "Other" + assert len(groups) == 1 + assert groups[0]["name"] == "Other" + + def test_categories_before_any_group_marker(self): + readme = textwrap.dedent("""\ + # T + + --- + + ## Orphan + + - [x](https://x.com) - X. + + **A Group** + + ## Grouped + + - [y](https://x.com) - Y. + + # Contributing + + Done. + """) + groups, _ = parse_readme(readme) + assert len(groups) == 2 + assert groups[0]["name"] == "Other" + assert groups[0]["categories"][0]["name"] == "Orphan" + assert groups[1]["name"] == "A Group" + assert groups[1]["categories"][0]["name"] == "Grouped" + + def _content_nodes(md_text: str) -> list[SyntaxTreeNode]: """Helper: parse markdown and return all block nodes.""" md = MarkdownIt("commonmark") @@ -283,7 +438,8 @@ class TestParseSectionEntries: Done. """) - cats, _ = parse_readme(readme) + groups, _ = parse_readme(readme) + cats = groups[0]["categories"] # 2 main entries + 1 also_see = 3 assert cats[0]["entry_count"] == 3 @@ -305,7 +461,8 @@ class TestParseSectionEntries: Done. """) - cats, _ = parse_readme(readme) + groups, _ = parse_readme(readme) + cats = groups[0]["categories"] assert cats[0]["preview"] == "alpha, beta, gamma, delta" def test_description_html_escapes_xss(self): @@ -366,10 +523,17 @@ class TestParseRealReadme: readme_path = os.path.join(os.path.dirname(__file__), "..", "..", "README.md") with open(readme_path, encoding="utf-8") as f: self.readme_text = f.read() - self.cats, self.resources = parse_readme(self.readme_text) + self.groups, self.resources = parse_readme(self.readme_text) + self.cats = [c for g in self.groups for c in g["categories"]] - def test_at_least_83_categories(self): - assert len(self.cats) >= 83 + def test_at_least_11_groups(self): + assert len(self.groups) >= 11 + + def test_first_group_is_ai_ml(self): + assert self.groups[0]["name"] == "AI & ML" + + def test_at_least_76_categories(self): + assert len(self.cats) >= 76 def test_resources_has_newsletters_and_podcasts(self): names = [r["name"] for r in self.resources] @@ -380,21 +544,17 @@ class TestParseRealReadme: all_names = [c["name"] for c in self.cats] + [r["name"] for r in self.resources] assert "Contributing" not in all_names - def test_first_category_is_admin_panels(self): - assert self.cats[0]["name"] == "Admin Panels" - assert self.cats[0]["slug"] == "admin-panels" + def test_first_category_is_ai_and_agents(self): + assert self.cats[0]["name"] == "AI and Agents" + assert self.cats[0]["slug"] == "ai-and-agents" - def test_last_category_is_wsgi_servers(self): - assert self.cats[-1]["name"] == "WSGI Servers" - assert self.cats[-1]["slug"] == "wsgi-servers" - - def test_restful_api_slug(self): + def test_web_apis_slug(self): slugs = [c["slug"] for c in self.cats] - assert "restful-api" in slugs + assert "web-apis" in slugs def test_descriptions_extracted(self): - admin = self.cats[0] - assert admin["description"] == "Libraries for administrative interfaces." + ai = next(c for c in self.cats if c["name"] == "AI and Agents") + assert "AI applications" in ai["description"] def test_entry_counts_nonzero(self): for cat in self.cats: @@ -422,3 +582,8 @@ class TestParseRealReadme: algos = next(c for c in self.cats if c["name"] == "Algorithms and Design Patterns") assert "awesome-algorithms" in algos["description"] assert "https://" not in algos["description"] + + def test_miscellaneous_in_own_group(self): + misc_group = next((g for g in self.groups if g["name"] == "Miscellaneous"), None) + assert misc_group is not None + assert any(c["name"] == "Miscellaneous" for c in misc_group["categories"])