mirror of
https://github.com/vinta/awesome-python.git
synced 2026-03-23 05:03:45 +08:00
Switch readme_parser.py from regex-based parsing to markdown-it-py for more robust and maintainable Markdown AST traversal. Update build pipeline, templates, styles, and JS to support the new parser output. Refresh GitHub stars data and update tests to match new parser behavior. Co-Authored-By: Claude <noreply@anthropic.com>
425 lines
14 KiB
Python
425 lines
14 KiB
Python
"""Tests for the readme_parser module."""
|
|
|
|
import os
|
|
import textwrap
|
|
|
|
import pytest
|
|
|
|
from readme_parser import (
|
|
_parse_section_entries,
|
|
_render_section_html,
|
|
parse_readme,
|
|
render_inline_html,
|
|
render_inline_text,
|
|
)
|
|
|
|
from markdown_it import MarkdownIt
|
|
from markdown_it.tree import SyntaxTreeNode
|
|
|
|
|
|
def _parse_inline(md_text: str) -> list[SyntaxTreeNode]:
|
|
"""Helper: parse a single paragraph and return its inline children."""
|
|
md = MarkdownIt("commonmark")
|
|
root = SyntaxTreeNode(md.parse(md_text))
|
|
# root > paragraph > inline > children
|
|
return root.children[0].children[0].children
|
|
|
|
|
|
class TestRenderInlineHtml:
|
|
def test_plain_text_escapes_html(self):
|
|
children = _parse_inline("Hello <world> & friends")
|
|
assert render_inline_html(children) == "Hello <world> & friends"
|
|
|
|
def test_link_with_target(self):
|
|
children = _parse_inline("[name](https://example.com)")
|
|
html = render_inline_html(children)
|
|
assert 'href="https://example.com"' in html
|
|
assert 'target="_blank"' in html
|
|
assert 'rel="noopener"' in html
|
|
assert ">name</a>" in html
|
|
|
|
def test_emphasis(self):
|
|
children = _parse_inline("*italic* text")
|
|
assert "<em>italic</em>" in render_inline_html(children)
|
|
|
|
def test_strong(self):
|
|
children = _parse_inline("**bold** text")
|
|
assert "<strong>bold</strong>" in render_inline_html(children)
|
|
|
|
def test_code_inline(self):
|
|
children = _parse_inline("`some code`")
|
|
assert "<code>some code</code>" in render_inline_html(children)
|
|
|
|
def test_mixed_link_and_text(self):
|
|
children = _parse_inline("See [foo](https://x.com) for details.")
|
|
html = render_inline_html(children)
|
|
assert "See " in html
|
|
assert ">foo</a>" in html
|
|
assert " for details." in html
|
|
|
|
|
|
class TestRenderInlineText:
|
|
def test_plain_text(self):
|
|
children = _parse_inline("Hello world")
|
|
assert render_inline_text(children) == "Hello world"
|
|
|
|
def test_link_becomes_text(self):
|
|
children = _parse_inline("See [awesome-algos](https://github.com/x/y).")
|
|
assert render_inline_text(children) == "See awesome-algos."
|
|
|
|
def test_emphasis_stripped(self):
|
|
children = _parse_inline("*italic* text")
|
|
assert render_inline_text(children) == "italic text"
|
|
|
|
def test_code_inline_kept(self):
|
|
children = _parse_inline("`code` here")
|
|
assert render_inline_text(children) == "code here"
|
|
|
|
|
|
MINIMAL_README = textwrap.dedent("""\
|
|
# Awesome Python
|
|
|
|
Some intro text.
|
|
|
|
---
|
|
|
|
## Alpha
|
|
|
|
_Libraries for alpha stuff._
|
|
|
|
- [lib-a](https://example.com/a) - Does A.
|
|
- [lib-b](https://example.com/b) - Does B.
|
|
|
|
## Beta
|
|
|
|
_Tools for beta._
|
|
|
|
- [lib-c](https://example.com/c) - Does C.
|
|
|
|
# Resources
|
|
|
|
Where to discover resources.
|
|
|
|
## Newsletters
|
|
|
|
- [News One](https://example.com/n1)
|
|
- [News Two](https://example.com/n2)
|
|
|
|
## Podcasts
|
|
|
|
- [Pod One](https://example.com/p1)
|
|
|
|
# Contributing
|
|
|
|
Please contribute!
|
|
""")
|
|
|
|
|
|
class TestParseReadmeSections:
|
|
def test_category_count(self):
|
|
cats, resources = parse_readme(MINIMAL_README)
|
|
assert len(cats) == 2
|
|
|
|
def test_resource_count(self):
|
|
cats, resources = parse_readme(MINIMAL_README)
|
|
assert len(resources) == 2
|
|
|
|
def test_category_names(self):
|
|
cats, _ = parse_readme(MINIMAL_README)
|
|
assert cats[0]["name"] == "Alpha"
|
|
assert cats[1]["name"] == "Beta"
|
|
|
|
def test_category_slugs(self):
|
|
cats, _ = parse_readme(MINIMAL_README)
|
|
assert cats[0]["slug"] == "alpha"
|
|
assert cats[1]["slug"] == "beta"
|
|
|
|
def test_category_description(self):
|
|
cats, _ = parse_readme(MINIMAL_README)
|
|
assert cats[0]["description"] == "Libraries for alpha stuff."
|
|
assert cats[1]["description"] == "Tools for beta."
|
|
|
|
def test_resource_names(self):
|
|
_, resources = parse_readme(MINIMAL_README)
|
|
assert resources[0]["name"] == "Newsletters"
|
|
assert resources[1]["name"] == "Podcasts"
|
|
|
|
def test_contributing_skipped(self):
|
|
cats, resources = parse_readme(MINIMAL_README)
|
|
all_names = [c["name"] for c in cats] + [r["name"] for r in resources]
|
|
assert "Contributing" not in all_names
|
|
|
|
def test_no_separator(self):
|
|
cats, resources = parse_readme("# Just a heading\n\nSome text.\n")
|
|
assert cats == []
|
|
assert resources == []
|
|
|
|
def test_no_description(self):
|
|
readme = textwrap.dedent("""\
|
|
# Title
|
|
|
|
---
|
|
|
|
## NullDesc
|
|
|
|
- [item](https://x.com) - Thing.
|
|
|
|
# Resources
|
|
|
|
## Tips
|
|
|
|
- [tip](https://x.com)
|
|
|
|
# Contributing
|
|
|
|
Done.
|
|
""")
|
|
cats, resources = parse_readme(readme)
|
|
assert cats[0]["description"] == ""
|
|
assert cats[0]["entries"][0]["name"] == "item"
|
|
|
|
def test_description_with_link_stripped(self):
|
|
readme = textwrap.dedent("""\
|
|
# T
|
|
|
|
---
|
|
|
|
## Algos
|
|
|
|
_Algorithms. Also see [awesome-algos](https://example.com)._
|
|
|
|
- [lib](https://x.com) - Lib.
|
|
|
|
# Contributing
|
|
|
|
Done.
|
|
""")
|
|
cats, _ = parse_readme(readme)
|
|
assert cats[0]["description"] == "Algorithms. Also see awesome-algos."
|
|
|
|
|
|
def _content_nodes(md_text: str) -> list[SyntaxTreeNode]:
|
|
"""Helper: parse markdown and return all block nodes."""
|
|
md = MarkdownIt("commonmark")
|
|
root = SyntaxTreeNode(md.parse(md_text))
|
|
return root.children
|
|
|
|
|
|
class TestParseSectionEntries:
|
|
def test_flat_entries(self):
|
|
nodes = _content_nodes(
|
|
"- [django](https://example.com/d) - A web framework.\n"
|
|
"- [flask](https://example.com/f) - A micro framework.\n"
|
|
)
|
|
entries = _parse_section_entries(nodes)
|
|
assert len(entries) == 2
|
|
assert entries[0]["name"] == "django"
|
|
assert entries[0]["url"] == "https://example.com/d"
|
|
assert "web framework" in entries[0]["description"]
|
|
assert entries[0]["also_see"] == []
|
|
assert entries[1]["name"] == "flask"
|
|
|
|
def test_link_only_entry(self):
|
|
nodes = _content_nodes("- [tool](https://x.com)\n")
|
|
entries = _parse_section_entries(nodes)
|
|
assert len(entries) == 1
|
|
assert entries[0]["name"] == "tool"
|
|
assert entries[0]["description"] == ""
|
|
|
|
def test_subcategorized_entries(self):
|
|
nodes = _content_nodes(
|
|
"- Algorithms\n"
|
|
" - [algos](https://x.com/a) - Algo lib.\n"
|
|
" - [sorts](https://x.com/s) - Sort lib.\n"
|
|
"- Design Patterns\n"
|
|
" - [patterns](https://x.com/p) - Pattern lib.\n"
|
|
)
|
|
entries = _parse_section_entries(nodes)
|
|
assert len(entries) == 3
|
|
assert entries[0]["name"] == "algos"
|
|
assert entries[2]["name"] == "patterns"
|
|
|
|
def test_text_before_link_is_subcategory(self):
|
|
nodes = _content_nodes(
|
|
"- MySQL - [awesome-mysql](http://example.com/awesome-mysql/)\n"
|
|
" - [mysqlclient](https://example.com/mysqlclient) - MySQL connector.\n"
|
|
" - [pymysql](https://example.com/pymysql) - Pure Python MySQL driver.\n"
|
|
)
|
|
entries = _parse_section_entries(nodes)
|
|
# awesome-mysql is a subcategory label, not an entry
|
|
assert len(entries) == 2
|
|
names = [e["name"] for e in entries]
|
|
assert "awesome-mysql" not in names
|
|
assert "mysqlclient" in names
|
|
assert "pymysql" in names
|
|
|
|
def test_also_see_sub_entries(self):
|
|
nodes = _content_nodes(
|
|
"- [asyncio](https://docs.python.org/3/library/asyncio.html) - Async I/O.\n"
|
|
" - [awesome-asyncio](https://github.com/timofurrer/awesome-asyncio)\n"
|
|
"- [trio](https://github.com/python-trio/trio) - Friendly async.\n"
|
|
)
|
|
entries = _parse_section_entries(nodes)
|
|
assert len(entries) == 2
|
|
assert entries[0]["name"] == "asyncio"
|
|
assert len(entries[0]["also_see"]) == 1
|
|
assert entries[0]["also_see"][0]["name"] == "awesome-asyncio"
|
|
assert entries[1]["name"] == "trio"
|
|
assert entries[1]["also_see"] == []
|
|
|
|
def test_entry_count_includes_also_see(self):
|
|
readme = textwrap.dedent("""\
|
|
# T
|
|
|
|
---
|
|
|
|
## Async
|
|
|
|
- [asyncio](https://x.com) - Async I/O.
|
|
- [awesome-asyncio](https://y.com)
|
|
- [trio](https://z.com) - Friendly async.
|
|
|
|
# Contributing
|
|
|
|
Done.
|
|
""")
|
|
cats, _ = parse_readme(readme)
|
|
# 2 main entries + 1 also_see = 3
|
|
assert cats[0]["entry_count"] == 3
|
|
|
|
def test_preview_first_four_names(self):
|
|
readme = textwrap.dedent("""\
|
|
# T
|
|
|
|
---
|
|
|
|
## Libs
|
|
|
|
- [alpha](https://x.com) - A.
|
|
- [beta](https://x.com) - B.
|
|
- [gamma](https://x.com) - C.
|
|
- [delta](https://x.com) - D.
|
|
- [epsilon](https://x.com) - E.
|
|
|
|
# Contributing
|
|
|
|
Done.
|
|
""")
|
|
cats, _ = parse_readme(readme)
|
|
assert cats[0]["preview"] == "alpha, beta, gamma, delta"
|
|
|
|
def test_description_html_escapes_xss(self):
|
|
nodes = _content_nodes('- [lib](https://x.com) - A <script>alert(1)</script> lib.\n')
|
|
entries = _parse_section_entries(nodes)
|
|
assert "<script>" not in entries[0]["description"]
|
|
assert "<script>" in entries[0]["description"]
|
|
|
|
|
|
class TestRenderSectionHtml:
|
|
def test_basic_entry(self):
|
|
nodes = _content_nodes("- [django](https://example.com) - A web framework.\n")
|
|
html = _render_section_html(nodes)
|
|
assert 'class="entry"' in html
|
|
assert 'href="https://example.com"' in html
|
|
assert "django" in html
|
|
assert "A web framework." in html
|
|
|
|
def test_subcategory_label(self):
|
|
nodes = _content_nodes(
|
|
"- Synchronous\n - [django](https://x.com) - Framework.\n"
|
|
)
|
|
html = _render_section_html(nodes)
|
|
assert 'class="subcat"' in html
|
|
assert "Synchronous" in html
|
|
assert 'class="entry"' in html
|
|
|
|
def test_sub_entry(self):
|
|
nodes = _content_nodes(
|
|
"- [django](https://x.com) - Framework.\n"
|
|
" - [awesome-django](https://y.com)\n"
|
|
)
|
|
html = _render_section_html(nodes)
|
|
assert 'class="entry-sub"' in html
|
|
assert "awesome-django" in html
|
|
|
|
def test_link_only_entry(self):
|
|
nodes = _content_nodes("- [tool](https://x.com)\n")
|
|
html = _render_section_html(nodes)
|
|
assert 'class="entry"' in html
|
|
assert 'href="https://x.com"' in html
|
|
assert "tool" in html
|
|
|
|
def test_xss_escaped_in_name(self):
|
|
nodes = _content_nodes('- [<img onerror=alert(1)>](https://x.com) - Bad.\n')
|
|
html = _render_section_html(nodes)
|
|
assert "onerror" not in html or "&" in html
|
|
|
|
def test_xss_escaped_in_subcat(self):
|
|
nodes = _content_nodes("- <script>alert(1)</script>\n")
|
|
html = _render_section_html(nodes)
|
|
assert "<script>" not in html
|
|
|
|
|
|
class TestParseRealReadme:
|
|
@pytest.fixture(autouse=True)
|
|
def load_readme(self):
|
|
readme_path = os.path.join(os.path.dirname(__file__), "..", "..", "README.md")
|
|
with open(readme_path, encoding="utf-8") as f:
|
|
self.readme_text = f.read()
|
|
self.cats, self.resources = parse_readme(self.readme_text)
|
|
|
|
def test_at_least_83_categories(self):
|
|
assert len(self.cats) >= 83
|
|
|
|
def test_resources_has_newsletters_and_podcasts(self):
|
|
names = [r["name"] for r in self.resources]
|
|
assert "Newsletters" in names
|
|
assert "Podcasts" in names
|
|
|
|
def test_contributing_not_in_results(self):
|
|
all_names = [c["name"] for c in self.cats] + [r["name"] for r in self.resources]
|
|
assert "Contributing" not in all_names
|
|
|
|
def test_first_category_is_admin_panels(self):
|
|
assert self.cats[0]["name"] == "Admin Panels"
|
|
assert self.cats[0]["slug"] == "admin-panels"
|
|
|
|
def test_last_category_is_wsgi_servers(self):
|
|
assert self.cats[-1]["name"] == "WSGI Servers"
|
|
assert self.cats[-1]["slug"] == "wsgi-servers"
|
|
|
|
def test_restful_api_slug(self):
|
|
slugs = [c["slug"] for c in self.cats]
|
|
assert "restful-api" in slugs
|
|
|
|
def test_descriptions_extracted(self):
|
|
admin = self.cats[0]
|
|
assert admin["description"] == "Libraries for administrative interfaces."
|
|
|
|
def test_entry_counts_nonzero(self):
|
|
for cat in self.cats:
|
|
assert cat["entry_count"] > 0, f"{cat['name']} has 0 entries"
|
|
|
|
def test_previews_nonempty(self):
|
|
for cat in self.cats:
|
|
assert cat["preview"], f"{cat['name']} has empty preview"
|
|
|
|
def test_content_html_nonempty(self):
|
|
for cat in self.cats:
|
|
assert cat["content_html"], f"{cat['name']} has empty content_html"
|
|
|
|
def test_algorithms_has_subcategories(self):
|
|
algos = next(c for c in self.cats if c["name"] == "Algorithms and Design Patterns")
|
|
assert 'class="subcat"' in algos["content_html"]
|
|
|
|
def test_async_has_also_see(self):
|
|
async_cat = next(c for c in self.cats if c["name"] == "Asynchronous Programming")
|
|
asyncio_entry = next(e for e in async_cat["entries"] if e["name"] == "asyncio")
|
|
assert len(asyncio_entry["also_see"]) >= 1
|
|
assert asyncio_entry["also_see"][0]["name"] == "awesome-asyncio"
|
|
|
|
def test_description_links_stripped_to_text(self):
|
|
algos = next(c for c in self.cats if c["name"] == "Algorithms and Design Patterns")
|
|
assert "awesome-algorithms" in algos["description"]
|
|
assert "https://" not in algos["description"]
|