mirror of
https://github.com/vinta/awesome-python.git
synced 2026-03-25 05:33:05 +08:00
feat(website): add markdown-it-py README parser and inline renderer tests
Introduce readme_parser.py which parses README.md into structured section data using the markdown-it-py AST. Includes TypedDicts for ParsedEntry/ParsedSection, slugify(), render_inline_html(), and render_inline_text(). Add test_readme_parser.py covering HTML escaping, link rendering, emphasis, strong, and code_inline for both renderers. Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
93
website/readme_parser.py
Normal file
93
website/readme_parser.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Parse README.md into structured section data using markdown-it-py AST."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import TypedDict
|
||||
|
||||
from markdown_it.tree import SyntaxTreeNode
|
||||
from markupsafe import escape
|
||||
|
||||
|
||||
class AlsoSee(TypedDict):
|
||||
name: str
|
||||
url: str
|
||||
|
||||
|
||||
class ParsedEntry(TypedDict):
|
||||
name: str
|
||||
url: str
|
||||
description: str # inline HTML, properly escaped
|
||||
also_see: list[AlsoSee]
|
||||
|
||||
|
||||
class ParsedSection(TypedDict):
|
||||
name: str
|
||||
slug: str
|
||||
description: str # plain text, links resolved to text
|
||||
content: str # raw markdown (backward compat)
|
||||
entries: list[ParsedEntry]
|
||||
entry_count: int
|
||||
preview: str
|
||||
content_html: str # rendered HTML, properly escaped
|
||||
|
||||
|
||||
# --- Slugify ----------------------------------------------------------------
|
||||
|
||||
_SLUG_NON_ALNUM_RE = re.compile(r"[^a-z0-9\s-]")
|
||||
_SLUG_WHITESPACE_RE = re.compile(r"[\s]+")
|
||||
_SLUG_MULTI_DASH_RE = re.compile(r"-+")
|
||||
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
"""Convert a category name to a URL-friendly slug."""
|
||||
slug = name.lower()
|
||||
slug = _SLUG_NON_ALNUM_RE.sub("", slug)
|
||||
slug = _SLUG_WHITESPACE_RE.sub("-", slug.strip())
|
||||
slug = _SLUG_MULTI_DASH_RE.sub("-", slug)
|
||||
return slug
|
||||
|
||||
|
||||
# --- Inline renderers -------------------------------------------------------
|
||||
|
||||
|
||||
def render_inline_html(children: list[SyntaxTreeNode]) -> str:
|
||||
"""Render inline AST nodes to HTML with proper escaping."""
|
||||
parts: list[str] = []
|
||||
for child in children:
|
||||
match child.type:
|
||||
case "text":
|
||||
parts.append(str(escape(child.content)))
|
||||
case "softbreak":
|
||||
parts.append(" ")
|
||||
case "link":
|
||||
href = str(escape(child.attrGet("href") or ""))
|
||||
inner = render_inline_html(child.children)
|
||||
parts.append(
|
||||
f'<a href="{href}" target="_blank" rel="noopener">{inner}</a>'
|
||||
)
|
||||
case "em":
|
||||
parts.append(f"<em>{render_inline_html(child.children)}</em>")
|
||||
case "strong":
|
||||
parts.append(f"<strong>{render_inline_html(child.children)}</strong>")
|
||||
case "code_inline":
|
||||
parts.append(f"<code>{escape(child.content)}</code>")
|
||||
case "html_inline":
|
||||
parts.append(str(escape(child.content)))
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
def render_inline_text(children: list[SyntaxTreeNode]) -> str:
|
||||
"""Render inline AST nodes to plain text (links become their text)."""
|
||||
parts: list[str] = []
|
||||
for child in children:
|
||||
match child.type:
|
||||
case "text":
|
||||
parts.append(child.content)
|
||||
case "softbreak":
|
||||
parts.append(" ")
|
||||
case "code_inline":
|
||||
parts.append(child.content)
|
||||
case "em" | "strong" | "link":
|
||||
parts.append(render_inline_text(child.children))
|
||||
return "".join(parts)
|
||||
69
website/tests/test_readme_parser.py
Normal file
69
website/tests/test_readme_parser.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""Tests for the readme_parser module."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from readme_parser import render_inline_html, render_inline_text
|
||||
|
||||
from markdown_it import MarkdownIt
|
||||
from markdown_it.tree import SyntaxTreeNode
|
||||
|
||||
|
||||
def _parse_inline(md_text: str) -> list[SyntaxTreeNode]:
|
||||
"""Helper: parse a single paragraph and return its inline children."""
|
||||
md = MarkdownIt("commonmark")
|
||||
root = SyntaxTreeNode(md.parse(md_text))
|
||||
# root > paragraph > inline > children
|
||||
return root.children[0].children[0].children
|
||||
|
||||
|
||||
class TestRenderInlineHtml:
|
||||
def test_plain_text_escapes_html(self):
|
||||
children = _parse_inline("Hello <world> & friends")
|
||||
assert render_inline_html(children) == "Hello <world> & friends"
|
||||
|
||||
def test_link_with_target(self):
|
||||
children = _parse_inline("[name](https://example.com)")
|
||||
html = render_inline_html(children)
|
||||
assert 'href="https://example.com"' in html
|
||||
assert 'target="_blank"' in html
|
||||
assert 'rel="noopener"' in html
|
||||
assert ">name</a>" in html
|
||||
|
||||
def test_emphasis(self):
|
||||
children = _parse_inline("*italic* text")
|
||||
assert "<em>italic</em>" in render_inline_html(children)
|
||||
|
||||
def test_strong(self):
|
||||
children = _parse_inline("**bold** text")
|
||||
assert "<strong>bold</strong>" in render_inline_html(children)
|
||||
|
||||
def test_code_inline(self):
|
||||
children = _parse_inline("`some code`")
|
||||
assert "<code>some code</code>" in render_inline_html(children)
|
||||
|
||||
def test_mixed_link_and_text(self):
|
||||
children = _parse_inline("See [foo](https://x.com) for details.")
|
||||
html = render_inline_html(children)
|
||||
assert "See " in html
|
||||
assert ">foo</a>" in html
|
||||
assert " for details." in html
|
||||
|
||||
|
||||
class TestRenderInlineText:
|
||||
def test_plain_text(self):
|
||||
children = _parse_inline("Hello world")
|
||||
assert render_inline_text(children) == "Hello world"
|
||||
|
||||
def test_link_becomes_text(self):
|
||||
children = _parse_inline("See [awesome-algos](https://github.com/x/y).")
|
||||
assert render_inline_text(children) == "See awesome-algos."
|
||||
|
||||
def test_emphasis_stripped(self):
|
||||
children = _parse_inline("*italic* text")
|
||||
assert render_inline_text(children) == "italic text"
|
||||
|
||||
def test_code_inline_kept(self):
|
||||
children = _parse_inline("`code` here")
|
||||
assert render_inline_text(children) == "code here"
|
||||
Reference in New Issue
Block a user