diff --git a/lark_channel/channel/config.py b/lark_channel/channel/config.py index 4d799e7..969ff6b 100644 --- a/lark_channel/channel/config.py +++ b/lark_channel/channel/config.py @@ -243,7 +243,7 @@ class StreamThrottleConfig: class MarkdownConverter: enabled: bool = True table_mode: TableMode = "off" - tag_md_mode: TagMdMode = "structured" + tag_md_mode: TagMdMode = "native" @dataclass diff --git a/lark_channel/channel/normalize/converters/post.py b/lark_channel/channel/normalize/converters/post.py index 153bc7c..3479568 100644 --- a/lark_channel/channel/normalize/converters/post.py +++ b/lark_channel/channel/normalize/converters/post.py @@ -1,13 +1,18 @@ """Converter: PostContent → Markdown (headings / bold / italic / code / links).""" +import re from typing import Any, Dict, List, Tuple from ...types import PostContent, ResourceDescriptor +_AT_MENTION_RE = re.compile(r'(.*?)') +_IMAGE_KEY_RE = re.compile(r"!\[(.*?)\]\(([^)]+)\)") + def convert(content: PostContent) -> Tuple[str, List[ResourceDescriptor]]: - md = _post_to_markdown(content.post) if content.post else content.text + md, md_resources = _post_to_markdown(content.post) if content.post else (content.text or "", []) resources = _post_resources(content.post) if content.post else [] + resources.extend(md_resources) return md, resources @@ -19,16 +24,25 @@ def _iter_documents(post: Dict[str, Any]) -> List[Dict[str, Any]]: return [doc for doc in post.values() if isinstance(doc, dict)] -def _post_to_markdown(post: Dict[str, Any]) -> str: +def _post_to_markdown(post: Dict[str, Any]) -> Tuple[str, List[ResourceDescriptor]]: docs = _iter_documents(post) if not docs: - return "" + return "", [] locale = docs[0] + + # Choose source paragraphs: prefer content_v2, fallback to content. + content_v2 = locale.get("content_v2") + if isinstance(content_v2, list) and len(content_v2) > 0: + source_paragraphs = content_v2 + else: + source_paragraphs = locale.get("content") or [] + lines: List[str] = [] + resources: List[ResourceDescriptor] = [] title = locale.get("title") if title: lines.append(f"# {title}") - for para in locale.get("content") or []: + for para in source_paragraphs: chunks: List[str] = [] for el in para or []: if not isinstance(el, dict): @@ -64,11 +78,13 @@ def _post_to_markdown(post: Dict[str, Any]) -> str: elif tag == "hr": chunks.append("---") elif tag == "md": - chunks.append(el.get("text") or "") + text, res = _process_md_text(el.get("text") or "") + chunks.append(text) + resources.extend(res) line = "".join(chunks) if line: lines.append(line) - return "\n\n".join(lines).strip() + return "\n\n".join(lines).strip(), resources def _post_resources(post: Dict[str, Any]) -> List[ResourceDescriptor]: @@ -105,3 +121,42 @@ def add(kind: str, key: Any, *, file_name: Any = None) -> None: elif tag == "file": add("file", el.get("file_key"), file_name=el.get("file_name")) return resources + + +def _process_md_text(text: str) -> Tuple[str, List[ResourceDescriptor]]: + """Post-process raw markdown text from an "md" element. + + Splits by fenced code block delimiters (```) and only applies + transformations (at-mention replacement, image key extraction) + to text outside of properly paired code blocks. Unclosed fences + are treated as outside-code-block text. + """ + resources: List[ResourceDescriptor] = [] + parts = text.split("```") + total = len(parts) + for i, part in enumerate(parts): + # Odd-index segments are inside code blocks, UNLESS it's the last + # segment of an even-length split (unclosed fence). + is_inside = (i % 2 == 1) + if is_inside and total % 2 == 0 and i == total - 1: + is_inside = False + if not is_inside: + # Outside code block: apply at-mention replacement. + def _replace_at(m: re.Match) -> str: + user_id = m.group(4) + name = m.group(5) + if user_id in ("all", "all_members"): + return "@all" + return f"@{name}" if name else f"@{user_id}" + + parts[i] = _AT_MENTION_RE.sub(_replace_at, part) + + # Extract image keys from ![...](key) patterns. + for _alt, img_key in _IMAGE_KEY_RE.findall(parts[i]): + if img_key: + resources.append(ResourceDescriptor( + type="image", # type: ignore[arg-type] + file_key=img_key, + )) + # Inside code block: preserve as-is. + return "```".join(parts), resources diff --git a/lark_channel/channel/outbound/markdown/to_post.py b/lark_channel/channel/outbound/markdown/to_post.py index f8f05ff..6fb8738 100644 --- a/lark_channel/channel/outbound/markdown/to_post.py +++ b/lark_channel/channel/outbound/markdown/to_post.py @@ -91,7 +91,7 @@ def markdown_to_post_ast( locale: str = "zh_cn", mentions: "list[Identity] | None" = None, table_mode: str = "off", - tag_md_mode: str = "structured", + tag_md_mode: str = "native", ) -> Dict[str, Any]: """Produce a Lark post AST (`{locale: {title, content: [[...]]}}`) from Markdown. @@ -99,13 +99,14 @@ def markdown_to_post_ast( paragraph so the recipient actually gets notified. ``tag_md_mode``: - - ``"structured"`` (default): parse Markdown into explicit post nodes + - ``"native"`` (default): wrap the raw markdown into one or more + ``tag:md`` rows (split at code-fence boundaries) and let the Feishu + client's own markdown parser render natively. Renders headers / + blockquotes / lists with native styling, but rendering depends on + Feishu client version. + - ``"structured"``: parse Markdown into explicit post nodes (``tag:text`` with style attributes, ``tag:a`` for links, ``tag:code_block`` for fenced code, etc). Cross-client deterministic. - - ``"native"``: wrap the raw markdown into one or more ``tag:md`` rows - (split at code-fence boundaries) and let the Feishu client's own - markdown parser render natively. Renders headers/blockquotes/lists - with native styling, but rendering depends on Feishu client version. """ if tag_md_mode == "native": return _build_native_md_ast(md, title=title, locale=locale, mentions=mentions) diff --git a/lark_channel/channel/tests/test_flatten.py b/lark_channel/channel/tests/test_flatten.py index 9806b28..6da5e29 100644 --- a/lark_channel/channel/tests/test_flatten.py +++ b/lark_channel/channel/tests/test_flatten.py @@ -153,6 +153,156 @@ def test_post_direct_document_shape_flattens_text_and_resources(): assert r[0].file_key == "img_direct" +def test_post_content_v2_md_preferred_and_post_processed(): + post = { + "zh_cn": { + "title": "V2", + "content": [[{"tag": "text", "text": "legacy content"}]], + "content_v2": [ + [ + { + "tag": "md", + "text": ( + 'hello Alice ' + 'and All ' + "![diagram](img_v2)\n\n" + "```text\n" + 'Code ![ignored](img_code)\n' + "```" + ), + } + ] + ], + } + } + + t, r = flatten(PostContent(post=post)) + + assert "# V2" in t + assert "legacy content" not in t + assert "hello @Alice and @all ![diagram](img_v2)" in t + assert 'Code ![ignored](img_code)' in t + assert [(x.type, x.file_key) for x in r] == [("image", "img_v2")] + + +def test_post_content_v2_empty_falls_back_to_content(): + """An empty content_v2 list must fall back to legacy content paragraphs.""" + post = { + "zh_cn": { + "title": "Fallback", + "content_v2": [], + "content": [[{"tag": "text", "text": "from legacy"}]], + } + } + + t, r = flatten(PostContent(post=post)) + + assert "# Fallback" in t + assert "from legacy" in t + assert r == [] + + +def test_post_content_v2_non_list_falls_back_to_content(): + """A non-list content_v2 (malformed) must fall back to legacy content.""" + post = { + "zh_cn": { + "title": "Bad", + "content_v2": "not-a-list", + "content": [[{"tag": "text", "text": "still works"}]], + } + } + + t, _ = flatten(PostContent(post=post)) + + assert "still works" in t + + +def test_post_md_text_at_all_members_alias_and_unnamed_at(): + """`all_members` resolves to @all; without inner text falls back to user_id.""" + post = { + "zh_cn": { + "content_v2": [ + [ + { + "tag": "md", + "text": ( + 'hi ' + 'and done' + ), + } + ] + ], + } + } + + t, r = flatten(PostContent(post=post)) + + assert "hi @all and @ou_42 done" in t + assert r == [] + + +def test_post_md_text_unclosed_fence_is_treated_as_outside(): + """An unclosed code fence must not protect at-mentions / image keys after it.""" + post = { + "zh_cn": { + "content_v2": [ + [ + { + "tag": "md", + "text": ( + 'before Alice\n' + "```python\n" + "still no close fence ![pic](img_unclosed)\n" + 'Bob' + ), + } + ] + ], + } + } + + t, r = flatten(PostContent(post=post)) + + assert "before @Alice" in t + assert "@Bob" in t + assert [(x.type, x.file_key) for x in r] == [ + ("image", "img_unclosed"), + ] + + +def test_post_md_text_multiple_paired_fences_protect_inner_blocks(): + """Multiple complete fence pairs: only outside-of-fence transformations apply.""" + post = { + "zh_cn": { + "content_v2": [ + [ + { + "tag": "md", + "text": ( + "outer1 ![a](img_a)\n" + "```\nblock1 X\n```\n" + "outer2 ![b](img_b)\n" + "```\nblock2 ![c](img_c)\n```\n" + "outer3" + ), + } + ] + ], + } + } + + t, r = flatten(PostContent(post=post)) + + # Inside-fence content preserved verbatim; outside-fence transformed. + assert 'block1 X' in t + assert "block2 ![c](img_c)" in t + # Only outside-fence images extracted (img_a, img_b), inner img_c skipped. + assert [(x.type, x.file_key) for x in r] == [ + ("image", "img_a"), + ("image", "img_b"), + ] + + def test_merge_forward_flatten_recursive(): child = TextContent(text="child content") item = MergeForwardItem( diff --git a/lark_channel/channel/tests/test_markdown.py b/lark_channel/channel/tests/test_markdown.py index b06fa73..0d732a5 100644 --- a/lark_channel/channel/tests/test_markdown.py +++ b/lark_channel/channel/tests/test_markdown.py @@ -8,22 +8,26 @@ def _zh(ast): return ast["zh_cn"] +def _structured(md, **kwargs): + return markdown_to_post_ast(md, tag_md_mode="structured", **kwargs) + + def test_plain_paragraph(): - ast = markdown_to_post_ast("hello world") + ast = _structured("hello world") assert _zh(ast)["title"] == "" paras = _zh(ast)["content"] assert paras == [[{"tag": "text", "text": "hello world"}]] def test_heading_becomes_bold_text(): - ast = markdown_to_post_ast("# Title\n\nbody") + ast = _structured("# Title\n\nbody") paras = _zh(ast)["content"] assert paras[0][0]["style"] == ["bold"] and paras[0][0]["text"] == "Title" assert paras[1] == [{"tag": "text", "text": "body"}] def test_bold_italic_code_inline(): - ast = markdown_to_post_ast("**bold** and *it* and `code`") + ast = _structured("**bold** and *it* and `code`") paras = _zh(ast)["content"] runs = paras[0] styles = [(r.get("text"), r.get("style", [])) for r in runs if r["tag"] == "text"] @@ -33,7 +37,7 @@ def test_bold_italic_code_inline(): def test_link_emits_a_tag(): - ast = markdown_to_post_ast("see [docs](https://x.example)") + ast = _structured("see [docs](https://x.example)") runs = _zh(ast)["content"][0] a_tag = next(r for r in runs if r["tag"] == "a") assert a_tag["text"] == "docs" @@ -41,7 +45,7 @@ def test_link_emits_a_tag(): def test_code_block_fenced(): - ast = markdown_to_post_ast("```python\nprint(1)\n```") + ast = _structured("```python\nprint(1)\n```") paras = _zh(ast)["content"] cb = paras[0][0] assert cb["tag"] == "code_block" @@ -50,26 +54,26 @@ def test_code_block_fenced(): def test_bullet_list_each_paragraph(): - ast = markdown_to_post_ast("- one\n- two\n- three") + ast = _structured("- one\n- two\n- three") paras = _zh(ast)["content"] assert len(paras) == 3 assert paras[0][0]["text"].startswith("• one") def test_hr(): - ast = markdown_to_post_ast("top\n\n---\n\nbot") + ast = _structured("top\n\n---\n\nbot") paras = _zh(ast)["content"] assert any(p == [{"tag": "hr"}] for p in paras) def test_blockquote_marker(): - ast = markdown_to_post_ast("> quoted line") + ast = _structured("> quoted line") first = _zh(ast)["content"][0] assert first[0] == {"tag": "text", "text": "│ "} def test_mentions_injected(): - ast = markdown_to_post_ast( + ast = _structured( "hi", mentions=[Identity(open_id="ou_1", display_name="Alice")], ) @@ -82,12 +86,12 @@ def test_mentions_injected(): def test_table_mode_bullets(): md = "| name | age |\n|---|---|\n| Alice | 30 |\n| Bob | 25 |" - ast = markdown_to_post_ast(md, table_mode="bullets") + ast = _structured(md, table_mode="bullets") paras = _zh(ast)["content"] assert paras[0][0]["text"].startswith("• name: Alice") assert paras[1][0]["text"].startswith("• name: Bob") def test_empty_input_yields_empty_paragraph(): - ast = markdown_to_post_ast("") + ast = _structured("") assert _zh(ast)["content"] == [[]] or _zh(ast)["content"] == [[{"tag": "text", "text": ""}]] diff --git a/lark_channel/channel/tests/test_markdown_native_mode.py b/lark_channel/channel/tests/test_markdown_native_mode.py index ea016dd..3b4b13f 100644 --- a/lark_channel/channel/tests/test_markdown_native_mode.py +++ b/lark_channel/channel/tests/test_markdown_native_mode.py @@ -41,7 +41,7 @@ def test_markdown_converter_has_tag_md_mode_field_with_structured_default(): conv = MarkdownConverter() - assert conv.tag_md_mode == "structured" + assert conv.tag_md_mode == "native" def test_markdown_converter_accepts_native_tag_md_mode(): @@ -108,6 +108,10 @@ def test_two_fences_separated_by_prose(self): class TestNativeMode: + def test_default_mode_is_native(self): + out = markdown_to_post_ast("# Hello") + assert out["zh_cn"]["content"] == [[{"tag": "md", "text": "# Hello"}]] + def test_plain_text_native_returns_single_md_node(self): out = markdown_to_post_ast("hello world", tag_md_mode="native") assert out == { @@ -170,21 +174,21 @@ def _load_snapshot(): path = Path(__file__).parent / "snapshots" / "markdown_structured.json" return json.loads(path.read_text(encoding="utf-8")) - def test_structured_default_kwarg_equals_snapshot(self): + def test_structured_explicit_kwarg_equals_snapshot(self): snapshot = self._load_snapshot() for label, text in FIXTURES.items(): - actual = markdown_to_post_ast(text) + actual = markdown_to_post_ast(text, tag_md_mode="structured") assert actual == snapshot[label], ( f"structured-mode regression for {label!r}\n" f"expected: {json.dumps(snapshot[label], ensure_ascii=False)}\n" f"actual: {json.dumps(actual, ensure_ascii=False)}" ) - def test_structured_explicit_kwarg_equals_default(self): + def test_native_default_matches_explicit_native(self): for label, text in FIXTURES.items(): assert markdown_to_post_ast(text) == markdown_to_post_ast( - text, tag_md_mode="structured" - ), f"explicit and default disagree for {label!r}" + text, tag_md_mode="native" + ), f"native explicit and default disagree for {label!r}" class TestSenderBuildPost: @@ -291,12 +295,12 @@ async def test_outbound_post_default_still_produces_structured_payload(self): from lark_channel.channel.types import OutboundPost d, calls = make_driver() - s = OutboundSender(d) # default OutboundConfig: structured + s = OutboundSender(d) # default OutboundConfig: native await s.send(OutboundPost(markdown="# Hello"), receive_id="oc_x") content = json.loads(calls[0]["content"]) node = content["zh_cn"]["content"][0][0] - assert node["tag"] == "text" - assert "bold" in node.get("style", []) + assert node["tag"] == "md" + assert node["text"] == "# Hello" @pytest.mark.asyncio async def test_outbound_post_native_with_code_fence_produces_multi_row(self): diff --git a/lark_channel/channel/tests/test_media_caption.py b/lark_channel/channel/tests/test_media_caption.py index a070c75..d963903 100644 --- a/lark_channel/channel/tests/test_media_caption.py +++ b/lark_channel/channel/tests/test_media_caption.py @@ -180,7 +180,8 @@ async def test_video_caption_native_post_body_uses_media_tag(): @pytest.mark.asyncio async def test_image_caption_structured_adds_media_as_final_row(): d, calls = make_caption_driver(image_key="img_structured") - s = OutboundSender(d) + cfg = OutboundConfig(markdown_converter=MarkdownConverter(tag_md_mode="structured")) + s = OutboundSender(d, cfg) await s.send( OutboundImage(source=MediaSource(kind="buffer", buffer=b"png"), caption="**bold**"), receive_id="oc_1", diff --git a/lark_channel/channel/tests/test_sender.py b/lark_channel/channel/tests/test_sender.py index 0101eae..365faf5 100644 --- a/lark_channel/channel/tests/test_sender.py +++ b/lark_channel/channel/tests/test_sender.py @@ -104,7 +104,9 @@ async def test_post_from_markdown_emits_post_msg(): # Wrapping with "post" causes server error 230001 (invalid message content). assert "post" not in content zh = content["zh_cn"] - assert zh["content"][0][0]["text"] == "bold" + # Default tag_md_mode is now "native", so markdown is preserved as raw md node. + assert zh["content"][0][0]["tag"] == "md" + assert zh["content"][0][0]["text"] == "**bold**" @pytest.mark.asyncio diff --git a/lark_channel/channel/tests/test_sender_extras.py b/lark_channel/channel/tests/test_sender_extras.py index 1cddf76..f156033 100644 --- a/lark_channel/channel/tests/test_sender_extras.py +++ b/lark_channel/channel/tests/test_sender_extras.py @@ -58,7 +58,7 @@ async def noop(**kwargs): s = OutboundSender( SendDriver(create_message=create_message, reply_message=noop), - OutboundConfig(markdown_converter=MarkdownConverter(table_mode="bullets")), + OutboundConfig(markdown_converter=MarkdownConverter(table_mode="bullets", tag_md_mode="structured")), ) md = "| name | age |\n|---|---|\n| Alice | 30 |" await s.send(OutboundPost(markdown=md), receive_id="oc_1")