refactor(chunking): remove tree-level chunk helper; add lossless flatten/greedy tests

SOV710 · SOV710 · commit 47c931ce8e7d · 2026-01-05T19:14:47.000+08:00
- Drop DocumentTree.get_chunks_at_level and its unit test
- Add TestLosslessChunking to validate flatten_tree + greedy_chunk + process_pipeline preserve all content
diff --git a/src/doc2anki/parser/tree.py b/src/doc2anki/parser/tree.py
@@ -146,40 +146,5 @@ def iter_all_nodes(self) -> Iterator[HeadingNode]:
             yield child
             yield from child.iter_descendants()
 
-    def get_chunks_at_level(self, level: int) -> tuple[HeadingNode, ...]:
-        """
-        获取在指定 level 切分的 chunks。
-
-        对于每个分支：
-        - 如果有 level N 的子节点：返回这些 level N 节点
-        - 如果深度不足：返回该分支的叶子节点
-
-        这确保没有内容被丢弃。
-
-        Args:
-            level: 目标切分层级
-
-        Returns:
-            Tuple of HeadingNode objects to be used as chunks
-        """
-        result: list[HeadingNode] = []
-
-        def collect(node: HeadingNode) -> None:
-            if node.level >= level:
-                # 到达或超过目标 level，作为 chunk
-                result.append(node)
-            elif not node.children:
-                # 叶子节点且 level < target，作为 chunk（深度不足）
-                result.append(node)
-            else:
-                # 有子节点且 level < target，继续递归
-                for child in node.children:
-                    collect(child)
-
-        for child in self.children:
-            collect(child)
-
-        return tuple(result)
-
     def __repr__(self) -> str:
         return f"DocumentTree(children={len(self.children)}, levels={set(self.get_all_levels())})"
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -190,43 +190,6 @@ def test_tree_get_nodes_at_level(self):
             nodes = tree.get_nodes_at_level(level)
             assert all(n.level == level for n in nodes)
 
-    def test_get_chunks_at_level_includes_shallow_branches(self):
-        """Test that get_chunks_at_level doesn't discard branches without target level."""
-        content = """# Title
-
-## 1
-
-### 1-1
-
-Content 1-1
-
-## 2
-
-### 2-1
-
-#### 2-1-1
-
-Content 2-1-1
-
-### 2-2
-
-Content 2-2
-"""
-        tree = build_document_tree(content, format="markdown")
-
-        # At level 4, we should get:
-        # - 1-1 (leaf, depth insufficient)
-        # - 2-1-1 (exactly level 4)
-        # - 2-2 (leaf, depth insufficient)
-        chunks = tree.get_chunks_at_level(4)
-
-        assert len(chunks) == 3
-
-        titles = [chunk.title for chunk in chunks]
-        assert "1-1" in titles
-        assert "2-1-1" in titles
-        assert "2-2" in titles
-
     def test_tree_iter_all_nodes(self):
         tree = build_document_tree(FIXTURES_DIR / "sample.md")
 
@@ -285,3 +248,98 @@ def test_build_tree_auto_detect(self):
         org_content = "* Org Heading\nContent"
         tree = build_document_tree(org_content)
         assert tree.source_format == "org"
+
+
+class TestLosslessChunking:
+    """Tests for lossless chunking pipeline."""
+
+    def test_flatten_tree_preserves_all_content(self):
+        """Test that flatten_tree includes all document content."""
+        from doc2anki.pipeline.processor import flatten_tree
+
+        content = """# Title
+Something
+
+## 1
+Content under 1
+
+### 1-1
+Content under 1-1
+
+## 2
+Content under 2
+"""
+        tree = build_document_tree(content, format="markdown")
+        blocks = flatten_tree(tree)
+
+        # Should have 4 blocks: Title, 1, 1-1, 2
+        assert len(blocks) == 4
+
+        # Verify all content is present
+        all_text = "\n".join(b.to_text() for b in blocks)
+        assert "# Title" in all_text
+        assert "Something" in all_text
+        assert "## 1" in all_text
+        assert "Content under 1" in all_text
+        assert "### 1-1" in all_text
+        assert "Content under 1-1" in all_text
+        assert "## 2" in all_text
+        assert "Content under 2" in all_text
+
+    def test_greedy_chunk_single_chunk(self):
+        """Test that small documents become a single chunk."""
+        from doc2anki.pipeline.processor import flatten_tree, greedy_chunk
+        from doc2anki.parser.metadata import DocumentMetadata
+
+        content = """# Title
+Something
+
+## Section
+More content
+"""
+        tree = build_document_tree(content, format="markdown")
+        blocks = flatten_tree(tree)
+        chunks = greedy_chunk(blocks, max_tokens=10000, metadata=DocumentMetadata.empty())
+
+        # Should be a single chunk
+        assert len(chunks) == 1
+
+        # Chunk should contain all content
+        chunk_text = chunks[0].chunk_content
+        assert "# Title" in chunk_text
+        assert "Something" in chunk_text
+        assert "## Section" in chunk_text
+        assert "More content" in chunk_text
+
+    def test_process_pipeline_lossless(self):
+        """Test that process_pipeline preserves all content."""
+        from doc2anki.pipeline import process_pipeline
+
+        content = """# Title
+Intro content
+
+## 1
+Section 1 content
+
+### 1-1
+Deep content
+
+## 2
+Section 2 content
+"""
+        tree = build_document_tree(content, format="markdown")
+        chunks = process_pipeline(tree, max_tokens=10000)
+
+        # Should be a single chunk with all content
+        assert len(chunks) == 1
+
+        chunk_text = chunks[0].chunk_content
+        # Verify nothing is lost
+        assert "# Title" in chunk_text
+        assert "Intro content" in chunk_text
+        assert "## 1" in chunk_text
+        assert "Section 1 content" in chunk_text
+        assert "### 1-1" in chunk_text
+        assert "Deep content" in chunk_text
+        assert "## 2" in chunk_text
+        assert "Section 2 content" in chunk_text