Skip to content

Commit 47c931c

Browse files
committed
refactor(chunking): remove tree-level chunk helper; add lossless flatten/greedy tests
- Drop DocumentTree.get_chunks_at_level and its unit test - Add TestLosslessChunking to validate flatten_tree + greedy_chunk + process_pipeline preserve all content
1 parent 642349c commit 47c931c

2 files changed

Lines changed: 95 additions & 72 deletions

File tree

src/doc2anki/parser/tree.py

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -146,40 +146,5 @@ def iter_all_nodes(self) -> Iterator[HeadingNode]:
146146
yield child
147147
yield from child.iter_descendants()
148148

149-
def get_chunks_at_level(self, level: int) -> tuple[HeadingNode, ...]:
150-
"""
151-
获取在指定 level 切分的 chunks。
152-
153-
对于每个分支:
154-
- 如果有 level N 的子节点:返回这些 level N 节点
155-
- 如果深度不足:返回该分支的叶子节点
156-
157-
这确保没有内容被丢弃。
158-
159-
Args:
160-
level: 目标切分层级
161-
162-
Returns:
163-
Tuple of HeadingNode objects to be used as chunks
164-
"""
165-
result: list[HeadingNode] = []
166-
167-
def collect(node: HeadingNode) -> None:
168-
if node.level >= level:
169-
# 到达或超过目标 level,作为 chunk
170-
result.append(node)
171-
elif not node.children:
172-
# 叶子节点且 level < target,作为 chunk(深度不足)
173-
result.append(node)
174-
else:
175-
# 有子节点且 level < target,继续递归
176-
for child in node.children:
177-
collect(child)
178-
179-
for child in self.children:
180-
collect(child)
181-
182-
return tuple(result)
183-
184149
def __repr__(self) -> str:
185150
return f"DocumentTree(children={len(self.children)}, levels={set(self.get_all_levels())})"

tests/test_parser.py

Lines changed: 95 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -190,43 +190,6 @@ def test_tree_get_nodes_at_level(self):
190190
nodes = tree.get_nodes_at_level(level)
191191
assert all(n.level == level for n in nodes)
192192

193-
def test_get_chunks_at_level_includes_shallow_branches(self):
194-
"""Test that get_chunks_at_level doesn't discard branches without target level."""
195-
content = """# Title
196-
197-
## 1
198-
199-
### 1-1
200-
201-
Content 1-1
202-
203-
## 2
204-
205-
### 2-1
206-
207-
#### 2-1-1
208-
209-
Content 2-1-1
210-
211-
### 2-2
212-
213-
Content 2-2
214-
"""
215-
tree = build_document_tree(content, format="markdown")
216-
217-
# At level 4, we should get:
218-
# - 1-1 (leaf, depth insufficient)
219-
# - 2-1-1 (exactly level 4)
220-
# - 2-2 (leaf, depth insufficient)
221-
chunks = tree.get_chunks_at_level(4)
222-
223-
assert len(chunks) == 3
224-
225-
titles = [chunk.title for chunk in chunks]
226-
assert "1-1" in titles
227-
assert "2-1-1" in titles
228-
assert "2-2" in titles
229-
230193
def test_tree_iter_all_nodes(self):
231194
tree = build_document_tree(FIXTURES_DIR / "sample.md")
232195

@@ -285,3 +248,98 @@ def test_build_tree_auto_detect(self):
285248
org_content = "* Org Heading\nContent"
286249
tree = build_document_tree(org_content)
287250
assert tree.source_format == "org"
251+
252+
253+
class TestLosslessChunking:
254+
"""Tests for lossless chunking pipeline."""
255+
256+
def test_flatten_tree_preserves_all_content(self):
257+
"""Test that flatten_tree includes all document content."""
258+
from doc2anki.pipeline.processor import flatten_tree
259+
260+
content = """# Title
261+
Something
262+
263+
## 1
264+
Content under 1
265+
266+
### 1-1
267+
Content under 1-1
268+
269+
## 2
270+
Content under 2
271+
"""
272+
tree = build_document_tree(content, format="markdown")
273+
blocks = flatten_tree(tree)
274+
275+
# Should have 4 blocks: Title, 1, 1-1, 2
276+
assert len(blocks) == 4
277+
278+
# Verify all content is present
279+
all_text = "\n".join(b.to_text() for b in blocks)
280+
assert "# Title" in all_text
281+
assert "Something" in all_text
282+
assert "## 1" in all_text
283+
assert "Content under 1" in all_text
284+
assert "### 1-1" in all_text
285+
assert "Content under 1-1" in all_text
286+
assert "## 2" in all_text
287+
assert "Content under 2" in all_text
288+
289+
def test_greedy_chunk_single_chunk(self):
290+
"""Test that small documents become a single chunk."""
291+
from doc2anki.pipeline.processor import flatten_tree, greedy_chunk
292+
from doc2anki.parser.metadata import DocumentMetadata
293+
294+
content = """# Title
295+
Something
296+
297+
## Section
298+
More content
299+
"""
300+
tree = build_document_tree(content, format="markdown")
301+
blocks = flatten_tree(tree)
302+
chunks = greedy_chunk(blocks, max_tokens=10000, metadata=DocumentMetadata.empty())
303+
304+
# Should be a single chunk
305+
assert len(chunks) == 1
306+
307+
# Chunk should contain all content
308+
chunk_text = chunks[0].chunk_content
309+
assert "# Title" in chunk_text
310+
assert "Something" in chunk_text
311+
assert "## Section" in chunk_text
312+
assert "More content" in chunk_text
313+
314+
def test_process_pipeline_lossless(self):
315+
"""Test that process_pipeline preserves all content."""
316+
from doc2anki.pipeline import process_pipeline
317+
318+
content = """# Title
319+
Intro content
320+
321+
## 1
322+
Section 1 content
323+
324+
### 1-1
325+
Deep content
326+
327+
## 2
328+
Section 2 content
329+
"""
330+
tree = build_document_tree(content, format="markdown")
331+
chunks = process_pipeline(tree, max_tokens=10000)
332+
333+
# Should be a single chunk with all content
334+
assert len(chunks) == 1
335+
336+
chunk_text = chunks[0].chunk_content
337+
# Verify nothing is lost
338+
assert "# Title" in chunk_text
339+
assert "Intro content" in chunk_text
340+
assert "## 1" in chunk_text
341+
assert "Section 1 content" in chunk_text
342+
assert "### 1-1" in chunk_text
343+
assert "Deep content" in chunk_text
344+
assert "## 2" in chunk_text
345+
assert "Section 2 content" in chunk_text

0 commit comments

Comments
 (0)