@@ -190,43 +190,6 @@ def test_tree_get_nodes_at_level(self):
190190 nodes = tree .get_nodes_at_level (level )
191191 assert all (n .level == level for n in nodes )
192192
193- def test_get_chunks_at_level_includes_shallow_branches (self ):
194- """Test that get_chunks_at_level doesn't discard branches without target level."""
195- content = """# Title
196-
197- ## 1
198-
199- ### 1-1
200-
201- Content 1-1
202-
203- ## 2
204-
205- ### 2-1
206-
207- #### 2-1-1
208-
209- Content 2-1-1
210-
211- ### 2-2
212-
213- Content 2-2
214- """
215- tree = build_document_tree (content , format = "markdown" )
216-
217- # At level 4, we should get:
218- # - 1-1 (leaf, depth insufficient)
219- # - 2-1-1 (exactly level 4)
220- # - 2-2 (leaf, depth insufficient)
221- chunks = tree .get_chunks_at_level (4 )
222-
223- assert len (chunks ) == 3
224-
225- titles = [chunk .title for chunk in chunks ]
226- assert "1-1" in titles
227- assert "2-1-1" in titles
228- assert "2-2" in titles
229-
230193 def test_tree_iter_all_nodes (self ):
231194 tree = build_document_tree (FIXTURES_DIR / "sample.md" )
232195
@@ -285,3 +248,98 @@ def test_build_tree_auto_detect(self):
285248 org_content = "* Org Heading\n Content"
286249 tree = build_document_tree (org_content )
287250 assert tree .source_format == "org"
251+
252+
253+ class TestLosslessChunking :
254+ """Tests for lossless chunking pipeline."""
255+
256+ def test_flatten_tree_preserves_all_content (self ):
257+ """Test that flatten_tree includes all document content."""
258+ from doc2anki .pipeline .processor import flatten_tree
259+
260+ content = """# Title
261+ Something
262+
263+ ## 1
264+ Content under 1
265+
266+ ### 1-1
267+ Content under 1-1
268+
269+ ## 2
270+ Content under 2
271+ """
272+ tree = build_document_tree (content , format = "markdown" )
273+ blocks = flatten_tree (tree )
274+
275+ # Should have 4 blocks: Title, 1, 1-1, 2
276+ assert len (blocks ) == 4
277+
278+ # Verify all content is present
279+ all_text = "\n " .join (b .to_text () for b in blocks )
280+ assert "# Title" in all_text
281+ assert "Something" in all_text
282+ assert "## 1" in all_text
283+ assert "Content under 1" in all_text
284+ assert "### 1-1" in all_text
285+ assert "Content under 1-1" in all_text
286+ assert "## 2" in all_text
287+ assert "Content under 2" in all_text
288+
289+ def test_greedy_chunk_single_chunk (self ):
290+ """Test that small documents become a single chunk."""
291+ from doc2anki .pipeline .processor import flatten_tree , greedy_chunk
292+ from doc2anki .parser .metadata import DocumentMetadata
293+
294+ content = """# Title
295+ Something
296+
297+ ## Section
298+ More content
299+ """
300+ tree = build_document_tree (content , format = "markdown" )
301+ blocks = flatten_tree (tree )
302+ chunks = greedy_chunk (blocks , max_tokens = 10000 , metadata = DocumentMetadata .empty ())
303+
304+ # Should be a single chunk
305+ assert len (chunks ) == 1
306+
307+ # Chunk should contain all content
308+ chunk_text = chunks [0 ].chunk_content
309+ assert "# Title" in chunk_text
310+ assert "Something" in chunk_text
311+ assert "## Section" in chunk_text
312+ assert "More content" in chunk_text
313+
314+ def test_process_pipeline_lossless (self ):
315+ """Test that process_pipeline preserves all content."""
316+ from doc2anki .pipeline import process_pipeline
317+
318+ content = """# Title
319+ Intro content
320+
321+ ## 1
322+ Section 1 content
323+
324+ ### 1-1
325+ Deep content
326+
327+ ## 2
328+ Section 2 content
329+ """
330+ tree = build_document_tree (content , format = "markdown" )
331+ chunks = process_pipeline (tree , max_tokens = 10000 )
332+
333+ # Should be a single chunk with all content
334+ assert len (chunks ) == 1
335+
336+ chunk_text = chunks [0 ].chunk_content
337+ # Verify nothing is lost
338+ assert "# Title" in chunk_text
339+ assert "Intro content" in chunk_text
340+ assert "## 1" in chunk_text
341+ assert "Section 1 content" in chunk_text
342+ assert "### 1-1" in chunk_text
343+ assert "Deep content" in chunk_text
344+ assert "## 2" in chunk_text
345+ assert "Section 2 content" in chunk_text
0 commit comments