Skip to content

Commit 642349c

Browse files
committed
refactor(pipeline): replace heading-level chunking with greedy token-based chunker
- Drop --chunk-level and auto-detect logic; chunk by flatten+greedy merge - Improve verbose chunk preview with token count + head/tail snippet - Interactive mode uses minimum heading level - Add DocumentTree.get_chunks_at_level() and tests
1 parent cf42bda commit 642349c

5 files changed

Lines changed: 260 additions & 84 deletions

File tree

src/doc2anki/cli.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,6 @@ def generate_cmd(
257257
"--extra-tags",
258258
help="Additional tags (comma-separated)",
259259
),
260-
chunk_level: Optional[int] = typer.Option(
261-
None,
262-
"--chunk-level",
263-
help="Heading level to chunk at (1-6, default: auto)",
264-
),
265260
include_parent_chain: bool = typer.Option(
266261
True,
267262
"--include-parent-chain/--no-parent-chain",
@@ -287,7 +282,7 @@ def generate_cmd(
287282
"""Generate Anki cards from documents."""
288283
# Import parser here to avoid circular imports and speed up CLI startup
289284
from .parser import build_document_tree
290-
from .pipeline import process_pipeline, auto_detect_level
285+
from .pipeline import process_pipeline
291286

292287
# Validate input path
293288
if not input_path.exists():
@@ -341,23 +336,19 @@ def generate_cmd(
341336
for key, value in tree.metadata.raw_data.items():
342337
console.print(f" - {key}: {value}")
343338

344-
# Determine chunk level
345-
actual_level = chunk_level
346-
if actual_level is None:
347-
actual_level = auto_detect_level(tree, max_tokens)
348-
349339
if verbose:
350340
console.print(f"[blue]Document tree:[/blue] {tree}")
351-
console.print(f"[blue]Chunk level:[/blue] {actual_level}")
352341

353342
# Interactive classification if requested
354343
classified_nodes = None
355344
if interactive:
356345
from .pipeline import run_interactive_session
357346

347+
# Use minimum heading level for interactive mode
348+
interactive_level = tree.min_level if tree.min_level > 0 else 2
358349
classified_nodes = run_interactive_session(
359350
tree=tree,
360-
level=actual_level,
351+
level=interactive_level,
361352
console=console,
362353
filename=str(file_path.name),
363354
)
@@ -370,7 +361,6 @@ def generate_cmd(
370361
try:
371362
chunk_contexts = process_pipeline(
372363
tree=tree,
373-
chunk_level=actual_level,
374364
max_tokens=max_tokens,
375365
include_parent_chain=include_parent_chain,
376366
classified_nodes=classified_nodes,
@@ -380,12 +370,22 @@ def generate_cmd(
380370
return
381371

382372
if verbose:
373+
from .parser import count_tokens
374+
383375
console.print(f"[blue]Chunks:[/blue] {len(chunk_contexts)}")
384376
for i, ctx in enumerate(chunk_contexts):
385-
preview = ctx.chunk_content[:100].replace("\n", " ")
377+
tokens = count_tokens(ctx.chunk_content)
386378
chain_str = " > ".join(ctx.parent_chain) if ctx.parent_chain else "(root)"
387-
console.print(f" [{i+1}] {chain_str}")
388-
console.print(f" {preview}...")
379+
380+
# Show beginning and ending of content
381+
content = ctx.chunk_content.replace("\n", " ")
382+
if len(content) > 80:
383+
preview = f"{content[:40]}...{content[-30:]}"
384+
else:
385+
preview = content
386+
387+
console.print(f" [{i+1}] {chain_str} (tokens: {tokens})")
388+
console.print(f" {preview}")
389389

390390
if dry_run:
391391
console.print(f"\n[green]Dry run complete for {file_path}[/green]")

src/doc2anki/parser/tree.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,5 +146,40 @@ def iter_all_nodes(self) -> Iterator[HeadingNode]:
146146
yield child
147147
yield from child.iter_descendants()
148148

149+
def get_chunks_at_level(self, level: int) -> tuple[HeadingNode, ...]:
150+
"""
151+
获取在指定 level 切分的 chunks。
152+
153+
对于每个分支:
154+
- 如果有 level N 的子节点:返回这些 level N 节点
155+
- 如果深度不足:返回该分支的叶子节点
156+
157+
这确保没有内容被丢弃。
158+
159+
Args:
160+
level: 目标切分层级
161+
162+
Returns:
163+
Tuple of HeadingNode objects to be used as chunks
164+
"""
165+
result: list[HeadingNode] = []
166+
167+
def collect(node: HeadingNode) -> None:
168+
if node.level >= level:
169+
# 到达或超过目标 level,作为 chunk
170+
result.append(node)
171+
elif not node.children:
172+
# 叶子节点且 level < target,作为 chunk(深度不足)
173+
result.append(node)
174+
else:
175+
# 有子节点且 level < target,继续递归
176+
for child in node.children:
177+
collect(child)
178+
179+
for child in self.children:
180+
collect(child)
181+
182+
return tuple(result)
183+
149184
def __repr__(self) -> str:
150185
return f"DocumentTree(children={len(self.children)}, levels={set(self.get_all_levels())})"

src/doc2anki/pipeline/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@
33
from .classifier import ChunkType, ClassifiedNode
44
from .context import ChunkWithContext
55
from .interactive import run_interactive_session
6-
from .processor import auto_detect_level, process_pipeline
6+
from .processor import process_pipeline
77

88
__all__ = [
99
"ChunkType",
1010
"ClassifiedNode",
1111
"ChunkWithContext",
12-
"auto_detect_level",
1312
"process_pipeline",
1413
"run_interactive_session",
1514
]

0 commit comments

Comments
 (0)