Skip to content

Commit 56d1721

Browse files
committed
refactor(interactive): adopt independent-classification with HeadingNode.own_text
- Add HeadingNode.own_text (heading + direct content only) to avoid subtree leakage - Interactive mode now traverses all AST nodes (no level param), shows breadcrumb paths, and counts tokens from own_text - Add oversized section warnings against max_tokens; CLI passes max_tokens into interactive session - Interactive pipeline builds ContentBlocks from direct content, runs greedy_chunk, and accumulates context via own_text (not full_content)
1 parent 47c931c commit 56d1721

File tree

4 files changed

+118
-44
lines changed

4 files changed

+118
-44
lines changed

src/doc2anki/cli.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,13 +344,11 @@ def generate_cmd(
344344
if interactive:
345345
from .pipeline import run_interactive_session
346346

347-
# Use minimum heading level for interactive mode
348-
interactive_level = tree.min_level if tree.min_level > 0 else 2
349347
classified_nodes = run_interactive_session(
350348
tree=tree,
351-
level=interactive_level,
352349
console=console,
353350
filename=str(file_path.name),
351+
max_tokens=max_tokens,
354352
)
355353

356354
if not classified_nodes:

src/doc2anki/parser/tree.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,30 @@ def full_content(self) -> str:
5959

6060
return "\n\n".join(parts)
6161

62+
@property
63+
def own_text(self) -> str:
64+
"""
65+
Get only this node's own text (without children).
66+
67+
Used for "independent classification" semantics where each node
68+
is classified independently. Only includes:
69+
- Heading line (Markdown format)
70+
- Direct content (no descendants)
71+
72+
Format is consistent with full_content's heading line style.
73+
"""
74+
parts = []
75+
76+
# Heading line (Markdown format)
77+
heading_marker = "#" * self.level
78+
parts.append(f"{heading_marker} {self.title}")
79+
80+
# Direct content only (no children)
81+
if self.content.strip():
82+
parts.append(self.content.strip())
83+
84+
return "\n\n".join(parts)
85+
6286
def iter_descendants(self) -> Iterator[HeadingNode]:
6387
"""Iterate over all descendant nodes (depth-first)."""
6488
for child in self.children:

src/doc2anki/pipeline/interactive.py

Lines changed: 63 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -21,24 +21,21 @@
2121
"s": ChunkType.SKIP,
2222
}
2323

24-
# Warning threshold for large chunks (tokens)
25-
LARGE_CHUNK_THRESHOLD = 2000
26-
2724

2825
@dataclass
2926
class InteractiveSession:
3027
"""Manages interactive chunk classification."""
3128

3229
tree: DocumentTree
33-
level: int
3430
nodes: list[HeadingNode] = field(default_factory=list)
3531
classified: list[ClassifiedNode] = field(default_factory=list)
3632
current_index: int = 0
3733
accumulated_tokens: int = 0
3834

3935
def __post_init__(self) -> None:
4036
"""Initialize nodes and classified list from tree."""
41-
self.nodes = self.tree.get_nodes_at_level(self.level)
37+
# Traverse all nodes in document order (depth-first)
38+
self.nodes = list(self.tree.iter_all_nodes())
4239
# Initialize all as CARD_ONLY (default)
4340
self.classified = [
4441
ClassifiedNode(node=n, chunk_type=ChunkType.CARD_ONLY)
@@ -65,12 +62,14 @@ def classify_current(self, chunk_type: ChunkType) -> int:
6562
Classify the current node and advance.
6663
6764
Returns the token count of the classified chunk.
65+
Uses own_text (not full_content) for independent classification.
6866
"""
6967
if self.is_complete:
7068
return 0
7169

7270
node = self.nodes[self.current_index]
73-
tokens = count_tokens(node.full_content)
71+
# Use own_text for independent classification semantics
72+
tokens = count_tokens(node.own_text)
7473

7574
self.classified[self.current_index].chunk_type = chunk_type
7675

@@ -104,13 +103,24 @@ def display_section_summary(
104103
console: Console,
105104
nodes: list[HeadingNode],
106105
filename: str,
107-
level: int,
108-
) -> None:
109-
"""Display a summary table of all sections."""
106+
max_tokens: int,
107+
) -> list[tuple[str, int]]:
108+
"""
109+
Display a summary table of all sections.
110+
111+
Args:
112+
console: Rich console for output
113+
nodes: List of HeadingNode to display
114+
filename: Source filename for display
115+
max_tokens: Maximum tokens per chunk (for oversized warning)
116+
117+
Returns:
118+
List of (breadcrumb, tokens) tuples for oversized nodes
119+
"""
110120
console.print()
111121
console.print(
112122
Panel(
113-
f"Found [cyan]{len(nodes)}[/cyan] sections at level [cyan]{level}[/cyan]",
123+
f"Found [cyan]{len(nodes)}[/cyan] sections",
114124
title=f"[bold]Processing: {filename}[/bold]",
115125
border_style="blue",
116126
)
@@ -121,17 +131,28 @@ def display_section_summary(
121131
table.add_column("Section", style="cyan")
122132
table.add_column("Tokens", justify="right")
123133

134+
oversized: list[tuple[str, int]] = []
135+
124136
for i, node in enumerate(nodes, 1):
125-
tokens = count_tokens(node.full_content)
126-
# Add warning indicator for large chunks
127-
token_str = f"{tokens:,}"
128-
if tokens > LARGE_CHUNK_THRESHOLD:
137+
# Use own_text for independent classification semantics
138+
tokens = count_tokens(node.own_text)
139+
breadcrumb = " > ".join(node.path)
140+
141+
if tokens > max_tokens:
142+
oversized.append((breadcrumb, tokens))
129143
token_str = f"[yellow]{tokens:,}[/yellow] [yellow]![/yellow]"
130-
table.add_row(str(i), node.title, token_str)
144+
style = "yellow"
145+
else:
146+
token_str = f"{tokens:,}"
147+
style = None
148+
149+
table.add_row(str(i), breadcrumb, token_str, style=style)
131150

132151
console.print(table)
133152
console.print()
134153

154+
return oversized
155+
135156

136157
def display_classification_help(console: Console) -> None:
137158
"""Display classification options."""
@@ -147,15 +168,17 @@ def display_classification_help(console: Console) -> None:
147168

148169
def preview_chunk(console: Console, node: HeadingNode) -> None:
149170
"""Display a preview of the chunk content."""
150-
content = node.full_content
171+
# Use own_text for independent classification semantics
172+
content = node.own_text
151173
# Truncate if too long
152174
max_preview = 2000
153175
if len(content) > max_preview:
154176
content = content[:max_preview] + "\n... [dim](truncated)[/dim]"
155177

156-
# Detect syntax for highlighting
178+
# Use breadcrumb as title
179+
breadcrumb = " > ".join(node.path)
157180
syntax = Syntax(content, "markdown", theme="monokai", line_numbers=True)
158-
console.print(Panel(syntax, title=f"[bold]{node.title}[/bold]", border_style="cyan"))
181+
console.print(Panel(syntax, title=f"[bold]{breadcrumb}[/bold]", border_style="cyan"))
159182

160183

161184
def prompt_classification(
@@ -172,18 +195,20 @@ def prompt_classification(
172195
if node is None:
173196
return "done"
174197

175-
tokens = count_tokens(node.full_content)
198+
# Use own_text for independent classification semantics
199+
tokens = count_tokens(node.own_text)
176200
idx = session.current_index + 1
177201
total = session.total
178202

179-
# Build prompt
203+
# Build breadcrumb display
204+
breadcrumb = " > ".join(node.path)
205+
206+
# Build prompt with token info
180207
token_info = f"[dim]({tokens:,} tokens)[/dim]"
181-
if tokens > LARGE_CHUNK_THRESHOLD:
182-
token_info = f"[yellow]({tokens:,} tokens)[/yellow]"
183208

184209
console.print(
185210
f"Section [bold]{idx}/{total}[/bold] "
186-
f"[cyan]\"{node.title}\"[/cyan] {token_info}"
211+
f"[cyan]{breadcrumb}[/cyan] {token_info}"
187212
)
188213

189214
try:
@@ -243,30 +268,38 @@ def show_token_info(
243268

244269
def run_interactive_session(
245270
tree: DocumentTree,
246-
level: int,
247271
console: Console,
248272
filename: str = "",
273+
max_tokens: int = 3000,
249274
) -> list[ClassifiedNode]:
250275
"""
251276
Run an interactive classification session.
252277
253278
Args:
254279
tree: DocumentTree to classify
255-
level: Heading level to classify at
256280
console: Rich console for output
257281
filename: Source filename for display
282+
max_tokens: Maximum tokens per chunk (for oversized warnings)
258283
259284
Returns:
260285
List of ClassifiedNode with user classifications
261286
"""
262-
session = InteractiveSession(tree=tree, level=level)
287+
session = InteractiveSession(tree=tree)
263288

264289
if session.total == 0:
265-
console.print("[yellow]No sections found at this level.[/yellow]")
290+
console.print("[yellow]No sections found.[/yellow]")
266291
return []
267292

268-
# Display summary
269-
display_section_summary(console, session.nodes, filename, level)
293+
# Display summary and get oversized nodes
294+
oversized = display_section_summary(console, session.nodes, filename, max_tokens)
295+
296+
# Display oversized warnings
297+
if oversized:
298+
console.print(f"[yellow]Warning: {len(oversized)} section(s) exceed max_tokens ({max_tokens}):[/yellow]")
299+
for breadcrumb, tokens in oversized:
300+
console.print(f" [yellow]- {breadcrumb}: {tokens} tokens[/yellow]")
301+
console.print()
302+
270303
display_classification_help(console)
271304

272305
# Classification loop
@@ -288,7 +321,7 @@ def run_interactive_session(
288321
elif result == "reset":
289322
session.reset()
290323
console.print("[yellow]Reset. Starting over...[/yellow]\n")
291-
display_section_summary(console, session.nodes, filename, level)
324+
display_section_summary(console, session.nodes, filename, max_tokens)
292325
display_classification_help(console)
293326

294327
elif result == "done":

src/doc2anki/pipeline/processor.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ def _process_with_classified_nodes(
179179
"""
180180
Process pre-classified nodes (interactive mode).
181181
182+
Uses "independent classification" semantics:
183+
- Each node is classified independently
184+
- Uses own_text (not full_content) to exclude child content
185+
- Parent=CARD + Child=SKIP means parent only includes its direct content
186+
182187
Args:
183188
tree: DocumentTree
184189
classified_nodes: Pre-classified nodes
@@ -188,26 +193,40 @@ def _process_with_classified_nodes(
188193
Returns:
189194
List of ChunkWithContext objects
190195
"""
191-
accumulated_ctx = ""
192-
result: list[ChunkWithContext] = []
196+
# Build ContentBlocks from CARD/FULL nodes (using own_text semantics)
197+
card_blocks: list[ContentBlock] = []
198+
context_content = ""
193199

194200
for cn in classified_nodes:
195201
if cn.chunk_type == ChunkType.SKIP:
196202
continue
197203

198-
# For nodes that generate cards, create ChunkWithContext
204+
# For nodes that generate cards, create ContentBlock
205+
# Use node.content (direct content only), not full_content
199206
if cn.should_generate_cards:
200-
chunk_ctx = ChunkWithContext(
201-
metadata=tree.metadata,
202-
accumulated_context=accumulated_ctx,
203-
parent_chain=cn.node.path if include_parent_chain else (),
204-
chunk_content=cn.node.full_content,
207+
card_blocks.append(
208+
ContentBlock(
209+
level=cn.node.level,
210+
heading="#" * cn.node.level + " " + cn.node.title,
211+
content=cn.node.content, # Direct content only
212+
path=cn.node.path if include_parent_chain else (),
213+
)
205214
)
206-
result.append(chunk_ctx)
207215

208-
# Update accumulated context for subsequent chunks
216+
# Accumulate context using own_text (not full_content)
209217
if cn.should_add_to_context:
210-
accumulated_ctx += f"\n\n{cn.node.full_content}"
218+
context_content += f"\n\n{cn.node.own_text}"
219+
220+
if not card_blocks:
221+
return []
222+
223+
# Execute greedy_chunk on card blocks
224+
result = greedy_chunk(card_blocks, max_tokens, tree.metadata)
225+
226+
# Attach accumulated context to all chunks
227+
if context_content.strip():
228+
for chunk in result:
229+
chunk.accumulated_context = context_content.strip()
211230

212231
return result
213232

0 commit comments

Comments
 (0)