refactor(interactive): adopt independent-classification with HeadingNode.own_text

SOV710 · SOV710 · commit 56d1721142ae · 2026-01-05T22:04:00.000+08:00
- Add HeadingNode.own_text (heading + direct content only) to avoid subtree leakage
- Interactive mode now traverses all AST nodes (no level param), shows breadcrumb paths, and counts tokens from own_text
- Add oversized section warnings against max_tokens; CLI passes max_tokens into interactive session
- Interactive pipeline builds ContentBlocks from direct content, runs greedy_chunk, and accumulates context via own_text (not full_content)
diff --git a/src/doc2anki/cli.py b/src/doc2anki/cli.py
@@ -344,13 +344,11 @@ def generate_cmd(
         if interactive:
             from .pipeline import run_interactive_session
 
-            # Use minimum heading level for interactive mode
-            interactive_level = tree.min_level if tree.min_level > 0 else 2
             classified_nodes = run_interactive_session(
                 tree=tree,
-                level=interactive_level,
                 console=console,
                 filename=str(file_path.name),
+                max_tokens=max_tokens,
             )
 
             if not classified_nodes:
diff --git a/src/doc2anki/parser/tree.py b/src/doc2anki/parser/tree.py
@@ -59,6 +59,30 @@ def full_content(self) -> str:
 
         return "\n\n".join(parts)
 
+    @property
+    def own_text(self) -> str:
+        """
+        Get only this node's own text (without children).
+
+        Used for "independent classification" semantics where each node
+        is classified independently. Only includes:
+        - Heading line (Markdown format)
+        - Direct content (no descendants)
+
+        Format is consistent with full_content's heading line style.
+        """
+        parts = []
+
+        # Heading line (Markdown format)
+        heading_marker = "#" * self.level
+        parts.append(f"{heading_marker} {self.title}")
+
+        # Direct content only (no children)
+        if self.content.strip():
+            parts.append(self.content.strip())
+
+        return "\n\n".join(parts)
+
     def iter_descendants(self) -> Iterator[HeadingNode]:
         """Iterate over all descendant nodes (depth-first)."""
         for child in self.children:
diff --git a/src/doc2anki/pipeline/interactive.py b/src/doc2anki/pipeline/interactive.py
@@ -21,24 +21,21 @@
     "s": ChunkType.SKIP,
 }
 
-# Warning threshold for large chunks (tokens)
-LARGE_CHUNK_THRESHOLD = 2000
-
 
 @dataclass
 class InteractiveSession:
     """Manages interactive chunk classification."""
 
     tree: DocumentTree
-    level: int
     nodes: list[HeadingNode] = field(default_factory=list)
     classified: list[ClassifiedNode] = field(default_factory=list)
     current_index: int = 0
     accumulated_tokens: int = 0
 
     def __post_init__(self) -> None:
         """Initialize nodes and classified list from tree."""
-        self.nodes = self.tree.get_nodes_at_level(self.level)
+        # Traverse all nodes in document order (depth-first)
+        self.nodes = list(self.tree.iter_all_nodes())
         # Initialize all as CARD_ONLY (default)
         self.classified = [
             ClassifiedNode(node=n, chunk_type=ChunkType.CARD_ONLY)
@@ -65,12 +62,14 @@ def classify_current(self, chunk_type: ChunkType) -> int:
         Classify the current node and advance.
 
         Returns the token count of the classified chunk.
+        Uses own_text (not full_content) for independent classification.
         """
         if self.is_complete:
             return 0
 
         node = self.nodes[self.current_index]
-        tokens = count_tokens(node.full_content)
+        # Use own_text for independent classification semantics
+        tokens = count_tokens(node.own_text)
 
         self.classified[self.current_index].chunk_type = chunk_type
 
@@ -104,13 +103,24 @@ def display_section_summary(
     console: Console,
     nodes: list[HeadingNode],
     filename: str,
-    level: int,
-) -> None:
-    """Display a summary table of all sections."""
+    max_tokens: int,
+) -> list[tuple[str, int]]:
+    """
+    Display a summary table of all sections.
+
+    Args:
+        console: Rich console for output
+        nodes: List of HeadingNode to display
+        filename: Source filename for display
+        max_tokens: Maximum tokens per chunk (for oversized warning)
+
+    Returns:
+        List of (breadcrumb, tokens) tuples for oversized nodes
+    """
     console.print()
     console.print(
         Panel(
-            f"Found [cyan]{len(nodes)}[/cyan] sections at level [cyan]{level}[/cyan]",
+            f"Found [cyan]{len(nodes)}[/cyan] sections",
             title=f"[bold]Processing: {filename}[/bold]",
             border_style="blue",
         )
@@ -121,17 +131,28 @@ def display_section_summary(
     table.add_column("Section", style="cyan")
     table.add_column("Tokens", justify="right")
 
+    oversized: list[tuple[str, int]] = []
+
     for i, node in enumerate(nodes, 1):
-        tokens = count_tokens(node.full_content)
-        # Add warning indicator for large chunks
-        token_str = f"{tokens:,}"
-        if tokens > LARGE_CHUNK_THRESHOLD:
+        # Use own_text for independent classification semantics
+        tokens = count_tokens(node.own_text)
+        breadcrumb = " > ".join(node.path)
+
+        if tokens > max_tokens:
+            oversized.append((breadcrumb, tokens))
             token_str = f"[yellow]{tokens:,}[/yellow] [yellow]![/yellow]"
-        table.add_row(str(i), node.title, token_str)
+            style = "yellow"
+        else:
+            token_str = f"{tokens:,}"
+            style = None
+
+        table.add_row(str(i), breadcrumb, token_str, style=style)
 
     console.print(table)
     console.print()
 
+    return oversized
+
 
 def display_classification_help(console: Console) -> None:
     """Display classification options."""
@@ -147,15 +168,17 @@ def display_classification_help(console: Console) -> None:
 
 def preview_chunk(console: Console, node: HeadingNode) -> None:
     """Display a preview of the chunk content."""
-    content = node.full_content
+    # Use own_text for independent classification semantics
+    content = node.own_text
     # Truncate if too long
     max_preview = 2000
     if len(content) > max_preview:
         content = content[:max_preview] + "\n... [dim](truncated)[/dim]"
 
-    # Detect syntax for highlighting
+    # Use breadcrumb as title
+    breadcrumb = " > ".join(node.path)
     syntax = Syntax(content, "markdown", theme="monokai", line_numbers=True)
-    console.print(Panel(syntax, title=f"[bold]{node.title}[/bold]", border_style="cyan"))
+    console.print(Panel(syntax, title=f"[bold]{breadcrumb}[/bold]", border_style="cyan"))
 
 
 def prompt_classification(
@@ -172,18 +195,20 @@ def prompt_classification(
     if node is None:
         return "done"
 
-    tokens = count_tokens(node.full_content)
+    # Use own_text for independent classification semantics
+    tokens = count_tokens(node.own_text)
     idx = session.current_index + 1
     total = session.total
 
-    # Build prompt
+    # Build breadcrumb display
+    breadcrumb = " > ".join(node.path)
+
+    # Build prompt with token info
     token_info = f"[dim]({tokens:,} tokens)[/dim]"
-    if tokens > LARGE_CHUNK_THRESHOLD:
-        token_info = f"[yellow]({tokens:,} tokens)[/yellow]"
 
     console.print(
         f"Section [bold]{idx}/{total}[/bold] "
-        f"[cyan]\"{node.title}\"[/cyan] {token_info}"
+        f"[cyan]{breadcrumb}[/cyan] {token_info}"
     )
 
     try:
@@ -243,30 +268,38 @@ def show_token_info(
 
 def run_interactive_session(
     tree: DocumentTree,
-    level: int,
     console: Console,
     filename: str = "",
+    max_tokens: int = 3000,
 ) -> list[ClassifiedNode]:
     """
     Run an interactive classification session.
 
     Args:
         tree: DocumentTree to classify
-        level: Heading level to classify at
         console: Rich console for output
         filename: Source filename for display
+        max_tokens: Maximum tokens per chunk (for oversized warnings)
 
     Returns:
         List of ClassifiedNode with user classifications
     """
-    session = InteractiveSession(tree=tree, level=level)
+    session = InteractiveSession(tree=tree)
 
     if session.total == 0:
-        console.print("[yellow]No sections found at this level.[/yellow]")
+        console.print("[yellow]No sections found.[/yellow]")
         return []
 
-    # Display summary
-    display_section_summary(console, session.nodes, filename, level)
+    # Display summary and get oversized nodes
+    oversized = display_section_summary(console, session.nodes, filename, max_tokens)
+
+    # Display oversized warnings
+    if oversized:
+        console.print(f"[yellow]Warning: {len(oversized)} section(s) exceed max_tokens ({max_tokens}):[/yellow]")
+        for breadcrumb, tokens in oversized:
+            console.print(f"  [yellow]- {breadcrumb}: {tokens} tokens[/yellow]")
+        console.print()
+
     display_classification_help(console)
 
     # Classification loop
@@ -288,7 +321,7 @@ def run_interactive_session(
         elif result == "reset":
             session.reset()
             console.print("[yellow]Reset. Starting over...[/yellow]\n")
-            display_section_summary(console, session.nodes, filename, level)
+            display_section_summary(console, session.nodes, filename, max_tokens)
             display_classification_help(console)
 
         elif result == "done":
diff --git a/src/doc2anki/pipeline/processor.py b/src/doc2anki/pipeline/processor.py
@@ -179,6 +179,11 @@ def _process_with_classified_nodes(
     """
     Process pre-classified nodes (interactive mode).
 
+    Uses "independent classification" semantics:
+    - Each node is classified independently
+    - Uses own_text (not full_content) to exclude child content
+    - Parent=CARD + Child=SKIP means parent only includes its direct content
+
     Args:
         tree: DocumentTree
         classified_nodes: Pre-classified nodes
@@ -188,26 +193,40 @@ def _process_with_classified_nodes(
     Returns:
         List of ChunkWithContext objects
     """
-    accumulated_ctx = ""
-    result: list[ChunkWithContext] = []
+    # Build ContentBlocks from CARD/FULL nodes (using own_text semantics)
+    card_blocks: list[ContentBlock] = []
+    context_content = ""
 
     for cn in classified_nodes:
         if cn.chunk_type == ChunkType.SKIP:
             continue
 
-        # For nodes that generate cards, create ChunkWithContext
+        # For nodes that generate cards, create ContentBlock
+        # Use node.content (direct content only), not full_content
         if cn.should_generate_cards:
-            chunk_ctx = ChunkWithContext(
-                metadata=tree.metadata,
-                accumulated_context=accumulated_ctx,
-                parent_chain=cn.node.path if include_parent_chain else (),
-                chunk_content=cn.node.full_content,
+            card_blocks.append(
+                ContentBlock(
+                    level=cn.node.level,
+                    heading="#" * cn.node.level + " " + cn.node.title,
+                    content=cn.node.content,  # Direct content only
+                    path=cn.node.path if include_parent_chain else (),
+                )
             )
-            result.append(chunk_ctx)
 
-        # Update accumulated context for subsequent chunks
+        # Accumulate context using own_text (not full_content)
         if cn.should_add_to_context:
-            accumulated_ctx += f"\n\n{cn.node.full_content}"
+            context_content += f"\n\n{cn.node.own_text}"
+
+    if not card_blocks:
+        return []
+
+    # Execute greedy_chunk on card blocks
+    result = greedy_chunk(card_blocks, max_tokens, tree.metadata)
+
+    # Attach accumulated context to all chunks
+    if context_content.strip():
+        for chunk in result:
+            chunk.accumulated_context = context_content.strip()
 
     return result