SOV710
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/doc2anki/cli.py‎
Lines changed: 10 additions & 19 deletions b/‎src/doc2anki/cli.py‎
Lines changed: 10 additions & 19 deletions
diff --git a/‎src/doc2anki/parser/__init__.py‎
Lines changed: 42 additions & 46 deletions b/‎src/doc2anki/parser/__init__.py‎
Lines changed: 42 additions & 46 deletions
diff --git a/‎src/doc2anki/parser/base.py‎
Lines changed: 0 additions & 85 deletions b/‎src/doc2anki/parser/base.py‎
Lines changed: 0 additions & 85 deletions
@@ -8,7 +8,6 @@ authors = [{ name = "SOV710", email = "chris916911179@outlook.com" }]
 dependencies = [
   "genanki>=0.13.0",
   "jinja2>=3.0.0",
-  "markdown-it-py>=3.0.0",
   "openai>=1.0.0",
   "orgparse>=0.4.0",
   "pydantic>=2.0.0",
@@ -17,6 +16,8 @@ dependencies = [
   "rich>=13.0.0",
   "tiktoken>=0.5.0",
   "tomli>=2.0.0",
+  "tree-sitter>=0.24.0",
+  "tree-sitter-markdown>=0.3.0",
   "typer>=0.9.0",
 ]
 
 
@@ -286,7 +286,7 @@ def generate_cmd(
 ) -> None:
     """Generate Anki cards from documents."""
     # Import parser here to avoid circular imports and speed up CLI startup
-    from .parser import parse_document, build_document_tree, detect_format
+    from .parser import build_document_tree
     from .pipeline import process_pipeline, auto_detect_level
 
     # Validate input path
@@ -329,25 +329,17 @@ def generate_cmd(
         if verbose:
             console.print(f"\n[blue]Processing:[/blue] {file_path}")
 
-        # Parse document
+        # Build document tree (includes metadata extraction)
         try:
-            global_context, content = parse_document(file_path)
+            tree = build_document_tree(file_path)
         except Exception as e:
             fatal_exit(f"Failed to parse {file_path}: {e}")
             return
 
-        if verbose and global_context:
-            console.print(f"[blue]Global context:[/blue] {len(global_context)} items")
-            for term, definition in global_context.items():
-                console.print(f"  - {term}: {definition}")
-
-        # Build document tree
-        try:
-            doc_format = "org" if file_path.suffix.lower() == ".org" else "markdown"
-            tree = build_document_tree(content, doc_format)
-        except Exception as e:
-            fatal_exit(f"Failed to build document tree for {file_path}: {e}")
-            return
+        if verbose and tree.metadata.raw_data:
+            console.print(f"[blue]Metadata:[/blue] {len(tree.metadata.raw_data)} items")
+            for key, value in tree.metadata.raw_data.items():
+                console.print(f"  - {key}: {value}")
 
         # Determine chunk level
         actual_level = chunk_level
@@ -380,7 +372,6 @@ def generate_cmd(
                 tree=tree,
                 chunk_level=actual_level,
                 max_tokens=max_tokens,
-                global_context=global_context,
                 include_parent_chain=include_parent_chain,
                 classified_nodes=classified_nodes,
             )
@@ -398,7 +389,7 @@ def generate_cmd(
 
         if dry_run:
             console.print(f"\n[green]Dry run complete for {file_path}[/green]")
-            console.print(f"  Global context items: {len(global_context)}")
+            console.print(f"  Metadata items: {len(tree.metadata.raw_data)}")
             console.print(f"  Chunks: {len(chunk_contexts)}")
             continue
 
@@ -418,13 +409,13 @@ def generate_cmd(
 
                 chunk_cards = generate_cards_for_chunk(
                     chunk=ctx.chunk_content,
-                    global_context=ctx.global_context,
+                    global_context=dict(ctx.metadata.raw_data) if ctx.metadata.raw_data else {},
                     client=client,
                     model=provider_config.model,
                     template=template,
                     max_retries=max_retries,
                     verbose=verbose,
-                    parent_chain=ctx.parent_chain if include_parent_chain else None,
+                    parent_chain=list(ctx.parent_chain) if include_parent_chain else None,
                 )
                 cards.extend(chunk_cards)
 
 
@@ -1,61 +1,55 @@
 """Document parsing module for doc2anki."""
 
 from pathlib import Path
+import re
 
-from .base import ParseResult
+from .tree import HeadingNode, DocumentTree
+from .metadata import DocumentMetadata
+from .builder import TreeBuilder
 from .markdown import MarkdownParser
 from .markdown import build_tree as build_markdown_tree
-from .orgmode import OrgModeParser
+from .orgmode import OrgParser
 from .orgmode import build_tree as build_org_tree
 from .chunker import chunk_document, count_tokens, ChunkingError
-from .tree import HeadingNode, DocumentTree
-
-
-def parse_document(file_path: Path) -> tuple[dict[str, str], str]:
-    """
-    Parse a document file and extract global context and content.
-
-    Args:
-        file_path: Path to the document file (.md or .org)
-
-    Returns:
-        Tuple of (global_context dict, content string)
-
-    Raises:
-        ValueError: If file format is not supported
-    """
-    file_path = Path(file_path)
-    suffix = file_path.suffix.lower()
 
-    if suffix == ".md":
-        parser = MarkdownParser()
-    elif suffix == ".org":
-        parser = OrgModeParser()
-    else:
-        raise ValueError(f"Unsupported file format: {suffix}. Supported: .md, .org")
-
-    result = parser.parse(file_path)
-    return result.global_context, result.content
 
-
-def build_document_tree(content: str, format: str = "markdown") -> DocumentTree:
+def build_document_tree(source: str | Path, format: str | None = None) -> DocumentTree:
     """
-    Build a DocumentTree from document content.
+    Build a DocumentTree from document content or file.
 
     Args:
-        content: Document content string
-        format: Document format ("markdown" or "org")
+        source: Document content string or Path to file
+        format: Document format ("markdown" or "org").
+                If None, auto-detect from file extension or content.
 
     Returns:
-        DocumentTree with parsed heading hierarchy
+        Immutable DocumentTree with parsed heading hierarchy and metadata
 
     Raises:
-        ValueError: If format is not supported
+        ValueError: If format is not supported or cannot be detected
     """
+    # Determine format
+    if format is None:
+        if isinstance(source, Path):
+            suffix = source.suffix.lower()
+            if suffix == ".md":
+                format = "markdown"
+            elif suffix == ".org":
+                format = "org"
+            else:
+                # Try to detect from content
+                content = source.read_text(encoding="utf-8")
+                format = detect_format(content)
+        else:
+            format = detect_format(source)
+
+    # Parse based on format
     if format in ("markdown", "md"):
-        return build_markdown_tree(content)
+        parser = MarkdownParser()
+        return parser.parse(source)
     elif format in ("org", "orgmode"):
-        return build_org_tree(content)
+        parser = OrgParser()
+        return parser.parse(source)
     else:
         raise ValueError(f"Unsupported format: {format}. Supported: markdown, org")
 
@@ -66,24 +60,26 @@ def detect_format(content: str) -> str:
 
     Returns "markdown" or "org" based on heading patterns.
     """
-    import re
-
     md_headings = len(re.findall(r"^#{1,6}\s+.+$", content, re.MULTILINE))
     org_headings = len(re.findall(r"^\*+\s+.+$", content, re.MULTILINE))
 
     return "org" if org_headings > md_headings else "markdown"
 
 
 __all__ = [
-    "parse_document",
+    # Core types
+    "HeadingNode",
+    "DocumentTree",
+    "DocumentMetadata",
+    "TreeBuilder",
+    # Parsers
+    "MarkdownParser",
+    "OrgParser",
+    # Functions
     "build_document_tree",
     "detect_format",
+    # Chunking
     "chunk_document",
     "count_tokens",
     "ChunkingError",
-    "ParseResult",
-    "MarkdownParser",
-    "OrgModeParser",
-    "HeadingNode",
-    "DocumentTree",
 ]