Skip to content

Commit cf42bda

Browse files
committed
feat!: migrate to AST-based parsing, remove context blocks
BREAKING CHANGE: Static context blocks (```context / #+BEGIN_CONTEXT) are no longer supported. Use interactive mode (-I) to classify chunks as CONTEXT_ONLY instead. Replace regex-based parser with proper AST parsing: - Markdown: tree-sitter-markdown for accurate structure extraction - Org-mode: orgparse for native AST support Benefits: - Reliable heading hierarchy detection - Correct handling of nested structures and code blocks - No more false positives from content resembling markdown syntax
1 parent 6e8015d commit cf42bda

15 files changed

Lines changed: 847 additions & 690 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ authors = [{ name = "SOV710", email = "chris916911179@outlook.com" }]
88
dependencies = [
99
"genanki>=0.13.0",
1010
"jinja2>=3.0.0",
11-
"markdown-it-py>=3.0.0",
1211
"openai>=1.0.0",
1312
"orgparse>=0.4.0",
1413
"pydantic>=2.0.0",
@@ -17,6 +16,8 @@ dependencies = [
1716
"rich>=13.0.0",
1817
"tiktoken>=0.5.0",
1918
"tomli>=2.0.0",
19+
"tree-sitter>=0.24.0",
20+
"tree-sitter-markdown>=0.3.0",
2021
"typer>=0.9.0",
2122
]
2223

src/doc2anki/cli.py

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ def generate_cmd(
286286
) -> None:
287287
"""Generate Anki cards from documents."""
288288
# Import parser here to avoid circular imports and speed up CLI startup
289-
from .parser import parse_document, build_document_tree, detect_format
289+
from .parser import build_document_tree
290290
from .pipeline import process_pipeline, auto_detect_level
291291

292292
# Validate input path
@@ -329,25 +329,17 @@ def generate_cmd(
329329
if verbose:
330330
console.print(f"\n[blue]Processing:[/blue] {file_path}")
331331

332-
# Parse document
332+
# Build document tree (includes metadata extraction)
333333
try:
334-
global_context, content = parse_document(file_path)
334+
tree = build_document_tree(file_path)
335335
except Exception as e:
336336
fatal_exit(f"Failed to parse {file_path}: {e}")
337337
return
338338

339-
if verbose and global_context:
340-
console.print(f"[blue]Global context:[/blue] {len(global_context)} items")
341-
for term, definition in global_context.items():
342-
console.print(f" - {term}: {definition}")
343-
344-
# Build document tree
345-
try:
346-
doc_format = "org" if file_path.suffix.lower() == ".org" else "markdown"
347-
tree = build_document_tree(content, doc_format)
348-
except Exception as e:
349-
fatal_exit(f"Failed to build document tree for {file_path}: {e}")
350-
return
339+
if verbose and tree.metadata.raw_data:
340+
console.print(f"[blue]Metadata:[/blue] {len(tree.metadata.raw_data)} items")
341+
for key, value in tree.metadata.raw_data.items():
342+
console.print(f" - {key}: {value}")
351343

352344
# Determine chunk level
353345
actual_level = chunk_level
@@ -380,7 +372,6 @@ def generate_cmd(
380372
tree=tree,
381373
chunk_level=actual_level,
382374
max_tokens=max_tokens,
383-
global_context=global_context,
384375
include_parent_chain=include_parent_chain,
385376
classified_nodes=classified_nodes,
386377
)
@@ -398,7 +389,7 @@ def generate_cmd(
398389

399390
if dry_run:
400391
console.print(f"\n[green]Dry run complete for {file_path}[/green]")
401-
console.print(f" Global context items: {len(global_context)}")
392+
console.print(f" Metadata items: {len(tree.metadata.raw_data)}")
402393
console.print(f" Chunks: {len(chunk_contexts)}")
403394
continue
404395

@@ -418,13 +409,13 @@ def generate_cmd(
418409

419410
chunk_cards = generate_cards_for_chunk(
420411
chunk=ctx.chunk_content,
421-
global_context=ctx.global_context,
412+
global_context=dict(ctx.metadata.raw_data) if ctx.metadata.raw_data else {},
422413
client=client,
423414
model=provider_config.model,
424415
template=template,
425416
max_retries=max_retries,
426417
verbose=verbose,
427-
parent_chain=ctx.parent_chain if include_parent_chain else None,
418+
parent_chain=list(ctx.parent_chain) if include_parent_chain else None,
428419
)
429420
cards.extend(chunk_cards)
430421

src/doc2anki/parser/__init__.py

Lines changed: 42 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,55 @@
11
"""Document parsing module for doc2anki."""
22

33
from pathlib import Path
4+
import re
45

5-
from .base import ParseResult
6+
from .tree import HeadingNode, DocumentTree
7+
from .metadata import DocumentMetadata
8+
from .builder import TreeBuilder
69
from .markdown import MarkdownParser
710
from .markdown import build_tree as build_markdown_tree
8-
from .orgmode import OrgModeParser
11+
from .orgmode import OrgParser
912
from .orgmode import build_tree as build_org_tree
1013
from .chunker import chunk_document, count_tokens, ChunkingError
11-
from .tree import HeadingNode, DocumentTree
12-
13-
14-
def parse_document(file_path: Path) -> tuple[dict[str, str], str]:
15-
"""
16-
Parse a document file and extract global context and content.
17-
18-
Args:
19-
file_path: Path to the document file (.md or .org)
20-
21-
Returns:
22-
Tuple of (global_context dict, content string)
23-
24-
Raises:
25-
ValueError: If file format is not supported
26-
"""
27-
file_path = Path(file_path)
28-
suffix = file_path.suffix.lower()
2914

30-
if suffix == ".md":
31-
parser = MarkdownParser()
32-
elif suffix == ".org":
33-
parser = OrgModeParser()
34-
else:
35-
raise ValueError(f"Unsupported file format: {suffix}. Supported: .md, .org")
36-
37-
result = parser.parse(file_path)
38-
return result.global_context, result.content
3915

40-
41-
def build_document_tree(content: str, format: str = "markdown") -> DocumentTree:
16+
def build_document_tree(source: str | Path, format: str | None = None) -> DocumentTree:
4217
"""
43-
Build a DocumentTree from document content.
18+
Build a DocumentTree from document content or file.
4419
4520
Args:
46-
content: Document content string
47-
format: Document format ("markdown" or "org")
21+
source: Document content string or Path to file
22+
format: Document format ("markdown" or "org").
23+
If None, auto-detect from file extension or content.
4824
4925
Returns:
50-
DocumentTree with parsed heading hierarchy
26+
Immutable DocumentTree with parsed heading hierarchy and metadata
5127
5228
Raises:
53-
ValueError: If format is not supported
29+
ValueError: If format is not supported or cannot be detected
5430
"""
31+
# Determine format
32+
if format is None:
33+
if isinstance(source, Path):
34+
suffix = source.suffix.lower()
35+
if suffix == ".md":
36+
format = "markdown"
37+
elif suffix == ".org":
38+
format = "org"
39+
else:
40+
# Try to detect from content
41+
content = source.read_text(encoding="utf-8")
42+
format = detect_format(content)
43+
else:
44+
format = detect_format(source)
45+
46+
# Parse based on format
5547
if format in ("markdown", "md"):
56-
return build_markdown_tree(content)
48+
parser = MarkdownParser()
49+
return parser.parse(source)
5750
elif format in ("org", "orgmode"):
58-
return build_org_tree(content)
51+
parser = OrgParser()
52+
return parser.parse(source)
5953
else:
6054
raise ValueError(f"Unsupported format: {format}. Supported: markdown, org")
6155

@@ -66,24 +60,26 @@ def detect_format(content: str) -> str:
6660
6761
Returns "markdown" or "org" based on heading patterns.
6862
"""
69-
import re
70-
7163
md_headings = len(re.findall(r"^#{1,6}\s+.+$", content, re.MULTILINE))
7264
org_headings = len(re.findall(r"^\*+\s+.+$", content, re.MULTILINE))
7365

7466
return "org" if org_headings > md_headings else "markdown"
7567

7668

7769
__all__ = [
78-
"parse_document",
70+
# Core types
71+
"HeadingNode",
72+
"DocumentTree",
73+
"DocumentMetadata",
74+
"TreeBuilder",
75+
# Parsers
76+
"MarkdownParser",
77+
"OrgParser",
78+
# Functions
7979
"build_document_tree",
8080
"detect_format",
81+
# Chunking
8182
"chunk_document",
8283
"count_tokens",
8384
"ChunkingError",
84-
"ParseResult",
85-
"MarkdownParser",
86-
"OrgModeParser",
87-
"HeadingNode",
88-
"DocumentTree",
8985
]

src/doc2anki/parser/base.py

Lines changed: 0 additions & 85 deletions
This file was deleted.

0 commit comments

Comments
 (0)