Skip to content

Commit c628cdd

Browse files
committed
fix(llm): harden JSON+cloze prompt constraints and validation
- Enforce strict JSON-only output (no markdown/code fences, no raw newlines in strings) - Add explicit cloze validity rules (type=cloze must include {{cN::...}}; otherwise downgrade to basic) - Reduce HTML/style verbosity to avoid line-wrapping/truncation and improve extractor/Pydantic pass rate
1 parent 356ed7d commit c628cdd

File tree

4 files changed

+194
-70
lines changed

4 files changed

+194
-70
lines changed

src/doc2anki/llm/client.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def call_llm(
3434
client: OpenAI,
3535
model: str,
3636
prompt: str,
37+
max_tokens: int = 65536,
3738
use_json_mode: bool = True,
3839
) -> str:
3940
"""
@@ -55,6 +56,7 @@ def call_llm(
5556
kwargs = {
5657
"model": model,
5758
"messages": [{"role": "user", "content": prompt}],
59+
"max_tokens": max_tokens,
5860
}
5961

6062
if use_json_mode:
@@ -121,6 +123,13 @@ def generate_cards_for_chunk(
121123
console.print(f" [dim]Attempt {attempt + 1}/{max_retries}...[/dim]")
122124

123125
response = call_llm(client, model, prompt)
126+
127+
if verbose:
128+
console.print("\n" + "=" * 80)
129+
console.print("[dim]Raw LLM response (verbatim):[/dim]")
130+
console.print(response, markup=False)
131+
console.print("=" * 80 + "\n")
132+
124133
json_data = extract_json(response)
125134
output = CardOutput.model_validate(json_data)
126135

src/doc2anki/models/cards.py

Lines changed: 88 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,69 @@
1-
"""Card Pydantic models for validation."""
1+
"""Card Pydantic models for validation.
2+
3+
This module validates LLM-generated cards and normalizes fields to ensure:
4+
- HTML payloads (Tokyo Night styled) can pass length constraints
5+
- Cloze placeholders like [CLOZE:c1:...] are converted to Anki {{c1::...}} markers
6+
- Tags are normalized and robust to common LLM output shapes
7+
"""
8+
9+
from __future__ import annotations
210

311
import re
4-
from typing import Literal, Union, List, Optional
12+
from typing import Annotated, List, Literal, Optional, Union, Any
13+
14+
from pydantic import BaseModel, Field, field_validator, ConfigDict
15+
16+
17+
# HTML cards can be quite large (inline styles + <style> blocks).
18+
# Keep an upper bound to avoid runaway outputs while not rejecting valid cards.
19+
MAX_HTML_LEN = 20_000
20+
21+
# Accept both:
22+
# 1) Standard Anki cloze markers: {{c1::...}}
23+
# 2) Template-safe placeholders: [CLOZE:c1:...]
24+
_CLOZE_ANKI_RE = re.compile(r"\{\{c\d+::", re.IGNORECASE)
25+
_CLOZE_PLACEHOLDER_RE = re.compile(r"\[CLOZE:c(\d+):(.+?)\]", re.IGNORECASE | re.DOTALL)
26+
27+
# Remove characters that can break Anki tags / filesystem-ish conventions
28+
_TAG_SANITIZE_RE = re.compile(r'[&/\\:*?"<>|]')
29+
30+
31+
def _normalize_tags(v: Any) -> list[str]:
32+
"""Normalize tags from common LLM outputs."""
33+
if v is None or v == "":
34+
return []
535

6-
from pydantic import BaseModel, Field, field_validator
7-
from typing_extensions import Annotated
36+
# LLM sometimes returns a single string: "tag1, tag2"
37+
if isinstance(v, str):
38+
# split by comma or whitespace (but keep simple)
39+
raw = [t for t in re.split(r"[,\n]\s*|\s{2,}", v) if t.strip()]
40+
elif isinstance(v, (list, tuple, set)):
41+
raw = [str(t) for t in v if str(t).strip()]
42+
else:
43+
# Unexpected type: coerce to string
44+
raw = [str(v).strip()] if str(v).strip() else []
45+
46+
return [_TAG_SANITIZE_RE.sub("_", t.lower().strip()) for t in raw if t.strip()]
47+
48+
49+
def _convert_cloze_placeholders_to_anki(text: str) -> str:
50+
"""Convert [CLOZE:cN:...] placeholders to {{cN::...}} markers."""
51+
def repl(m: re.Match) -> str:
52+
n = m.group(1)
53+
content = m.group(2).strip()
54+
return f"{{{{c{n}::{content}}}}}"
55+
56+
# Convert all occurrences
57+
return _CLOZE_PLACEHOLDER_RE.sub(repl, text)
858

959

1060
class BasicCard(BaseModel):
1161
"""Basic question-answer card."""
62+
model_config = ConfigDict(extra="ignore")
1263

1364
type: Literal["basic"]
14-
front: str = Field(min_length=5, max_length=1000)
15-
back: str = Field(min_length=1, max_length=3000)
65+
front: str = Field(min_length=5, max_length=MAX_HTML_LEN)
66+
back: str = Field(min_length=1, max_length=MAX_HTML_LEN)
1667
tags: List[str] = Field(default_factory=list)
1768

1869
# Runtime fields (not from LLM)
@@ -21,18 +72,16 @@ class BasicCard(BaseModel):
2172

2273
@field_validator("tags", mode="before")
2374
@classmethod
24-
def normalize_tags(cls, v):
25-
"""Normalize tags: remove special characters, lowercase."""
26-
if not v:
27-
return []
28-
return [re.sub(r'[&/\\:*?"<>|]', "_", tag.lower().strip()) for tag in v]
75+
def normalize_tags(cls, v: Any) -> list[str]:
76+
return _normalize_tags(v)
2977

3078

3179
class ClozeCard(BaseModel):
3280
"""Cloze deletion card."""
81+
model_config = ConfigDict(extra="ignore")
3382

3483
type: Literal["cloze"]
35-
text: str = Field(min_length=10, max_length=3000)
84+
text: str = Field(min_length=10, max_length=MAX_HTML_LEN)
3685
tags: List[str] = Field(default_factory=list)
3786

3887
# Runtime fields (not from LLM)
@@ -41,25 +90,42 @@ class ClozeCard(BaseModel):
4190

4291
@field_validator("text")
4392
@classmethod
44-
def must_have_cloze_marker(cls, v: str) -> str:
45-
"""Ensure cloze card has {{cN::...}} marker."""
46-
if not re.search(r"\{\{c\d+::", v):
47-
raise ValueError("Cloze card must contain {{cN::...}} marker")
48-
return v
93+
def ensure_cloze_marker(cls, v: str) -> str:
94+
"""
95+
Ensure cloze card contains valid cloze markers.
96+
97+
Accepts:
98+
- Standard Anki: {{cN::...}}
99+
- Placeholder: [CLOZE:cN:...], converted automatically
100+
"""
101+
if not isinstance(v, str):
102+
raise TypeError("Cloze card text must be a string")
103+
104+
text = v.strip()
105+
if not text:
106+
raise ValueError("Cloze card text cannot be empty")
107+
108+
# Convert placeholder form -> Anki form
109+
if _CLOZE_PLACEHOLDER_RE.search(text):
110+
text = _convert_cloze_placeholders_to_anki(text)
111+
112+
# Validate Anki cloze markers exist
113+
if not _CLOZE_ANKI_RE.search(text):
114+
raise ValueError("Cloze card must contain {{cN::...}} marker (or [CLOZE:cN:...] placeholder)")
115+
116+
return text
49117

50118
@field_validator("tags", mode="before")
51119
@classmethod
52-
def normalize_tags(cls, v):
53-
"""Normalize tags: remove special characters, lowercase."""
54-
if not v:
55-
return []
56-
return [re.sub(r'[&/\\:*?"<>|]', "_", tag.lower().strip()) for tag in v]
120+
def normalize_tags(cls, v: Any) -> list[str]:
121+
return _normalize_tags(v)
57122

58123

59124
Card = Annotated[Union[BasicCard, ClozeCard], Field(discriminator="type")]
60125

61126

62127
class CardOutput(BaseModel):
63128
"""Container for LLM-generated cards."""
129+
model_config = ConfigDict(extra="ignore")
64130

65131
cards: List[Card]

templates/generate_cards.j2

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,7 @@
4646

4747
## 待处理内容
4848

49+
---
50+
---
51+
4952
{{ chunk_content }}

templates/tokyonight_themes_template.j2

Lines changed: 94 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -10,91 +10,137 @@
1010
---
1111
{% endif %}
1212

13-
请根据以下内容生成 Anki 学习卡片,使用 Tokyo Night 配色风格
13+
请根据以下内容生成 Anki 学习卡片。
1414

15-
## 卡片类型要求
15+
## 要求
1616

17-
1. **basic**(基础问答):适合概念理解、原理解释
18-
2. **cloze**(填空):适合记忆具体细节、列表项
19-
3. 每张卡片应该是原子化的,只测试一个知识点
20-
4. 问题应该清晰明确,答案应该简洁准确
17+
1. 卡片类型只能是 `basic`(基础问答)或 `cloze`(填空)
18+
2. 每张卡片应该是原子化的,只测试一个知识点
19+
3. 问题应该清晰明确,答案应该简洁准确
20+
4. 对于适合记忆具体细节的内容,优先使用 cloze 类型
21+
5. 对于适合理解概念的内容,使用 basic 类型
2122

22-
## HTML 样式规范
23+
## 样式规范
24+
25+
所有卡片必须使用 Tokyo Night 配色主题的 HTML + 内联 CSS 样式。
2326

2427
### Basic 卡片样式
2528

26-
**正面(Front):**
29+
**正面模板(Front):**
30+
- 使用居中布局,问题清晰
31+
- 关键词使用 `<strong style="color: #7aa2f7; font-weight: 600;">` 突出显示
32+
33+
**背面模板(Back):**
34+
- 根据答案复杂度选择合适的结构:
35+
- 简短答案:段落形式
36+
- 列表答案:使用 `<ol>` 或 `<ul>` 标签
37+
- 复杂解释:使用 `<h2>` 分节,配合段落和列表
38+
39+
**标准 Basic 卡片结构:**
2740
```html
41+
<!-- Front -->
2842
<div style="max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;">
29-
<p style="margin: 0; font-size: 18px; color: #c0caf5; line-height: 1.8;">问题内容,使用 <strong style="color: #7aa2f7; font-weight: 600;">蓝色</strong> 强调关键词</p>
43+
<p style="margin: 0; font-size: 18px; color: #c0caf5; line-height: 1.8;">问题内容,关键词用 <strong style="color: #7aa2f7; font-weight: 600;">加粗</strong></p>
44+
</div>
45+
46+
<!-- Back - 简短答案 -->
47+
<div style="max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;">
48+
<p style="margin: 0; font-size: 16px; color: #a9b1d6; line-height: 1.8;">答案内容</p>
49+
</div>
50+
51+
<!-- Back - 列表答案 -->
52+
<div style="max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;">
53+
<p style="margin: 0 0 24px 0; font-size: 17px; color: #c0caf5; text-indent: 0;"><strong style="color: #7aa2f7; font-weight: 600;">标题</strong>内容:</p>
54+
<ol style="margin: 20px 0; padding-left: 2.5em; list-style-position: outside;">
55+
<li style="margin-bottom: 10px; padding-left: 8px; color: #a9b1d6; font-size: 16px;">列表项 1</li>
56+
<li style="margin-bottom: 10px; padding-left: 8px; color: #a9b1d6; font-size: 16px;">列表项 2</li>
57+
</ol>
3058
</div>
31-
```
3259

33-
**背面(Back):**
34-
- 对于列表式答案,使用 `<ol>` 或 `<ul>` 标签
35-
- 对于段落式答案,使用 `<p>` 标签,必要时使用 `<h2>` 分节
36-
- 数学公式使用 `<anki-mathjax>` 标签包裹
37-
- 公式区块使用深色背景突出显示
60+
<!-- 需要在 Styling 中添加 -->
61+
<style>
62+
ol li::marker { color: #ff9e64; font-weight: 700; }
63+
ul li::marker { color: #ff9e64; }
64+
::selection { background-color: #283457; color: #c0caf5; }
65+
</style>
3866

39-
**样式要点:**
40-
- 主容器:`background-color: #16161e; border: 1px solid #292e42`
41-
- 主文本:`color: #a9b1d6; font-size: 16px`
42-
- 强调文字:`color: #7aa2f7` (蓝色) 或 `color: #9ece6a` (绿色)
43-
- 标题:`color: #7aa2f7; border-bottom: 2px solid #292e42`
44-
- 列表标记颜色:`#ff9e64` (橙色,通过 CSS ::marker 实现)
45-
- 公式背景:`background-color: #1a1b26; border: 1px solid #292e42`
67+
<!-- Back - 复杂解释(多节) -->
68+
<div style="max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;">
69+
<h2 style="margin: 0 0 16px 0; font-size: 18px; color: #7aa2f7; font-weight: 600; border-bottom: 2px solid #292e42; padding-bottom: 8px;">章节标题</h2>
70+
<p style="margin: 0 0 16px 0; font-size: 16px; color: #a9b1d6; line-height: 1.8;">段落内容</p>
71+
<!-- 可添加更多章节 -->
72+
</div>
73+
```
4674

4775
### Cloze 卡片样式
4876

49-
**文本内容:**
50-
- 使用 Anki 的挖空语法:用两个大括号 + c1/c2/c3 + 两个冒号 + 内容 + 两个大括号(避免被 jinja2 解析)
51-
- 示例表示方式:`[CLOZE:c1:内容]` - AI 需将此转换为正确的 Anki 语法
52-
- 可以有多个挖空:`[CLOZE:c1:第一个]`、`[CLOZE:c2:第二个]`
77+
Cloze 卡片使用 Anki 原生的格式,不需要特殊 HTML。
78+
79+
### 配色参考
80+
81+
- **主背景**: `#16161e`
82+
- **文本主色**: `#a9b1d6`
83+
- **标题/强调**: `#7aa2f7` (蓝色)
84+
- **次要强调**: `#9ece6a` (绿色)
85+
- **列表标记**: `#ff9e64` (橙色)
86+
- **边框**: `#292e42`
87+
- **选中高亮**: `#283457`
88+
- **公式背景**: `#1a1b26`
5389

54-
**HTML 结构:**
90+
### 样式使用指南
91+
92+
1. **无序列表** (`<ul>`):使用自定义 `<span>` 作为项目符号
5593
```html
56-
<div style="max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;">
57-
<p style="margin: 0; font-size: 16px; color: #a9b1d6; line-height: 1.8;">
58-
包含挖空的文本内容,<strong style="color: #7aa2f7; font-weight: 600;">关键词</strong>用蓝色强调
59-
</p>
60-
</div>
94+
<li style="margin-bottom: 10px; color: #a9b1d6; font-size: 16px; padding-left: 8px; position: relative;">
95+
<span style="position: absolute; left: -1.2em; color: #ff9e64;">•</span>
96+
列表内容
97+
</li>
6198
```
6299

63-
**Cloze 样式规范:**
64-
- 如果是列表型挖空,建议使用 `<ol>` 或 `<ul>` 包裹
65-
- 段落型挖空使用 `<p>` 标签
66-
- 保持与 basic 卡片一致的 Tokyo Night 配色
100+
2. **有序列表** (`<ol>`):使用 `::marker` 伪元素(需在 Styling 中定义)
101+
102+
3. **数学公式**:使用 `<anki-mathjax>` 标签,放在深色背景框中
103+
```html
104+
<p style="margin: 0 0 16px 0; font-size: 16px; color: #c0caf5; text-align: center; padding: 16px; background-color: #1a1b26; border-radius: 4px; border: 1px solid #292e42;">
105+
<anki-mathjax>公式内容</anki-mathjax>
106+
</p>
107+
```
108+
109+
4. **分类标题**(如"经济层面"):使用绿色强调
110+
```html
111+
<strong style="color: #9ece6a; font-weight: 600;">分类名称</strong>
112+
```
67113

68114
## 输出格式
69115

70-
请严格按照以下 JSON schema 输出
116+
请以 JSON 格式输出,严格遵循以下 schema
71117

72118
{% raw %}
73119
```json
74120
{
75121
"cards": [
76122
{
77123
"type": "basic",
78-
"front": "<div style=\"max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;\"><p style=\"margin: 0; font-size: 18px; color: #c0caf5; line-height: 1.8;\">问题内容</p></div>",
79-
"back": "<div style=\"max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;\"><p style=\"margin: 0; font-size: 16px; color: #a9b1d6; line-height: 1.8;\">答案内容</p></div>\n\n<style>\n::selection {\n background-color: #283457;\n color: #c0caf5;\n}\n</style>",
80-
"tags": ["tag1", "tag2"]
124+
"front": "完整的 HTML,包含内联样式",
125+
"back": "完整的 HTML,包含内联样式",
126+
"tags": ["tag1", "tag2"],
127+
"styling": "需要放在 Anki Styling 区域的 CSS(如果有的话)"
81128
},
82129
{
83130
"type": "cloze",
84-
"text": "<div style=\"max-width: 800px; margin: 0 auto; background-color: #16161e; border-radius: 8px; padding: 32px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); border: 1px solid #292e42;\"><p style=\"margin: 0; font-size: 16px; color: #a9b1d6; line-height: 1.8;\">这是[CLOZE:c1:填空内容]的示例</p></div>\n\n<style>\n::selection {\n background-color: #283457;\n color: #c0caf5;\n}\n</style>",
85-
"tags": ["tag1"]
131+
"text": "这是一个{{c1::填空}}示例",
132+
"tags": ["tag1"],
133+
"styling": null
86134
}
87135
]
88136
}
89137
```
90138
{% endraw %}
91139

92-
**重要说明:**
93-
1. 所有 HTML 必须是完整的、可直接使用的
94-
2. `[CLOZE:c1:内容]` 需要转换为 Anki 的标准挖空语法(两个花括号 c1 双冒号 内容 两个花括号)
95-
3. 样式部分的 `<style>` 标签必须包含在 back 或 text 字段中
96-
4. 对于列表型内容,必须使用 `<ol>` 或 `<ul>` 标签,并添加 `::marker` 样式
97-
5. JSON 中的 HTML 需要正确转义双引号
140+
**注意:**
141+
- `styling` 字段用于存放无法内联的 CSS(如 `::marker` 伪元素),如果没有则设为 `null`
142+
- Basic 卡片的 `front` 和 `back` 必须是完整的 HTML 代码,包含所有内联样式
143+
- Cloze 卡片的 `text` 使用原生格式,不需要 HTML 包装
98144

99145
## 待处理内容
100146

0 commit comments

Comments
 (0)