Skip to content

Commit 20465b3

Browse files
committed
Fix translation pipeline corrupting HTML tags and bold headings
- Unescape entity-encoded HTML tags (&lt;code&gt; → <code>) in translated content when the same tag appears as raw HTML in the English source - Remove bare code-fence wrapping from bold heading lines (**...**) that the translation pipeline incorrectly wraps in fenced code blocks
1 parent 8ab9fb3 commit 20465b3

File tree

2 files changed

+102
-0
lines changed

2 files changed

+102
-0
lines changed

src/languages/lib/correct-translation-content.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,34 @@ export function correctTranslatedContentStrings(
884884
}
885885
}
886886

887+
// Unescape HTML entity-encoded tags (`&lt;tag&gt;` → `<tag>`) that Crowdin
888+
// introduces when the English source uses inline raw HTML — e.g.
889+
// `<code><a href="...">label</a></code>` inside table `<td>` cells.
890+
// Without this fix, those tags render as literal `<code>` text on translated
891+
// pages rather than as styled code elements.
892+
// Only unescape tag names present as raw HTML in the English source to avoid
893+
// incorrectly expanding intentional `&lt;` entity sequences.
894+
if (englishContent && content.includes('&lt;')) {
895+
const englishTagNames = new Set(
896+
[...englishContent.matchAll(/<([a-z][a-z0-9]*)/gi)].map((m) => m[1].toLowerCase()),
897+
)
898+
if (englishTagNames.size > 0) {
899+
content = content.replace(
900+
/&lt;(\/?[a-z][a-z0-9]*)(\s[^<>]*?)?&gt;/gi,
901+
(match, tag: string, attrs = '') => {
902+
const baseName = tag.replace(/^\//, '').toLowerCase()
903+
return englishTagNames.has(baseName) ? `<${tag}${attrs}>` : match
904+
},
905+
)
906+
}
907+
}
908+
909+
// Remove bare code-fence wrapping from bold heading lines. Translation pipelines
910+
// sometimes wrap `**heading**` lines in bare (no-language) fenced code blocks,
911+
// causing them to render as code instead of bold text. Strip the fences and
912+
// restore the heading as plain Markdown.
913+
content = content.replace(/^```\s*\n(\*\*[^\n]+\*\*)\s*\n```/gm, '$1')
914+
887915
// Collapsed Markdown table rows — restore linebreaks between `|` cells.
888916
content = content.replaceAll(' | | ', ' |\n| ')
889917

src/languages/tests/correct-translation-content.ts

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,6 +1355,80 @@ describe('correctTranslatedContentStrings', () => {
13551355
expect(fix('{{%raw %}', 'es')).toBe('{% raw %}')
13561356
expect(fix('{{% raw %}', 'es')).toBe('{% raw %}')
13571357
})
1358+
1359+
test('unescapes entity-encoded HTML tags when English source has matching raw HTML', () => {
1360+
const english =
1361+
'<td><code><a href="https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md">ubuntu-latest</a></code></td>'
1362+
1363+
expect(fix('&lt;code&gt;ubuntu-latest&lt;/code&gt;', 'ko', english)).toBe(
1364+
'<code>ubuntu-latest</code>',
1365+
)
1366+
expect(
1367+
fix('&lt;a href="https://example.com"&gt;ubuntu-latest&lt;/a&gt;', 'ko', english),
1368+
).toBe('<a href="https://example.com">ubuntu-latest</a>')
1369+
expect(
1370+
fix(
1371+
'&lt;code&gt;&lt;a href="https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md"&gt;ubuntu-latest&lt;/a&gt;&lt;/code&gt;',
1372+
'ko',
1373+
english,
1374+
),
1375+
).toBe(
1376+
'<code><a href="https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md">ubuntu-latest</a></code>',
1377+
)
1378+
})
1379+
1380+
test('does not unescape entity-encoded tags absent from English source', () => {
1381+
const english = '<p>Simple paragraph without code elements</p>'
1382+
const input = '&lt;code&gt;text&lt;/code&gt;'
1383+
expect(fix(input, 'ko', english)).toBe(input)
1384+
})
1385+
1386+
test('does not unescape entity-encoded tags when no English content provided', () => {
1387+
const input = '&lt;code&gt;ubuntu-latest&lt;/code&gt;'
1388+
expect(fix(input, 'ko')).toBe(input)
1389+
})
1390+
1391+
test('removes bare code-fence wrapping from bold heading lines', () => {
1392+
const input = '```\n**다음은 작업을 다운로드하는 데 필요합니다.**\n```'
1393+
expect(fix(input, 'ko')).toBe('**다음은 작업을 다운로드하는 데 필요합니다.**')
1394+
})
1395+
1396+
test('removes bare code-fence wrapping from bold headings between real code blocks', () => {
1397+
const input = [
1398+
'```shell copy',
1399+
'github.com',
1400+
'api.github.com',
1401+
'```',
1402+
'',
1403+
'```',
1404+
'**다음은 작업을 다운로드하는 데 필요합니다.**',
1405+
'```',
1406+
'',
1407+
'```shell copy',
1408+
'codeload.github.com',
1409+
'```',
1410+
].join('\n')
1411+
1412+
const expected = [
1413+
'```shell copy',
1414+
'github.com',
1415+
'api.github.com',
1416+
'```',
1417+
'',
1418+
'**다음은 작업을 다운로드하는 데 필요합니다.**',
1419+
'',
1420+
'```shell copy',
1421+
'codeload.github.com',
1422+
'```',
1423+
].join('\n')
1424+
1425+
expect(fix(input, 'ko')).toBe(expected)
1426+
})
1427+
1428+
test('does not strip language-specified code fences with bold content', () => {
1429+
const input = '```shell\n**not a heading**\n```'
1430+
expect(fix(input, 'ko')).toBe(input)
1431+
})
13581432
})
13591433

13601434
// ─── EDGE CASES ────────────────────────────────────────────────────

0 commit comments

Comments
 (0)