Skip to content

Commit b629baf

Browse files
committed
Allow to, optionally, keep Unicode escape sequences in stringToPDFString (PR 17331 follow-up)
Currently *some* of the links[1] on page three of the `issue19835.pdf` test-case aren't clickable, since the destination (of the LinkAnnotation) becomes empty. The reason is that these destinations include the character `\x1b`, which is interpreted as the start of a Unicode escape sequence specifying the language of the string; please refer to section [7.9.2.2 Text String Type](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf#G6.1957385) in the PDF specification. Hence it seems that we need a way to optionally disable that behaviour, to avoid a "badly" formatted string from becoming empty (or truncated), at least for cases where we are: - Parsing named destinations[2] and URLs. - Handling "strings" that are actually /Name-instances. - Building a lookup Object/Map based on some PDF data-structure. *NOTE:* The issue that prompted this patch is obviously related to destinations, however I've gone through the `src/core/` folder and updated various other `stringToPDFString` call-sites that (directly or indirectly) fit the categories listed above. --- [1] Try clicking on anything on the line containing "Item 7A. Quantitative and Qualitative Disclosures About Market Risk 27". [2] Unfortunately just skipping `stringToPDFString` in this case would cause other issues, such as the named destination becoming "unusable" in the viewer; see e.g. issues 14847 and 14864.
1 parent 254431d commit b629baf

5 files changed

Lines changed: 63 additions & 16 deletions

File tree

src/core/catalog.js

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ function fetchRemoteDest(action) {
7676
dest = dest.name;
7777
}
7878
if (typeof dest === "string") {
79-
return stringToPDFString(dest);
79+
return stringToPDFString(dest, /* keepEscapeSequence = */ true);
8080
} else if (isValidExplicitDest(dest)) {
8181
return JSON.stringify(dest);
8282
}
@@ -674,15 +674,17 @@ class Catalog {
674674
for (const [key, value] of obj.getAll()) {
675675
const dest = fetchDest(value);
676676
if (dest) {
677-
dests[stringToPDFString(key)] = dest;
677+
dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
678+
dest;
678679
}
679680
}
680681
} else if (obj instanceof Dict) {
681682
for (const [key, value] of obj) {
682683
const dest = fetchDest(value);
683684
if (dest) {
684685
// Always let the NameTree take precedence.
685-
dests[stringToPDFString(key)] ||= dest;
686+
dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] ||=
687+
dest;
686688
}
687689
}
688690
}
@@ -1046,7 +1048,8 @@ class Catalog {
10461048
for (const [key, value] of nameTree.getAll()) {
10471049
const fs = new FileSpec(value, this.xref);
10481050
attachments ??= Object.create(null);
1049-
attachments[stringToPDFString(key)] = fs.serializable;
1051+
attachments[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
1052+
fs.serializable;
10501053
}
10511054
}
10521055
return shadow(this, "attachments", attachments);
@@ -1060,7 +1063,10 @@ class Catalog {
10601063
const nameTree = new NameTree(obj.getRaw("XFAImages"), this.xref);
10611064
for (const [key, value] of nameTree.getAll()) {
10621065
xfaImages ??= new Dict(this.xref);
1063-
xfaImages.set(stringToPDFString(key), value);
1066+
xfaImages.set(
1067+
stringToPDFString(key, /* keepEscapeSequence = */ true),
1068+
value
1069+
);
10641070
}
10651071
}
10661072
return shadow(this, "xfaImages", xfaImages);
@@ -1084,7 +1090,10 @@ class Catalog {
10841090
} else if (typeof js !== "string") {
10851091
return;
10861092
}
1087-
js = stringToPDFString(js).replaceAll("\x00", "");
1093+
js = stringToPDFString(js, /* keepEscapeSequence = */ true).replaceAll(
1094+
"\x00",
1095+
""
1096+
);
10881097
// Skip empty entries, similar to the `_collectJS` function.
10891098
if (js) {
10901099
(javaScript ||= new Map()).set(name, js);
@@ -1094,7 +1103,10 @@ class Catalog {
10941103
if (obj instanceof Dict && obj.has("JavaScript")) {
10951104
const nameTree = new NameTree(obj.getRaw("JavaScript"), this.xref);
10961105
for (const [key, value] of nameTree.getAll()) {
1097-
appendIfJavaScriptDict(stringToPDFString(key), value);
1106+
appendIfJavaScriptDict(
1107+
stringToPDFString(key, /* keepEscapeSequence = */ true),
1108+
value
1109+
);
10981110
}
10991111
}
11001112
// Append OpenAction "JavaScript" actions, if any, to the JavaScript map.
@@ -1633,7 +1645,10 @@ class Catalog {
16331645
const name = target.get("N");
16341646

16351647
if (isName(relationship, "C") && typeof name === "string") {
1636-
attachment = docAttachments[stringToPDFString(name)];
1648+
attachment =
1649+
docAttachments[
1650+
stringToPDFString(name, /* keepEscapeSequence = */ true)
1651+
];
16371652
}
16381653
}
16391654

@@ -1699,7 +1714,11 @@ class Catalog {
16991714
js = jsAction;
17001715
}
17011716

1702-
const jsURL = js && recoverJsURL(stringToPDFString(js));
1717+
const jsURL =
1718+
js &&
1719+
recoverJsURL(
1720+
stringToPDFString(js, /* keepEscapeSequence = */ true)
1721+
);
17031722
if (jsURL) {
17041723
url = jsURL.url;
17051724
resultObj.newWindow = jsURL.newWindow;
@@ -1735,7 +1754,10 @@ class Catalog {
17351754
dest = dest.name;
17361755
}
17371756
if (typeof dest === "string") {
1738-
resultObj.dest = stringToPDFString(dest);
1757+
resultObj.dest = stringToPDFString(
1758+
dest,
1759+
/* keepEscapeSequence = */ true
1760+
);
17391761
} else if (isValidExplicitDest(dest)) {
17401762
resultObj.dest = dest;
17411763
}

src/core/core_utils.js

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,10 @@ function _collectJS(entry, xref, list, parents) {
424424
} else if (typeof js === "string") {
425425
code = js;
426426
}
427-
code &&= stringToPDFString(code).replaceAll("\x00", "");
427+
code &&= stringToPDFString(
428+
code,
429+
/* keepEscapeSequence = */ true
430+
).replaceAll("\x00", "");
428431
if (code) {
429432
list.push(code);
430433
}

src/core/file_spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class FileSpec {
7777

7878
const item = pickPlatformItem(this.root);
7979
if (item && typeof item === "string") {
80-
filename = stringToPDFString(item)
80+
filename = stringToPDFString(item, /* keepEscapeSequence = */ true)
8181
.replaceAll("\\\\", "\\")
8282
.replaceAll("\\/", "/")
8383
.replaceAll("\\", "/");

src/shared/util.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,9 +1022,9 @@ const PDFStringTranslateTable = [
10221022
0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac,
10231023
];
10241024

1025-
function stringToPDFString(str) {
1025+
function stringToPDFString(str, keepEscapeSequence = false) {
10261026
// See section 7.9.2.2 Text String Type.
1027-
// The string can contain some language codes bracketed with 0x0b,
1027+
// The string can contain some language codes bracketed with 0x1b,
10281028
// so we must remove them.
10291029
if (str[0] >= "\xEF") {
10301030
let encoding;
@@ -1047,7 +1047,7 @@ function stringToPDFString(str) {
10471047
const decoder = new TextDecoder(encoding, { fatal: true });
10481048
const buffer = stringToBytes(str);
10491049
const decoded = decoder.decode(buffer);
1050-
if (!decoded.includes("\x1b")) {
1050+
if (keepEscapeSequence || !decoded.includes("\x1b")) {
10511051
return decoded;
10521052
}
10531053
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
@@ -1060,7 +1060,7 @@ function stringToPDFString(str) {
10601060
const strBuf = [];
10611061
for (let i = 0, ii = str.length; i < ii; i++) {
10621062
const charCode = str.charCodeAt(i);
1063-
if (charCode === 0x1b) {
1063+
if (!keepEscapeSequence && charCode === 0x1b) {
10641064
// eslint-disable-next-line no-empty
10651065
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
10661066
continue;

test/unit/api_spec.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,6 +1421,28 @@ describe("api", function () {
14211421
await loadingTask.destroy();
14221422
});
14231423

1424+
it("gets a destination containing Unicode escape sequence (\x1b), from /Dests dictionary with keys using PDFDocEncoding", async function () {
1425+
if (isNodeJS) {
1426+
pending("Linked test-cases are not supported in Node.js.");
1427+
}
1428+
const loadingTask = getDocument(buildGetDocumentParams("issue19835.pdf"));
1429+
const pdfDoc = await loadingTask.promise;
1430+
1431+
const page3 = await pdfDoc.getPage(3);
1432+
const annots = await page3.getAnnotations();
1433+
1434+
const annot = annots.find(x => x.id === "55R");
1435+
// Sanity check to make sure that we found the "correct" annotation.
1436+
expect(annot.dest).toEqual(
1437+
"\u02d9\u0064\u002a\u0010\u000e\u0061\u00d6\u0002\u005b\u00b7\u201a\u0022\u00c5\u00da\u017e\u00bb\u00d5\u0062\u02dd\u00d1"
1438+
);
1439+
1440+
const dest = await pdfDoc.getDestination(annot.dest);
1441+
expect(dest).toEqual([28, { name: "XYZ" }, 34.0799999, 73.5199999, 0]);
1442+
1443+
await loadingTask.destroy();
1444+
});
1445+
14241446
it("gets non-string destination", async function () {
14251447
let numberPromise = pdfDocument.getDestination(4.3);
14261448
let booleanPromise = pdfDocument.getDestination(true);

0 commit comments

Comments
 (0)