Allow to, optionally, keep Unicode escape sequences in stringToPDFString (PR 17331 follow-up)

Snuffleupagus · Snuffleupagus · commit b629bafd1ce7 · 2025-04-30T20:51:10.000+02:00
Currently *some* of the links[1] on page three of the `issue19835.pdf` test-case aren't clickable, since the destination (of the LinkAnnotation) becomes empty. The reason is that these destinations include the character `\x1b`, which is interpreted as the start of a Unicode escape sequence specifying the language of the string; please refer to section [7.9.2.2 Text String Type](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf#G6.1957385) in the PDF specification. Hence it seems that we need a way to optionally disable that behaviour, to avoid a "badly" formatted string from becoming empty (or truncated), at least for cases where we are: - Parsing named destinations[2] and URLs. - Handling "strings" that are actually /Name-instances. - Building a lookup Object/Map based on some PDF data-structure. *NOTE:* The issue that prompted this patch is obviously related to destinations, however I've gone through the `src/core/` folder and updated various other `stringToPDFString` call-sites that (directly or indirectly) fit the categories listed above. --- [1] Try clicking on anything on the line containing "Item 7A. Quantitative and Qualitative Disclosures About Market Risk 27". [2] Unfortunately just skipping `stringToPDFString` in this case would cause other issues, such as the named destination becoming "unusable" in the viewer; see e.g. issues 14847 and 14864.
diff --git a/src/core/catalog.js b/src/core/catalog.js
@@ -76,7 +76,7 @@ function fetchRemoteDest(action) {
       dest = dest.name;
     }
     if (typeof dest === "string") {
-      return stringToPDFString(dest);
+      return stringToPDFString(dest, /* keepEscapeSequence = */ true);
     } else if (isValidExplicitDest(dest)) {
       return JSON.stringify(dest);
     }
@@ -674,15 +674,17 @@ class Catalog {
         for (const [key, value] of obj.getAll()) {
           const dest = fetchDest(value);
           if (dest) {
-            dests[stringToPDFString(key)] = dest;
+            dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
+              dest;
           }
         }
       } else if (obj instanceof Dict) {
         for (const [key, value] of obj) {
           const dest = fetchDest(value);
           if (dest) {
             // Always let the NameTree take precedence.
-            dests[stringToPDFString(key)] ||= dest;
+            dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] ||=
+              dest;
           }
         }
       }
@@ -1046,7 +1048,8 @@ class Catalog {
       for (const [key, value] of nameTree.getAll()) {
         const fs = new FileSpec(value, this.xref);
         attachments ??= Object.create(null);
-        attachments[stringToPDFString(key)] = fs.serializable;
+        attachments[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
+          fs.serializable;
       }
     }
     return shadow(this, "attachments", attachments);
@@ -1060,7 +1063,10 @@ class Catalog {
       const nameTree = new NameTree(obj.getRaw("XFAImages"), this.xref);
       for (const [key, value] of nameTree.getAll()) {
         xfaImages ??= new Dict(this.xref);
-        xfaImages.set(stringToPDFString(key), value);
+        xfaImages.set(
+          stringToPDFString(key, /* keepEscapeSequence = */ true),
+          value
+        );
       }
     }
     return shadow(this, "xfaImages", xfaImages);
@@ -1084,7 +1090,10 @@ class Catalog {
       } else if (typeof js !== "string") {
         return;
       }
-      js = stringToPDFString(js).replaceAll("\x00", "");
+      js = stringToPDFString(js, /* keepEscapeSequence = */ true).replaceAll(
+        "\x00",
+        ""
+      );
       // Skip empty entries, similar to the `_collectJS` function.
       if (js) {
         (javaScript ||= new Map()).set(name, js);
@@ -1094,7 +1103,10 @@ class Catalog {
     if (obj instanceof Dict && obj.has("JavaScript")) {
       const nameTree = new NameTree(obj.getRaw("JavaScript"), this.xref);
       for (const [key, value] of nameTree.getAll()) {
-        appendIfJavaScriptDict(stringToPDFString(key), value);
+        appendIfJavaScriptDict(
+          stringToPDFString(key, /* keepEscapeSequence = */ true),
+          value
+        );
       }
     }
     // Append OpenAction "JavaScript" actions, if any, to the JavaScript map.
@@ -1633,7 +1645,10 @@ class Catalog {
             const name = target.get("N");
 
             if (isName(relationship, "C") && typeof name === "string") {
-              attachment = docAttachments[stringToPDFString(name)];
+              attachment =
+                docAttachments[
+                  stringToPDFString(name, /* keepEscapeSequence = */ true)
+                ];
             }
           }
 
@@ -1699,7 +1714,11 @@ class Catalog {
             js = jsAction;
           }
 
-          const jsURL = js && recoverJsURL(stringToPDFString(js));
+          const jsURL =
+            js &&
+            recoverJsURL(
+              stringToPDFString(js, /* keepEscapeSequence = */ true)
+            );
           if (jsURL) {
             url = jsURL.url;
             resultObj.newWindow = jsURL.newWindow;
@@ -1735,7 +1754,10 @@ class Catalog {
         dest = dest.name;
       }
       if (typeof dest === "string") {
-        resultObj.dest = stringToPDFString(dest);
+        resultObj.dest = stringToPDFString(
+          dest,
+          /* keepEscapeSequence = */ true
+        );
       } else if (isValidExplicitDest(dest)) {
         resultObj.dest = dest;
       }
diff --git a/src/core/core_utils.js b/src/core/core_utils.js
@@ -424,7 +424,10 @@ function _collectJS(entry, xref, list, parents) {
       } else if (typeof js === "string") {
         code = js;
       }
-      code &&= stringToPDFString(code).replaceAll("\x00", "");
+      code &&= stringToPDFString(
+        code,
+        /* keepEscapeSequence = */ true
+      ).replaceAll("\x00", "");
       if (code) {
         list.push(code);
       }
diff --git a/src/core/file_spec.js b/src/core/file_spec.js
@@ -77,7 +77,7 @@ class FileSpec {
 
     const item = pickPlatformItem(this.root);
     if (item && typeof item === "string") {
-      filename = stringToPDFString(item)
+      filename = stringToPDFString(item, /* keepEscapeSequence = */ true)
         .replaceAll("\\\\", "\\")
         .replaceAll("\\/", "/")
         .replaceAll("\\", "/");
diff --git a/src/shared/util.js b/src/shared/util.js
@@ -1022,9 +1022,9 @@ const PDFStringTranslateTable = [
   0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac,
 ];
 
-function stringToPDFString(str) {
+function stringToPDFString(str, keepEscapeSequence = false) {
   // See section 7.9.2.2 Text String Type.
-  // The string can contain some language codes bracketed with 0x0b,
+  // The string can contain some language codes bracketed with 0x1b,
   // so we must remove them.
   if (str[0] >= "\xEF") {
     let encoding;
@@ -1047,7 +1047,7 @@ function stringToPDFString(str) {
         const decoder = new TextDecoder(encoding, { fatal: true });
         const buffer = stringToBytes(str);
         const decoded = decoder.decode(buffer);
-        if (!decoded.includes("\x1b")) {
+        if (keepEscapeSequence || !decoded.includes("\x1b")) {
           return decoded;
         }
         return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
@@ -1060,7 +1060,7 @@ function stringToPDFString(str) {
   const strBuf = [];
   for (let i = 0, ii = str.length; i < ii; i++) {
     const charCode = str.charCodeAt(i);
-    if (charCode === 0x1b) {
+    if (!keepEscapeSequence && charCode === 0x1b) {
       // eslint-disable-next-line no-empty
       while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
       continue;
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
@@ -1421,6 +1421,28 @@ describe("api", function () {
       await loadingTask.destroy();
     });
 
+    it("gets a destination containing Unicode escape sequence (\x1b), from /Dests dictionary with keys using PDFDocEncoding", async function () {
+      if (isNodeJS) {
+        pending("Linked test-cases are not supported in Node.js.");
+      }
+      const loadingTask = getDocument(buildGetDocumentParams("issue19835.pdf"));
+      const pdfDoc = await loadingTask.promise;
+
+      const page3 = await pdfDoc.getPage(3);
+      const annots = await page3.getAnnotations();
+
+      const annot = annots.find(x => x.id === "55R");
+      // Sanity check to make sure that we found the "correct" annotation.
+      expect(annot.dest).toEqual(
+        "\u02d9\u0064\u002a\u0010\u000e\u0061\u00d6\u0002\u005b\u00b7\u201a\u0022\u00c5\u00da\u017e\u00bb\u00d5\u0062\u02dd\u00d1"
+      );
+
+      const dest = await pdfDoc.getDestination(annot.dest);
+      expect(dest).toEqual([28, { name: "XYZ" }, 34.0799999, 73.5199999, 0]);
+
+      await loadingTask.destroy();
+    });
+
     it("gets non-string destination", async function () {
       let numberPromise = pdfDocument.getDestination(4.3);
       let booleanPromise = pdfDocument.getDestination(true);

Original file line number	Diff line number	Diff line change
`@@ -424,7 +424,10 @@ function _collectJS(entry, xref, list, parents) {`
`424`	`424`	`} else if (typeof js === "string") {`
`425`	`425`	`code = js;`
`426`	`426`	`}`
`427`		`- code &&= stringToPDFString(code).replaceAll("\x00", "");`
	`427`	`+ code &&= stringToPDFString(`
	`428`	`+ code,`
	`429`	`+ /* keepEscapeSequence = */ true`
	`430`	`+ ).replaceAll("\x00", "");`
`428`	`431`	`if (code) {`
`429`	`432`	`list.push(code);`
`430`	`433`	`}`