Merge pull request #20406 from gregtatum/text-extractor

calixteman · web-flow · commit 27bb5fb17392 · 2025-10-30T17:23:18.000+01:00
Add text extractor as an external service
diff --git a/web/app.js b/web/app.js
@@ -89,6 +89,7 @@ import { PDFPrintServiceFactory } from "web-print_service";
 import { PDFRenderingQueue } from "./pdf_rendering_queue.js";
 import { PDFScriptingManager } from "./pdf_scripting_manager.js";
 import { PDFSidebar } from "web-pdf_sidebar";
+import { PdfTextExtractor } from "./pdf_text_extractor.js";
 import { PDFThumbnailViewer } from "web-pdf_thumbnail_viewer";
 import { PDFViewer } from "./pdf_viewer.js";
 import { Preferences } from "web-preferences";
@@ -129,6 +130,8 @@ const PDFViewerApplication = {
   pdfDocumentProperties: null,
   /** @type {PDFLinkService} */
   pdfLinkService: null,
+  /** @type {PdfTextExtractor|null} */
+  pdfTextExtractor: null,
   /** @type {PDFHistory} */
   pdfHistory: null,
   /** @type {PDFSidebar} */
@@ -262,6 +265,8 @@ const PDFViewerApplication = {
     }
     await this._initializeViewerComponents();
 
+    this.pdfTextExtractor = new PdfTextExtractor(this.externalServices);
+
     // Bind the various event handlers *after* the viewer has been
     // initialized, to prevent errors if an event arrives too soon.
     this.bindEvents();
@@ -1144,6 +1149,7 @@ const PDFViewerApplication = {
       this.pdfViewer.setDocument(null);
       this.pdfLinkService.setDocument(null);
       this.pdfDocumentProperties?.setDocument(null);
+      this.pdfTextExtractor?.setViewer(null);
     }
     this.pdfLinkService.externalLinkEnabled = true;
     this.store = null;
@@ -1450,6 +1456,7 @@ const PDFViewerApplication = {
 
     const pdfViewer = this.pdfViewer;
     pdfViewer.setDocument(pdfDocument);
+    this.pdfTextExtractor.setViewer(pdfViewer);
     const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
 
     this.pdfThumbnailViewer?.setDocument(pdfDocument);
diff --git a/web/external_services.js b/web/external_services.js
@@ -33,6 +33,8 @@ class BaseExternalServices {
 
   reportTelemetry(data) {}
 
+  reportText(data) {}
+
   /**
    * @returns {Promise<IL10n>}
    */
diff --git a/web/firefoxcom.js b/web/firefoxcom.js
@@ -645,6 +645,10 @@ class ExternalServices extends BaseExternalServices {
     FirefoxCom.request("reportTelemetry", data);
   }
 
+  reportText(data) {
+    FirefoxCom.request("reportText", data);
+  }
+
   updateEditorStates(data) {
     FirefoxCom.request("updateEditorStates", data);
   }
diff --git a/web/pdf_text_extractor.js b/web/pdf_text_extractor.js
@@ -0,0 +1,89 @@
+/* Copyright 2024 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This class manages the interaction of extracting the text content of the page
+ * and passing it back to the external service.
+ */
+class PdfTextExtractor {
+  /** @type {PDFViewer} */
+  #pdfViewer;
+
+  #externalServices;
+
+  /**
+   * @type {?Promise<string>}
+   */
+  #textPromise;
+
+  #pendingRequests = new Set();
+
+  constructor(externalServices) {
+    this.#externalServices = externalServices;
+
+    window.addEventListener("requestTextContent", ({ detail }) => {
+      this.extractTextContent(detail.requestId);
+    });
+  }
+
+  /**
+   * The PDF viewer is required to get the page text.
+   *
+   * @param {PDFViewer | null}
+   */
+  setViewer(pdfViewer) {
+    this.#pdfViewer = pdfViewer;
+    if (this.#pdfViewer && this.#pendingRequests.size) {
+      // Handle any pending requests that came in while things were loading.
+      for (const pendingRequest of this.#pendingRequests) {
+        this.extractTextContent(pendingRequest);
+      }
+      this.#pendingRequests.clear();
+    }
+  }
+
+  /**
+   * Builds up all of the text from a PDF.
+   *
+   * @param {number} requestId
+   */
+  async extractTextContent(requestId) {
+    if (!this.#pdfViewer) {
+      this.#pendingRequests.add(requestId);
+      return;
+    }
+
+    if (!this.#textPromise) {
+      const textPromise = (this.#textPromise = this.#pdfViewer.getAllText());
+
+      // After the text resolves, cache the text for a little bit in case
+      // multiple consumers call it.
+      textPromise.then(() => {
+        setTimeout(() => {
+          if (this.#textPromise === textPromise) {
+            this.#textPromise = null;
+          }
+        }, 5000);
+      });
+    }
+
+    this.#externalServices.reportText({
+      text: await this.#textPromise,
+      requestId,
+    });
+  }
+}
+
+export { PdfTextExtractor };

Original file line number	Diff line number	Diff line change
`@@ -645,6 +645,10 @@ class ExternalServices extends BaseExternalServices {`
`645`	`645`	`FirefoxCom.request("reportTelemetry", data);`
`646`	`646`	`}`
`647`	`647`
	`648`	`+ reportText(data) {`
	`649`	`+ FirefoxCom.request("reportText", data);`
	`650`	`+ }`
	`651`	`+`
`648`	`652`	`updateEditorStates(data) {`
`649`	`653`	`FirefoxCom.request("updateEditorStates", data);`
`650`	`654`	`}`