Skip to content

Commit c70ff5a

Browse files
committed
Improve and simplify the PdfTextExtractor implementation
Working on PR 20784, I couldn't help noticing that this code can be improved a little bit. - Only initialize `PdfTextExtractor` in development mode and MOZCENTRAL builds, since it's unused elsewhere. - Re-factor how `PdfTextExtractor` waits for the viewer to be available/ready, by using existing (internal) events. This simplifies the `PdfTextExtractor` class and removes its `setViewer` method, which improves general consistency since normally the viewer-components don't use such a method in that way (here it was effectively used as a stand-in for a `setDocument` method). - Finally, while slightly unrelated, rename the `#getAllTextInProgress` field in the `PDFViewer` class to `#copyAllInProgress` to clearly indicate what it's for since the `getAllText` method is used more generally now.
1 parent ae507c4 commit c70ff5a

3 files changed

Lines changed: 32 additions & 38 deletions

File tree

web/app.js

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -264,8 +264,6 @@ const PDFViewerApplication = {
264264
}
265265
await this._initializeViewerComponents();
266266

267-
this.pdfTextExtractor = new PdfTextExtractor(this.externalServices);
268-
269267
// Bind the various event handlers *after* the viewer has been
270268
// initialized, to prevent errors if an event arrives too soon.
271269
this.bindEvents();
@@ -779,6 +777,17 @@ const PDFViewerApplication = {
779777
);
780778
};
781779
}
780+
781+
if (
782+
typeof PDFJSDev === "undefined" ||
783+
PDFJSDev.test("TESTING || MOZCENTRAL")
784+
) {
785+
this.pdfTextExtractor = new PdfTextExtractor(
786+
externalServices,
787+
pdfViewer,
788+
eventBus
789+
);
790+
}
782791
},
783792

784793
async run(config) {
@@ -1151,7 +1160,6 @@ const PDFViewerApplication = {
11511160
this.pdfViewer.setDocument(null);
11521161
this.pdfLinkService.setDocument(null);
11531162
this.pdfDocumentProperties?.setDocument(null);
1154-
this.pdfTextExtractor?.setViewer(null);
11551163
}
11561164
this.pdfLinkService.externalLinkEnabled = true;
11571165
this.store = null;
@@ -1455,7 +1463,6 @@ const PDFViewerApplication = {
14551463

14561464
const pdfViewer = this.pdfViewer;
14571465
pdfViewer.setDocument(pdfDocument);
1458-
this.pdfTextExtractor.setViewer(pdfViewer);
14591466
const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
14601467

14611468
this.pdfThumbnailViewer?.setDocument(pdfDocument);

web/pdf_text_extractor.js

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,55 +18,42 @@
1818
* and passing it back to the external service.
1919
*/
2020
class PdfTextExtractor {
21-
/** @type {PDFViewer} */
22-
#pdfViewer;
23-
21+
/** @type {BaseExternalServices} */
2422
#externalServices;
2523

26-
/**
27-
* @type {?Promise<string>}
28-
*/
24+
/** @type {?Promise<string>} */
2925
#textPromise;
3026

31-
#pendingRequests = new Set();
27+
#capability = Promise.withResolvers();
3228

33-
constructor(externalServices) {
29+
constructor(externalServices, pdfViewer, eventBus) {
3430
this.#externalServices = externalServices;
3531

32+
eventBus._on("pagesinit", () => {
33+
this.#capability.resolve(pdfViewer);
34+
});
35+
eventBus._on("pagesdestroy", () => {
36+
this.#capability.reject(new Error("pagesdestroy"));
37+
this.#textPromise = null;
38+
39+
this.#capability = Promise.withResolvers();
40+
});
41+
3642
window.addEventListener("requestTextContent", ({ detail }) => {
3743
this.extractTextContent(detail.requestId);
3844
});
3945
}
4046

41-
/**
42-
* The PDF viewer is required to get the page text.
43-
*
44-
* @param {PDFViewer | null}
45-
*/
46-
setViewer(pdfViewer) {
47-
this.#pdfViewer = pdfViewer;
48-
if (this.#pdfViewer && this.#pendingRequests.size) {
49-
// Handle any pending requests that came in while things were loading.
50-
for (const pendingRequest of this.#pendingRequests) {
51-
this.extractTextContent(pendingRequest);
52-
}
53-
this.#pendingRequests.clear();
54-
}
55-
}
56-
5747
/**
5848
* Builds up all of the text from a PDF.
5949
*
6050
* @param {number} requestId
6151
*/
6252
async extractTextContent(requestId) {
63-
if (!this.#pdfViewer) {
64-
this.#pendingRequests.add(requestId);
65-
return;
66-
}
67-
6853
if (!this.#textPromise) {
69-
const textPromise = (this.#textPromise = this.#pdfViewer.getAllText());
54+
const textPromise = (this.#textPromise = this.#capability.promise.then(
55+
pdfViewer => pdfViewer.getAllText()
56+
));
7057

7158
// After the text resolves, cache the text for a little bit in case
7259
// multiple consumers call it.

web/pdf_viewer.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ class PDFViewer {
268268

269269
#switchAnnotationEditorModeTimeoutId = null;
270270

271-
#getAllTextInProgress = false;
271+
#copyAllInProgress = false;
272272

273273
#hiddenCopyElement = null;
274274

@@ -816,13 +816,13 @@ class PDFViewer {
816816
// has been selected.
817817

818818
if (
819-
this.#getAllTextInProgress ||
819+
this.#copyAllInProgress ||
820820
textLayerMode === TextLayerMode.ENABLE_PERMISSIONS
821821
) {
822822
stopEvent(event);
823823
return;
824824
}
825-
this.#getAllTextInProgress = true;
825+
this.#copyAllInProgress = true;
826826

827827
// TODO: if all the pages are rendered we don't need to wait for
828828
// getAllText and we could just get text from the Selection object.
@@ -855,7 +855,7 @@ class PDFViewer {
855855
);
856856
})
857857
.finally(() => {
858-
this.#getAllTextInProgress = false;
858+
this.#copyAllInProgress = false;
859859
keydownAC.abort();
860860
classList.remove("copyAll");
861861
});

0 commit comments

Comments
 (0)