Skip to content

Commit 9d81faf

Browse files
committed
Add a new internal viewer to explore the structure of PDF files.
The one from pdf.js.utils is a bit too old: a lot of bugs have been fixed in the code that parses PDF files since then. It's just an internal development tool, so it doesn't need to be perfect, but it should be good enough to be useful.
1 parent d5653a1 commit 9d81faf

7 files changed

Lines changed: 1508 additions & 11 deletions

File tree

gulpfile.mjs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ const IMAGE_DECODERS_LEGACY_DIR = BUILD_DIR + "image_decoders-legacy/";
6565
const DEFAULT_PREFERENCES_DIR = BUILD_DIR + "default_preferences/";
6666
const MINIFIED_DIR = BUILD_DIR + "minified/";
6767
const MINIFIED_LEGACY_DIR = BUILD_DIR + "minified-legacy/";
68+
const INTERNAL_VIEWER_DIR = BUILD_DIR + "internal-viewer/";
6869
const JSDOC_BUILD_DIR = BUILD_DIR + "jsdoc/";
6970
const GH_PAGES_DIR = BUILD_DIR + "gh-pages/";
7071
const DIST_DIR = BUILD_DIR + "dist/";
@@ -2368,6 +2369,52 @@ gulp.task("check_l10n", function (done) {
23682369
});
23692370
});
23702371

2372+
function createInternalViewerBundle(defines) {
2373+
const viewerFileConfig = createWebpackConfig(defines, {
2374+
filename: "pdf_internal_viewer.mjs",
2375+
library: {
2376+
type: "module",
2377+
},
2378+
});
2379+
return gulp
2380+
.src("./web/pdf_internal_viewer.js", { encoding: false })
2381+
.pipe(webpack2Stream(viewerFileConfig));
2382+
}
2383+
2384+
function buildInternalViewer(defines, dir) {
2385+
fs.rmSync(dir, { recursive: true, force: true });
2386+
2387+
return ordered([
2388+
createMainBundle(defines).pipe(gulp.dest(dir + "build")),
2389+
createWorkerBundle(defines).pipe(gulp.dest(dir + "build")),
2390+
createInternalViewerBundle(defines).pipe(gulp.dest(dir + "web")),
2391+
preprocessHTML("web/pdf_internal_viewer.html", defines).pipe(
2392+
gulp.dest(dir + "web")
2393+
),
2394+
preprocessCSS("web/pdf_internal_viewer.css", defines)
2395+
.pipe(
2396+
postcss([
2397+
postcssDirPseudoClass(),
2398+
discardCommentsCSS(),
2399+
postcssNesting(),
2400+
postcssLightDarkFunction({ preserve: true }),
2401+
autoprefixer(AUTOPREFIXER_CONFIG),
2402+
])
2403+
)
2404+
.pipe(gulp.dest(dir + "web")),
2405+
createWasmBundle().pipe(gulp.dest(dir + "web/wasm")),
2406+
]);
2407+
}
2408+
2409+
gulp.task(
2410+
"internal-viewer",
2411+
gulp.series(createBuildNumber, function createInternalViewer() {
2412+
console.log("\n### Creating internal viewer");
2413+
const defines = { ...DEFINES, GENERIC: true };
2414+
return buildInternalViewer(defines, INTERNAL_VIEWER_DIR);
2415+
})
2416+
);
2417+
23712418
function ghPagesPrepare() {
23722419
console.log("\n### Creating web site");
23732420

@@ -2391,6 +2438,13 @@ function ghPagesPrepare() {
23912438
gulp
23922439
.src(JSDOC_BUILD_DIR + "**/*", { base: JSDOC_BUILD_DIR, encoding: false })
23932440
.pipe(gulp.dest(GH_PAGES_DIR + "api/draft/")),
2441+
gulp
2442+
.src(INTERNAL_VIEWER_DIR + "**/*", {
2443+
base: INTERNAL_VIEWER_DIR,
2444+
encoding: false,
2445+
removeBOM: false,
2446+
})
2447+
.pipe(gulp.dest(GH_PAGES_DIR + "internal-viewer/")),
23942448
]);
23952449
}
23962450

@@ -2442,6 +2496,7 @@ gulp.task(
24422496
gulp.series(
24432497
"generic",
24442498
"generic-legacy",
2499+
"internal-viewer",
24452500
"jsdoc",
24462501
ghPagesPrepare,
24472502
"metalsmith"

src/core/document.js

Lines changed: 230 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
isArrayEqual,
2323
makeArr,
2424
objectSize,
25+
OPS,
2526
PageActionEventType,
2627
RenderingIntentFlag,
2728
shadow,
@@ -37,6 +38,17 @@ import {
3738
PopupAnnotation,
3839
WidgetAnnotation,
3940
} from "./annotation.js";
41+
import {
42+
Cmd,
43+
Dict,
44+
EOF,
45+
isName,
46+
isRefsEqual,
47+
Name,
48+
Ref,
49+
RefSet,
50+
RefSetCache,
51+
} from "./primitives.js";
4052
import {
4153
collectActions,
4254
getInheritableProperty,
@@ -51,27 +63,21 @@ import {
5163
XRefEntryException,
5264
XRefParseException,
5365
} from "./core_utils.js";
54-
import {
55-
Dict,
56-
isName,
57-
isRefsEqual,
58-
Name,
59-
Ref,
60-
RefSet,
61-
RefSetCache,
62-
} from "./primitives.js";
66+
import { EvaluatorPreprocessor, PartialEvaluator } from "./evaluator.js";
6367
import { getXfaFontDict, getXfaFontName } from "./xfa_fonts.js";
68+
import { Lexer, Linearization, Parser } from "./parser.js";
6469
import { NullStream, Stream } from "./stream.js";
6570
import { BaseStream } from "./base_stream.js";
6671
import { calculateMD5 } from "./calculate_md5.js";
6772
import { Catalog } from "./catalog.js";
6873
import { clearGlobalCaches } from "./cleanup_helper.js";
6974
import { DatasetReader } from "./dataset_reader.js";
7075
import { Intersector } from "./intersector.js";
71-
import { Linearization } from "./parser.js";
76+
import { LocalColorSpaceCache } from "./image_utils.js";
7277
import { ObjectLoader } from "./object_loader.js";
7378
import { OperatorList } from "./operator_list.js";
74-
import { PartialEvaluator } from "./evaluator.js";
79+
import { PDFFunctionFactory } from "./function.js";
80+
import { PDFImage } from "./image.js";
7581
import { StreamsSequenceStream } from "./decode_stream.js";
7682
import { StructTreePage } from "./struct_tree.js";
7783
import { XFAFactory } from "./xfa/factory.js";
@@ -2030,6 +2036,219 @@ class PDFDocument {
20302036
AnnotationFactory.createGlobals(this.pdfManager)
20312037
);
20322038
}
2039+
2040+
async toJSObject(value, firstCall = true) {
2041+
if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) {
2042+
throw new Error("Not implemented: toJSObject");
2043+
}
2044+
2045+
if (value === null && firstCall) {
2046+
return this.toJSObject(this.xref.trailer, false);
2047+
}
2048+
if (value instanceof Dict) {
2049+
const obj = Object.create(null);
2050+
const isPage = isName(value.get("Type"), "Page");
2051+
for (const [key, val] of value.getRawEntries()) {
2052+
obj[key] =
2053+
isPage && key === "Contents"
2054+
? _getContentTokens(val, this.xref)
2055+
: await this.toJSObject(val, false);
2056+
}
2057+
return obj;
2058+
}
2059+
if (Array.isArray(value)) {
2060+
return Promise.all(value.map(v => this.toJSObject(v, false)));
2061+
}
2062+
if (value instanceof Ref) {
2063+
if (firstCall) {
2064+
return this.toJSObject(this.xref.fetch(value), false);
2065+
}
2066+
const result = Object.create(null);
2067+
result.num = value.num;
2068+
result.gen = value.gen;
2069+
return result;
2070+
}
2071+
if (value instanceof BaseStream) {
2072+
const { dict } = value;
2073+
const obj = Object.create(null);
2074+
obj.dict = await this.toJSObject(dict, false);
2075+
2076+
if (
2077+
isName(dict.get("Type"), "XObject") &&
2078+
isName(dict.get("Subtype"), "Image")
2079+
) {
2080+
try {
2081+
const pdfFunctionFactory = new PDFFunctionFactory({
2082+
xref: this.xref,
2083+
isEvalSupported: this.pdfManager.evaluatorOptions.isEvalSupported,
2084+
});
2085+
const imageObj = await PDFImage.buildImage({
2086+
xref: this.xref,
2087+
res: Dict.empty,
2088+
image: value,
2089+
pdfFunctionFactory,
2090+
globalColorSpaceCache: this.catalog.globalColorSpaceCache,
2091+
localColorSpaceCache: new LocalColorSpaceCache(),
2092+
});
2093+
const imgData = await imageObj.createImageData(
2094+
/* forceRGBA = */ true,
2095+
/* isOffscreenCanvasSupported = */ false
2096+
);
2097+
obj.imageData = {
2098+
width: imgData.width,
2099+
height: imgData.height,
2100+
kind: imgData.kind,
2101+
data: imgData.data,
2102+
};
2103+
return obj;
2104+
} catch {
2105+
// Fall through to regular byte stream if image decoding fails.
2106+
}
2107+
}
2108+
2109+
if (isName(dict.get("Subtype"), "Form")) {
2110+
obj.bytes = value.getString();
2111+
value.reset();
2112+
const { instructions, cmdNames } = _groupIntoInstructions(
2113+
_tokenizeStream(value, this.xref)
2114+
);
2115+
obj.contentStream = true;
2116+
obj.instructions = instructions;
2117+
obj.cmdNames = cmdNames;
2118+
return obj;
2119+
}
2120+
2121+
obj.bytes = value.getString();
2122+
return obj;
2123+
}
2124+
return value;
2125+
}
2126+
}
2127+
2128+
function _tokenizeStream(stream, xref) {
2129+
const tokens = [];
2130+
const parser = new Parser({
2131+
lexer: new Lexer(stream),
2132+
xref,
2133+
allowStreams: false,
2134+
});
2135+
while (true) {
2136+
let obj;
2137+
try {
2138+
obj = parser.getObj();
2139+
} catch {
2140+
break;
2141+
}
2142+
if (obj === EOF) {
2143+
break;
2144+
}
2145+
const token = _tokenToJSObject(obj);
2146+
if (token !== null) {
2147+
tokens.push(token);
2148+
}
2149+
}
2150+
return tokens;
2151+
}
2152+
2153+
function _getContentTokens(contentsVal, xref) {
2154+
const refs = Array.isArray(contentsVal) ? contentsVal : [contentsVal];
2155+
const rawContents = [];
2156+
const tokens = [];
2157+
for (const rawRef of refs) {
2158+
if (rawRef instanceof Ref) {
2159+
rawContents.push({ num: rawRef.num, gen: rawRef.gen });
2160+
}
2161+
const stream = xref.fetchIfRef(rawRef);
2162+
if (!(stream instanceof BaseStream)) {
2163+
continue;
2164+
}
2165+
tokens.push(..._tokenizeStream(stream, xref));
2166+
}
2167+
const { instructions, cmdNames } = _groupIntoInstructions(tokens);
2168+
return { contentStream: true, instructions, cmdNames, rawContents };
2169+
}
2170+
2171+
// Lazily-built reverse map: OPS numeric id → property name string.
2172+
let _opsIdToName = null;
2173+
2174+
function _getOpsIdToName() {
2175+
if (!_opsIdToName) {
2176+
_opsIdToName = Object.create(null);
2177+
for (const [name, id] of Object.entries(OPS)) {
2178+
_opsIdToName[id] = name;
2179+
}
2180+
}
2181+
return _opsIdToName;
2182+
}
2183+
2184+
function _groupIntoInstructions(tokens) {
2185+
const { opMap } = EvaluatorPreprocessor;
2186+
const opsIdToName = _getOpsIdToName();
2187+
const instructions = [];
2188+
const cmdNames = Object.create(null);
2189+
const argBuffer = [];
2190+
for (const token of tokens) {
2191+
if (token.type !== "cmd") {
2192+
argBuffer.push(token);
2193+
continue;
2194+
}
2195+
const op = opMap[token.value];
2196+
if (op && !(token.value in cmdNames)) {
2197+
cmdNames[token.value] = opsIdToName[op.id];
2198+
}
2199+
let args;
2200+
if (!op || op.variableArgs) {
2201+
// Unknown command or variable args: consume all pending args.
2202+
args = argBuffer.splice(0);
2203+
} else {
2204+
// Fixed args: consume exactly numArgs, orphan the rest.
2205+
const orphanCount = Math.max(0, argBuffer.length - op.numArgs);
2206+
for (let i = 0; i < orphanCount; i++) {
2207+
instructions.push({ cmd: null, args: [argBuffer.shift()] });
2208+
}
2209+
args = argBuffer.splice(0);
2210+
}
2211+
instructions.push({ cmd: token.value, args });
2212+
}
2213+
for (const t of argBuffer) {
2214+
instructions.push({ cmd: null, args: [t] });
2215+
}
2216+
return { instructions, cmdNames };
2217+
}
2218+
2219+
function _tokenToJSObject(obj) {
2220+
if (obj instanceof Cmd) {
2221+
return { type: "cmd", value: obj.cmd };
2222+
}
2223+
if (obj instanceof Name) {
2224+
return { type: "name", value: obj.name };
2225+
}
2226+
if (obj instanceof Ref) {
2227+
return { type: "ref", num: obj.num, gen: obj.gen };
2228+
}
2229+
if (Array.isArray(obj)) {
2230+
return { type: "array", value: obj.map(_tokenToJSObject) };
2231+
}
2232+
if (obj instanceof Dict) {
2233+
const result = Object.create(null);
2234+
for (const [key, val] of obj.getRawEntries()) {
2235+
result[key] = _tokenToJSObject(val);
2236+
}
2237+
return { type: "dict", value: result };
2238+
}
2239+
if (typeof obj === "number") {
2240+
return { type: "number", value: obj };
2241+
}
2242+
if (typeof obj === "string") {
2243+
return { type: "string", value: obj };
2244+
}
2245+
if (typeof obj === "boolean") {
2246+
return { type: "boolean", value: obj };
2247+
}
2248+
if (obj === null) {
2249+
return { type: "null" };
2250+
}
2251+
return null;
20332252
}
20342253

20352254
export { Page, PDFDocument };

src/core/worker.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,22 @@ class WorkerMessageHandler {
952952
return pdfManager.fontFallback(data.id, handler);
953953
});
954954

955+
handler.on("GetRawData", async function ({ ref, page }) {
956+
if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) {
957+
throw new Error("Not implemented: GetRawData");
958+
}
959+
let value = null;
960+
if (page >= 1) {
961+
value = (await pdfManager.ensureCatalog("getPageDict", [page - 1]))[1];
962+
} else if (ref) {
963+
value =
964+
typeof ref === "string"
965+
? Ref.fromString(ref)
966+
: Ref.get(ref.num, ref.gen);
967+
}
968+
return pdfManager.ensureDoc("toJSObject", [value]);
969+
});
970+
955971
handler.on("Cleanup", function (data) {
956972
return pdfManager.cleanup(/* manuallyTriggered = */ true);
957973
});

0 commit comments

Comments
 (0)