Skip to content

Commit 44a6354

Browse files
Merge pull request #20831 from calixteman/internal_viewer
Add a new internal viewer to explore the structure of PDF files.
2 parents 15e58f3 + 9d81faf commit 44a6354

7 files changed

Lines changed: 1508 additions & 11 deletions

File tree

gulpfile.mjs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ const IMAGE_DECODERS_LEGACY_DIR = BUILD_DIR + "image_decoders-legacy/";
6565
const DEFAULT_PREFERENCES_DIR = BUILD_DIR + "default_preferences/";
6666
const MINIFIED_DIR = BUILD_DIR + "minified/";
6767
const MINIFIED_LEGACY_DIR = BUILD_DIR + "minified-legacy/";
68+
const INTERNAL_VIEWER_DIR = BUILD_DIR + "internal-viewer/";
6869
const JSDOC_BUILD_DIR = BUILD_DIR + "jsdoc/";
6970
const GH_PAGES_DIR = BUILD_DIR + "gh-pages/";
7071
const DIST_DIR = BUILD_DIR + "dist/";
@@ -2368,6 +2369,52 @@ gulp.task("check_l10n", function (done) {
23682369
});
23692370
});
23702371

2372+
function createInternalViewerBundle(defines) {
2373+
const viewerFileConfig = createWebpackConfig(defines, {
2374+
filename: "pdf_internal_viewer.mjs",
2375+
library: {
2376+
type: "module",
2377+
},
2378+
});
2379+
return gulp
2380+
.src("./web/pdf_internal_viewer.js", { encoding: false })
2381+
.pipe(webpack2Stream(viewerFileConfig));
2382+
}
2383+
2384+
function buildInternalViewer(defines, dir) {
2385+
fs.rmSync(dir, { recursive: true, force: true });
2386+
2387+
return ordered([
2388+
createMainBundle(defines).pipe(gulp.dest(dir + "build")),
2389+
createWorkerBundle(defines).pipe(gulp.dest(dir + "build")),
2390+
createInternalViewerBundle(defines).pipe(gulp.dest(dir + "web")),
2391+
preprocessHTML("web/pdf_internal_viewer.html", defines).pipe(
2392+
gulp.dest(dir + "web")
2393+
),
2394+
preprocessCSS("web/pdf_internal_viewer.css", defines)
2395+
.pipe(
2396+
postcss([
2397+
postcssDirPseudoClass(),
2398+
discardCommentsCSS(),
2399+
postcssNesting(),
2400+
postcssLightDarkFunction({ preserve: true }),
2401+
autoprefixer(AUTOPREFIXER_CONFIG),
2402+
])
2403+
)
2404+
.pipe(gulp.dest(dir + "web")),
2405+
createWasmBundle().pipe(gulp.dest(dir + "web/wasm")),
2406+
]);
2407+
}
2408+
2409+
gulp.task(
2410+
"internal-viewer",
2411+
gulp.series(createBuildNumber, function createInternalViewer() {
2412+
console.log("\n### Creating internal viewer");
2413+
const defines = { ...DEFINES, GENERIC: true };
2414+
return buildInternalViewer(defines, INTERNAL_VIEWER_DIR);
2415+
})
2416+
);
2417+
23712418
function ghPagesPrepare() {
23722419
console.log("\n### Creating web site");
23732420

@@ -2391,6 +2438,13 @@ function ghPagesPrepare() {
23912438
gulp
23922439
.src(JSDOC_BUILD_DIR + "**/*", { base: JSDOC_BUILD_DIR, encoding: false })
23932440
.pipe(gulp.dest(GH_PAGES_DIR + "api/draft/")),
2441+
gulp
2442+
.src(INTERNAL_VIEWER_DIR + "**/*", {
2443+
base: INTERNAL_VIEWER_DIR,
2444+
encoding: false,
2445+
removeBOM: false,
2446+
})
2447+
.pipe(gulp.dest(GH_PAGES_DIR + "internal-viewer/")),
23942448
]);
23952449
}
23962450

@@ -2442,6 +2496,7 @@ gulp.task(
24422496
gulp.series(
24432497
"generic",
24442498
"generic-legacy",
2499+
"internal-viewer",
24452500
"jsdoc",
24462501
ghPagesPrepare,
24472502
"metalsmith"

src/core/document.js

Lines changed: 230 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
isArrayEqual,
2323
makeArr,
2424
objectSize,
25+
OPS,
2526
PageActionEventType,
2627
RenderingIntentFlag,
2728
shadow,
@@ -37,6 +38,17 @@ import {
3738
PopupAnnotation,
3839
WidgetAnnotation,
3940
} from "./annotation.js";
41+
import {
42+
Cmd,
43+
Dict,
44+
EOF,
45+
isName,
46+
isRefsEqual,
47+
Name,
48+
Ref,
49+
RefSet,
50+
RefSetCache,
51+
} from "./primitives.js";
4052
import {
4153
collectActions,
4254
getInheritableProperty,
@@ -51,27 +63,21 @@ import {
5163
XRefEntryException,
5264
XRefParseException,
5365
} from "./core_utils.js";
54-
import {
55-
Dict,
56-
isName,
57-
isRefsEqual,
58-
Name,
59-
Ref,
60-
RefSet,
61-
RefSetCache,
62-
} from "./primitives.js";
66+
import { EvaluatorPreprocessor, PartialEvaluator } from "./evaluator.js";
6367
import { getXfaFontDict, getXfaFontName } from "./xfa_fonts.js";
68+
import { Lexer, Linearization, Parser } from "./parser.js";
6469
import { NullStream, Stream } from "./stream.js";
6570
import { BaseStream } from "./base_stream.js";
6671
import { calculateMD5 } from "./calculate_md5.js";
6772
import { Catalog } from "./catalog.js";
6873
import { clearGlobalCaches } from "./cleanup_helper.js";
6974
import { DatasetReader } from "./dataset_reader.js";
7075
import { Intersector } from "./intersector.js";
71-
import { Linearization } from "./parser.js";
76+
import { LocalColorSpaceCache } from "./image_utils.js";
7277
import { ObjectLoader } from "./object_loader.js";
7378
import { OperatorList } from "./operator_list.js";
74-
import { PartialEvaluator } from "./evaluator.js";
79+
import { PDFFunctionFactory } from "./function.js";
80+
import { PDFImage } from "./image.js";
7581
import { StreamsSequenceStream } from "./decode_stream.js";
7682
import { StructTreePage } from "./struct_tree.js";
7783
import { XFAFactory } from "./xfa/factory.js";
@@ -2030,6 +2036,219 @@ class PDFDocument {
20302036
AnnotationFactory.createGlobals(this.pdfManager)
20312037
);
20322038
}
2039+
2040+
async toJSObject(value, firstCall = true) {
2041+
if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) {
2042+
throw new Error("Not implemented: toJSObject");
2043+
}
2044+
2045+
if (value === null && firstCall) {
2046+
return this.toJSObject(this.xref.trailer, false);
2047+
}
2048+
if (value instanceof Dict) {
2049+
const obj = Object.create(null);
2050+
const isPage = isName(value.get("Type"), "Page");
2051+
for (const [key, val] of value.getRawEntries()) {
2052+
obj[key] =
2053+
isPage && key === "Contents"
2054+
? _getContentTokens(val, this.xref)
2055+
: await this.toJSObject(val, false);
2056+
}
2057+
return obj;
2058+
}
2059+
if (Array.isArray(value)) {
2060+
return Promise.all(value.map(v => this.toJSObject(v, false)));
2061+
}
2062+
if (value instanceof Ref) {
2063+
if (firstCall) {
2064+
return this.toJSObject(this.xref.fetch(value), false);
2065+
}
2066+
const result = Object.create(null);
2067+
result.num = value.num;
2068+
result.gen = value.gen;
2069+
return result;
2070+
}
2071+
if (value instanceof BaseStream) {
2072+
const { dict } = value;
2073+
const obj = Object.create(null);
2074+
obj.dict = await this.toJSObject(dict, false);
2075+
2076+
if (
2077+
isName(dict.get("Type"), "XObject") &&
2078+
isName(dict.get("Subtype"), "Image")
2079+
) {
2080+
try {
2081+
const pdfFunctionFactory = new PDFFunctionFactory({
2082+
xref: this.xref,
2083+
isEvalSupported: this.pdfManager.evaluatorOptions.isEvalSupported,
2084+
});
2085+
const imageObj = await PDFImage.buildImage({
2086+
xref: this.xref,
2087+
res: Dict.empty,
2088+
image: value,
2089+
pdfFunctionFactory,
2090+
globalColorSpaceCache: this.catalog.globalColorSpaceCache,
2091+
localColorSpaceCache: new LocalColorSpaceCache(),
2092+
});
2093+
const imgData = await imageObj.createImageData(
2094+
/* forceRGBA = */ true,
2095+
/* isOffscreenCanvasSupported = */ false
2096+
);
2097+
obj.imageData = {
2098+
width: imgData.width,
2099+
height: imgData.height,
2100+
kind: imgData.kind,
2101+
data: imgData.data,
2102+
};
2103+
return obj;
2104+
} catch {
2105+
// Fall through to regular byte stream if image decoding fails.
2106+
}
2107+
}
2108+
2109+
if (isName(dict.get("Subtype"), "Form")) {
2110+
obj.bytes = value.getString();
2111+
value.reset();
2112+
const { instructions, cmdNames } = _groupIntoInstructions(
2113+
_tokenizeStream(value, this.xref)
2114+
);
2115+
obj.contentStream = true;
2116+
obj.instructions = instructions;
2117+
obj.cmdNames = cmdNames;
2118+
return obj;
2119+
}
2120+
2121+
obj.bytes = value.getString();
2122+
return obj;
2123+
}
2124+
return value;
2125+
}
2126+
}
2127+
2128+
function _tokenizeStream(stream, xref) {
2129+
const tokens = [];
2130+
const parser = new Parser({
2131+
lexer: new Lexer(stream),
2132+
xref,
2133+
allowStreams: false,
2134+
});
2135+
while (true) {
2136+
let obj;
2137+
try {
2138+
obj = parser.getObj();
2139+
} catch {
2140+
break;
2141+
}
2142+
if (obj === EOF) {
2143+
break;
2144+
}
2145+
const token = _tokenToJSObject(obj);
2146+
if (token !== null) {
2147+
tokens.push(token);
2148+
}
2149+
}
2150+
return tokens;
2151+
}
2152+
2153+
function _getContentTokens(contentsVal, xref) {
2154+
const refs = Array.isArray(contentsVal) ? contentsVal : [contentsVal];
2155+
const rawContents = [];
2156+
const tokens = [];
2157+
for (const rawRef of refs) {
2158+
if (rawRef instanceof Ref) {
2159+
rawContents.push({ num: rawRef.num, gen: rawRef.gen });
2160+
}
2161+
const stream = xref.fetchIfRef(rawRef);
2162+
if (!(stream instanceof BaseStream)) {
2163+
continue;
2164+
}
2165+
tokens.push(..._tokenizeStream(stream, xref));
2166+
}
2167+
const { instructions, cmdNames } = _groupIntoInstructions(tokens);
2168+
return { contentStream: true, instructions, cmdNames, rawContents };
2169+
}
2170+
2171+
// Lazily-built reverse map: OPS numeric id → property name string.
2172+
let _opsIdToName = null;
2173+
2174+
function _getOpsIdToName() {
2175+
if (!_opsIdToName) {
2176+
_opsIdToName = Object.create(null);
2177+
for (const [name, id] of Object.entries(OPS)) {
2178+
_opsIdToName[id] = name;
2179+
}
2180+
}
2181+
return _opsIdToName;
2182+
}
2183+
2184+
function _groupIntoInstructions(tokens) {
2185+
const { opMap } = EvaluatorPreprocessor;
2186+
const opsIdToName = _getOpsIdToName();
2187+
const instructions = [];
2188+
const cmdNames = Object.create(null);
2189+
const argBuffer = [];
2190+
for (const token of tokens) {
2191+
if (token.type !== "cmd") {
2192+
argBuffer.push(token);
2193+
continue;
2194+
}
2195+
const op = opMap[token.value];
2196+
if (op && !(token.value in cmdNames)) {
2197+
cmdNames[token.value] = opsIdToName[op.id];
2198+
}
2199+
let args;
2200+
if (!op || op.variableArgs) {
2201+
// Unknown command or variable args: consume all pending args.
2202+
args = argBuffer.splice(0);
2203+
} else {
2204+
// Fixed args: consume exactly numArgs, orphan the rest.
2205+
const orphanCount = Math.max(0, argBuffer.length - op.numArgs);
2206+
for (let i = 0; i < orphanCount; i++) {
2207+
instructions.push({ cmd: null, args: [argBuffer.shift()] });
2208+
}
2209+
args = argBuffer.splice(0);
2210+
}
2211+
instructions.push({ cmd: token.value, args });
2212+
}
2213+
for (const t of argBuffer) {
2214+
instructions.push({ cmd: null, args: [t] });
2215+
}
2216+
return { instructions, cmdNames };
2217+
}
2218+
2219+
function _tokenToJSObject(obj) {
2220+
if (obj instanceof Cmd) {
2221+
return { type: "cmd", value: obj.cmd };
2222+
}
2223+
if (obj instanceof Name) {
2224+
return { type: "name", value: obj.name };
2225+
}
2226+
if (obj instanceof Ref) {
2227+
return { type: "ref", num: obj.num, gen: obj.gen };
2228+
}
2229+
if (Array.isArray(obj)) {
2230+
return { type: "array", value: obj.map(_tokenToJSObject) };
2231+
}
2232+
if (obj instanceof Dict) {
2233+
const result = Object.create(null);
2234+
for (const [key, val] of obj.getRawEntries()) {
2235+
result[key] = _tokenToJSObject(val);
2236+
}
2237+
return { type: "dict", value: result };
2238+
}
2239+
if (typeof obj === "number") {
2240+
return { type: "number", value: obj };
2241+
}
2242+
if (typeof obj === "string") {
2243+
return { type: "string", value: obj };
2244+
}
2245+
if (typeof obj === "boolean") {
2246+
return { type: "boolean", value: obj };
2247+
}
2248+
if (obj === null) {
2249+
return { type: "null" };
2250+
}
2251+
return null;
20332252
}
20342253

20352254
export { Page, PDFDocument };

src/core/worker.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,22 @@ class WorkerMessageHandler {
952952
return pdfManager.fontFallback(data.id, handler);
953953
});
954954

955+
handler.on("GetRawData", async function ({ ref, page }) {
956+
if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) {
957+
throw new Error("Not implemented: GetRawData");
958+
}
959+
let value = null;
960+
if (page >= 1) {
961+
value = (await pdfManager.ensureCatalog("getPageDict", [page - 1]))[1];
962+
} else if (ref) {
963+
value =
964+
typeof ref === "string"
965+
? Ref.fromString(ref)
966+
: Ref.get(ref.num, ref.gen);
967+
}
968+
return pdfManager.ensureDoc("toJSObject", [value]);
969+
});
970+
955971
handler.on("Cleanup", function (data) {
956972
return pdfManager.cleanup(/* manuallyTriggered = */ true);
957973
});

0 commit comments

Comments
 (0)