From a67b9aec6c5f34d46e8a080f920c3b0e046464b6 Mon Sep 17 00:00:00 2001 From: "Richard Smith (smir)" Date: Mon, 19 Aug 2024 15:18:05 +0100 Subject: [PATCH] Send fetch requests for all page dict lookups in parallel - When adding page dict candidates to the lookup tree, also initiate fetching them from xref, so if they are not yet loaded at all, the XHR will be sent - Only at the top level - assume that if there is a /Pages tree, it is sensibly structured and the number of requests won't be too bad - We can then await on the cached Promise without making the requests pipeline - This has a significant performance improvement for load-on-demand (i.e. with auto-fetch turned off) when a PDF has a large number of pages in the top level /Pages collection, and those pages are spread through a file, so every candidate needs to be fetched separately - PDFs with many pages where each page is a big image and all the pages are at the top level are quite a common output for digitisation programmes - I would have liked to do something like "if it's the top level collection and page count = number of kids, then just fetch that page without traversing the tree" but unfortunately I agree with comments on #8088 that there is no good general solution to allow for /Pages nodes with empty /Kids arrays --- src/core/catalog.js | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index 2c4551d3ab2c7..9dd20e0e73be2 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -143,6 +143,7 @@ class Catalog { this.globalImageCache = new GlobalImageCache(); this.pageKidsCountCache = new RefSetCache(); this.pageIndexCache = new RefSetCache(); + this.pageDictCache = new RefSetCache(); this.nonBlendModesSet = new RefSet(); this.systemFontCache = new Map(); } @@ -1161,6 +1162,7 @@ class Catalog { this.globalImageCache.clear(/* onlyData = */ manuallyTriggered); this.pageKidsCountCache.clear(); this.pageIndexCache.clear(); + this.pageDictCache.clear(); this.nonBlendModesSet.clear(); const translatedFonts = await Promise.all(this.fontCache); @@ -1184,7 +1186,8 @@ class Catalog { } const xref = this.xref, pageKidsCountCache = this.pageKidsCountCache, - pageIndexCache = this.pageIndexCache; + pageIndexCache = this.pageIndexCache, + pageDictCache = this.pageDictCache; let currentPageIndex = 0; while (nodesToVisit.length) { @@ -1203,7 +1206,8 @@ class Catalog { } visitedNodes.put(currentNode); - const obj = await xref.fetchAsync(currentNode); + const obj = await (pageDictCache.get(currentNode) || + xref.fetchAsync(currentNode)); if (obj instanceof Dict) { let type = obj.getRaw("Type"); if (type instanceof Ref) { @@ -1285,7 +1289,18 @@ class Catalog { // node further down in the tree (see issue5644.pdf, issue8088.pdf), // and to ensure that we actually find the correct `Page` dict. for (let last = kids.length - 1; last >= 0; last--) { - nodesToVisit.push(kids[last]); + const lastKid = kids[last]; + nodesToVisit.push(lastKid); + + // Launch all requests in parallel so we don't wait for each one in turn + // when looking for a page near the end, if all the pages are top level. + if ( + currentNode === this.toplevelPagesDict && + lastKid instanceof Ref && + !pageDictCache.has(lastKid) + ) { + pageDictCache.put(lastKid, xref.fetchAsync(lastKid)); + } } }