From a67b9aec6c5f34d46e8a080f920c3b0e046464b6 Mon Sep 17 00:00:00 2001
From: "Richard Smith (smir)" <smir@preservica.com>
Date: Mon, 19 Aug 2024 15:18:05 +0100
Subject: [PATCH] Send fetch requests for all page dict lookups in parallel -
 When adding page dict candidates to the lookup tree, also initiate fetching
 them from xref, so if they are not yet loaded at all, the XHR will be sent  -
 Only at the top level - assume that if there is a /Pages tree, it is sensibly
 structured and the number of requests won't be too bad - We can then await on
 the cached Promise without making the requests pipeline - This has a
 significant performance improvement for load-on-demand (i.e. with auto-fetch
 turned off) when a PDF has a large number of pages in the top level /Pages
 collection, and those pages are spread through a file, so every candidate
 needs to be fetched separately  - PDFs with many pages where each page is a
 big image and all the pages are at the top level are quite a common output
 for digitisation programmes - I would have liked to do something like "if
 it's the top level collection and page count = number of kids, then just
 fetch that page without traversing the tree" but unfortunately I agree with
 comments on #8088 that there is no good general solution to allow for /Pages
 nodes with empty /Kids arrays

---
 src/core/catalog.js | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/core/catalog.js b/src/core/catalog.js
index 2c4551d3ab2c7..9dd20e0e73be2 100644
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@@ -143,6 +143,7 @@ class Catalog {
     this.globalImageCache = new GlobalImageCache();
     this.pageKidsCountCache = new RefSetCache();
     this.pageIndexCache = new RefSetCache();
+    this.pageDictCache = new RefSetCache();
     this.nonBlendModesSet = new RefSet();
     this.systemFontCache = new Map();
   }
@@ -1161,6 +1162,7 @@ class Catalog {
     this.globalImageCache.clear(/* onlyData = */ manuallyTriggered);
     this.pageKidsCountCache.clear();
     this.pageIndexCache.clear();
+    this.pageDictCache.clear();
     this.nonBlendModesSet.clear();
 
     const translatedFonts = await Promise.all(this.fontCache);
@@ -1184,7 +1186,8 @@ class Catalog {
     }
     const xref = this.xref,
       pageKidsCountCache = this.pageKidsCountCache,
-      pageIndexCache = this.pageIndexCache;
+      pageIndexCache = this.pageIndexCache,
+      pageDictCache = this.pageDictCache;
     let currentPageIndex = 0;
 
     while (nodesToVisit.length) {
@@ -1203,7 +1206,8 @@ class Catalog {
         }
         visitedNodes.put(currentNode);
 
-        const obj = await xref.fetchAsync(currentNode);
+        const obj = await (pageDictCache.get(currentNode) ||
+          xref.fetchAsync(currentNode));
         if (obj instanceof Dict) {
           let type = obj.getRaw("Type");
           if (type instanceof Ref) {
@@ -1285,7 +1289,18 @@ class Catalog {
       // node further down in the tree (see issue5644.pdf, issue8088.pdf),
       // and to ensure that we actually find the correct `Page` dict.
       for (let last = kids.length - 1; last >= 0; last--) {
-        nodesToVisit.push(kids[last]);
+        const lastKid = kids[last];
+        nodesToVisit.push(lastKid);
+
+        // Launch all requests in parallel so we don't wait for each one in turn
+        // when looking for a page near the end, if all the pages are top level.
+        if (
+          currentNode === this.toplevelPagesDict &&
+          lastKid instanceof Ref &&
+          !pageDictCache.has(lastKid)
+        ) {
+          pageDictCache.put(lastKid, xref.fetchAsync(lastKid));
+        }
       }
     }