From 2847d84d00de0e116067c6109ba8fbc0398c2d82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Ribaudo?= Date: Fri, 2 Aug 2024 18:08:56 +0200 Subject: [PATCH] Allow specifying custom match logic in PDFFindController This patch allows embedders of PDF.js to provide custom match logic for seaching in PDFs. This is done by subclassing the PDFFindController class and overriding the `match` method. `match` is called once per PDF page, receives as parameters the search query, the page contents, and the page index, and returns an array of { index, length } objects representing the search results. --- test/unit/pdf_find_controller_spec.js | 87 ++++++++++++++++- web/pdf_find_controller.js | 130 ++++++++++++++------------ 2 files changed, 154 insertions(+), 63 deletions(-) diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index e455d5048c58f4..e1f3169d58fb8a 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -51,7 +51,8 @@ class MockLinkService extends SimpleLinkService { async function initPdfFindController( filename, - updateMatchesCountOnProgress = true + updateMatchesCountOnProgress = true, + matcher = undefined ) { const loadingTask = getDocument( buildGetDocumentParams(filename || tracemonkeyFileName, { @@ -65,7 +66,13 @@ async function initPdfFindController( const linkService = new MockLinkService(); linkService.setDocument(pdfDocument); - const pdfFindController = new PDFFindController({ + let FindControllerClass = PDFFindController; + if (matcher !== undefined) { + FindControllerClass = class extends PDFFindController {}; + FindControllerClass.prototype.match = matcher; + } + + const pdfFindController = new FindControllerClass({ linkService, eventBus, updateMatchesCountOnProgress, @@ -1054,4 +1061,80 @@ describe("pdf_find_controller", function () { const { eventBus } = await initPdfFindController(); await testOnFind({ eventBus }); }); + + describe("custom matcher", () => { + it("calls to the matcher with the right arguments", async () => { + const QUERY = "Foo bar"; + + const spy = jasmine + .createSpy("custom find matcher") + .and.callFake(() => [{ index: 0, length: 1 }]); + + const { eventBus, pdfFindController } = await initPdfFindController( + null, + false, + spy + ); + + const PAGES_COUNT = 14; + + await testSearch({ + eventBus, + pdfFindController, + state: { query: QUERY }, + selectedMatch: { pageIndex: 0, matchIndex: 0 }, + matchesPerPage: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + }); + + expect(spy).toHaveBeenCalledTimes(PAGES_COUNT); + + for (let i = 0; i < PAGES_COUNT; i++) { + const args = spy.calls.argsFor(i); + expect(args[0]).withContext(`page ${i}`).toBe(QUERY); + expect(args[2]).withContext(`page ${i}`).toBe(i); + } + + expect(spy.calls.argsFor(0)[1]).toMatch(/^Trace-based /); + expect(spy.calls.argsFor(1)[1]).toMatch(/^Hence, recording and /); + expect(spy.calls.argsFor(12)[1]).toMatch(/Figure 12. Fraction of time /); + expect(spy.calls.argsFor(13)[1]).toMatch(/^not be interpreted as /); + }); + + it("uses the results returned by the custom matcher", async () => { + const QUERY = "Foo bar"; + + // prettier-ignore + const spy = jasmine.createSpy("custom find matcher") + .and.returnValue(undefined) + .withArgs(QUERY, jasmine.anything(), 0) + .and.returnValue([ + { index: 20, length: 3 }, + { index: 50, length: 8 }, + ]) + .withArgs(QUERY, jasmine.anything(), 2) + .and.returnValue([ + { index: 7, length: 19 } + ]) + .withArgs(QUERY, jasmine.anything(), 13) + .and.returnValue([ + { index: 50, length: 2 }, + { index: 54, length: 9 }, + { index: 80, length: 4 }, + ]); + + const { eventBus, pdfFindController } = await initPdfFindController( + null, + false, + spy + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { query: QUERY }, + selectedMatch: { pageIndex: 0, matchIndex: 0 }, + matchesPerPage: [2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3], + }); + }); + }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 3990b2eb6668fa..88d259c8bff6dd 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -670,37 +670,6 @@ class PDFFindController { return true; } - #calculateRegExpMatch(query, entireWord, pageIndex, pageContent) { - const matches = (this._pageMatches[pageIndex] = []); - const matchesLength = (this._pageMatchesLength[pageIndex] = []); - if (!query) { - // The query can be empty because some chars like diacritics could have - // been stripped out. - return; - } - const diffs = this._pageDiffs[pageIndex]; - let match; - while ((match = query.exec(pageContent)) !== null) { - if ( - entireWord && - !this.#isEntireWord(pageContent, match.index, match[0].length) - ) { - continue; - } - - const [matchPos, matchLen] = getOriginalIndex( - diffs, - match.index, - match[0].length - ); - - if (matchLen) { - matches.push(matchPos); - matchesLength.push(matchLen); - } - } - } - #convertToRegExpString(query, hasDiacritics) { const { matchDiacritics } = this.#state; let isUnicode = false; @@ -771,13 +740,65 @@ class PDFFindController { return [isUnicode, query]; } - #calculateMatch(pageIndex) { - let query = this.#query; + async #calculateMatch(pageIndex) { + const query = this.#query; if (query.length === 0) { return; // Do nothing: the matches should be wiped out already. } - const { caseSensitive, entireWord } = this.#state; const pageContent = this._pageContents[pageIndex]; + const matcherResult = await this.match(query, pageContent, pageIndex); + + const matches = (this._pageMatches[pageIndex] = []); + const matchesLength = (this._pageMatchesLength[pageIndex] = []); + const diffs = this._pageDiffs[pageIndex]; + + matcherResult?.forEach(({ index, length }) => { + const [matchPos, matchLen] = getOriginalIndex(diffs, index, length); + if (matchLen) { + matches.push(matchPos); + matchesLength.push(matchLen); + } + }); + + // When `highlightAll` is set, ensure that the matches on previously + // rendered (and still active) pages are correctly highlighted. + if (this.#state.highlightAll) { + this.#updatePage(pageIndex); + } + if (this._resumePageIdx === pageIndex) { + this._resumePageIdx = null; + this.#nextPageMatch(); + } + + // Update the match count. + const pageMatchesCount = this._pageMatches[pageIndex].length; + this._matchesCountTotal += pageMatchesCount; + if (this.#updateMatchesCountOnProgress) { + if (pageMatchesCount > 0) { + this.#updateUIResultsCount(); + } + } else if (++this.#visitedPagesCount === this._linkService.pagesCount) { + // For example, in GeckoView we want to have only the final update because + // the Java side provides only one object to update the counts. + this.#updateUIResultsCount(); + } + } + + /** + * @typedef {Object} SingleFindMatch + * @property {number} index - The start of the matched text in the page's string + * contents. + * @property {number} length - The length of the matched text. + */ + + /** + * @param {string | string[]} query - The search query. + * @param {string} pageContent - The text content of the page to search in. + * @param {number} pageIndex - The index of the page that is being processed. + * @returns {Promise | SingleFindMatch[] | undefined} An + * array of matches in the provided page. + */ + match(query, pageContent, pageIndex) { const hasDiacritics = this._hasDiacritics[pageIndex]; let isUnicode = false; @@ -799,34 +820,22 @@ class PDFFindController { }) .join("|"); } + if (!query) { + return undefined; + } + const { caseSensitive, entireWord } = this.#state; const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`; - query = query ? new RegExp(query, flags) : null; - - this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent); + query = new RegExp(query, flags); - // When `highlightAll` is set, ensure that the matches on previously - // rendered (and still active) pages are correctly highlighted. - if (this.#state.highlightAll) { - this.#updatePage(pageIndex); - } - if (this._resumePageIdx === pageIndex) { - this._resumePageIdx = null; - this.#nextPageMatch(); - } - - // Update the match count. - const pageMatchesCount = this._pageMatches[pageIndex].length; - this._matchesCountTotal += pageMatchesCount; - if (this.#updateMatchesCountOnProgress) { - if (pageMatchesCount > 0) { - this.#updateUIResultsCount(); + const matches = []; + for (const { index, 0: match } of pageContent.matchAll(query)) { + if (entireWord && !this.#isEntireWord(pageContent, index, match.length)) { + continue; } - } else if (++this.#visitedPagesCount === this._linkService.pagesCount) { - // For example, in GeckoView we want to have only the final update because - // the Java side provides only one object to update the counts. - this.#updateUIResultsCount(); + matches.push({ index, length: match.length }); } + return matches; } #extractText() { @@ -930,10 +939,9 @@ class PDFFindController { continue; } this._pendingFindMatches.add(i); - this._extractTextPromises[i].then(() => { - this._pendingFindMatches.delete(i); - this.#calculateMatch(i); - }); + this._extractTextPromises[i] + .then(() => this.#calculateMatch(i)) + .finally(() => this._pendingFindMatches.delete(i)); } }