Skip to content

Commit

Permalink
Very slightly improve the performance when searching in a pdf
Browse files Browse the repository at this point in the history
It helps to slightly decrease memory use in reducing the number of created arrays.
In searching for "a" in pdf.pdf, the time spent in getOriginalIndex is decreased by
around 30%.
  • Loading branch information
calixteman committed Nov 28, 2024
1 parent 65f20b0 commit 94d53d5
Showing 1 changed file with 26 additions and 19 deletions.
45 changes: 26 additions & 19 deletions web/pdf_find_controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ function normalize(text) {
}

let normalized = text.normalize("NFD");
const positions = [[0, 0]];
const positions = [0, 0];
let rawDiacriticsIndex = 0;
let syllableIndex = 0;
let shift = 0;
Expand All @@ -201,7 +201,7 @@ function normalize(text) {
const replacement = CHARACTERS_TO_NORMALIZE[p1];
const jj = replacement.length;
for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]);
positions.push(i - shift + j, shift - j);
}
shift -= jj - 1;
return replacement;
Expand All @@ -216,7 +216,7 @@ function normalize(text) {
}
const jj = replacement.length;
for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]);
positions.push(i - shift + j, shift - j);
}
shift -= jj - 1;
return replacement;
Expand All @@ -233,13 +233,13 @@ function normalize(text) {
} else {
// i is the position of the first diacritic
// so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + 1, shift - 1]);
positions.push(i - 1 - shift + 1, shift - 1);
shift -= 1;
shiftOrigin += 1;
}

// End-of-line.
positions.push([i - shift + 1, shift]);
positions.push(i - shift + 1, shift);
shiftOrigin += 1;
eol += 1;

Expand All @@ -261,7 +261,7 @@ function normalize(text) {
for (let j = 1; j <= jj; j++) {
// i is the position of the first diacritic
// so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + j, shift - j]);
positions.push(i - 1 - shift + j, shift - j);
}
shift -= jj;
shiftOrigin += jj;
Expand All @@ -270,7 +270,7 @@ function normalize(text) {
// Diacritics are followed by a -\n.
// See comments in `if (p6)` block.
i += len - 1;
positions.push([i - shift + 1, 1 + shift]);
positions.push(i - shift + 1, 1 + shift);
shift += 1;
shiftOrigin += 1;
eol += 1;
Expand All @@ -296,7 +296,7 @@ function normalize(text) {
// The \n isn't in the original text so here y = i, n = X.len - 2 and
// o = X.len - 1.
const len = p6.length - 2;
positions.push([i - shift + len, 1 + shift]);
positions.push(i - shift + len, 1 + shift);
shift += 1;
shiftOrigin += 1;
eol += 1;
Expand All @@ -308,7 +308,7 @@ function normalize(text) {
// white space.
// A CJK can be encoded in UTF-32, hence their length isn't always 1.
const len = p7.length - 1;
positions.push([i - shift + len, shift]);
positions.push(i - shift + len, shift);
shiftOrigin += 1;
eol += 1;
return p7.slice(0, -1);
Expand All @@ -317,7 +317,7 @@ function normalize(text) {
if (p8) {
// eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar".
positions.push([i - shift + 1, shift - 1]);
positions.push(i - shift + 1, shift - 1);
shift -= 1;
shiftOrigin += 1;
eol += 1;
Expand All @@ -331,7 +331,7 @@ function normalize(text) {
const newCharLen = syllablePositions[syllableIndex][0] - 1;
++syllableIndex;
for (let j = 1; j <= newCharLen; j++) {
positions.push([i - (shift - j), shift - j]);
positions.push(i - (shift - j), shift - j);
}
shift -= newCharLen;
shiftOrigin += newCharLen;
Expand All @@ -340,9 +340,15 @@ function normalize(text) {
}
);

positions.push([normalized.length, shift]);
positions.push(normalized.length, shift);
const starts = new Uint32Array(positions.length >> 1);
const shifts = new Int32Array(positions.length >> 1);
for (let i = 0, ii = positions.length; i < ii; i += 2) {
starts[i >> 1] = positions[i];
shifts[i >> 1] = positions[i + 1];
}

return [normalized, positions, hasDiacritics];
return [normalized, [starts, shifts], hasDiacritics];
}

// Determine the original, non-normalized, match index such that highlighting of
Expand All @@ -353,25 +359,26 @@ function getOriginalIndex(diffs, pos, len) {
return [pos, len];
}

const [starts, shifts] = diffs;
// First char in the new string.
const start = pos;
// Last char in the new string.
const end = pos + len - 1;
let i = binarySearchFirstItem(diffs, x => x[0] >= start);
if (diffs[i][0] > start) {
let i = binarySearchFirstItem(starts, x => x >= start);
if (starts[i] > start) {
--i;
}

let j = binarySearchFirstItem(diffs, x => x[0] >= end, i);
if (diffs[j][0] > end) {
let j = binarySearchFirstItem(starts, x => x >= end, i);
if (starts[j] > end) {
--j;
}

// First char in the old string.
const oldStart = start + diffs[i][1];
const oldStart = start + shifts[i];

// Last char in the old string.
const oldEnd = end + diffs[j][1];
const oldEnd = end + shifts[j];
const oldLen = oldEnd + 1 - oldStart;

return [oldStart, oldLen];
Expand Down

0 comments on commit 94d53d5

Please sign in to comment.