From 06f9d8002d04b91113d3d7e44ce6bc1f7770d835 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Tue, 10 Sep 2024 22:00:04 +0200 Subject: [PATCH] Consider foo-\nBar as a compound word Fixes #18693. --- test/pdfs/.gitignore | 1 + test/pdfs/issue18693.pdf | Bin 0 -> 9641 bytes test/unit/pdf_find_controller_spec.js | 20 +++++++++++++++++ web/pdf_find_controller.js | 30 +++++++++++++++++--------- 4 files changed, 41 insertions(+), 10 deletions(-) create mode 100755 test/pdfs/issue18693.pdf diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 2cacef5120edd..01dd1d125d352 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -666,3 +666,4 @@ !highlight.pdf !bug1708040.pdf !issue18694.pdf +!issue18693.pdf diff --git a/test/pdfs/issue18693.pdf b/test/pdfs/issue18693.pdf new file mode 100755 index 0000000000000000000000000000000000000000..68c628063762a45a9832248e214a8d22a42a8204 GIT binary patch literal 9641 zcmeHNc{o&k`$uJ$wNPPfWgBy5WyDwpku^)GFk@y&mSK#&$6l5YvKB%MMQI@+B3pzW zB8j5NR>_(o`W^C=_j&!^_kFJG_g>fUk7urH&Nb(pd->j<@BKYTrTfRTXCF>`o!loC7Fi2pnbz!D3Kwm9FB|PPF}SE+3dr`jsSl7(58^6WQr+~=H>-@#8X|p-Q2w( zD3~dYLUtz6sWb>2fy2Rcon7e^8cf%fNT>Wzf$3`>RYjw9;czXu79I|0B9u{Dn%bIp zlr~&TNgIv9BLECeRSCc&v^4>31O@=`S}0{L9UX+SCLXJ)gV922Yh!S>FnxD2#qTH> zI9z%4#E}YtuMW@*W=L^&q92D~fYs6XtOkmOnNg{9a7b(S%|U1&2rSIP--80PHlsMg ztd7DV2nZa6z>7|!5Z$=gkH2`_U+lTrra#ff)L&)8zJRg2@NlD z(?i9xLnA+)R$pR1%@F?Sq*%^`j#tj&WXL>11{O9$W!fslU{7H?w8^;F5L-_xBMO~J zCen!@QmZJNc+*|MMDv1R)=o`{PT+6s+V3@B@t~+H)#+DO!I8gW#8chr6nDB8n0RX% zcpBA1iwe#r04Sm%XgERD=ML|wlE_wxrnQB;C0CA7XwNH5G5>l zw1t7$2-;j9gBit(>P;h2R?|}#^tPI%fIR>~tUCnk!6{c-bq=PfDUC`pr_hhWz*Pma zp!m_jOaN1c4rVO`_>m{8@ok6&^(&T5=1bR^=0%5~f6f^Kj=+G+PU(j!6NeM#bhnLj z*KgxO>EI-R@A^vnpTwF><=dzizP7XvIC-j!tAX)^gdt&9cr{%~>8<<^_<_@!^ zH+~Q9&ffPL#kIE{xm0rJ@V3Gtb7Ln;)xpCDC|eE=rNqf$BS#}T1Lwa?e^_Yib+~&h zOI;azwCLWGk8O3m?;F0(1XPy$PtCb5J5PJ~EY43%ICw_)98~uY!Pi+Z- z5;f|HhF26hMA(++Xld_1_PEE^>Jsi~NC%oL^>#Jvi!_RM8|rwtWS>%7Z8tHYTUMP3bGc^lGA8kZ4yv2L zoZ!hk7E9sW6)W_F!9k&oa9k)*lN6rFDIOq{qm;mVTnd#SUaxUJaciqVod7<>+zhr< zQ!40sGD7?STTR>4SJwh_Wr2yjtDXWvjB4!c3k{kYtfB3pyQQBCXcG>=Lc1FuZ-4PB zEUOuKrY)aC{{7wqKm}h6v?ue6-WnI)nf9hz*tY)Q?et`th&nQE&^D(=h_B=Fy-OXJ zU!K-jifwSJQjZ}A?8886JGbO*Q!`19iu#0%gC1176iUhqvx*48>2c_%ov`jq?NG3w zMvguiF>TbtH~7^)e$i#A-5a;h?ZwXa`T%Zezt)Sw*A#mAD6`k-yG-lP-*8&tL@z>; zm=9cP43N5xV7f)WqyLySp|rd9+Q~B!Gp*%=A0@neX7+Gil2SV+$kr?H0;?!=IJ%8s zJ;fSYJt!Qrz$UE)?dvuTXFNw`zh~ANKC$(7cSG=wSgGxYYPOxWk4Sn#$VT$#SXaR+q2p5K`-akXPA%GGB# z;JQa^Y$dWJe5^E?F&E-Ea#a2?PkPkUc8C6Aj>y)+UaJHS0r5u6bMEKGW zl*H-R%2Sr<7)7j*<2$7Sc(a)#+?mPFUYhe*4yNUbFQR(PL=r0tVCr=d`=a+uyG6yz zetcxmy6ckDAv~a{q}wf-anuK0Z_MYGE6!ytSPG?32qGMPEO+jRFAn$d0la^Hol3KuvjaRONqwW=jDq?Zwc6dP_tc4n+>q|+ zsQD(B!!@p1{gE~)b@KZ~tUqh>lBC|j_ZpuIZzRY2pOcnk@H_BI?5=r>oy)U}+F{0g zB2scSNU51}2~opOaZ!p{O}ejCle1%Q$tNbfj+#sLTVz#5NA7|ehW85^;d~rg7tPXTD z??d=uL;Go*iLzFAb|1}ZCBD*poEcbin>+V{FZ+_QC0;mRqnDeguwb1%tvVER?mnri zsl~hXzPZZo_gWuxOy29vh4?-n7wK1Gs~Ft-`qb$WgAz;TgNp)NxRcD@+x#hi`C??6OS3hA?I**7WE>{dcuV`;D8yT_LdzJBabdnTbbgWChe>pGYO~LT#p0I!fDL%ZrZY z7yIok4tk_H*Q3s_dHuEUX==yqYWVCGcNR{|oAk->T1HQ?&$pZK_s=ByaLxh-s`aAW zgvX!bxikivU!QgSqCUlZV&e0)O8=ObBEL=Wdu3$>lqJtv*-Z z_|-OSK`uG`JJ)M|scR1Qy!tI&YNhvVU=LcnF3>EX0SBZ{IFSMtHtNPjSPYLRcz@+p8$(` z0)a>%cxC)v6T&IP2m&X`%lLLP^Q#5ozcmj>D5GKK-VXHD{RCcLd$rtLEr9|2MzT0-z%e|d;;N=s#B&zI+h%H2yh$i2}&F=*Y0Lt~Lmg9?-})XVzx@2_lw zWN*p7U{c_JMzXQSq=w(9zj@~C%Cy1h<_KfOChHFZI5BmJJZ%o0Ptn;iAKwaAt!RGf zgS3-6bA+Cs6>W)s{yovCrbHZiY9z-I#ey=(K?R)V^U)w23Jn#)Gletp8_yVOhIfxn zapV|t;>y`^&kJ|y`t-%KXWeb+ z@jSfEyS~a4!q?@K>$BUlr}C8&eut5S<9#(`)sfxDbRsJD2ebK{iz&P*-)~@8nsIUa z6UQQ7uWcS5)ek5BzJM%s>b8$%g4!Tr?Ik1i4^YBFEFTAa%)Y`P2( zicG>n-|^q0t3$8nEvsxBs@TrC1yU{eDK4a2J%;$rE2b2%JTvl*H}kHB$`=QOI*^@QQZtvmzE^VyGgK+khcnvTYgjH1U*Jh036 zc~8UR;}<8}hs3f><*TP7bH|uoScVRq4JqY?y9$f!y|X2I5WOSagyCz1ztkO<3lwOIYcz#v$uPRec^w6x zz_h?pO2i{E(H0)V`kYi>Yd-qy_9@}oiaT<%4^DCve+irCuZZP}|!$ zO-gz?|J-zsx#(7dsM)3MV(f*!r2&O~-Yo{2)Ph{J#w1s%H@)C z*AVKS8azS5a7$eS-XJ$yE5JCQ6f^cJc{J1Mf?c-0d&(|*0aYb+=_i+UwbReFp8LY% zd*O_-yHzkBYfy)lsK%*1o-O<{ADVY-%-{ri!pXy$&gwkes*{1Lfcs!6H?OXbz6nE{ zCop1qdG=U=_pOX?fp!|6W5J@r3%2aBTfN@nZ#KFvq*lPQTf*3O$6eO?5<8S*ymI29 z?u@h6ym7%m=_Qd^s>?v{U3|wirMC6G+|gPPHcK9<;bq7L9Z{b^aC!W9u5h>Wl})JE#O7}bw`c|88?<|8RH_iw(tq~{{9=ca(`Pk**hp>_2w=TC?orl#j5-&Y!W{);b7qv-@kjcG&T1I%+JS=#xWnVM( zjl;Y_fpOBD86x0t%XK{)(eVUcktkXovLiZ*&eHjgts`Yjwi7O`!YgvAvhr-Vll4B; zREX2)qi>$0o>Sgrx2b8UPYI93d*Vaze+#fWm9-hFt31rV{og`hQ2&5xf)R)+0K z28DAv38bVFMiQ$rr`=p+g<6P0oUJC5&BpG_=ad8Dr`V;Y;h*$pPg}K>!c`c87Y^SO z%IVSd&;0N~S}QUf<9ac?6VV@SjAd`~LzO)d@NhMx+8ko|9@QZGWgAA2vL6I>WJiD- z^NW*~A9wog-@gzpd7eQySz6}95FYlZDuy@ooLf(Ubq}#u=jp|_UY@MjX&hSe#-nDo zXuEVqo5Ti7twT+g2-JlAvTS*UT@Z)K#8C?;n;@u7#bFf-?MVKP@$|9GG3+6;?4+4= z>Fd$g&yV?B-98z7qpCi!Ra4KyO#%0~D_l6;PetKlN0gR!5EwMnLYy{3^IwAb zgX@8#{@~|+P(8^1jp{*|1Ezo#aNFXR=`EDmt=k>G?XWiaLqM1B69qX%#)HB#TgR8A z9%YE9Y`eC&azz@O;eV~c$47%JUgHKNBcwjhBF7jRb*D8c(}F8Mv;KkWsgU|P3!__R zX~*%TTXb!*z+=0dIR7hH?Cr(c#mTSYUwwQSzyj)O{yxkmv8{4>>xr_y){qDQ_*u#OLCRHV|JU~rIcgaHT`0s#djpkDs&bfO>B-AnR^ zkw5HcQ@lttXAinF)g7{Gm*_zCrmIOv{Ni#!$@Re8JbpTI_flL7QjtV;gZU9XU~okM zwwV%{^i#*fo94PUG%^WBaizGe(xsptd{aFTQzN6Fs++oVbK6wYV*?)lpaoi9-1VC5pX{u8pwcX;R9D=OjMF61s18Ir1f+!_ zG8h!p!I>@#L7)_ois0HnqkaUpQEh`oGD*deN^>L9aYPRfS7#D&HCJFDn5wW}w>HSH z&76uhmE^rT!}{7dZ*OO^3Y>@{I--$CC`k!JhN37G43tQwC_zyuB$fma9ViIWk6{=Y zZPfiIyZY|nn+JCiu~^rxf~ysLQow#bD%Ng+>G$9L{0j2F z=;v=Ie+k^b;rbh{zl6YF68<~8{)X!>A@G-k|IV)eGq~7)Z8pG$3N;B|a4WINW@bZ! z!6r2KkB!>TeHse;qY-A6v^DvsH}ZEZ0MiYdvxRm;*?0_!nVq>|BGO{#rE6xgryQ(mS;mr3 z!WDkv^$S)!douflf#d?`F2H1yz&5;0_ { it("calls to the matcher with the right arguments", async () => { const QUERY = "Foo bar"; diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 09e61fb2d9385..71855127a4b7b 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -131,7 +131,8 @@ function normalize(text) { // 30A0-30FF: Katakana const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; const HKDiacritics = "(?:\u3099|\u309A)"; - const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`; + const CompoundWord = "\\p{Ll}-\\n\\p{Lu}"; + const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(${CompoundWord})|(\\S-\\n)|(${CJK}\\n)|(\\n)`; if (syllablePositions.length === 0) { // Most of the syllables belong to Hangul so there are no need @@ -193,7 +194,7 @@ function normalize(text) { normalized = normalized.replace( normalizationRegex, - (match, p1, p2, p3, p4, p5, p6, p7, p8, i) => { + (match, p1, p2, p3, p4, p5, p6, p7, p8, p9, i) => { i -= shiftOrigin; if (p1) { // Maybe fractions or quotations mark... @@ -267,7 +268,7 @@ function normalize(text) { if (hasTrailingDashEOL) { // Diacritics are followed by a -\n. - // See comments in `if (p5)` block. + // See comments in `if (p6)` block. i += len - 1; positions.push([i - shift + 1, 1 + shift]); shift += 1; @@ -280,32 +281,41 @@ function normalize(text) { } if (p5) { + // Compound word with a line break after the hyphen. + positions.push([i - shift + 3, 1 + shift]); + shift += 1; + shiftOrigin += 1; + eol += 1; + return p5.replace("\n", ""); + } + + if (p6) { // "X-\n" is removed because an hyphen at the end of a line // with not a space before is likely here to mark a break // in a word. // If X is encoded with UTF-32 then it can have a length greater than 1. // The \n isn't in the original text so here y = i, n = X.len - 2 and // o = X.len - 1. - const len = p5.length - 2; + const len = p6.length - 2; positions.push([i - shift + len, 1 + shift]); shift += 1; shiftOrigin += 1; eol += 1; - return p5.slice(0, -2); + return p6.slice(0, -2); } - if (p6) { + if (p7) { // An ideographic at the end of a line doesn't imply adding an extra // white space. // A CJK can be encoded in UTF-32, hence their length isn't always 1. - const len = p6.length - 1; + const len = p7.length - 1; positions.push([i - shift + len, shift]); shiftOrigin += 1; eol += 1; - return p6.slice(0, -1); + return p7.slice(0, -1); } - if (p7) { + if (p8) { // eol is replaced by space: "foo\nbar" is likely equivalent to // "foo bar". positions.push([i - shift + 1, shift - 1]); @@ -327,7 +337,7 @@ function normalize(text) { shift -= newCharLen; shiftOrigin += newCharLen; } - return p8; + return p9; } );