From d0abda2cb40f94b26f876599dc80ba8e44a845f0 Mon Sep 17 00:00:00 2001 From: 3y3 <3y3@ya.ru> Date: Mon, 30 Sep 2024 18:08:07 +0300 Subject: [PATCH] fix: Fix search clauses generation --- jest.config.js | 10 +++ package.json | 4 +- src/indexer/index.ts | 25 ++++++-- src/types.ts | 2 +- src/worker/format.ts | 4 +- src/worker/search.ts | 62 +++++++++--------- test/__snapshots__/index.spec.ts.snap | 57 +++++++++++++++++ test/index.spec.ts | 91 +++++++++++++++++++++++++++ tsconfig.json | 2 +- 9 files changed, 220 insertions(+), 37 deletions(-) create mode 100644 jest.config.js create mode 100644 test/__snapshots__/index.spec.ts.snap create mode 100644 test/index.spec.ts diff --git a/jest.config.js b/jest.config.js new file mode 100644 index 0000000..81dcba0 --- /dev/null +++ b/jest.config.js @@ -0,0 +1,10 @@ +/** @type {import('ts-jest/dist/types').InitialOptionsTsJest} */ +module.exports = { + snapshotFormat: { + escapeString: true, + printBasicPrototype: true, + }, + transform: { + '^.+\\.tsx?$': ['esbuild-jest', {tsconfig: './tsconfig.json'}], + }, +}; diff --git a/package.json b/package.json index 69bb743..aa40269 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,7 @@ "build:code": "node esbuild/build.mjs && tsc --emitDeclarationOnly", "build:clean": "rm -rf lib", "prepublishOnly": "npm run build", - "test": "exit 0", + "test": "jest", "typecheck": "tsc --noEmit", "lint": "lint update && lint", "lint:fix": "lint update && lint fix", @@ -36,8 +36,10 @@ "@diplodoc/lint": "^1.1.3", "@diplodoc/tsconfig": "^1.0.2", "@esbuild-plugins/tsconfig-paths": "^0.1.2", + "@types/jest": "^29.5.13", "@types/lunr": "^2.3.7", "esbuild": "^0.23.1", + "jest": "^29.7.0", "ts-dedent": "^2.2.0", "typescript": "^5.6.2" }, diff --git a/src/indexer/index.ts b/src/indexer/index.ts index 31aac5c..e60ce77 100644 --- a/src/indexer/index.ts +++ b/src/indexer/index.ts @@ -13,6 +13,11 @@ type DocumentInfo = { keywords: string[]; }; +export enum ReleaseFormat { + JSONP = 'jsonp', + RAW = 'raw', +} + export class Indexer { private indices: Record = {}; @@ -34,7 +39,11 @@ export class Indexer { * * @returns {void} */ - add(lang: string, url: string, data: DocPageData) { + add( + lang: string, + url: string, + data: Pick, + ) { if (!this.indices[lang]) { this.init(lang); } @@ -61,12 +70,20 @@ export class Indexer { * Dumps index and registry for target language. * * @param lang - index language + * @param format - output format * * @returns {{index: Index, registry: Registry}} */ - release(lang: string) { - const index = 'self.index=' + JSON.stringify(this.indices[lang].build()); - const registry = 'self.registry=' + JSON.stringify(this.docs[lang]); + release(lang: string, format = ReleaseFormat.JSONP) { + const index = this.indices[lang].build(); + const registry = this.docs[lang]; + + if (format === ReleaseFormat.JSONP) { + return { + index: 'self.index=' + JSON.stringify(index), + registry: 'self.registry=' + JSON.stringify(registry), + }; + } return {index, registry}; } diff --git a/src/types.ts b/src/types.ts index 589edd3..71e7a4f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -7,7 +7,7 @@ enum Confidence { export interface WorkerConfig extends ISearchWorkerConfig { tolerance: number; - confidence: Confidence; + confidence: `${Confidence}` | Confidence; resources: { index: string; registry: string; diff --git a/src/worker/format.ts b/src/worker/format.ts index 92ca28e..af29a5c 100644 --- a/src/worker/format.ts +++ b/src/worker/format.ts @@ -9,7 +9,7 @@ const SHORT_HEAD = 20; type Trimmer = (text: string, score: Score) => [string, Position[]]; export function format( - {base, mark}: WorkerConfig, + {base, mark}: Pick, results: SearchResult[], registry: Registry, trim: Trimmer, @@ -18,7 +18,7 @@ export function format( const doc = registry[entry.ref]; const item = { type: 'page', - link: `${base}/${entry.ref}`, + link: `${base.replace(/\/?$/, '')}/${entry.ref.replace(/&\/?/, '')}`, title: doc.title, description: doc.content.slice(0, MAX_LENGTH), } as SearchSuggestPageItem; diff --git a/src/worker/search.ts b/src/worker/search.ts index eef3c19..f1b4911 100644 --- a/src/worker/search.ts +++ b/src/worker/search.ts @@ -9,12 +9,11 @@ import {INDEX_FIELDS} from '../constants'; import {phrased, sparsed} from './score'; -const withIndex = (index: Index) => (builder: Index.QueryBuilder | false) => - function withIndex() { - if (!builder) { - return false; - } +const isStrategy = (candidate: unknown): candidate is Index.QueryBuilder => + typeof candidate === 'function'; +const withIndex = (index: Index) => (builder: Index.QueryBuilder) => + function withIndex() { return index.query(builder); }; @@ -32,49 +31,48 @@ const makeStrategies = (tolerance: number, index: Index, clauses: FixedClause[], [ tolerance >= 0 && function precise(query: Query) { - query.clauses = clauses.slice(); + query.clauses = copy(clauses); }, tolerance >= 0 && - function precise(query: Query) { - query.clauses = clauses.slice(); - - if (!sealed) { - for (let i = query.clauses.length - 1; i >= 0; i--) { - const clause = query.clauses[i] as FixedClause; - if (clause.presence !== Query.presence.PROHIBITED) { - wildcard(clause, Query.wildcard.TRAILING); - break; - } + !sealed && + function preciseUnsealed(query: Query) { + query.clauses = copy(clauses); + + for (let i = query.clauses.length - 1; i >= 0; i--) { + const clause = query.clauses[i] as FixedClause; + if (clause.presence !== Query.presence.PROHIBITED) { + query.clauses[i] = wildcard(clause, Query.wildcard.TRAILING); + break; } } }, tolerance >= 1 && function trailingWildcard(query: Query) { - query.clauses = clauses.map((clause) => { + query.clauses = copy(clauses).map((clause) => { if (clause.presence !== Query.presence.PROHIBITED) { - wildcard(clause, Query.wildcard.TRAILING); + return wildcard(clause, Query.wildcard.TRAILING); } return clause; }); }, tolerance >= 2 && function bothWildcard(query: Query) { - query.clauses = clauses.map((clause) => { + query.clauses = copy(clauses).map((clause) => { if (clause.presence !== Query.presence.PROHIBITED) { // eslint-disable-next-line no-bitwise - wildcard(clause, Query.wildcard.LEADING | Query.wildcard.TRAILING); + return wildcard(clause, Query.wildcard.LEADING | Query.wildcard.TRAILING); } return clause; }); }, ] - .filter(Boolean) + .filter(isStrategy) .map(withIndex(index)); export type SearchResult = Index.Result & {scores: Record}; export function search( - {tolerance, confidence}: WorkerConfig, + {tolerance, confidence}: Pick, index: Index, query: string, count: number, @@ -108,6 +106,8 @@ export function search( } function wildcard(clause: FixedClause, mode: Query.wildcard) { + const result = {...clause}; + const requiredLength = [ // eslint-disable-next-line no-bitwise @@ -116,22 +116,24 @@ function wildcard(clause: FixedClause, mode: Query.wildcard) { mode & Query.wildcard.LEADING ? 2 : 0, ].reduce((a, b) => a + b, 0) + 1; - if (clause.term.length < requiredLength) { - return; + if (result.term.length < requiredLength) { + return result; } // eslint-disable-next-line no-bitwise if (mode & Query.wildcard.TRAILING) { - clause.term = clause.term + '*'; + result.term = result.term + '*'; } // eslint-disable-next-line no-bitwise if (mode & Query.wildcard.LEADING) { - clause.term = '*' + clause.term; + result.term = '*' + result.term; } - clause.wildcard = mode; - clause.usePipeline = false; + result.wildcard = mode; + result.usePipeline = false; + + return result; } function byMaxScore(a: SearchResult, b: SearchResult) { @@ -152,3 +154,7 @@ function getMaxScore(result: SearchResult) { return score; } + +function copy(clauses: FixedClause[]) { + return clauses.slice().map((clause) => ({...clause})); +} diff --git a/test/__snapshots__/index.spec.ts.snap b/test/__snapshots__/index.spec.ts.snap new file mode 100644 index 0000000..ac13115 --- /dev/null +++ b/test/__snapshots__/index.spec.ts.snap @@ -0,0 +1,57 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`suggest should match code 1`] = ` +Array [ + " + +
+
crm.stagehistory.list
+
+", +] +`; + +exports[`suggest should match html content 1`] = ` +Array [ + " + +
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer sit amet enim velit.
+
+", +] +`; + +exports[`suggest should match title content 1`] = ` +Array [ + " + +
Lorem ipsum 1
+
Integer sit amet enim velit. Nam facilisis eget magna non blandit.
+
+", + " + +
Lorem ipsum 2
+
Nam facilisis eget magna non blandit. Sed semper, dui ut suscipit semper, nibh justo tempor purus, quis placerat enim dolor vitae neque.
+
+", +] +`; + +exports[`suggest should score longest phrase 1`] = ` +Array [ + " + +
+
...urus, quis placerat enim dolor vitae neque. Vivamus dignissim nunc et tortor vulputate maximus.
+
+", + " + +
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer sit amet enim velit. Nam facilisis eget magna non blandit.
+
+", +] +`; diff --git a/test/index.spec.ts b/test/index.spec.ts new file mode 100644 index 0000000..fdeef77 --- /dev/null +++ b/test/index.spec.ts @@ -0,0 +1,91 @@ +import type {Index} from 'lunr'; +import type {Registry, WorkerConfig} from '../src/types'; +import type {SearchSuggestPageItem} from '@diplodoc/components'; + +import {Indexer, ReleaseFormat} from '../src/indexer'; +import {search} from '../src/worker/search'; +import {format, short} from '../src/worker/format'; + +const Lorem = [ + 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', + 'Integer sit amet enim velit.', + 'Nam facilisis eget magna non blandit.', + 'Sed semper, dui ut suscipit semper, nibh justo tempor purus, quis placerat enim dolor vitae neque.', + 'Vivamus dignissim nunc et tortor vulputate maximus.', + 'Fusce lobortis pretium lectus, non pretium mi rhoncus quis.', + 'Curabitur blandit imperdiet metus id luctus.', + 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.', + 'Aenean lobortis ligula a mauris posuere, luctus pretium mauris ultrices.', +]; + +const Code = 'crm.stagehistory.list'; + +const item = ({link, title, description}: SearchSuggestPageItem) => ` + +
${title}
+
${description}
+
+`; + +describe('suggest', () => { + const lang = 'ru'; + let indexer: Indexer; + let uid = 1; + + function suggest(query: string, config: Pick) { + const {index, registry} = indexer.release(lang, ReleaseFormat.RAW); + + const results = search(config, index as Index, query, 10, false); + + return format({base: './', mark: 'mark'}, results, registry as Registry, short).map(item); + } + + function add(html: string, title = '') { + indexer.add(lang, String(uid++), { + html, + title, + leading: false, + meta: {}, + toc: {items: [], href: ''}, + }); + } + + beforeEach(() => { + indexer = new Indexer(); + }); + + it('should match html content', () => { + add(Lorem.slice(0, 2).join(' ')); + add(Lorem.slice(1, 3).join(' ')); + + const config = {confidence: 'phrased', tolerance: 2} as const; + + expect(suggest('Lorem ipsum', config)).toMatchSnapshot(); + }); + + it('should match title content', () => { + add(Lorem.slice(1, 3).join(' '), 'Lorem ipsum 1'); + add(Lorem.slice(2, 4).join(' '), 'Lorem ipsum 2'); + + const config = {confidence: 'phrased', tolerance: 2} as const; + + expect(suggest('Lorem ipsum', config)).toMatchSnapshot(); + }); + + it('should score longest phrase', () => { + add(Lorem.slice(0, 3).join(' ')); + add(Lorem.slice(1, 5).join(' ')); + + const config = {confidence: 'phrased', tolerance: 2} as const; + + expect(suggest('enim dolor vitae', config)).toMatchSnapshot(); + }); + + it('should match code', () => { + add(Code); + + const config = {confidence: 'phrased', tolerance: 2} as const; + + expect(suggest('stagehistory', config)).toMatchSnapshot(); + }); +}); diff --git a/tsconfig.json b/tsconfig.json index 3f28eab..815761f 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -9,5 +9,5 @@ "baseUrl": ".", "outDir": "lib" }, - "include": ["src"] + "include": ["src", "test"] }