Skip to content

Commit

Permalink
Always search folded index
Browse files Browse the repository at this point in the history
Fixes #57. No longer index non-normalised fields reducing index size, storing original but creating an index of folded items folded as per rules on a per-language basis. Make all search use normalisation and use normalised index. Remove 'fold' option from APIs as now unnecessary. Bump index version as this would now be incompatible with indexes created without a folded index.
  • Loading branch information
wardle committed Sep 9, 2023
1 parent 550b38c commit 368ff49
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 59 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1275,7 +1275,6 @@ such as 'Wegener's Granulomatosis' will be suprised that their search fails to r
Search parameters:
* `s` - the text to search
* `fold` - whether to normalize (fold) the search text and to use folded index (default, `false`)
* `constraint` - an ECL expression to constrain the search; I never use search without this
* `maxHits` - maximum number of hits
* `inactiveConcepts` - whether to search inactive concepts (default, `false`)
Expand Down
3 changes: 1 addition & 2 deletions cmd/com/eldrix/hermes/cmd/server.clj
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,9 @@
(assoc ctx :result {:subsumedBy (hermes/subsumed-by? svc concept-id subsumer-id)})))})

(defn parse-search-params
[{:keys [s fold maxHits isA refset constraint ecl fuzzy fallbackFuzzy inactiveConcepts inactiveDescriptions removeDuplicates]}]
[{:keys [s maxHits isA refset constraint ecl fuzzy fallbackFuzzy inactiveConcepts inactiveDescriptions removeDuplicates]}]
(cond-> {}
s (assoc :s s)
fold (assoc :fold (parse-flag fold))
constraint (assoc :constraint constraint)
ecl (assoc :constraint ecl)
maxHits (assoc :max-hits (Long/parseLong maxHits))
Expand Down
35 changes: 23 additions & 12 deletions src/com/eldrix/hermes/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

(def ^:private expected-manifest
"Defines the current expected manifest."
{:version "lmdb/16"
{:version "lmdb/18"
:store "store.db"
:search "search.db"
:members "members.db"})
Expand Down Expand Up @@ -537,15 +537,15 @@


(defn ^:private make-search-params
[^Svc svc {:keys [s fold constraint accept-language language-refset-ids] :as params}]
[^Svc svc {:keys [s constraint accept-language language-refset-ids] :as params}]
(let [lang-refset-ids (or (seq language-refset-ids)
(when accept-language (match-locale svc accept-language true))
(match-locale svc))]
(cond-> (assoc params :language-refset-ids lang-refset-ids)
fold
(assoc :s* (lang/fold (if (string? fold) fold (first lang-refset-ids)) s))
constraint ;; if there is a constraint, parse
(assoc :query (ecl/parse svc constraint)))))
;; if there is a string, normalize it
s (update :s #(lang/fold (first lang-refset-ids) %))
;; if there is a constraint, parse it into a Lucene query
constraint (assoc :query (ecl/parse svc constraint)))))

(s/fdef search
:args (s/cat :svc ::svc :params ::search-params)
Expand All @@ -565,7 +565,6 @@
| `:fuzzy` | fuzziness (0-2, default 0) |
| `:fallback-fuzzy` | if no results, try fuzzy search (0-2, default 0). |
| `:remove-duplicates?` | remove duplicate results (default, false) |
| `:fold` | fold term / use folded index? (bool/country code) |
| `:accept-language` | locales for preferred synonyms in results |
| `:language-refset-ids | languages for preferred synonyms in results |
Expand All @@ -577,11 +576,6 @@
```
For autocompletion, it is recommended to use `fuzzy=0`, and `fallback-fuzzy=2`.
If `fold` is true, the search term will be normalised according to the
language preferences (accept-language or language-refset-ids) and the folded
index will be used in addition to the raw term index. To use an explicit
folding strategy, `fold` may also be a country code (e.g. \"sv\").
There are some lower-level search parameters available, but it is usually
more appropriate to use a SNOMED ECL constraint instead of these.
Expand Down Expand Up @@ -1502,3 +1496,20 @@
(crit/bench (extended-concept svc 24700007))
(crit/bench (search svc {:s "multiple sclerosis"})))

(defn ^:private analyse-diacritics
[svc]
(let [ch (a/chan 1 (filter :active))]
(a/thread (stream-all-concepts svc ch))
(loop [n-concepts 0, missing 0, results []]
(if-let [concept (a/<!! ch)]
(let [s1 (set (map :term (synonyms svc (:id concept))))
s2 (set (map #(lang/fold "en" %) s1))
diff (set/difference s2 s1)
diff' (remove #(or (are-any? svc (set (map :conceptId (search svc {:s %}))) [(:id concept)])
(are-any? svc [(:id concept)] (set (map :conceptId (search svc {:s %}))) )) diff)]
(recur (if (seq diff) (inc n-concepts) n-concepts)
(+ missing (count diff'))
(if (seq diff') (conj results {:concept-id (:id concept) :missing diff'}) results)))
{:n-concepts n-concepts :missing missing :results results}))))


47 changes: 17 additions & 30 deletions src/com/eldrix/hermes/impl/search.clj
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
"Turn an extended description into a Lucene document."
[ed]
(let [doc (doto (Document.)
(.add (TextField. "term" (:term ed) Field$Store/YES))
(.add (StoredField. "term" ^String (:term ed)))
(.add (TextField. "nterm" (lang/fold (:languageCode ed) (:term ed)) Field$Store/NO))
(.add (DoubleDocValuesField. "length-boost" (/ 1.0 (Math/sqrt (count (:term ed)))))) ;; add a penalty for longer terms
(.add (LongPoint. "module-id" (long-array [(:moduleId ed)])))
Expand Down Expand Up @@ -270,31 +270,23 @@

(defn- make-search-query
^Query
[{:keys [s s* fuzzy show-fsn? inactive-concepts? inactive-descriptions? concept-refsets properties]
[{:keys [s fuzzy show-fsn? inactive-concepts? inactive-descriptions? concept-refsets properties]
:or {show-fsn? false, inactive-concepts? false, inactive-descriptions? true}}]
(let [query (cond-> (BooleanQuery$Builder.)
(and s s*)
(doto (.add (make-tokens-query "term" s fuzzy) BooleanClause$Occur/SHOULD)
(.add (make-tokens-query "nterm" s* fuzzy) BooleanClause$Occur/SHOULD)
(.setMinimumNumberShouldMatch 1))
s
(.add (make-tokens-query "nterm" s fuzzy) BooleanClause$Occur/MUST)

(and s (not s*))
(.add (make-tokens-query "term" s fuzzy) BooleanClause$Occur/MUST)
(not inactive-concepts?)
(.add (q-concept-active true) BooleanClause$Occur/FILTER)

(and s* (not s))
(.add (make-tokens-query "nterm" s* fuzzy) BooleanClause$Occur/MUST)
(not inactive-descriptions?)
(.add (q-description-active true) BooleanClause$Occur/FILTER)

(not inactive-concepts?)
(.add (q-concept-active true) BooleanClause$Occur/FILTER)
(not show-fsn?)
(.add (q-fsn) BooleanClause$Occur/MUST_NOT)

(not inactive-descriptions?)
(.add (q-description-active true) BooleanClause$Occur/FILTER)

(not show-fsn?)
(.add (q-fsn) BooleanClause$Occur/MUST_NOT)

(seq concept-refsets)
(.add (LongPoint/newSetQuery "concept-refsets" ^Collection concept-refsets) BooleanClause$Occur/FILTER))]
(seq concept-refsets)
(.add (LongPoint/newSetQuery "concept-refsets" ^Collection concept-refsets) BooleanClause$Occur/FILTER))]
(doseq [[k v] properties]
(let [^Collection vv (if (instance? Collection v) v [v])]
(.add query
Expand Down Expand Up @@ -370,8 +362,7 @@ items."
| keyword | description (default) |
|--------------------- |---------------------------------------------------|
| :s | search string to use for term |
| :s* | search string to use for normalised term |
| :s | search string; should be normalized |
| :max-hits | maximum hits (if omitted returns unlimited but |
| | *unsorted* results) |
| :language-refset-ids | ordered priority list of reference set ids |
Expand All @@ -392,10 +383,7 @@ items."
```
(do-search searcher {:s \"neurologist\" :properties {snomed/IsA [14679004]}})
```
A FSN is a fully-specified name and should generally be left out of search.
Normalization relates to text folding, in which characters with diacritics
that do not alter semantics are normalized. "
A FSN is a fully-specified name and should generally be left out of search. "
[^IndexSearcher searcher {:keys [max-hits language-refset-ids fuzzy fallback-fuzzy remove-duplicates?] :as params}]
(let [q1 (make-search-query params)
q2 (if-let [q (:query params)] (q-and [q1 q]) q1)
Expand All @@ -407,8 +395,7 @@ items."
(if remove-duplicates?
(remove-duplicates duplicate-result? results)
results)
(let [fuzzy (or fuzzy 0)
fallback (or fallback-fuzzy 0)]
(let [fuzzy (or fuzzy 0), fallback (or fallback-fuzzy 0)]
(when (and (zero? fuzzy) (pos? fallback)) ; only fallback to fuzzy search if no fuzziness requested first time
(do-search searcher (assoc params :fuzzy fallback)))))))

Expand Down Expand Up @@ -609,10 +596,10 @@ items."
(q-not (MatchAllDocsQuery.) (IntPoint/newRangeQuery field (int (inc maximum)) Integer/MAX_VALUE)))))

(defn q-term [s]
(make-tokens-query "term" s))
(make-tokens-query "nterm" s))

(defn q-wildcard [s]
(WildcardQuery. (Term. "term" ^String s)))
(WildcardQuery. (Term. "nterm" ^String s)))

(defn q-type [type-id]
(LongPoint/newExactQuery "type-id" type-id))
Expand Down
17 changes: 3 additions & 14 deletions test/com/eldrix/hermes/core_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -120,21 +120,10 @@

(deftest ^:live test-term-folding
(testing "Search parameters"
(is (= "hjarta" (:s* (#'hermes/make-search-params *svc* {:s "hjärta" :fold "en"})))
(is (= "hjarta" (:s* (#'hermes/make-search-params *svc* {:s "hjärta" :accept-language "en"})))
"In English, a search against the folded index should fold ä")
(is (= "hjärta" (:s* (#'hermes/make-search-params *svc* {:s "hjärta" :fold "sv"})))
"In Swedish, a search against rhe folded index should not fold ä"))
(testing "Search results"
(let [no-fold-results (map :term (hermes/search *svc* {:s "Sjögren" :fold false :max-hits 50}))
fold-results (map :term (hermes/search *svc* {:s "Sjögren" :fold true :max-hits 50}))]
(is (every? #(str/includes? % "Sjögren") no-fold-results)
"With no folding, every result should include terms with diacritics")
(is (not (some #(str/includes? % "Sjogren") no-fold-results))
"With no folding, no result should include a term without diacritics")
(is (every? #(or (str/includes? % "Sjögren") (str/includes? % "Sjogren")) fold-results)
"With folding enabled, terms without diacritics should be returned as well")
(is (some #(str/includes? % "Sjogren") fold-results))
(is (some #(str/includes? % "Sjögren") fold-results)))))
(is (= "hjärta" (:s* (#'hermes/make-search-params *svc* {:s "hjärta" :language-refset-ids [46011000052107]})))
"In Swedish, a search against the folded index should not fold ä")))

#_(deftest ^:live test-historical-assumptions
(let [counts (#'hermes/historical-association-counts *svc*)]
Expand Down

0 comments on commit 368ff49

Please sign in to comment.