multi-source search

khaledk2 · Sep 1, 2024 · 6c67326 · 6c67326
1 parent 4ea3357
commit 6c67326
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 18 deletions.
diff --git a/omero_search_engine/api/v1/resources/resource_analyser.py b/omero_search_engine/api/v1/resources/resource_analyser.py
@@ -379,6 +379,7 @@ def prepare_search_results(results, size=0):
             continue
         row = {}
         returned_results.append(row)
+        row["Data Source"]=res["data_source"]
         row["Key"] = res["Attribute"]
         row["Value"] = res["Value"]
         row["Number of %ss" % resource] = res.get("items_in_the_bucket")
@@ -474,7 +475,7 @@ def get_key_values_return_contents(name, resource, data_source, csv):
 
 
 def query_cashed_bucket_part_value_keys(
-    name, value, resource, es_index="key_value_buckets_information"
+    name, value, data_source, resource, es_index="key_value_buckets_information"
 ):
     """
     Search for and obtain the available values for an attribute and part of the
@@ -486,9 +487,14 @@ def query_cashed_bucket_part_value_keys(
     if name:
         name = name.strip()
     value = adjust_value(value)
+    if data_source and data_source.strip() and data_source.lower() != "all":
+        data_source = [itm.strip().lower() for itm in data_source.split(',')]
+    else:
+        data_source = get_data_sources()
+
     if resource != "all":
         query = key_part_values_buckets_template.substitute(
-            name=name, value=value, resource=resource
+            name=name, value=value, resource=resource, data_source=json.dumps(data_source)
         )
         res = search_index_for_values_get_all_buckets(es_index, query)
         returned_results = prepare_search_results_buckets(res)
@@ -501,7 +507,7 @@ def query_cashed_bucket_part_value_keys(
             if table == "image1":
                 continue
             query = key_part_values_buckets_template.substitute(
-                name=name, value=value, resource=table
+                name=name, value=value, resource=table, data_source=json.dumps(data_source)
             )
             res = search_index_for_values_get_all_buckets(es_index, query)
             returned_results[table] = prepare_search_results_buckets(res)
@@ -512,6 +518,11 @@ def query_cashed_bucket(
     name, resource, data_source, es_index="key_value_buckets_information"
 ):
     # returns possible matches for a specific resource
+    if data_source and data_source.strip() and data_source.lower() != "all":
+        data_source = [itm.strip().lower() for itm in data_source.split(',')]
+    else:
+        data_source =get_data_sources()
+
     if name:
         name = name.strip()
     if resource != "all":
@@ -527,7 +538,7 @@ def query_cashed_bucket(
         returned_results = {}
         for table in resource_elasticsearchindex:
             query = key_values_buckets_template.substitute(
-                name=name, resource=table, data_source=json.dumps([data_source])
+                name=name, resource=table, data_source=json.dumps(data_source)
             )
             res = search_index_for_values_get_all_buckets(es_index, query)
             returned_results[table] = prepare_search_results_buckets(res)
@@ -549,6 +560,11 @@ def search_value_for_resource(
     """
     value = adjust_value(value)
 
+    if data_source and data_source.lower() != "all":
+        data_source = [itm.strip().lower() for itm in data_source.split(',')]
+    else:
+        data_source=get_data_sources()
+
     if table_ != "all":
         query = resource_key_values_buckets_template.substitute(
             value=value, resource=table_, data_source=json.dumps(data_source)
@@ -606,16 +622,53 @@ def search_value_for_resource(
 """
 key_part_values_buckets_template = Template(
     """
-{"query":{"bool":{"must":[{"bool":{
-"must":[{"match":{"Attribute.keyrnamenormalize":"$name"}},
-{"wildcard":{"Value.keyvaluenormalize":"*$value*"}}
-]
-}},{
-"bool": {"must": [
-{"match":{"resource.keyresource": "$resource"}}
-]}}]}}}"""
+{
+   "query":{
+      "bool":{
+         "must":[
+            {
+               "bool":{
+                  "must":[
+                     {
+                        "match":{
+                           "Attribute.keyrnamenormalize":"$name"
+                        }
+                     },
+                     {
+                        "wildcard":{
+                           "Value.keyvaluenormalize":"*$value*"
+                        }
+                     }
+                  ]
+               }
+            },
+            {
+               "bool":{
+                  "must":[
+                     {
+                        "match":{
+                           "resource.keyresource":"$resource"
+                        }
+                     }
+                  ]
+               }
+            },
+            {
+               "bool":{
+                  "must":{
+                     "terms":{
+                        "data_source.keyvalue":$data_source
+                     }
+                  }
+               }
+            }
+         ]
+      }
+   }
+}"""
 )
 
+
 # "fields": ["Attribute","Value","items_in_the_bucket",
 # "total_items_in_saved_buckets","total_buckets","total_items"],
 # "_source": false,

diff --git a/omero_search_engine/api/v1/resources/swagger_docs/search_for_any_value.yml b/omero_search_engine/api/v1/resources/swagger_docs/search_for_any_value.yml
@@ -24,6 +24,12 @@ parameters:
     in: query
     type: string
     required: false
+  - name: data_source
+    in: query
+    type: string
+    required: false
+    description: If it is provided, it will return the search results for a specific data source, otherwise it will return the results from all the data sources
+
 definitions:
  data:
     type: object

diff --git a/omero_search_engine/api/v1/resources/swagger_docs/searchbyvalue.yml b/omero_search_engine/api/v1/resources/swagger_docs/searchbyvalue.yml
@@ -32,6 +32,11 @@ parameters:
     description: bookmark is used to the call the next page if number of results is bigger than 1000, it returns with each reasult page.
     in: query
     type: integer
+  - name: data_source
+    in: query
+    type: string
+    required: false
+    description: If it is provided, it will return the search results for a specific data source, otherwise it will return the results from all the data sources
   - name: return_containers
     in: query
     type: boolean

diff --git a/omero_search_engine/api/v1/resources/swagger_docs/searchvaluesusingkey.yml b/omero_search_engine/api/v1/resources/swagger_docs/searchvaluesusingkey.yml
@@ -47,6 +47,16 @@ parameters:
     in: query
     type: string
     required: true
+  - name: data_source
+    in: query
+    type: string
+    required: false
+    description: If it is provided, it will return the search results for a specific data source, otherwise it will return the results from all the data sources
+  - name: data_source
+    in: query
+    type: string
+    required: false
+    description: If it is provided, it will return the search results for a specific data source, otherwise it will return the results from all the data sources
   - name: csv
     description: a flag to return a CSV file which is created on the fly instead of JSON
     in: query

diff --git a/omero_search_engine/api/v1/resources/urls.py b/omero_search_engine/api/v1/resources/urls.py
@@ -189,6 +189,7 @@ def get_values_using_value(resource_table):
     file: swagger_docs/search_for_any_value.yml
     """
     value = request.args.get("value")
+    data_source = request.args.get("data_source")
     if not value:
         return jsonify(
             build_error_message("Error: {error}".format(error="No value is provided "))
@@ -206,7 +207,7 @@ def get_values_using_value(resource_table):
     if key:
         # If the key is provided it will restrict the search to the provided key.
 
-        return query_cashed_bucket_part_value_keys(key, value, resource_table)
+        return query_cashed_bucket_part_value_keys(key, value,data_source, resource_table)
     bookmark = request.args.get("bookmark")
     if bookmark:
         bookmark = bookmark.split(",")
@@ -237,7 +238,7 @@ def get_values_using_value(resource_table):
                     )
                 )
             )
-    return jsonify(search_value_for_resource(resource_table, value, bookmark))
+    return jsonify(search_value_for_resource(resource_table, value, data_source, bookmark))
 
 
 @resources.route("/<resource_table>/searchvaluesusingkey/", methods=["GET"])
@@ -253,13 +254,14 @@ def search_values_for_a_key(resource_table):
     # default is false
     # if it sets to true, a CSV file content will be sent instead of dict
     csv = request.args.get("csv")
+    data_source = request.args.get("data_source")
     if csv:
         try:
             csv = json.loads(csv.lower())
         except Exception:
             csv = False
 
-    return get_key_values_return_contents(key, resource_table, csv)
+    return get_key_values_return_contents(key, resource_table,data_source, csv)
 
 
 # getannotationkeys==> keys

diff --git a/omero_search_engine/validation/results_validator.py b/omero_search_engine/validation/results_validator.py
@@ -295,13 +295,13 @@ def get_results_searchengine(self, operator=None):
         if self.type == "buckets":
             if self.name:
                 res = get_key_values_return_contents(
-                    self.name, "image", data_source=[self.data_source], csv=False
+                    self.name, "image", data_source=self.data_source, csv=False
                 )
                 self.searchengine_results = json.loads(res.data)
             elif self.value:
 
                 self.searchengine_results = search_value_for_resource(
-                    "image", self.value, [self.data_source]
+                    "image", self.value, self.data_source
                 )
             return
 
@@ -892,7 +892,7 @@ def get_omero_stats():
             for name in names:
                 if name == "name":
                     continue
-                returned_results = query_cashed_bucket(name, resource, [data_source])
+                returned_results = query_cashed_bucket(name, resource, data_source)
                 if resource == "image":
                     data.append(
                         "%s, %s, %s,%s,%s"