Merge branch 'main' into statement_timeout

khaledk2 · Jul 25, 2024 · 6b2469b · 6b2469b
2 parents e572692 + 547bead
commit 6b2469b
Show file tree

Hide file tree

Showing 19 changed files with 1,257 additions and 147 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -48,6 +48,13 @@ jobs:
           python manage.py set_database_configuration -u localhost -s  ${{ job.services.postgres.ports[5432] }} -n postgress -p passwprd
           # configure elasticsearch
           python manage.py set_elasticsearch_configuration -e localhost:${{ job.services.elasticsearch.ports[9200] }}
+          # download and extract the database backup file           
+          wget https://downloads.openmicroscopy.org/images/omero_db_searchengine.zip -P app_data
+          unzip app_data/omero_db_searchengine.zip -d app_data/          
+          # run  restore omero database
+          python manage.py restore_postgresql_database
+          # run indexing indexing
+          python manage.py get_index_data_from_database -b False
           # run tests
           python -m unittest discover -s unit_tests
   upload:

diff --git a/app_data/test_index_data.json b/app_data/test_index_data.json
@@ -96,5 +96,27 @@
             "validation screen"
          ]
       ]
-   }
+   },
+    "query_in": {
+       "image": [
+         [
+          "Gene Symbol",
+          [
+             "Duoxa2",
+             "Bach2",
+             "Cxcr2",
+             "Mysm1"
+          ]
+          ],
+          [
+          "Organism",
+          [
+             "homo sapiens",
+             "mus musculus",
+             "mus musculus x mus spretus",
+             "human adenovirus 2"
+          ]
+       ]
+          ]
+    }
 }
diff --git a/configurations/app_config.yml b/configurations/app_config.yml
@@ -15,3 +15,4 @@ ELASTICSEARCH_BACKUP_FOLDER: "path/to/elasticsearch/backup/folder"
 verify_certs: False
 ELASTIC_PASSWORD: elasticsearch_user_password
 STATEMENT_TIMEOUT: 5000
+BASE_FOLDER: /etc/searchengine/
diff --git a/examples/search.py b/examples/search.py
@@ -88,5 +88,9 @@ def call_omero_return_results(url, data=None, method="post"):
         % (len(received_results), total_results, page, total_pages, bookmark)
     )
 
-# 2000 /11686633, page: 1/11687, bookmark: 109600
-# 2000 /12225067, page: 1/12226, bookmark:  109600
+# another example using in operators and send items inside value as a string,
+# The List items are separated by ','
+logging.info("Using in operator")
+url = "%s%s?key=Gene Symbol&value=Pdgfc,Rnase10&operator=in" % (base_url, image_search)
+bookmark, total_results, total_pages = call_omero_return_results(url, method="get")
+logging.info("%s,%s" % (total_results, total_pages))
diff --git a/examples/using_in_operator.py b/examples/using_in_operator.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2024 University of Dundee & Open Microscopy Environment.
+# All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from utils import query_the_search_ending, logging
+
+# It is similar to use the 'in' operator in a sql statement,
+# rather than having multiple 'or' conditions,
+# it will only use a single condition.
+
+# The following example will search for the images which have any of the 'Gene Symbol'
+# values in this list ["Duoxa2", "Bach2", "Cxcr2", "Mysm1"]
+
+# and filters
+
+logging.info("Example of using in operator")
+
+
+values_in = ["Duoxa2", "Bach2", "Cxcr2", "Mysm1"]
+logging.info("Searching for 'Gene Symbol' with values in [%s]" % (",".join(values_in)))
+and_filters = [{"name": "Gene Symbol", "value": values_in, "operator": "in"}]
+
+main_attributes = []
+query = {"and_filters": and_filters}
+#
+recieved_results_data = query_the_search_ending(query, main_attributes)
diff --git a/examples/using_not_in_operator.py b/examples/using_not_in_operator.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2024 University of Dundee & Open Microscopy Environment.
+# All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from utils import query_the_search_ending, logging
+
+# It is similar to use the 'not in' operator in a sql statement,
+# rather than having multiple 'or' conditions with not_equals operators,
+# it will only use a single condition.
+
+# The following example will search for the images which have met any of the 'Organism'
+# values in this list
+# ["homo sapiens","mus musculus","mus musculus x mus spretus","human adenovirus 2"]
+
+# and filters
+
+logging.info("Example of using not_in operator")
+
+
+values_not_in = [
+    "homo sapiens",
+    "mus musculus",
+    "mus musculus x mus spretus",
+    "human adenovirus 2",
+]
+logging.info("Searching for 'Organism' with values in [%s]" % (",".join(values_not_in)))
+and_filters = [{"name": "Organism", "value": values_not_in, "operator": "not_in"}]
+
+main_attributes = []
+query = {"and_filters": and_filters}
+#
+received_results_data = query_the_search_ending(query, main_attributes)
diff --git a/manage.py b/manage.py
@@ -114,13 +114,25 @@ def sql_results_to_panda():
     pass
 
 
+@manager.command
+def restore_postgresql_database():
+    from omero_search_engine.database.utils import restore_database
+
+    restore_database()
+
+
 @manager.command
 @manager.option(
     "-r",
     "--resource",
     help="resource name, creating all the indexes for all the resources is the default",  # noqa
 )
-def get_index_data_from_database(resource="all"):
+@manager.option(
+    "-b",
+    "--backup",
+    help="if True, backup will be called ",  # noqa
+)
+def get_index_data_from_database(resource="all", backup="True"):
     """
     insert data in Elasticsearch index for each resource
     It gets the data from postgres database server
@@ -132,7 +144,9 @@ def get_index_data_from_database(resource="all"):
         get_insert_data_to_index,
         save_key_value_buckets,
     )
+    import json
 
+    backup = json.loads(backup.lower())
     if resource != "all":
         sql_st = sqls_resources.get(resource)
         if not sql_st:
@@ -148,7 +162,8 @@ def get_index_data_from_database(resource="all"):
         test_indexing_search_query(deep_check=False, check_studies=True)
 
     # backup the index data
-    backup_elasticsearch_data()
+    if backup:
+        backup_elasticsearch_data()
 
 
 # set configurations
@@ -360,6 +375,44 @@ def restore_elasticsearch_data():
     restore_indices_data()
 
 
+@manager.command
+@manager.option("-s", "--screen_name", help="Screen name, or part of it")
+@manager.option("-p", "--project_name", help="Project name, or part of it")
+def data_validator(screen_name=None, project_name=None):
+    """
+    Checking key-value pair for trailing and heading space.
+    It also checks the key-value pair duplication.
+    It can check all the projects and screens.
+    Also, it can run for a specific project or screen.
+    The output is a collection of CSV files; each check usually generates three files:
+    The main file contains image details (e.g. image id)
+    in addition to the key and the value.
+    one file for screens and one for projects.
+    Each file contains the screen name (project name),
+      the key-value pair which has the issue and the total number of affected
+      images for each row.
+    """
+    from datetime import datetime
+
+    if screen_name and project_name:
+        print("Either screen name or project name is allowed")
+
+    from omero_search_engine.validation.omero_keyvalue_data_validator import (
+        check_for_heading_space,
+        check_for_trailing_space,
+        check_duplicated_keyvalue_pairs,
+    )
+
+    start = datetime.now()
+    check_for_trailing_space(screen_name, project_name)
+    start1 = datetime.now()
+    check_for_heading_space(screen_name, project_name)
+    start2 = datetime.now()
+    check_duplicated_keyvalue_pairs(screen_name, project_name)
+    end = datetime.now()
+    print("start: %s, start1: %s, start2: %s, end: %s" % (start, start1, start2, end))
+
+
 @manager.command
 def test_container_key_value():
     from omero_search_engine.validation.results_validator import (
@@ -370,4 +423,7 @@ def test_container_key_value():
 
 
 if __name__ == "__main__":
+    from flask_script import Command
+
+    Command.capture_all_args = False
     manager.run()
diff --git a/omero_search_engine/api/v1/resources/query_handler.py b/omero_search_engine/api/v1/resources/query_handler.py
@@ -35,6 +35,9 @@
     "screen": {"name": "name", "description": "description"},
 }
 
+res_and_main_attributes = None
+res_or_main_attributes = None
+
 
 def check_get_names(idr_, resource, attribute, return_exact=False):
     # check the idr name and return the resource and possible values
@@ -107,8 +110,10 @@ def adjust_resource(self):
                     )
                     if len(ac_value) == 1:
                         self.value = ac_value[0]
-                    else:
+                    elif len(ac_value) == 0:
                         self.value = -1
+                    else:
+                        self.value = ac_value
                 """
                 pr_names = get_resource_names(self.resource)
                 if not self.value in pr_names:
@@ -337,6 +342,7 @@ def get_image_non_image_query(self):
 
     def run_query(self, query_, resource):
         main_attributes = {}
+
         query = {"and_filters": [], "or_filters": []}
 
         if query_.get("and_filters"):
@@ -398,6 +404,11 @@ def run_query(self, query_, resource):
         # res = search_query(query, resource, bookmark,
         #                    self.raw_elasticsearch_query,
         #                    main_attributes,return_containers=self.return_containers)
+        global res_and_main_attributes, res_or_main_attributes
+        if res_and_main_attributes:
+            main_attributes["and_main_attributes"] = (
+                main_attributes.get("and_main_attributes") + res_and_main_attributes
+            )
         if resource == "image" and self.return_containers:
             res = search_query(
                 query,
@@ -633,6 +644,12 @@ def determine_search_results_(query_, return_columns=False, return_containers=Fa
     and_filters = query_.get("query_details").get("and_filters")
     or_filters = query_.get("query_details").get("or_filters")
     and_query_groups = []
+    main_attributes = query_.get("main_attributes")
+    global res_and_main_attributes, res_or_main_attributes
+    if main_attributes:
+        res_and_main_attributes = main_attributes.get("and_main_attributes")
+        res_or_main_attributes = main_attributes.get("or_main_attributes")
+
     columns_def = query_.get("columns_def")
     or_query_groups = []
     if and_filters and len(and_filters) > 0:
@@ -785,9 +802,9 @@ def add_local_schemas_to(resolver, schema_folder, base_uri, schema_ext=".json"):
 
 
 def query_validator(query):
-    query_schema_file = (
-        "omero_search_engine/api/v1/resources/schemas/query_data.json"  # noqa
-    )
+    print("TRoz", query)
+    main_dir = os.path.abspath(os.path.dirname(__file__))
+    query_schema_file = os.path.join(main_dir, "schemas", "query_data.json")
     base_uri = "file:" + abspath("") + "/"
     with open(query_schema_file, "r") as schema_f:
         query_schema = json.loads(schema_f.read())

diff --git a/omero_search_engine/api/v1/resources/schemas/filter_schema.json b/omero_search_engine/api/v1/resources/schemas/filter_schema.json
@@ -13,12 +13,12 @@
     },
      "value": {
       "name":"value",
-      "type": "string"
+      "type": ["array", "string"]
     },
      "operator": {
       "name": "operator",
       "type": "string",
-      "enum": ["equals", "not_equals", "contains","not_contains"]
+      "enum": ["equals", "not_equals", "contains", "not_contains", "in", "not_in"]
     }
      ,"resource": {
       "name": "resource",

diff --git a/omero_search_engine/api/v1/resources/swagger_docs/search.yml b/omero_search_engine/api/v1/resources/swagger_docs/search.yml
@@ -28,7 +28,7 @@ parameters:
     description: operator, default equals
     in: query
     type: string
-    enum: ['equals', 'not_equals', 'contains', 'not_contains']
+    enum: ['equals', 'not_equals', 'contains', 'not_contains', 'in', 'not_in']
   - name: case_sensitive
     description: case sensitive query, default False
     in: query