diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e45791cb..e82dd7ea 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -48,6 +48,13 @@ jobs: python manage.py set_database_configuration -u localhost -s ${{ job.services.postgres.ports[5432] }} -n postgress -p passwprd # configure elasticsearch python manage.py set_elasticsearch_configuration -e localhost:${{ job.services.elasticsearch.ports[9200] }} + # download and extract the database backup file + wget https://downloads.openmicroscopy.org/images/omero_db_searchengine.zip -P app_data + unzip app_data/omero_db_searchengine.zip -d app_data/ + # run restore omero database + python manage.py restore_postgresql_database + # run indexing indexing + python manage.py get_index_data_from_database -b False # run tests python -m unittest discover -s unit_tests upload: @@ -85,6 +92,6 @@ jobs: uses: docker/build-push-action@v2 with: context: . - file: deployment/docker/centos/Dockerfile + file: deployment/docker/rockylinux/Dockerfile push: true tags: ${{ join(fromJson(steps.gettags.outputs.tags)) }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a8622a84..6e124301 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ --- repos: - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 24.4.2 hooks: - id: black args: [--target-version=py35] - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.1.0 hooks: - id: flake8 args: [ diff --git a/CHANGELOG.md b/CHANGELOG.md index adf65a54..90eb21b8 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ +0.5.4 (March 2024): +-------------------- +- Support Rocky Linux 9 [#93](https://github.com/ome/omero_search_engine/pull/93) + 0.5.3 (September 2023): ----------------------- - Secure the connection with elasticsearch [#92](https://github.com/ome/omero_search_engine/pull/92) - 0.5.2 (June 2023): ------------------ - Return all the available values for a key in a container [#77](https://github.com/ome/omero_search_engine/pull/77) diff --git a/app_data/test_index_data.json b/app_data/test_index_data.json index 658d43a4..d9ba7f9b 100644 --- a/app_data/test_index_data.json +++ b/app_data/test_index_data.json @@ -96,5 +96,27 @@ "validation screen" ] ] - } + }, + "query_in": { + "image": [ + [ + "Gene Symbol", + [ + "Duoxa2", + "Bach2", + "Cxcr2", + "Mysm1" + ] + ], + [ + "Organism", + [ + "homo sapiens", + "mus musculus", + "mus musculus x mus spretus", + "human adenovirus 2" + ] + ] + ] + } } diff --git a/configurations/app_config.yml b/configurations/app_config.yml index c7d52eb2..e1f9ad9f 100644 --- a/configurations/app_config.yml +++ b/configurations/app_config.yml @@ -16,3 +16,4 @@ verify_certs: False ELASTIC_PASSWORD: elasticsearch_user_password SEARCHENGINE_LOGS_FOLDER: path/to/search/logs/folder SEARCHENGINE_ADMIN_PASSWD: "hashed_admin_password" +BASE_FOLDER: /etc/searchengine/ \ No newline at end of file diff --git a/deployment/docker/rockylinux/.dockerignore b/deployment/docker/rockylinux/.dockerignore new file mode 100644 index 00000000..e2bec014 --- /dev/null +++ b/deployment/docker/rockylinux/.dockerignore @@ -0,0 +1,3 @@ +Dockerfile +.git +.gitignore diff --git a/deployment/docker/rockylinux/Dockerfile b/deployment/docker/rockylinux/Dockerfile new file mode 100644 index 00000000..ef577694 --- /dev/null +++ b/deployment/docker/rockylinux/Dockerfile @@ -0,0 +1,23 @@ +#docker build . -t searchengine +# docker build . -f deployment/docker/rockylinux/Dockerfile -t searchengine +FROM rockylinux/rockylinux:9.0 +USER root +RUN dnf update -y +RUN dnf groupinstall "Development Tools" -y +RUN dnf install libpq-devel -y +RUN dnf install python3-pip -y +RUN dnf install -y python3-devel.x86_64 +RUN dnf clean all && rm -rf /var/cache/yum +RUN mkdir /searchengine +ADD deployment/docker/rockylinux/start_gunicorn_serch_engine.sh /searchengine +ADD deployment/docker/rockylinux/run_app.sh /searchengine +ADD . /searchengine +RUN cd /searchengine +RUN mkdir /etc/searchengine +RUN mkdir /etc/searchengine/chachedata +RUN mkdir /etc/searchengine/logs +WORKDIR /searchengine +RUN pip3 install -r requirements.txt +RUN pip3 install gunicorn +EXPOSE 5577 +ENTRYPOINT ["bash", "run_app.sh"] diff --git a/deployment/docker/rockylinux/run_app.sh b/deployment/docker/rockylinux/run_app.sh new file mode 100644 index 00000000..a1cac35a --- /dev/null +++ b/deployment/docker/rockylinux/run_app.sh @@ -0,0 +1,18 @@ +#!/bin/bash +echo "$@" + +#test if the configuration file exists, if not it will copy it from the app configuration folder +test -f /etc/searchengine/.app_config.yml || cp /searchengine/configurations/app_config.yml /etc/searchengine/.app_config.yml + +#Check the script input +if [[ $@ == run_app* ]] ; then + url_perfix=${@/run_app/} + echo using prefix: $url_perfix + bash start_gunicorn_serch_engine.sh $url_perfix +elif [ -z "$@" ] || [ "$@" = "run_app" ]; then + echo "Starting the app" + bash start_gunicorn_serch_engine.sh +else + echo "$@" + python3 manage.py "$@" +fi diff --git a/deployment/docker/rockylinux/start_gunicorn_serch_engine.sh b/deployment/docker/rockylinux/start_gunicorn_serch_engine.sh new file mode 100644 index 00000000..9b8fd400 --- /dev/null +++ b/deployment/docker/rockylinux/start_gunicorn_serch_engine.sh @@ -0,0 +1,26 @@ +#!/bin/sh +NAME="omero_search_engine" +USER root +APPPATH=/searchengine +SOCKFILE=/etc/searchengine/sock3 #change this to project_dir/sock (new file will be created) +echo "Starting $NAME as `whoami`" +export PATH="$APPPATH:$PATH" +echo "staring the app" +# Create the run directory if it doesn't exist +RUNDIR=$(dirname $SOCKFILE) +echo "$RUNDIR" +test -d $RUNDIR || mkdir -p $RUNDIR +LOGS=/etc/searchengine/logs +LOGSDIR=$(dirname $LOGS) +test -d $LOGSDIR || mkdir -p $LOGSDIR +user=$USER +echo "Start Gunicorn ...." +echo "$HOME" +echo pwd +cd $APPPATH +if [ -z "$@" ]; then + exec gunicorn "omero_search_engine:create_app('production')" -b 0.0.0.0:5577 --timeout 0 --name "$NAME" --bind=unix:$SOCKFILE --log-file=$LOGSDIR/logs/engine_gunilog.log --access-logfile=$LOGSDIR/logs/engine_access.log -error-logfile=$LOGSDIR/logs/engine_logs/engine_error.log --workers 4 +else + echo Run with SCRIPT_NAME=$@ + SCRIPT_NAME=/"$@"/ exec gunicorn "omero_search_engine:create_app('production')" -b 0.0.0.0:5577 --timeout 0 --name "$NAME" --bind=unix:$SOCKFILE --log-file=$LOGSDIR/logs/engine_gunilog.log --access-logfile=$LOGSDIR/logs/engine_access.log -error-logfile=$LOGSDIR/logs/engine_logs/engine_error.log --workers 4 +fi diff --git a/examples/search.py b/examples/search.py index fedd9fe6..df5b2a24 100644 --- a/examples/search.py +++ b/examples/search.py @@ -88,5 +88,9 @@ def call_omero_return_results(url, data=None, method="post"): % (len(received_results), total_results, page, total_pages, bookmark) ) -# 2000 /11686633, page: 1/11687, bookmark: 109600 -# 2000 /12225067, page: 1/12226, bookmark: 109600 +# another example using in operators and send items inside value as a string, +# The List items are separated by ',' +logging.info("Using in operator") +url = "%s%s?key=Gene Symbol&value=Pdgfc,Rnase10&operator=in" % (base_url, image_search) +bookmark, total_results, total_pages = call_omero_return_results(url, method="get") +logging.info("%s,%s" % (total_results, total_pages)) diff --git a/examples/using_in_operator.py b/examples/using_in_operator.py new file mode 100644 index 00000000..173222d1 --- /dev/null +++ b/examples/using_in_operator.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (C) 2024 University of Dundee & Open Microscopy Environment. +# All rights reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from utils import query_the_search_ending, logging + +# It is similar to use the 'in' operator in a sql statement, +# rather than having multiple 'or' conditions, +# it will only use a single condition. + +# The following example will search for the images which have any of the 'Gene Symbol' +# values in this list ["Duoxa2", "Bach2", "Cxcr2", "Mysm1"] + +# and filters + +logging.info("Example of using in operator") + + +values_in = ["Duoxa2", "Bach2", "Cxcr2", "Mysm1"] +logging.info("Searching for 'Gene Symbol' with values in [%s]" % (",".join(values_in))) +and_filters = [{"name": "Gene Symbol", "value": values_in, "operator": "in"}] + +main_attributes = [] +query = {"and_filters": and_filters} +# +recieved_results_data = query_the_search_ending(query, main_attributes) diff --git a/examples/using_not_in_operator.py b/examples/using_not_in_operator.py new file mode 100644 index 00000000..1765a051 --- /dev/null +++ b/examples/using_not_in_operator.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (C) 2024 University of Dundee & Open Microscopy Environment. +# All rights reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + +from utils import query_the_search_ending, logging + +# It is similar to use the 'not in' operator in a sql statement, +# rather than having multiple 'or' conditions with not_equals operators, +# it will only use a single condition. + +# The following example will search for the images which have met any of the 'Organism' +# values in this list +# ["homo sapiens","mus musculus","mus musculus x mus spretus","human adenovirus 2"] + +# and filters + +logging.info("Example of using not_in operator") + + +values_not_in = [ + "homo sapiens", + "mus musculus", + "mus musculus x mus spretus", + "human adenovirus 2", +] +logging.info("Searching for 'Organism' with values in [%s]" % (",".join(values_not_in))) +and_filters = [{"name": "Organism", "value": values_not_in, "operator": "not_in"}] + +main_attributes = [] +query = {"and_filters": and_filters} +# +received_results_data = query_the_search_ending(query, main_attributes) diff --git a/manage.py b/manage.py index 76be9e9c..6c982709 100644 --- a/manage.py +++ b/manage.py @@ -114,13 +114,25 @@ def sql_results_to_panda(): pass +@manager.command +def restore_postgresql_database(): + from omero_search_engine.database.utils import restore_database + + restore_database() + + @manager.command @manager.option( "-r", "--resource", help="resource name, creating all the indexes for all the resources is the default", # noqa ) -def get_index_data_from_database(resource="all"): +@manager.option( + "-b", + "--backup", + help="if True, backup will be called ", # noqa +) +def get_index_data_from_database(resource="all", backup="True"): """ insert data in Elasticsearch index for each resource It gets the data from postgres database server @@ -132,7 +144,9 @@ def get_index_data_from_database(resource="all"): get_insert_data_to_index, save_key_value_buckets, ) + import json + backup = json.loads(backup.lower()) if resource != "all": sql_st = sqls_resources.get(resource) if not sql_st: @@ -148,7 +162,8 @@ def get_index_data_from_database(resource="all"): test_indexing_search_query(deep_check=False, check_studies=True) # backup the index data - backup_elasticsearch_data() + if backup: + backup_elasticsearch_data() # set configurations @@ -351,6 +366,44 @@ def restore_elasticsearch_data(): restore_indices_data() +@manager.command +@manager.option("-s", "--screen_name", help="Screen name, or part of it") +@manager.option("-p", "--project_name", help="Project name, or part of it") +def data_validator(screen_name=None, project_name=None): + """ + Checking key-value pair for trailing and heading space. + It also checks the key-value pair duplication. + It can check all the projects and screens. + Also, it can run for a specific project or screen. + The output is a collection of CSV files; each check usually generates three files: + The main file contains image details (e.g. image id) + in addition to the key and the value. + one file for screens and one for projects. + Each file contains the screen name (project name), + the key-value pair which has the issue and the total number of affected + images for each row. + """ + from datetime import datetime + + if screen_name and project_name: + print("Either screen name or project name is allowed") + + from omero_search_engine.validation.omero_keyvalue_data_validator import ( + check_for_heading_space, + check_for_trailing_space, + check_duplicated_keyvalue_pairs, + ) + + start = datetime.now() + check_for_trailing_space(screen_name, project_name) + start1 = datetime.now() + check_for_heading_space(screen_name, project_name) + start2 = datetime.now() + check_duplicated_keyvalue_pairs(screen_name, project_name) + end = datetime.now() + print("start: %s, start1: %s, start2: %s, end: %s" % (start, start1, start2, end)) + + @manager.command def test_container_key_value(): from omero_search_engine.validation.results_validator import ( @@ -404,4 +457,7 @@ def set_logs_folder(logs_folder=None): if __name__ == "__main__": + from flask_script import Command + + Command.capture_all_args = False manager.run() diff --git a/omero_search_engine/api/v1/resources/query_handler.py b/omero_search_engine/api/v1/resources/query_handler.py index 822b6b7f..18d4d878 100644 --- a/omero_search_engine/api/v1/resources/query_handler.py +++ b/omero_search_engine/api/v1/resources/query_handler.py @@ -35,6 +35,9 @@ "screen": {"name": "name", "description": "description"}, } +res_and_main_attributes = None +res_or_main_attributes = None + def check_get_names(idr_, resource, attribute, return_exact=False): # check the idr name and return the resource and possible values @@ -107,8 +110,10 @@ def adjust_resource(self): ) if len(ac_value) == 1: self.value = ac_value[0] - else: + elif len(ac_value) == 0: self.value = -1 + else: + self.value = ac_value """ pr_names = get_resource_names(self.resource) if not self.value in pr_names: @@ -337,6 +342,7 @@ def get_image_non_image_query(self): def run_query(self, query_, resource): main_attributes = {} + query = {"and_filters": [], "or_filters": []} if query_.get("and_filters"): @@ -366,7 +372,7 @@ def run_query(self, query_, resource): for qu in qu_items: if not qu: continue - if type(qu) != list: + if not isinstance(qu, list): ss.append(qu.__dict__) else: bb = [] @@ -398,6 +404,11 @@ def run_query(self, query_, resource): # res = search_query(query, resource, bookmark, # self.raw_elasticsearch_query, # main_attributes,return_containers=self.return_containers) + global res_and_main_attributes, res_or_main_attributes + if res_and_main_attributes: + main_attributes["and_main_attributes"] = ( + main_attributes.get("and_main_attributes") + res_and_main_attributes + ) if resource == "image" and self.return_containers: res = search_query( query, @@ -633,6 +644,12 @@ def determine_search_results_(query_, return_columns=False, return_containers=Fa and_filters = query_.get("query_details").get("and_filters") or_filters = query_.get("query_details").get("or_filters") and_query_groups = [] + main_attributes = query_.get("main_attributes") + global res_and_main_attributes, res_or_main_attributes + if main_attributes: + res_and_main_attributes = main_attributes.get("and_main_attributes") + res_or_main_attributes = main_attributes.get("or_main_attributes") + columns_def = query_.get("columns_def") or_query_groups = [] if and_filters and len(and_filters) > 0: @@ -785,9 +802,9 @@ def add_local_schemas_to(resolver, schema_folder, base_uri, schema_ext=".json"): def query_validator(query): - query_schema_file = ( - "omero_search_engine/api/v1/resources/schemas/query_data.json" # noqa - ) + print("TRoz", query) + main_dir = os.path.abspath(os.path.dirname(__file__)) + query_schema_file = os.path.join(main_dir, "schemas", "query_data.json") base_uri = "file:" + abspath("") + "/" with open(query_schema_file, "r") as schema_f: query_schema = json.loads(schema_f.read()) diff --git a/omero_search_engine/api/v1/resources/schemas/filter_schema.json b/omero_search_engine/api/v1/resources/schemas/filter_schema.json index 3f0df36a..611388ba 100644 --- a/omero_search_engine/api/v1/resources/schemas/filter_schema.json +++ b/omero_search_engine/api/v1/resources/schemas/filter_schema.json @@ -13,12 +13,12 @@ }, "value": { "name":"value", - "type": "string" + "type": ["array", "string"] }, "operator": { "name": "operator", "type": "string", - "enum": ["equals", "not_equals", "contains","not_contains"] + "enum": ["equals", "not_equals", "contains", "not_contains", "in", "not_in"] } ,"resource": { "name": "resource", diff --git a/omero_search_engine/api/v1/resources/swagger_docs/search.yml b/omero_search_engine/api/v1/resources/swagger_docs/search.yml index fdb70676..ad3c861c 100644 --- a/omero_search_engine/api/v1/resources/swagger_docs/search.yml +++ b/omero_search_engine/api/v1/resources/swagger_docs/search.yml @@ -28,7 +28,7 @@ parameters: description: operator, default equals in: query type: string - enum: ['equals', 'not_equals', 'contains', 'not_contains'] + enum: ['equals', 'not_equals', 'contains', 'not_contains', 'in', 'not_in'] - name: case_sensitive description: case sensitive query, default False in: query diff --git a/omero_search_engine/api/v1/resources/utils.py b/omero_search_engine/api/v1/resources/utils.py index 41d7d889..2af31430 100644 --- a/omero_search_engine/api/v1/resources/utils.py +++ b/omero_search_engine/api/v1/resources/utils.py @@ -113,10 +113,31 @@ def get_resource_annotation_table(resource_table): """ {"match": {"key_values.value.keyvaluenormalize":"$value"}}""" ) + +# in operator +case_insensitive_must_in_value_condition_template = Template( + """ +{"terms": {"key_values.value.keyvaluenormalize":$value}}""" +) + case_sensitive_must_value_condition_template = Template( """ {"match": {"key_values.value.keyvalue":"$value"}}""" ) + +nested_query_template_must_must_not = Template( + """ +{"nested": {"path": "key_values", +"query":{"bool": {"must":[$must_part], "must_not":[$must_not_part]}}}}""" +) + +# in opeartor +case_sensitive_must_in_value_condition_template = Template( + """ +{"terms": {"key_values.value.keyvalue":$value}}""" +) + + nested_keyvalue_pair_query_template = Template( """ {"nested": {"path": "key_values", @@ -172,6 +193,8 @@ def get_resource_annotation_table(resource_table): {"name.keyvalue": "$idr"}}}}]}}} """ ) +operators_required_list_data_type = ["in", "not_in"] + def build_error_message(error): """ @@ -291,8 +314,20 @@ def elasticsearch_query_builder( search_omero_app.logger.info("FILTER %s" % filter) try: key = filter["name"].strip() - value = filter["value"].strip() operator = filter["operator"].strip() + if operator in operators_required_list_data_type: + if isinstance(filter["value"], list): + value_ = filter["value"] + else: + # in case of providing it with single query, the values should + # be provided as a string separated the array items by ',' + value_ = filter["value"].split(",") + value = [val.strip() for val in value_] + value = json.dumps(value) + + else: + value = filter["value"].strip() + except Exception as e: search_omero_app.logger.info(str(e)) return build_error_message( @@ -333,6 +368,61 @@ def elasticsearch_query_builder( nested=",".join(_nested_must_part) ) ) + + if operator == "in": + if case_sensitive: + _nested_must_part.append( + case_sensitive_must_in_value_condition_template.substitute( # noqa + value=value + ) + ) + _nested_must_part.append( + case_sensitive_must_name_condition_template.substitute(name=key) + ) # noqa + + else: + _nested_must_part.append( + case_insensitive_must_in_value_condition_template.substitute( # noqa + value=value + ) + ) + _nested_must_part.append( + case_insensitive_must_name_condition_template.substitute( # noqa + name=key + ) + ) + + nested_must_part.append( + nested_keyvalue_pair_query_template.substitute( + nested=",".join(_nested_must_part) + ) + ) + + if operator == "not_in": + if case_sensitive: + nested_must_part.append( + nested_query_template_must_must_not.substitute( + must_not_part=case_sensitive_must_in_value_condition_template.substitute( # noqa + value=value + ), + must_part=case_sensitive_must_name_condition_template.substitute( # noqa + name=key + ), + ) + ) + + else: + nested_must_part.append( + nested_query_template_must_must_not.substitute( + must_not_part=case_insensitive_must_in_value_condition_template.substitute( # noqa + value=value + ), + must_part=case_insensitive_must_name_condition_template.substitute( # noqa + name=key + ), + ) + ) + if operator == "contains": value = "*{value}*".format(value=adjust_value(value)) # _nested_must_part.append(must_name_condition_template.substitute(name=key)) # noqa @@ -371,64 +461,50 @@ def elasticsearch_query_builder( value = "*{value}*".format(value=adjust_value(value)) if case_sensitive: nested_must_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_sensitive_must_name_condition_template.substitute( # noqa - name=key - ) - ) - ) - nested_must_not_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_sensitive_wildcard_value_condition_template.substitute( # noqa + nested_query_template_must_must_not.substitute( + must_not_part=case_sensitive_wildcard_value_condition_template.substitute( # noqa wild_card_value=value - ) + ), + must_part=case_sensitive_must_name_condition_template.substitute( # noqa + name=key + ), ) ) + else: nested_must_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_insensitive_must_name_condition_template.substitute( # noqa - name=key - ) - ) - ) - nested_must_not_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_insensitive_wildcard_value_condition_template.substitute( # noqa + nested_query_template_must_must_not.substitute( + must_not_part=case_insensitive_wildcard_value_condition_template.substitute( # noqa wild_card_value=value - ) + ), + must_part=case_insensitive_must_name_condition_template.substitute( # noqa + name=key + ), ) ) else: if case_sensitive: nested_must_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_sensitive_must_name_condition_template.substitute( # noqa - name=key - ) - ) - ) - nested_must_not_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_sensitive_must_value_condition_template.substitute( # noqa + nested_query_template_must_must_not.substitute( + must_not_part=case_sensitive_must_value_condition_template.substitute( # noqa value=value - ) + ), + must_part=case_sensitive_must_name_condition_template.substitute( # noqa + name=key + ), ) ) + else: nested_must_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_insensitive_must_name_condition_template.substitute( # noqa - name=key - ) - ) - ) - nested_must_not_part.append( - nested_keyvalue_pair_query_template.substitute( - nested=case_insensitive_must_value_condition_template.substitute( # noqa + nested_query_template_must_must_not.substitute( + must_not_part=case_insensitive_must_value_condition_template.substitute( # noqa value=value - ) + ), + must_part=case_insensitive_must_name_condition_template.substitute( # noqa + name=key + ), ) ) @@ -618,7 +694,6 @@ def elasticsearch_query_builder( ff = nested_query_template_must_not.substitute(must_not_value=ss) should_part_list_or.append(ff) all_terms = "" - for should_part_list_ in all_should_part_list: if isinstance(should_part_list_, dict): should_part_list = should_part_list_.get("main") diff --git a/omero_search_engine/cache_functions/elasticsearch/transform_data.py b/omero_search_engine/cache_functions/elasticsearch/transform_data.py index 279a7d8e..529e8a8f 100644 --- a/omero_search_engine/cache_functions/elasticsearch/transform_data.py +++ b/omero_search_engine/cache_functions/elasticsearch/transform_data.py @@ -36,7 +36,7 @@ ) from omero_search_engine.validation.psql_templates import ( query_images_in_project_id, - query_images_screen_id, + query_images_in_screen_id, ) from app_data.data_attrs import annotation_resource_link @@ -326,7 +326,7 @@ def handle_file(file_name, es_index, cols, is_image, from_json): actions.append( { "_index": es_index, - "_source": record # , + "_source": record, # , # "_id": record['id'] } ) @@ -686,7 +686,7 @@ def save_key_value_buckets( if resource_table == "project": sql_n = query_images_in_project_id.substitute(project_id=id) elif resource_table == "screen": - sql_n = query_images_screen_id.substitute(screen_id=id) + sql_n = query_images_in_screen_id.substitute(screen_id=id) no_images_co = conn.execute_query(sql_n) res["no_images"] = len(no_images_co) diff --git a/omero_search_engine/database/utils.py b/omero_search_engine/database/utils.py new file mode 100644 index 00000000..8f6e58da --- /dev/null +++ b/omero_search_engine/database/utils.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (C) 2024 University of Dundee & Open Microscopy Environment. +# All rights reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import os +import sys +import subprocess + + +def restore_database(): + """ + restote the database from a database dump file + """ + from omero_search_engine import search_omero_app + + main_dir = os.path.abspath(os.path.dirname(__file__)) + mm = main_dir.replace("omero_search_engine/database", "") + sys.path.append(mm) + dat_file_name = os.path.join(mm, "app_data/omero.pgdump") + restore_command = "psql --username %s --host %s --port %s -d %s -f %s" % ( + search_omero_app.config.get("DATABASE_USER"), + search_omero_app.config.get("DATABASE_SERVER_URI"), + search_omero_app.config.get("DATABASE_PORT"), + search_omero_app.config.get("DATABASE_NAME"), + dat_file_name, + ) + try: + proc = subprocess.Popen( + restore_command, + shell=True, + env={"PGPASSWORD": search_omero_app.config.get("DATABASE_PASSWORD")}, + ) + proc.wait() + except Exception as e: + print("Exception happened during dump %s" % (e)) diff --git a/omero_search_engine/validation/omero_keyvalue_data_validator.py b/omero_search_engine/validation/omero_keyvalue_data_validator.py new file mode 100644 index 00000000..d9d203d2 --- /dev/null +++ b/omero_search_engine/validation/omero_keyvalue_data_validator.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (C) 2024 University of Dundee & Open Microscopy Environment. +# All rights reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from omero_search_engine import search_omero_app +from omero_search_engine.validation.psql_templates import ( + trail_space_query, + head_space_query, + duplicated_keyvalue_pairs_query, +) +import os +import pandas as pd + +conn = search_omero_app.config["database_connector"] + + +def prepare_the_sql_statement(sql_template, screen_name, project_name, add_where=""): + """ + customize the sql statement + """ + if not screen_name and project_name: + return sql_template.substitute( + condition=" {add_where} project.name like '%{project_name}%'".format( + add_where=add_where, project_name=project_name + ) + ) + elif not project_name and screen_name: + return sql_template.substitute( + condition=" {add_where} screen.name like '%{screen_name}%'".format( + add_where=add_where, screen_name=screen_name + ) + ) + elif not screen_name and not project_name: + return sql_template.substitute(condition="") + + +def check_for_trailing_space(screen_name, project_name): + search_omero_app.logger.info("Checking for trailing space ...") + sql_statment = prepare_the_sql_statement( + trail_space_query, screen_name, project_name, " and" + ) + tail_space_results = conn.execute_query(sql_statment) + if len(tail_space_results) == 0: + search_omero_app.logger.info("No results is available for trailing space") + return + search_omero_app.logger.info("Generate for trailing space ...") + generate_reports(tail_space_results, "trailing_space", screen_name, project_name) + + +def check_for_heading_space(screen_name, project_name): + search_omero_app.logger.info("Checking for heading space ...") + sql_statment = prepare_the_sql_statement( + head_space_query, screen_name, project_name, " and" + ) + head_space_results = conn.execute_query(sql_statment) + if len(head_space_results) == 0: + search_omero_app.logger.info("No results available for heading space") + return + search_omero_app.logger.info("Generate for head space ...") + generate_reports(head_space_results, "heading_space", screen_name, project_name) + + +def check_duplicated_keyvalue_pairs(screen_name, project_name): + search_omero_app.logger.info("Checking for duplicated key-value pairs...") + sql_statment = prepare_the_sql_statement( + duplicated_keyvalue_pairs_query, screen_name, project_name, "where" + ) + duplicated_keyvalue_pairs_results = conn.execute_query(sql_statment) + if len(duplicated_keyvalue_pairs_results) == 0: + search_omero_app.logger.info( + "No results available for duplicated key-value pairs" + ) + return + search_omero_app.logger.info("Generate reports for duplicated key-value pairs...") + generate_reports( + duplicated_keyvalue_pairs_results, + "duplicated_keyvalue_pairs", + screen_name, + project_name, + ) + + +def generate_reports(results, check_type, screen_name, project_name): + """ + Generate the output CSV files contents and save them + """ + df = pd.DataFrame(results) + base_folder = search_omero_app.config.get("BASE_FOLDER") + if not os.path.isdir(base_folder): + base_folder = os.path.expanduser("~") + + all_fields_file = os.path.join(base_folder, "all_%s.csv" % check_type) + screens_file = os.path.join(base_folder, "screens_%s.csv" % check_type) + projects_file = os.path.join(base_folder, "projects_%s.csv" % check_type) + + with open(all_fields_file, "w") as text_file: + text_file.write(df.to_csv()) + + if (not screen_name and not project_name) or screen_name: + df2 = ( + df.groupby(["screen_name", "name", "value"]) + .size() + .reset_index() + .rename(columns={0: "number of images"}) + ) + with open(screens_file, "w") as text_file: + text_file.write(df2.to_csv()) + search_omero_app.logger.info(df2.sum()) + + if (not screen_name and not project_name) or project_name: + df3 = ( + df.groupby(["project_name", "name", "value"]) + .size() + .reset_index() + .rename(columns={0: "number of images"}) + ) + + with open(projects_file, "w") as text_file: + text_file.write(df3.to_csv()) + search_omero_app.logger.info(df3.sum()) diff --git a/omero_search_engine/validation/psql_templates.py b/omero_search_engine/validation/psql_templates.py index fa86ecfe..6209705d 100644 --- a/omero_search_engine/validation/psql_templates.py +++ b/omero_search_engine/validation/psql_templates.py @@ -41,6 +41,39 @@ def substitute(self, **kwargs): return super(SqlSearchEngineTemplate, self).substitute(kwargs) +# get available values for an image key +query_images_available_values_for_key = Template( + """ +Select DISTINCT lower(annotation_mapvalue.value) from image +inner join imageannotationlink on image.id =imageannotationlink.parent +inner join annotation_mapvalue on +annotation_mapvalue.annotation_id=imageannotationlink.child +where lower(annotation_mapvalue.name)='$name' """ +) + +# get any values for an image keys +query_images_any_value = Template( + """ +Select DISTINCT lower(annotation_mapvalue.name), +lower(annotation_mapvalue.value) from image +inner join imageannotationlink on image.id =imageannotationlink.parent +inner join annotation_mapvalue on +annotation_mapvalue.annotation_id=imageannotationlink.child +where lower(annotation_mapvalue.value) like '%$val_part%' """ +) + +# get any values for an image keys +query_images_contains_not_contains = Template( + """ +Select DISTINCT image.id from image +inner join imageannotationlink on image.id =imageannotationlink.parent +inner join annotation_mapvalue on +annotation_mapvalue.annotation_id=imageannotationlink.child +where lower(annotation_mapvalue.name)='$name' +and lower(annotation_mapvalue.value) $operator ('%$value%') """ +) + + # get images satisfy image key-value query query_images_key_value = Template( """ @@ -49,7 +82,7 @@ def substitute(self, **kwargs): inner join annotation_mapvalue on annotation_mapvalue.annotation_id=imageannotationlink.child where lower(annotation_mapvalue.name)='$name' and -lower(annotation_mapvalue.value)=lower('$value')""" +lower(annotation_mapvalue.value)$operator lower('$value')""" ) # Get number of images which satisfy project key-value query @@ -65,11 +98,11 @@ def substitute(self, **kwargs): inner join annotation_mapvalue on annotation_mapvalue.annotation_id=projectannotationlink.child where lower(annotation_mapvalue.name)=lower('$name') -and lower(annotation_mapvalue.value)=lower('$value')""" +and lower(annotation_mapvalue.value) $operator lower('$value')""" ) # Get the number of images using "in" -query_image_or = Template( +query_image_in = Template( """ Select DISTINCT image.id from image inner join imageannotationlink @@ -77,7 +110,7 @@ def substitute(self, **kwargs): inner join annotation_mapvalue on annotation_mapvalue.annotation_id=imageannotationlink.child where lower(annotation_mapvalue.name) in ($names) -and lower(annotation_mapvalue.value) in ($values)""" +and lower(annotation_mapvalue.value) $operator ($values)""" ) # Get the images which satisfy screen key-value query @@ -93,8 +126,8 @@ def substitute(self, **kwargs): on screen.id =screenannotationlink.parent inner join annotation_mapvalue on annotation_mapvalue.annotation_id=screenannotationlink.child -where lower(annotation_mapvalue.name)='$name' -and lower(annotation_mapvalue.value)=lower('$value')""" +where lower(annotation_mapvalue.name)= lower('$name') +and lower(annotation_mapvalue.value)$operator lower('$value')""" ) @@ -117,11 +150,11 @@ def substitute(self, **kwargs): inner join dataset on datasetimagelink.parent=dataset.id inner join projectdatasetlink on dataset.id=projectdatasetlink.child inner join project on project.id=projectdatasetlink.parent -where lower(project.name)=lower('$name')""" +where lower (project.name) $operator lower ('$name')""" ) # get images in a screen using id -query_images_screen_id = Template( +query_images_in_screen_id = Template( """ Select DISTINCT image.id from image inner join wellsample on wellsample.image=image.id @@ -141,7 +174,7 @@ def substitute(self, **kwargs): inner join plate on well.plate=plate.id inner join screenplatelink on plate.id=screenplatelink.child inner join screen on screen.id=screenplatelink.parent -where lower(screen.name)=lower('$name')""" +where lower(screen.name)$operator lower('$name')""" ) # get resource id using its name @@ -179,6 +212,70 @@ def substitute(self, **kwargs): and lower(annotation_mapvalue.value) =lower('$value')""" ) +head_space_query = Template( + """ +select image.id as image_id, screen.name as screen_name, project.name as project_name, + annotation_mapvalue.name, annotation_mapvalue.value from image + inner join imageannotationlink on image.id =imageannotationlink.parent + inner join annotation_mapvalue on + annotation_mapvalue.annotation_id=imageannotationlink.child + left join datasetimagelink on datasetimagelink.child=image.id + left join dataset on datasetimagelink.parent=dataset.id + left join projectdatasetlink on dataset.id=projectdatasetlink.child + left join project on project.id=projectdatasetlink.parent + left join wellsample on wellsample.image=image.id + left join well on wellsample.well= well.id left join plate on well.plate=plate.id + left join screenplatelink on plate.id=screenplatelink.child + left join screen on screen.id=screenplatelink.parent + where annotation_mapvalue.value like ' %' $condition + group by project_name, screen_name,image.id, + annotation_mapvalue.name, annotation_mapvalue.value +""" +) + +trail_space_query = Template( + """ +select image.id as image_id, screen.name as screen_name, project.name as project_name, + annotation_mapvalue.name, annotation_mapvalue.value from image + inner join imageannotationlink on image.id =imageannotationlink.parent + inner join annotation_mapvalue on + annotation_mapvalue.annotation_id=imageannotationlink.child + left join datasetimagelink on datasetimagelink.child=image.id + left join dataset on datasetimagelink.parent=dataset.id + left join projectdatasetlink on dataset.id=projectdatasetlink.child + left join project on project.id=projectdatasetlink.parent + left join wellsample on wellsample.image=image.id + left join well on wellsample.well= well.id + left join plate on well.plate=plate.id + left join screenplatelink on plate.id=screenplatelink.child + left join screen on screen.id=screenplatelink.parent + where annotation_mapvalue.value like '% ' $condition + group by project_name, screen_name,image.id, annotation_mapvalue.name, + annotation_mapvalue.value +""" +) + +duplicated_keyvalue_pairs_query = Template( + """ +Select image.id as image_id, project.name as project_name, screen.name as screen_name, + annotation_mapvalue.name, annotation_mapvalue.value, count (*) from image + left join datasetimagelink on datasetimagelink.child=image.id + left join dataset on datasetimagelink.parent=dataset.id + left join projectdatasetlink on dataset.id=projectdatasetlink.child + left join project on project.id=projectdatasetlink.parent + left join wellsample on wellsample.image=image.id + left join well on wellsample.well= well.id left join plate on well.plate=plate.id + left join screenplatelink on plate.id=screenplatelink.child + left join screen on screen.id=screenplatelink.parent + inner join imageannotationlink on image.id =imageannotationlink.parent + inner join annotation_mapvalue on + annotation_mapvalue.annotation_id=imageannotationlink.child + $condition + group by project_name, screen_name,image.id, annotation_mapvalue.name, + annotation_mapvalue.value HAVING COUNT(*)>1 + """ +) + project_key_values = Template( """ Select DISTINCT (annotation_mapvalue.value) from image diff --git a/omero_search_engine/validation/results_validator.py b/omero_search_engine/validation/results_validator.py index 1b4a4120..0c13379f 100644 --- a/omero_search_engine/validation/results_validator.py +++ b/omero_search_engine/validation/results_validator.py @@ -18,21 +18,33 @@ # along with this program. If not, see . from omero_search_engine import search_omero_app +import json from datetime import datetime from omero_search_engine.api.v1.resources.query_handler import ( determine_search_results_, query_validator, simple_search, ) + +from omero_search_engine.api.v1.resources.resource_analyser import ( + search_value_for_resource, + get_key_values_return_contents, +) + from omero_search_engine.validation.psql_templates import ( query_images_key_value, query_image_project_meta_data, query_images_screen_key_value, query_images_in_project_name, query_images_screen_name, - query_image_or, + query_image_in, screens_count, projects_count, + query_images_available_values_for_key, + query_images_any_value, + query_images_contains_not_contains, + query_images_in_project_id, + query_images_in_screen_id, ) import os import pandas as pd @@ -43,9 +55,14 @@ "screen": query_images_screen_key_value, "project_name": query_images_in_project_name, "screen_name": query_images_screen_name, - "query_image_or": query_image_or, + "query_image_or": query_image_in, + "in_clause": query_image_in, + "not_in_clause": query_image_in, "screens_count": screens_count, "projects_count": projects_count, + "available_values_for_key": query_images_available_values_for_key, + "search_any_value": query_images_any_value, + "image_contains_not_contains": query_images_contains_not_contains, } @@ -55,8 +72,9 @@ class Validator(object): and from the searchengine """ - def __init__(self, deep_check): + def __init__(self, deep_check=False): self.deep_check = deep_check + self.identical = True def set_simple_query(self, resource, name, value, type="keyvalue"): """ @@ -70,6 +88,30 @@ def set_simple_query(self, resource, name, value, type="keyvalue"): self.sql_statement = query_methods[resource] self.searchengine_results = {} + def set_contains_not_contains_query(self, resource, name, value, type="keyvalue"): + """ + simple query + """ + self.resource = resource + self.type = type + self.name = name + self.value = value + self.postgres_results = [] + self.sql_statement = query_methods["image_contains_not_contains"] + self.searchengine_results = {} + + def set_owner_group(self, owner_id=None, group_id=None): + self.owner_id = owner_id + self.group_id = group_id + + def set_in_query(self, clauses, resource="image", type="in_clause"): + """ + in list query + """ + self.type = type + self.clauses = clauses + self.resource = resource + def set_complex_query(self, name, clauses, resource="image", type="complex"): """ complex query @@ -82,6 +124,27 @@ def set_complex_query(self, name, clauses, resource="image", type="complex"): self.postgres_results = [] self.searchengine_results = {} + def get_in_sql(self, clauses, name="in_clause"): + names = "'%s'" % clauses[0].lower() + cases = [c.lower() for c in clauses[1]] + values = "'" + "','".join(cases) + "'" + if name == "in_clause": + sql = query_methods[name].substitute( + names=names, values=values, operator="in" + ) + elif name == "not_in_clause": + sql = query_methods[name].substitute( + names=names, values=values, operator="not in" + ) + # sql = query_methods[name].substitute(names=names, values=values) + conn = search_omero_app.config["database_connector"] + postgres_results = conn.execute_query(sql) + results = [item["id"] for item in postgres_results] + search_omero_app.logger.info( + "results for 'or' received %s" % len(results) + ) # noqa + return results + def get_or_sql(self, clauses, name="query_image_or"): names = "" values = "" @@ -92,12 +155,13 @@ def get_or_sql(self, clauses, name="query_image_or"): else: names = "'%s'" % claus[0].lower() values = "'%s'" % claus[1].lower() - sql = query_methods[name].substitute(names=names, values=values) + # sql = query_methods[name].substitute(names=names, values=values) + sql = query_methods[name].substitute(names=names, values=values, operator="in") conn = search_omero_app.config["database_connector"] postgres_results = conn.execute_query(sql) results = [item["id"] for item in postgres_results] search_omero_app.logger.info( - "results for or received %s" % len(results) + "results for 'or' received %s" % len(results) ) # noqa return results @@ -106,14 +170,15 @@ def get_and_sql(self, clauses): co = 0 for claus in clauses: sql = query_methods["image"].substitute( - name=claus[0].lower(), value=claus[1].lower() + # toz + operator="=", + name=claus[0].lower(), + value=claus[1].lower(), ) conn = search_omero_app.config["database_connector"] postgres_results = conn.execute_query(sql) res = [item["id"] for item in postgres_results] - search_omero_app.logger.info( - "results for and received recived %s" % len(res) - ) + search_omero_app.logger.info("results for 'and' received %s" % len(res)) if co == 0: results = res else: @@ -121,12 +186,30 @@ def get_and_sql(self, clauses): co += 1 return results - def get_results_postgres(self): + def get_results_db(self, operator=None): """ Query the postgresql """ search_omero_app.logger.info("Getting results from postgres") - if self.type == "complex": + if self.type == "buckets": + if self.name: + sql = query_methods["available_values_for_key"].substitute( + name=self.name + ) + conn = search_omero_app.config["database_connector"] + self.postgres_results = conn.execute_query(sql) + elif self.value: + sql = query_methods["search_any_value"].substitute(val_part=self.value) + conn = search_omero_app.config["database_connector"] + self.postgres_results = conn.execute_query(sql) + return + if self.type == "in_clause": + self.postgres_results = self.get_in_sql(self.clauses) + return + elif self.type == "not_in_clause": + self.postgres_results = self.get_in_sql(self.clauses, self.type) + return + elif self.type == "complex": if self.name == "query_image_or": self.postgres_results = self.get_or_sql(self.clauses) elif self.name == "query_image_and": @@ -142,12 +225,30 @@ def get_results_postgres(self): ) return else: + if not operator or operator == "equals": + operator = "=" + elif operator == "not_equals": + operator = "!=" + elif operator == "contains": + operator = "like" + elif operator == "not_contains": + operator = "not like" + if self.name != "name": sql = self.sql_statement.substitute( - name=self.name.lower(), value=self.value.lower() + # toz + operator=operator, + name=self.name.lower(), + value=self.value.lower(), ) else: - sql = self.sql_statement.substitute(name=self.value) + sql = self.sql_statement.substitute(name=self.value, operator=operator) + + if hasattr(self, "owner_id") and self.owner_id: + sql = sql + " and %s.owner_id=%s" % (self.resource, self.owner_id) + if hasattr(self, "group_id") and self.group_id: + sql = sql + " and %s.group_id=%s" % (self.resource, self.group_id) + print(sql) # search_omero_app.logger.info ("sql: %s"%sql) conn = search_omero_app.config["database_connector"] postgres_results = conn.execute_query(sql) @@ -156,11 +257,45 @@ def get_results_postgres(self): "results received %s" % len(self.postgres_results) ) # noqa - def get_results_searchengine(self): + def get_results_searchengine(self, operator=None): """ Query the results from the serachengine """ - if self.type == "complex": + if self.type == "buckets": + if self.name: + res = get_key_values_return_contents(self.name, "image", False) + self.searchengine_results = json.loads(res.data) + elif self.value: + self.searchengine_results = search_value_for_resource( + "image", self.value + ) + return + + if self.type == "in_clause": + filters = [] + filters.append( + { + "name": self.clauses[0], + "value": self.clauses[1], + "operator": "in", + "resource": self.resource, + } + ) + query = {"and_filters": filters, "or_filters": []} + + elif self.type == "not_in_clause": + filters = [] + filters.append( + { + "name": self.clauses[0], + "value": self.clauses[1], + "operator": "not_in", + "resource": self.resource, + } + ) + query = {"and_filters": filters, "or_filters": []} + + elif self.type == "complex": filters = [] if self.name != "query_image_and_or": for claus in self.clauses: @@ -195,14 +330,15 @@ def get_results_searchengine(self): "resource": self.resource, } ) - else: + if not operator: + operator = "equals" if self.name != "name": and_filters = [ { "name": self.name.lower(), "value": self.value.lower(), - "operator": "equals", + "operator": operator, "resource": self.resource, } ] @@ -212,11 +348,21 @@ def get_results_searchengine(self): "name": "name", "value": self.value, "resource": "project", - "operator": "equals", + "operator": operator, } ] query = {"and_filters": and_filters, "or_filters": []} - query_data = {"query_details": query} + and_main_attributes = [] + if hasattr(self, "owner_id") and self.owner_id: + and_main_attributes.append( + {"name": "owner_id", "value": self.owner_id, "operator": "equals"} + ) + if hasattr(self, "group_id") and self.group_id: + and_main_attributes.append( + {"name": "group_id", "value": self.group_id, "operator": "equals"} + ) + main_attributes = {"and_main_attributes": and_main_attributes} + query_data = {"query_details": query, "main_attributes": main_attributes} # validate the query syntex query_validation_res = query_validator(query_data) if query_validation_res == "OK": @@ -324,7 +470,10 @@ def get_containers_test_cases(self): None, return_containers=True, ) - if search_engine_results["results"].get("results"): + # print(search_engine_results["results"]) + if search_engine_results.get("results") and search_engine_results[ + "results" + ].get("results"): for item in search_engine_results["results"].get("results"): if item["type"] == "screen": if item["name"] in screens_results_idr: @@ -382,42 +531,43 @@ def get_containers_test_cases(self): search_omero_app.logger.info(mes) return mess - def compare_results(self): + def compare_results(self, operator=None): """ - call the results + Get and compare the results between the database and the searchengine """ st_time = datetime.now() - self.get_results_postgres() + self.get_results_db(operator) st2_time = datetime.now() - self.get_results_searchengine() + self.get_results_searchengine(operator) st3_time = datetime.now() sql_time = st2_time - st_time searchengine_time = st3_time - st2_time + if self.type == "bucket": + return if len(self.postgres_results) == self.searchengine_results.get("size"): - ids_in = True is_it_repated = [] serach_ids = [id for id in self.searchengine_results.get("ids")] serach_idsp = [id for id in self.searchengine_results.get("idsp")] if self.deep_check: if sorted(serach_ids) != sorted(self.postgres_results): - ids_in = False + self.identical = False if sorted(serach_idsp) != sorted(serach_ids): - ids_in = False + self.identical = False else: if sorted(serach_idsp) != sorted(serach_ids): - ids_in = False + self.identical = False else: for id in serach_ids: if id in is_it_repated: - ids_in = False + self.identical = False break else: is_it_repated.append(id) if id not in self.postgres_results: - ids_in = False + self.identical = False break - if ids_in: + if self.identical: search_omero_app.logger.info( "No of the retuned results are similar ..." ) @@ -430,32 +580,12 @@ def compare_results(self): searchengine_no = self.searchengine_results.get("size") else: searchengine_no = self.searchengine_results - if not self.deep_check: - return ( - "not equal, database no of the results from server is: %s and\ - the number of results from searchengine (bookmark) is %s?,\ - \ndatabase server query time= %s, searchengine query time= %s" - % ( - len(self.postgres_results), - searchengine_no, - sql_time, - searchengine_time, - ) - ) - else: - return ( - "not equal, database no of the results from server is: %s and\ - the number of results from searchengine (bookmark) is %s?,\ - the number of results from searchengine (pagination) is %s?,\ - \ndatabase server query time= %s, searchengine query time= %s" - % ( - len(self.postgres_results), - searchengine_no, - len(serach_idsp), - sql_time, - searchengine_time, - ) - ) + return ( + "not equal, the number of results from the database server is: %s and" + "the number of results from searchengine is %s?," + "\ndatabase server query time= %s, searchengine query time= %s" + % (len(self.postgres_results), searchengine_no, sql_time, searchengine_time) + ) def validate_queries(json_file, deep_check): @@ -473,6 +603,7 @@ def validate_queries(json_file, deep_check): test_cases = test_data.get("test_cases") complex_test_cases = test_data.get("complex_test_cases") + query_in = test_data.get("query_in") messages = [] from datetime import datetime @@ -482,23 +613,40 @@ def validate_queries(json_file, deep_check): name = case[0] value = case[1] search_omero_app.logger.info( - "Testing %s for name: %s, key: %s" % (resource, name, value) + "Testing (equals) %s for name: %s, key: %s" % (resource, name, value) ) validator = Validator(deep_check) validator.set_simple_query(resource, name, value) if resource == "image": mess = validator.get_containers_test_cases() messages = messages + mess - - res = validator.compare_results() + res = validator.compare_results("equals") elabsed_time = str(datetime.now() - start_time) messages.append( - "Results from PostgreSQL and search engine for " - "name '%s', value '%s', are: %s" + "Results form (equals) the database and search engine" + "for name: %s , value: %s are: %s" % (validator.name, validator.value, res) ) search_omero_app.logger.info("Total time=%s" % elabsed_time) + # Not equals + start_time = datetime.now() + search_omero_app.logger.info( + "Testing (not equals) %s for name: %s, key: %s" + % (resource, name, value) + ) + if resource == "image": + not_equals_validator = Validator(deep_check) + not_equals_validator.set_simple_query(resource, name, value) + res = not_equals_validator.compare_results("not_equals") + elabsed_time = str(datetime.now() - start_time) + messages.append( + "Results (not_equals) form PostgreSQL and search engine" + "for name: %s , value: %s are: %s" + % (not_equals_validator.name, not_equals_validator.value, res) + ) + search_omero_app.logger.info("Total time=%s" % elabsed_time) + for name, cases_ in complex_test_cases.items(): for cases in cases_: start_time = datetime.now() @@ -513,6 +661,42 @@ def validate_queries(json_file, deep_check): search_omero_app.logger.info( "Total time=%s" % str(datetime.now() - start_time) ) + + for resource, cases in query_in.items(): + for case in cases: + start_time = datetime.now() + validator_in = Validator(deep_check) + validator_in.set_in_query(case, resource) + res = validator_in.compare_results() + messages.append( + "Results for 'in' from the database and search engine" + "for %s name: %s and value in [%s] are %s" + % ( + validator_in.resource, + validator_in.clauses[0], + ",".join(validator_in.clauses[1]), + res, + ) + ) + end_in = datetime.now() + search_omero_app.logger.info("Total time=%s" % str(end_in - start_time)) + # test the same but change the operator to not in + search_omero_app.logger.info("Total time=%s" % str(end_in - start_time)) + validator_not_in = Validator(deep_check) + validator_not_in.set_in_query(case, resource, type="not_in_clause") + res = validator_not_in.compare_results() + messages.append( + "Results for 'not in' from the database and search engine for %s name: " + "%s and value in [%s] are %s" + % ( + validator_not_in.resource, + validator_not_in.clauses[0], + ",".join(validator_not_in.clauses[1]), + res, + ) + ) + search_omero_app.logger.info("Total time=%s" % str(datetime.now() - end_in)) + search_omero_app.logger.info( "############################################## Check Report ##############################################" # noqa ) @@ -525,7 +709,7 @@ def validate_queries(json_file, deep_check): "###########################################################################################################" # noqa ) # save the check report to a text file - base_folder = "/etc/searchengine/" + base_folder = search_omero_app.config.get("BASE_FOLDER") if not os.path.isdir(base_folder): base_folder = os.path.expanduser("~") @@ -540,6 +724,8 @@ def validate_queries(json_file, deep_check): def test_no_images(): idr_url = search_omero_app.config.get("IDR_TEST_FILE_URL") + if not idr_url: + return if not idr_url: search_omero_app.logger.info("No idr test file is found") @@ -556,7 +742,6 @@ def test_no_images(): headers = lines[0] headers = headers.split("\t") - print(len(headers)) for i in range(len(headers) - 1): print(i, headers[i]) names = {} @@ -570,7 +755,7 @@ def test_no_images(): names[name] = int(study[9]) results = {} - base_folder = "/etc/searchengine/" + base_folder = search_omero_app.config.get("BASE_FOLDER") if not os.path.isdir(base_folder): base_folder = os.path.expanduser("~") @@ -633,7 +818,7 @@ def test_no_images(): def get_omero_stats(base_url=None): columns = ["Resource", "Attribute", "No. of unique values", "Attribute's URL"] - base_folder = "/etc/searchengine/" + base_folder = search_omero_app.config.get("BASE_FOLDER") if not os.path.isdir(base_folder): base_folder = os.path.expanduser("~") metadata_file = os.path.join(base_folder, "metadata.xlsx") @@ -699,12 +884,107 @@ def get_omero_stats(base_url=None): writer.save() -def get_no_images_sql_containers(): +def check_number_images_sql_containers_using_ids(): + """ + This method tests the number of images inside each container + (project or screen) in the searchengine index data + and compare them with the number of images inside + each container in the database server. + As container name is not unique, container id is used + to determine the number of images + """ + from omero_search_engine.api.v1.resources.urls import ( + get_resource_names, + ) + from omero_search_engine.api.v1.resources.utils import ( + search_resource_annotation, + ) + + dd = True + + conn = search_omero_app.config["database_connector"] + all_names = get_resource_names("all") + for resource in all_names: + search_omero_app.logger.info( + "######################## Checking %s ########################\n" % resource + ) + for res_name_ in all_names.get(resource): + res_name = res_name_.get("name") + res_id = res_name_.get("id") + search_omero_app.logger.info( + "Checking %s name: %s, id: %s" % (resource, res_name, res_id) + ) + and_filters = [] + main_attributes = { + "and_main_attributes": [ + { + "name": "%s_id" % resource, + "value": res_id, + "operator": "equals", + "resource": "image", + } + ] + } + or_filters = [] + query = {"and_filters": and_filters, "or_filters": or_filters} + + query_data = {"query_details": query, "main_attributes": main_attributes} + + returned_results = search_resource_annotation("image", query_data) + if returned_results.get("results"): + if returned_results.get("results").get("size"): + searchengine_results = returned_results["results"]["size"] + else: + searchengine_results = 0 + search_omero_app.logger.info( + "Number of images returned from searchengine: %s" % searchengine_results + ) + if resource == "project": + sql = query_images_in_project_id.substitute(project_id=res_id) + elif resource == "screen": + sql = query_images_in_screen_id.substitute(screen_id=res_id) + results = conn.execute_query(sql) + postgres_results = len(results) + search_omero_app.logger.info( + "Number of images returned from the database: %s" % postgres_results + ) + if searchengine_results != postgres_results: + if res_name == "idr0021" and res_id == 872: + # """ + # issue with these two images: + # as they belong to two different datasets + # image ids= 9539, 9552 + # """ + continue + dd = False + if searchengine_results > 0: + test_array = [] + for res in returned_results["results"]["results"]: + test_array.append(res.get("id")) + for ress in results: + if ress["id"] not in test_array: + print("================>>>>") + print(ress["id"]) + search_omero_app.logger.info("ERROR: Not equal results") + print( + "Error checking %s name: %s, id: %s" + % (resource, res_name, res_id) + ) + # return False + else: + search_omero_app.logger.info("equal results") + search_omero_app.logger.info( + "\n-----------------------------------------------------------------------------\n" # noqa + ) + return dd + + +def get_no_images_sql_containers(write_report=True): """ This method tests the number of images inside each container (project or screen) in the searchengine index data and compare them with the number of images inside - each container in the postgresql database server + each container in the database server """ from omero_search_engine.api.v1.resources.urls import ( get_resource_names, @@ -748,10 +1028,14 @@ def get_no_images_sql_containers(): ) search_omero_app.logger.info(message2) messages.append(message2) - sql = query_methods["%s_name" % resource].substitute(name=res_name) + sql = query_methods["%s_name" % resource].substitute( + name=res_name, operator="=" + ) results = conn.execute_query(sql) postgres_results = len(results) - message3 = "No of images returned from postgresql: %s" % seachengine_results + message3 = ( + "Number of images returned from the database: %s" % postgres_results + ) messages.append(message3) search_omero_app.logger.info(message3) if seachengine_results != postgres_results: @@ -765,14 +1049,26 @@ def get_no_images_sql_containers(): messages.append( "\n-----------------------------------------------------------------------------\n" # noqa ) - base_folder = "/etc/searchengine/" - if not os.path.isdir(base_folder): - base_folder = os.path.expanduser("~") + if write_report: + base_folder = search_omero_app.config.get("BASE_FOLDER") + if not os.path.isdir(base_folder): + base_folder = os.path.expanduser("~") + report_file = os.path.join(base_folder, "check_containers_report.txt") + report = "\n".join(messages) # noqa + with open(report_file, "w") as f: + f.write(report) - report_file = os.path.join(base_folder, "check_containers_report.txt") - report = "\n".join(messages) # noqa - with open(report_file, "w") as f: - f.write(report) + +""" +def set_ownership(resource , name, value, owner_id=None, group_id=None): + if hasattr(self, 'owener_id'): + if hasattr(self, 'group_id'): + sql=query_images_key_value.substitute(name=name, value=value) + if owner_id: + sql=sql +" %s.%owner_id=%s"%(resource,owner_id) + if group_id: + sql = sql + " %s.%group_id=%s" % (resource, group_id) +""" def check_container_keys_vakues(): @@ -820,7 +1116,7 @@ def check_container_keys_vakues(): else: scr_searchengine_results = scr_searchengine_results.response search_omero_app.logger.info( - "Results from PostgreSQL database: %s" % len(screen_results) + "Results from the database: %s" % len(screen_results) ) if len(scr_searchengine_results) > 0 and scr_searchengine_results[ 0 @@ -850,7 +1146,7 @@ def check_container_keys_vakues(): pr_searchengine_results = pr_searchengine_results.response search_omero_app.logger.info( - "Results from PostgreSQL database: %s" % len(project_results) + "Results from the database: %s" % len(project_results) ) if len(pr_searchengine_results) > 0 and pr_searchengine_results[ 0 diff --git a/unit_tests/__init__.py b/unit_tests/__init__.py new file mode 100644 index 00000000..51828773 --- /dev/null +++ b/unit_tests/__init__.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (C) 2024 University of Dundee & Open Microscopy Environment. +# All rights reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . diff --git a/unit_tests/test_app.py b/unit_tests/test_app.py index ab697e86..5566739f 100644 --- a/unit_tests/test_app.py +++ b/unit_tests/test_app.py @@ -34,6 +34,10 @@ key_values_resource_cache_template, ) +from omero_search_engine.validation.results_validator import ( + Validator, + check_number_images_sql_containers_using_ids, +) from omero_search_engine.cache_functions.elasticsearch.transform_data import ( delete_es_index, create_index, @@ -46,11 +50,24 @@ not_valid_and_filters, not_valid_or_filters, query, + query_image_and, + query_image_or, + query_image_and_or, + simple_queries, + query_in, + images_keys, + images_value_parts, + contains_not_contains_queries, + image_owner, + image_group, + image_owner_group, ) from omero_search_engine import search_omero_app, create_app create_app("testing") +# deep_check should be a configuration item +deep_check = True class BasicTestCase(unittest.TestCase): @@ -131,8 +148,6 @@ def test_add_submit_query_delete_es_index(self): es_index_2 = "key_values_resource_cach" create_es_index_2 = True all_all_indices = get_all_indexes_from_elasticsearch() - print(all_all_indices) - print(all_all_indices.keys()) if es_index_2 in all_all_indices.keys(): create_es_index_2 = False @@ -143,12 +158,189 @@ def test_add_submit_query_delete_es_index(self): create_index(es_index_2, key_values_resource_cache_template) ) res = search_resource_annotation(table, query) - print(res) assert len(res.get("results")) >= 0 self.assertTrue(delete_es_index(es_index)) if create_es_index_2: self.assertTrue(delete_es_index(es_index_2)) + def test_single_query(self): + """ + test query the search engine and compare + its results with the results from the database + """ + for resource, cases in simple_queries.items(): + for case in cases: + name = case[0] + value = case[1] + validator = Validator(deep_check) + validator.set_simple_query(resource, name, value) + validator.get_results_db("equals") + validator.get_results_searchengine("equals") + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + validator.get_results_db("not_equals") + validator.get_results_searchengine("not_equals") + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + self.assertTrue(validator.identical) + + def test_and_query(self): + name = "query_image_and" + for cases in query_image_and: + validator = Validator(deep_check) + validator.set_complex_query(name, cases) + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + self.assertTrue(validator.identical) + + def test_or_query(self): + name = "query_image_or" + for cases in query_image_or: + validator = Validator(deep_check) + validator.set_complex_query(name, cases) + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + self.assertTrue(validator.identical) + + def test_no_images_containers(self): + self.assertTrue(check_number_images_sql_containers_using_ids()) + + def test_multi_or_quries(self): + pass + + def test_complex_query(self): + name = "query_image_and_or" + for cases in query_image_and_or: + validator = Validator(deep_check) + validator.set_complex_query(name, cases) + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + self.assertTrue(validator.identical) + + def test_in_query(self): + for resource, cases in query_in.items(): + for case in cases: + validator = Validator(deep_check) + validator.set_in_query(case, resource) + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + self.assertTrue(validator.identical) + + def test_not_in_query(self): + for resource, cases in query_in.items(): + for case in cases: + validator = Validator(deep_check) + validator.set_in_query(case, resource, type="not_in_clause") + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + self.assertTrue(validator.identical) + + def test_seach_for_any_value(self): + for part in images_value_parts: + validator = Validator(deep_check) + validator.set_simple_query("image", None, part, type="buckets") + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("total_number_of_buckets"), + ) + + def test_available_values_for_key(self): + for image_key in images_keys: + validator = Validator(deep_check) + validator.set_simple_query("image", image_key, None, type="buckets") + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("total_number_of_buckets"), + ) + + def test_contains_not_contains_queries(self): + for resource, cases in contains_not_contains_queries.items(): + for case in cases: + name = case[0] + value = case[1] + validator = Validator(deep_check) + validator.set_contains_not_contains_query(resource, name, value) + validator.get_results_db("contains") + validator.get_results_searchengine("contains") + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + validator.get_results_db("not_contains") + validator.get_results_searchengine("not_contains") + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + self.assertTrue(validator.identical) + + def test_owner(self): + for resource, cases in image_owner.items(): + for case in cases: + name = case[0] + value = case[1] + owner_id = case[2] + validator = Validator(deep_check) + validator.set_simple_query(resource, name, value) + validator.set_owner_group(owner_id=owner_id) + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + + def test_group(self): + for resource, cases in image_group.items(): + for case in cases: + name = case[0] + value = case[1] + group_id = case[2] + validator = Validator(deep_check) + validator.set_simple_query(resource, name, value) + validator.set_owner_group(group_id=group_id) + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + + def test_owner_group(self): + for resource, cases in image_owner_group.items(): + for case in cases: + name = case[0] + value = case[1] + owner_id = case[2] + group_id = case[3] + validator = Validator(deep_check) + validator.set_simple_query(resource, name, value) + validator.set_owner_group(owner_id=owner_id, group_id=group_id) + validator.compare_results() + self.assertEqual( + len(validator.postgres_results), + validator.searchengine_results.get("size"), + ) + # def test_add_delete_es_index(self): # ''' # test create index in elastic search diff --git a/unit_tests/test_data.py b/unit_tests/test_data.py index f081bf94..0a791446 100644 --- a/unit_tests/test_data.py +++ b/unit_tests/test_data.py @@ -52,3 +52,54 @@ # "operator": "equals", "resource": "image"}]}} query = {"query_details": {"and_filters": []}} + +query_image_and = [ + [["Phenotype Annotation Level", "protein"], ["organism", "homo sapiens"]] +] + +query_image_or = [[["Gene Symbol", "CDK5RAP2"], ["Gene Symbol", "cep120"]]] + +query_image_and_or = [ + { + "query_image_and": [ + ["Organism", "homo sapiens"], + ["Targeted Protein", "CDK5RAP2"], + ["Phenotype Term Accession", "CMPO_0000425"], + ], + "query_image_or": [ + ["Phenotype", "protein localized to centrosome"], + ["Gene Symbol", "http://www.ebi.ac.uk/cmpo/CMPO_0000425"], + ], + } +] + +simple_queries = { + "image": [ + ["cell line", "Hela"], + ["PBS", "10Mm"], + ["Gene Symbol", "CDK5RAP2"], + ["organism", "homo sapiens"], + ["temperature", "37"], + ] +} + +contains_not_contains_queries = { + "image": [["cell line", "hel"], ["gene symbol", "cep"]] +} + +query_in = { + "image": [ + ["Gene Symbol", ["pcnt", "cenpj", "cep120", "cdk5rap2"]], + ["temperature", ["23 c", "37 c"]], + ] +} + +image_owner = {"image": [["cell line", "Hela", 103]]} + +image_group = {"image": [["cell line", "Hela", 5]]} + +image_owner_group = {"image": [["gene symbol", "cep120", 702, 5]]} + +images_keys = ["cell line", "gene symbol"] + +images_value_parts = ["he", "pr"]