Fix import csv and add support to be used with multosearch

khaledk2 · Oct 27, 2024 · 5a32426 · 5a32426
1 parent b6697f5
commit 5a32426
Show file tree

Hide file tree

Showing 8 changed files with 335 additions and 183 deletions.
diff --git a/configurations/configuration.py b/configurations/configuration.py
@@ -58,30 +58,41 @@ def set_database_connection_variables(config):
     from omero_search_engine.database.database_connector import DatabaseConnector
 
     config.database_connectors = {}
+    config.FILES = {}
     for source in config.DATA_SOURCES:
-        if source.get("DATABASE").get("DATABASE_PORT"):
-            address = source.get("DATABASE").get(
-                "DATABASE_SERVER_URI"
-            ) + ":%s" % source.get("DATABASE").get("DATABASE_PORT")
-        else:
-            address = source.get("DATABASE").get("DATABASE_SERVER_URI")
-        DATABASE_URI = "postgresql://%s:%s@%s/%s" % (
-            source.get("DATABASE").get("DATABASE_USER"),
-            source.get("DATABASE").get("DATABASE_PASSWORD"),
-            address,
-            source.get("DATABASE").get("DATABASE_NAME"),
-        )
-        database_connector = DatabaseConnector(
-            source.get("DATABASE").get("DATABASE_NAME"), DATABASE_URI
-        )
-        config.database_connectors[source.get("name")] = database_connector
-
-
-def update_config_file(updated_configuration, configure_database=False):
+        if source.get("DATABASE"):
+            if source.get("DATABASE").get("DATABASE_PORT"):
+                address = source.get("DATABASE").get(
+                    "DATABASE_SERVER_URI"
+                ) + ":%s" % source.get("DATABASE").get("DATABASE_PORT")
+            else:
+                address = source.get("DATABASE").get("DATABASE_SERVER_URI")
+            DATABASE_URI = "postgresql://%s:%s@%s/%s" % (
+                source.get("DATABASE").get("DATABASE_USER"),
+                source.get("DATABASE").get("DATABASE_PASSWORD"),
+                address,
+                source.get("DATABASE").get("DATABASE_NAME"),
+            )
+            database_connector = DatabaseConnector(
+                source.get("DATABASE").get("DATABASE_NAME"), DATABASE_URI
+            )
+            config.database_connectors[source.get("name")] = database_connector
+        elif  source.get("CSV"):
+            csv_config={"Type":"CSV"}
+            config.FILES [source.get("name")]= csv_config
+            if source.get("CSV"). get("images_folder"):
+                csv_config["images_folder"]=source.get("CSV"). get("images_folder")
+            if source.get("CSV").get("projects_file"):
+                csv_config["projects_file"] = source.get("CSV").get("projects_file")
+            if source.get("CSV").get("screens_file"):
+                csv_config["screens_file"] = source.get("CSV").get("screens_file")
+
+
+def update_config_file(updated_configuration, data_source=False):
     is_changed = False
     with open(app_config.INSTANCE_CONFIG) as f:
         configuration = yaml.load(f)
-    if not configure_database:
+    if not data_source:
         found = []
         for key, value in updated_configuration.items():
             if key in configuration:
@@ -98,27 +109,40 @@ def update_config_file(updated_configuration, configure_database=False):
                     print("%s value is added with value %s " % (key, value))
                     is_changed = True
     else:
-        is_changed = config_database(configuration, updated_configuration)
+        is_changed = config_datasource(configuration, updated_configuration)
 
     if is_changed:
         with open(app_config.INSTANCE_CONFIG, "w") as f:
             yaml.dump(configuration, f)
 
-
-def config_database(configuration, updated_configuration):
-    for data_source in configuration.get("DATA_SOURCES"):
-        changed = False
-        Found = False
-        if data_source["name"].lower() == updated_configuration["name"].lower():
-            Found = True
-            for k, v in updated_configuration["DATABASE"].items():
-                if data_source["DATABASE"][k] != v:
-                    data_source["DATABASE"][k] = v
-                    changed = True
-            break
-    if not Found:
-        configuration.get("DATA_SOURCES").append(updated_configuration)
-        changed = True
+def config_datasource(configuration, updated_configuration):
+    changed = False
+    Found = False
+    if updated_configuration.get("CSV").get("type") =="CSV":
+        for data_source in configuration.get("DATA_SOURCES"):
+            if data_source.get("name").lower()==updated_configuration.get("name").lower():
+                Found=True
+                for k, v in updated_configuration["CSV"].items():
+                    if v=="CSV":
+                        continue
+                    if data_source["CSV"].get(k) != v:
+                        data_source["CSV"][k] = v
+                        changed = True
+        if not Found:
+            configuration.get("DATA_SOURCES").append(updated_configuration)
+            changed = True
+    else:
+        for data_source in configuration.get("DATA_SOURCES"):
+            if data_source["name"].lower() == updated_configuration["name"].lower():
+                Found = True
+                for k, v in updated_configuration["DATABASE"].items():
+                    if data_source["DATABASE"][k] != v:
+                        data_source["DATABASE"][k] = v
+                        changed = True
+                break
+        if not Found:
+            configuration.get("DATA_SOURCES").append(updated_configuration)
+            changed = True
     return changed
 
 

diff --git a/manage.py b/manage.py
@@ -172,6 +172,7 @@ def get_index_data_from_database(resource="all", source="all", backup="True"):
         #        return
         #     get_insert_data_to_index(sql_st, resource)
         # else:
+
         for res, sql_st in sqls_resources.items():
             if resource.lower() != "all" and resource.lower() != res.lower():
                 continue
@@ -198,7 +199,6 @@ def get_index_data_from_database(resource="all", source="all", backup="True"):
     if backup:
         backup_elasticsearch_data()
 
-
 # set configurations
 @manager.command
 @manager.option("-u", "--url", help="database server url")
@@ -237,7 +237,7 @@ def set_database_configuration(
         database_attrs["DATABASE_BACKUP_FILE"] = backup_filename
 
     if len(database_attrs) > 0:
-        update_config_file(databse_config, configure_database=True)
+        update_config_file(databse_config, data_source=True)
     else:
         search_omero_app.logger.info(
             "At least one database attribute\
@@ -246,6 +246,31 @@ def set_database_configuration(
         )
 
 
+@manager.command
+@manager.option("-n", "--name", help="data source name")
+@manager.option("-i", "--images_folder", help="path to a folder contains csv files cwhich contains the image data ")
+@manager.option("-p", "--projects_file", help="path to the a file containing the projects data")
+@manager.option("-s", "--screens_file", help="path to the a file containing the screens data")
+@manager.option("-d", "--datasource_type", help=" data source type; supports csv")
+
+def set_data_source_files(name=None, images_folder=None, projects_file=None, screens_file=None,datasource_type="CSV"):
+    source={}
+    if not name:
+        print ("Source name attribute is missing")
+        return
+    source["name"]=name
+    source_attrs={}
+    source["CSV"]=source_attrs
+    source_attrs["type"]=datasource_type
+    if images_folder:
+        source_attrs["images_folder"]=images_folder
+    if projects_file:
+        source_attrs["projects_file"]=projects_file
+    if screens_file:
+        source_attrs["screens_file"]=screens_file
+
+    update_config_file(source, True)
+
 @manager.command
 @manager.option("-e", "--elasticsearch_url", help="elasticsearch url")
 def set_elasticsearch_configuration(elasticsearch_url=None):
@@ -479,6 +504,37 @@ def test_container_key_value():
 
     check_container_keys_vakues()
 
+@manager.command
+@manager.option(
+    "-s",
+    "--source",
+    help="data source name, ndexeing all the data sources is the default",  # noqa
+)
+@manager.option(
+    "-f",
+    "--folder",
+    help="data folder which contains csv files",  # noqa
+)
+@manager.option(
+    "-r",
+    "--resource",
+    help="resource name, creating all the indexes for all the resources is the default",  # noqa
+)
+def get_index_data_from_csv_files(source=None, folder=None, resource="image"):
+    from omero_search_engine.cache_functions.elasticsearch.transform_data import (
+        insert_resource_data,
+        save_key_value_buckets
+    )
+    insert_resource_data(
+        folder=folder, resource=resource, data_source=source, from_json=False
+    )
+    save_key_value_buckets(
+        resource_table_=resource,
+        data_source=source,
+        clean_index=False,
+        only_values=False,
+    )
+
 
 if __name__ == "__main__":
     from flask_script import Command

diff --git a/omero_search_engine/api/v1/resources/query_handler.py b/omero_search_engine/api/v1/resources/query_handler.py
@@ -53,7 +53,7 @@ def check_get_names(idr_, resource, attribute, return_exact=False):
                     for name in pr_names_
                     if name[attribute] and idr_.lower() in name[attribute].lower()
                 ]
-                print(act_name, data_source)
+                # print(act_name, data_source)
                 all_act_names = all_act_names + act_name
         else:
             # This should be modified to query specific data source specific

diff --git a/omero_search_engine/api/v1/resources/resource_analyser.py b/omero_search_engine/api/v1/resources/resource_analyser.py
@@ -41,44 +41,11 @@
 }}}}}}}}"""
 )
 key_number_search_template = Template(
-    r"""
-{
-   "size":0,
-    "query":{ "bool" : {"must": {
-               "match":{
-                  "data_source.keyvalue":"$data_source"
-               }
-               }
-               }
-   },
-   "aggs":{
-      "value_search":{
-         "nested":{
-            "path":"key_values"
-         },
-         "aggs":{
-            "value_filter":{
-               "filter":{
-                  "terms":{
-                     "key_values.name.keyword":[
-                        "$key"
-                     ]
-                  }
-               },
-               "aggs":{
-                  "required_values":{
-                     "cardinality":{
-                        "field":"key_values.value.keyvalue",
-                        "precision_threshold":4000
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-"""
+    """
+{"size":0,"query":{ "bool": {"must": {"match":{"data_source.keyvalue":"$data_source"}}}},
+"aggs":{"value_search":{"nested":{"path":"key_values"},"aggs":{"value_filter":{"filter":{"terms":{
+"key_values.name.keyword":["$key"]}},"aggs":{"required_values":{"cardinality":{
+"field":"key_values.value.keyvalue","precision_threshold":4000}}}}}}}}"""
 )
 
 search_by_value_only = Template(
@@ -535,8 +502,8 @@ def query_cashed_bucket(
 
     if name:
         name = name.strip()
-    if resource != "all":
 
+    if resource != "all":
         query = key_values_buckets_template.substitute(
             name=name, resource=resource, data_source=json.dumps(data_source)
         )
@@ -886,6 +853,8 @@ def get_the_results(
         es_index, query
     )  # .search(index=es_index, body=query)
     hits = results_["hits"]["hits"]
+    print ("===>>> Hist %s"%hits)
+
     if len(hits) > 0:
         for hit in hits:
             if len(hits) > 0:
@@ -912,6 +881,8 @@ def get_the_results(
                     returned_results[hit["_source"]["data_source"]] = [
                         item for item in hit["_source"]["resourcename"]
                     ]
+                else:
+                    return returned_results
 
     # remove container description from the results,
     # should be added again later after cleaning up the description
@@ -921,6 +892,7 @@ def get_the_results(
     return returned_results
 
 
+
 def get_container_values_for_key(
     table_, container_name, csv, ret_data_source=None, key=None
 ):
@@ -1053,22 +1025,50 @@ def process_container_query(table_, attribute_name, container_id, key, resourse)
    {"terms": {"field": "key_values.value.keyvalue","size": 10000}}}}}}}"""
 )
 
-
 """
 Get all the keys bucket"""
-container_project_keys_template = {
-    "keys_search": {
-        "nested": {"path": "key_values"},
-        "aggs": {
-            "required_values": {
-                "cardinality": {
-                    "field": "key_values.name.keynamenormalize",
-                    "precision_threshold": 4000,
-                },
-            },
-            "uniquesTerms": {
-                "terms": {"field": "key_values.name.keynamenormalize", "size": 10000}
-            },
-        },
-    }
+container_project_keys_template =  Template(
+    """
+{"keys_search": {"nested": {"path": "key_values"},
+"aggs": {"required_values": {"cardinality": {"field": "key_values.name.keynamenormalize","precision_threshold": 4000,
+},},"uniquesTerms": {"terms": {"field": "key_values.name.keynamenormalize", "size": 10000}},},}}
+"""
+)
+resource_keys_template= Template(
+    '''
+    {
+   "size":0,
+    "query":{ "bool" : {"must": {
+               "match":{
+                  "data_source.keyvalue":"$data_source"
+               }
+               }
+               }
+   },
+   "aggs":{
+      "value_search":{
+         "nested":{
+            "path":"key_values"
+         },
+               "aggs":{
+                  "required_values":{
+                     "cardinality":{
+                        "field":"key_values.name.keyword",
+                        "precision_threshold":4000
+                     }
+                  }
+               },
+"aggs": {"required_name": {
+"terms": {"field": "key_values.name.keyword","size": 9999}}}
+
+      }
+   }
 }
+'''
+)
+
+
+def get_resource_keys(resource, data_source):
+    res_index = resource_elasticsearchindex.get(resource)
+    res = search_index_for_value(res_index, json.loads(resource_keys_template.substitute(data_source=data_source)))
+    return res["aggregations"]["value_search"]["required_name"]["buckets"]