diff --git a/configurations/configuration.py b/configurations/configuration.py index 49b4f8c..63ebca3 100644 --- a/configurations/configuration.py +++ b/configurations/configuration.py @@ -77,11 +77,11 @@ def set_database_connection_variables(config): source.get("DATABASE").get("DATABASE_NAME"), DATABASE_URI ) config.database_connectors[source.get("name")] = database_connector - elif source.get("CSV"): - csv_config={"Type":"CSV"} - config.FILES [source.get("name")]= csv_config - if source.get("CSV"). get("images_folder"): - csv_config["images_folder"]=source.get("CSV"). get("images_folder") + elif source.get("CSV"): + csv_config = {"Type": "CSV"} + config.FILES[source.get("name")] = csv_config + if source.get("CSV").get("images_folder"): + csv_config["images_folder"] = source.get("CSV").get("images_folder") if source.get("CSV").get("projects_file"): csv_config["projects_file"] = source.get("CSV").get("projects_file") if source.get("CSV").get("screens_file"): @@ -115,15 +115,19 @@ def update_config_file(updated_configuration, data_source=False): with open(app_config.INSTANCE_CONFIG, "w") as f: yaml.dump(configuration, f) + def config_datasource(configuration, updated_configuration): changed = False Found = False - if updated_configuration.get("CSV").get("type") =="CSV": + if updated_configuration.get("CSV").get("type") == "CSV": for data_source in configuration.get("DATA_SOURCES"): - if data_source.get("name").lower()==updated_configuration.get("name").lower(): - Found=True + if ( + data_source.get("name").lower() + == updated_configuration.get("name").lower() + ): + Found = True for k, v in updated_configuration["CSV"].items(): - if v=="CSV": + if v == "CSV": continue if data_source["CSV"].get(k) != v: data_source["CSV"][k] = v diff --git a/manage.py b/manage.py index 4b3dab5..5618482 100644 --- a/manage.py +++ b/manage.py @@ -199,6 +199,7 @@ def get_index_data_from_database(resource="all", source="all", backup="True"): if backup: backup_elasticsearch_data() + # set configurations @manager.command @manager.option("-u", "--url", help="database server url") @@ -248,29 +249,43 @@ def set_database_configuration( @manager.command @manager.option("-n", "--name", help="data source name") -@manager.option("-i", "--images_folder", help="path to a folder contains csv files cwhich contains the image data ") -@manager.option("-p", "--projects_file", help="path to the a file containing the projects data") -@manager.option("-s", "--screens_file", help="path to the a file containing the screens data") +@manager.option( + "-i", + "--images_folder", + help="path to a folder contains csv files cwhich contains the image data ", +) +@manager.option( + "-p", "--projects_file", help="path to the a file containing the projects data" +) +@manager.option( + "-s", "--screens_file", help="path to the a file containing the screens data" +) @manager.option("-d", "--datasource_type", help=" data source type; supports csv") - -def set_data_source_files(name=None, images_folder=None, projects_file=None, screens_file=None,datasource_type="CSV"): - source={} +def set_data_source_files( + name=None, + images_folder=None, + projects_file=None, + screens_file=None, + datasource_type="CSV", +): + source = {} if not name: - print ("Source name attribute is missing") + print("Source name attribute is missing") return - source["name"]=name - source_attrs={} - source["CSV"]=source_attrs - source_attrs["type"]=datasource_type + source["name"] = name + source_attrs = {} + source["CSV"] = source_attrs + source_attrs["type"] = datasource_type if images_folder: - source_attrs["images_folder"]=images_folder + source_attrs["images_folder"] = images_folder if projects_file: - source_attrs["projects_file"]=projects_file + source_attrs["projects_file"] = projects_file if screens_file: - source_attrs["screens_file"]=screens_file + source_attrs["screens_file"] = screens_file update_config_file(source, True) + @manager.command @manager.option("-e", "--elasticsearch_url", help="elasticsearch url") def set_elasticsearch_configuration(elasticsearch_url=None): @@ -504,6 +519,7 @@ def test_container_key_value(): check_container_keys_vakues() + @manager.command @manager.option( "-s", @@ -523,8 +539,9 @@ def test_container_key_value(): def get_index_data_from_csv_files(source=None, folder=None, resource="image"): from omero_search_engine.cache_functions.elasticsearch.transform_data import ( insert_resource_data, - save_key_value_buckets + save_key_value_buckets, ) + insert_resource_data( folder=folder, resource=resource, data_source=source, from_json=False ) diff --git a/omero_search_engine/api/v1/resources/resource_analyser.py b/omero_search_engine/api/v1/resources/resource_analyser.py index 8d233a6..b96cbb9 100644 --- a/omero_search_engine/api/v1/resources/resource_analyser.py +++ b/omero_search_engine/api/v1/resources/resource_analyser.py @@ -853,7 +853,7 @@ def get_the_results( es_index, query ) # .search(index=es_index, body=query) hits = results_["hits"]["hits"] - print ("===>>> Hist %s"%hits) + print("===>>> Hist %s" % hits) if len(hits) > 0: for hit in hits: @@ -892,7 +892,6 @@ def get_the_results( return returned_results - def get_container_values_for_key( table_, container_name, csv, ret_data_source=None, key=None ): @@ -1027,15 +1026,15 @@ def process_container_query(table_, attribute_name, container_id, key, resourse) """ Get all the keys bucket""" -container_project_keys_template = Template( +container_project_keys_template = Template( """ {"keys_search": {"nested": {"path": "key_values"}, "aggs": {"required_values": {"cardinality": {"field": "key_values.name.keynamenormalize","precision_threshold": 4000, },},"uniquesTerms": {"terms": {"field": "key_values.name.keynamenormalize", "size": 10000}},},}} """ ) -resource_keys_template= Template( - ''' +resource_keys_template = Template( + """ { "size":0, "query":{ "bool" : {"must": { @@ -1064,11 +1063,14 @@ def process_container_query(table_, attribute_name, container_id, key, resourse) } } } -''' +""" ) def get_resource_keys(resource, data_source): res_index = resource_elasticsearchindex.get(resource) - res = search_index_for_value(res_index, json.loads(resource_keys_template.substitute(data_source=data_source))) + res = search_index_for_value( + res_index, + json.loads(resource_keys_template.substitute(data_source=data_source)), + ) return res["aggregations"]["value_search"]["required_name"]["buckets"] diff --git a/omero_search_engine/api/v1/resources/utils.py b/omero_search_engine/api/v1/resources/utils.py index ea7a851..e1d68f9 100644 --- a/omero_search_engine/api/v1/resources/utils.py +++ b/omero_search_engine/api/v1/resources/utils.py @@ -1350,8 +1350,10 @@ def check_empty_string(string_to_check): def get_all_index_data(res_table, data_source): - query_return_all_data = {"query_details": {"and_filters": [], "or_filters": [], "case_sensitive": False}} - res=search_resource_annotation( + query_return_all_data = { + "query_details": {"and_filters": [], "or_filters": [], "case_sensitive": False} + } + res = search_resource_annotation( res_table, query_return_all_data, return_containers=False, @@ -1359,31 +1361,32 @@ def get_all_index_data(res_table, data_source): ) return res + ################## def get_number_image_inside_container(resource, res_id, data_source): and_filters = [] main_attributes = { - "and_main_attributes": [ - { - "name": "%s_id" % resource, - "value": res_id, - "operator": "equals", - "resource": "image", - }, - { - "name": "data_source", - "value": data_source, - "operator": "equals", - "resource": "image", - }, - ] + "and_main_attributes": [ + { + "name": "%s_id" % resource, + "value": res_id, + "operator": "equals", + "resource": "image", + }, + { + "name": "data_source", + "value": data_source, + "operator": "equals", + "resource": "image", + }, + ] } or_filters = [] query = {"and_filters": and_filters, "or_filters": or_filters} query_data = { - "query_details": query, - "main_attributes": main_attributes, + "query_details": query, + "main_attributes": main_attributes, } returned_results = search_resource_annotation("image", query_data) @@ -1394,4 +1397,5 @@ def get_number_image_inside_container(resource, res_id, data_source): searchengine_results = 0 return searchengine_results + ##################### diff --git a/omero_search_engine/cache_functions/elasticsearch/transform_data.py b/omero_search_engine/cache_functions/elasticsearch/transform_data.py index cebd488..1d9163d 100644 --- a/omero_search_engine/cache_functions/elasticsearch/transform_data.py +++ b/omero_search_engine/cache_functions/elasticsearch/transform_data.py @@ -453,7 +453,10 @@ def insert_resource_data(folder, resource, data_source, from_json): finally: pool.close() + total_process = 0 + + def get_insert_data_to_index(sql_st, resource, data_source, clean_index=True): """ - Query the postgreSQL database server and get metadata (key-value pair) @@ -661,6 +664,7 @@ def insert_plate_data(folder, plate_file): ] handle_file(file_name, es_index, cols) + def save_key_value_buckets( resource_table_=None, data_source=None, clean_index=False, only_values=False ): @@ -705,20 +709,25 @@ def save_key_value_buckets( ) from omero_search_engine.api.v1.resources.resource_analyser import ( get_resource_keys, - get_resource_names) - from omero_search_engine.api.v1.resources.utils import get_all_index_data,get_number_image_inside_container + get_resource_names, + ) + from omero_search_engine.api.v1.resources.utils import ( + get_all_index_data, + get_number_image_inside_container, + ) + res = get_resource_keys(resource_table, data_source) resource_keys = [res["key"] for res in res] # resource_keys = get_keys(resource_table, data_source) name_results = None if resource_table in ["project", "screen"]: - #sql = "select id, name,description from {resource}".format( + # sql = "select id, name,description from {resource}".format( # resource=resource_table - #) - #conn = search_omero_app.config.database_connectors[data_source] - #name_result = conn.execute_query(sql) - #name_result = get_resource_names(resource=resource_table, data_source=json.dumps(data_source)) - #print (name_result) + # ) + # conn = search_omero_app.config.database_connectors[data_source] + # name_result = conn.execute_query(sql) + # name_result = get_resource_names(resource=resource_table, data_source=json.dumps(data_source)) + # print (name_result) # name_results = [res["name"] for res in name_results] # Determine the number of images for each container name_result = get_all_index_data(resource_table, data_source) @@ -726,15 +735,16 @@ def save_key_value_buckets( try: for res in name_result["results"]["results"]: id = res.get("id") - # if resource_table == "project": + # if resource_table == "project": # sql_n = query_images_in_project_id.substitute(project_id=id) - #elif resource_table == "screen": - # sql_n = query_images_in_screen_id.substitute(screen_id=id) - no_images_co = get_number_image_inside_container(resource_table, id, data_source) - #no_images_co = conn.execute_query(sql_n) + # elif resource_table == "screen": + # sql_n = query_images_in_screen_id.substitute(screen_id=id) + no_images_co = get_number_image_inside_container( + resource_table, id, data_source + ) + # no_images_co = conn.execute_query(sql_n) res["no_images"] = no_images_co - name_results = [ { "id": res["id"], @@ -770,8 +780,6 @@ def save_key_value_buckets( (key, resource_table, es_index, len(resource_keys), data_source) ) - - # determine the number of processes inside the process pool no_processors = search_omero_app.config.get("NO_PROCESSES") if not no_processors: