Skip to content

Commit

Permalink
Fix import csv and add support to be used with multosearch
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledk2 committed Oct 27, 2024
1 parent b6697f5 commit 5a32426
Show file tree
Hide file tree
Showing 8 changed files with 335 additions and 183 deletions.
96 changes: 60 additions & 36 deletions configurations/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,30 +58,41 @@ def set_database_connection_variables(config):
from omero_search_engine.database.database_connector import DatabaseConnector

config.database_connectors = {}
config.FILES = {}
for source in config.DATA_SOURCES:
if source.get("DATABASE").get("DATABASE_PORT"):
address = source.get("DATABASE").get(
"DATABASE_SERVER_URI"
) + ":%s" % source.get("DATABASE").get("DATABASE_PORT")
else:
address = source.get("DATABASE").get("DATABASE_SERVER_URI")
DATABASE_URI = "postgresql://%s:%s@%s/%s" % (
source.get("DATABASE").get("DATABASE_USER"),
source.get("DATABASE").get("DATABASE_PASSWORD"),
address,
source.get("DATABASE").get("DATABASE_NAME"),
)
database_connector = DatabaseConnector(
source.get("DATABASE").get("DATABASE_NAME"), DATABASE_URI
)
config.database_connectors[source.get("name")] = database_connector


def update_config_file(updated_configuration, configure_database=False):
if source.get("DATABASE"):
if source.get("DATABASE").get("DATABASE_PORT"):
address = source.get("DATABASE").get(
"DATABASE_SERVER_URI"
) + ":%s" % source.get("DATABASE").get("DATABASE_PORT")
else:
address = source.get("DATABASE").get("DATABASE_SERVER_URI")
DATABASE_URI = "postgresql://%s:%s@%s/%s" % (
source.get("DATABASE").get("DATABASE_USER"),
source.get("DATABASE").get("DATABASE_PASSWORD"),
address,
source.get("DATABASE").get("DATABASE_NAME"),
)
database_connector = DatabaseConnector(
source.get("DATABASE").get("DATABASE_NAME"), DATABASE_URI
)
config.database_connectors[source.get("name")] = database_connector
elif source.get("CSV"):
csv_config={"Type":"CSV"}
config.FILES [source.get("name")]= csv_config
if source.get("CSV"). get("images_folder"):
csv_config["images_folder"]=source.get("CSV"). get("images_folder")
if source.get("CSV").get("projects_file"):
csv_config["projects_file"] = source.get("CSV").get("projects_file")
if source.get("CSV").get("screens_file"):
csv_config["screens_file"] = source.get("CSV").get("screens_file")


def update_config_file(updated_configuration, data_source=False):
is_changed = False
with open(app_config.INSTANCE_CONFIG) as f:
configuration = yaml.load(f)
if not configure_database:
if not data_source:
found = []
for key, value in updated_configuration.items():
if key in configuration:
Expand All @@ -98,27 +109,40 @@ def update_config_file(updated_configuration, configure_database=False):
print("%s value is added with value %s " % (key, value))
is_changed = True
else:
is_changed = config_database(configuration, updated_configuration)
is_changed = config_datasource(configuration, updated_configuration)

if is_changed:
with open(app_config.INSTANCE_CONFIG, "w") as f:
yaml.dump(configuration, f)


def config_database(configuration, updated_configuration):
for data_source in configuration.get("DATA_SOURCES"):
changed = False
Found = False
if data_source["name"].lower() == updated_configuration["name"].lower():
Found = True
for k, v in updated_configuration["DATABASE"].items():
if data_source["DATABASE"][k] != v:
data_source["DATABASE"][k] = v
changed = True
break
if not Found:
configuration.get("DATA_SOURCES").append(updated_configuration)
changed = True
def config_datasource(configuration, updated_configuration):
changed = False
Found = False
if updated_configuration.get("CSV").get("type") =="CSV":
for data_source in configuration.get("DATA_SOURCES"):
if data_source.get("name").lower()==updated_configuration.get("name").lower():
Found=True
for k, v in updated_configuration["CSV"].items():
if v=="CSV":
continue
if data_source["CSV"].get(k) != v:
data_source["CSV"][k] = v
changed = True
if not Found:
configuration.get("DATA_SOURCES").append(updated_configuration)
changed = True
else:
for data_source in configuration.get("DATA_SOURCES"):
if data_source["name"].lower() == updated_configuration["name"].lower():
Found = True
for k, v in updated_configuration["DATABASE"].items():
if data_source["DATABASE"][k] != v:
data_source["DATABASE"][k] = v
changed = True
break
if not Found:
configuration.get("DATA_SOURCES").append(updated_configuration)
changed = True
return changed


Expand Down
60 changes: 58 additions & 2 deletions manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def get_index_data_from_database(resource="all", source="all", backup="True"):
# return
# get_insert_data_to_index(sql_st, resource)
# else:

for res, sql_st in sqls_resources.items():
if resource.lower() != "all" and resource.lower() != res.lower():
continue
Expand All @@ -198,7 +199,6 @@ def get_index_data_from_database(resource="all", source="all", backup="True"):
if backup:
backup_elasticsearch_data()


# set configurations
@manager.command
@manager.option("-u", "--url", help="database server url")
Expand Down Expand Up @@ -237,7 +237,7 @@ def set_database_configuration(
database_attrs["DATABASE_BACKUP_FILE"] = backup_filename

if len(database_attrs) > 0:
update_config_file(databse_config, configure_database=True)
update_config_file(databse_config, data_source=True)
else:
search_omero_app.logger.info(
"At least one database attribute\
Expand All @@ -246,6 +246,31 @@ def set_database_configuration(
)


@manager.command
@manager.option("-n", "--name", help="data source name")
@manager.option("-i", "--images_folder", help="path to a folder contains csv files cwhich contains the image data ")
@manager.option("-p", "--projects_file", help="path to the a file containing the projects data")
@manager.option("-s", "--screens_file", help="path to the a file containing the screens data")
@manager.option("-d", "--datasource_type", help=" data source type; supports csv")

def set_data_source_files(name=None, images_folder=None, projects_file=None, screens_file=None,datasource_type="CSV"):
source={}
if not name:
print ("Source name attribute is missing")
return
source["name"]=name
source_attrs={}
source["CSV"]=source_attrs
source_attrs["type"]=datasource_type
if images_folder:
source_attrs["images_folder"]=images_folder
if projects_file:
source_attrs["projects_file"]=projects_file
if screens_file:
source_attrs["screens_file"]=screens_file

update_config_file(source, True)

@manager.command
@manager.option("-e", "--elasticsearch_url", help="elasticsearch url")
def set_elasticsearch_configuration(elasticsearch_url=None):
Expand Down Expand Up @@ -479,6 +504,37 @@ def test_container_key_value():

check_container_keys_vakues()

@manager.command
@manager.option(
"-s",
"--source",
help="data source name, ndexeing all the data sources is the default", # noqa
)
@manager.option(
"-f",
"--folder",
help="data folder which contains csv files", # noqa
)
@manager.option(
"-r",
"--resource",
help="resource name, creating all the indexes for all the resources is the default", # noqa
)
def get_index_data_from_csv_files(source=None, folder=None, resource="image"):
from omero_search_engine.cache_functions.elasticsearch.transform_data import (
insert_resource_data,
save_key_value_buckets
)
insert_resource_data(
folder=folder, resource=resource, data_source=source, from_json=False
)
save_key_value_buckets(
resource_table_=resource,
data_source=source,
clean_index=False,
only_values=False,
)


if __name__ == "__main__":
from flask_script import Command
Expand Down
2 changes: 1 addition & 1 deletion omero_search_engine/api/v1/resources/query_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def check_get_names(idr_, resource, attribute, return_exact=False):
for name in pr_names_
if name[attribute] and idr_.lower() in name[attribute].lower()
]
print(act_name, data_source)
# print(act_name, data_source)
all_act_names = all_act_names + act_name
else:
# This should be modified to query specific data source specific
Expand Down
110 changes: 55 additions & 55 deletions omero_search_engine/api/v1/resources/resource_analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,44 +41,11 @@
}}}}}}}}"""
)
key_number_search_template = Template(
r"""
{
"size":0,
"query":{ "bool" : {"must": {
"match":{
"data_source.keyvalue":"$data_source"
}
}
}
},
"aggs":{
"value_search":{
"nested":{
"path":"key_values"
},
"aggs":{
"value_filter":{
"filter":{
"terms":{
"key_values.name.keyword":[
"$key"
]
}
},
"aggs":{
"required_values":{
"cardinality":{
"field":"key_values.value.keyvalue",
"precision_threshold":4000
}
}
}
}
}
}
}
}
"""
"""
{"size":0,"query":{ "bool": {"must": {"match":{"data_source.keyvalue":"$data_source"}}}},
"aggs":{"value_search":{"nested":{"path":"key_values"},"aggs":{"value_filter":{"filter":{"terms":{
"key_values.name.keyword":["$key"]}},"aggs":{"required_values":{"cardinality":{
"field":"key_values.value.keyvalue","precision_threshold":4000}}}}}}}}"""
)

search_by_value_only = Template(
Expand Down Expand Up @@ -535,8 +502,8 @@ def query_cashed_bucket(

if name:
name = name.strip()
if resource != "all":

if resource != "all":
query = key_values_buckets_template.substitute(
name=name, resource=resource, data_source=json.dumps(data_source)
)
Expand Down Expand Up @@ -886,6 +853,8 @@ def get_the_results(
es_index, query
) # .search(index=es_index, body=query)
hits = results_["hits"]["hits"]
print ("===>>> Hist %s"%hits)

if len(hits) > 0:
for hit in hits:
if len(hits) > 0:
Expand All @@ -912,6 +881,8 @@ def get_the_results(
returned_results[hit["_source"]["data_source"]] = [
item for item in hit["_source"]["resourcename"]
]
else:
return returned_results

# remove container description from the results,
# should be added again later after cleaning up the description
Expand All @@ -921,6 +892,7 @@ def get_the_results(
return returned_results



def get_container_values_for_key(
table_, container_name, csv, ret_data_source=None, key=None
):
Expand Down Expand Up @@ -1053,22 +1025,50 @@ def process_container_query(table_, attribute_name, container_id, key, resourse)
{"terms": {"field": "key_values.value.keyvalue","size": 10000}}}}}}}"""
)


"""
Get all the keys bucket"""
container_project_keys_template = {
"keys_search": {
"nested": {"path": "key_values"},
"aggs": {
"required_values": {
"cardinality": {
"field": "key_values.name.keynamenormalize",
"precision_threshold": 4000,
},
},
"uniquesTerms": {
"terms": {"field": "key_values.name.keynamenormalize", "size": 10000}
},
},
}
container_project_keys_template = Template(
"""
{"keys_search": {"nested": {"path": "key_values"},
"aggs": {"required_values": {"cardinality": {"field": "key_values.name.keynamenormalize","precision_threshold": 4000,
},},"uniquesTerms": {"terms": {"field": "key_values.name.keynamenormalize", "size": 10000}},},}}
"""
)
resource_keys_template= Template(
'''
{
"size":0,
"query":{ "bool" : {"must": {
"match":{
"data_source.keyvalue":"$data_source"
}
}
}
},
"aggs":{
"value_search":{
"nested":{
"path":"key_values"
},
"aggs":{
"required_values":{
"cardinality":{
"field":"key_values.name.keyword",
"precision_threshold":4000
}
}
},
"aggs": {"required_name": {
"terms": {"field": "key_values.name.keyword","size": 9999}}}
}
}
}
'''
)


def get_resource_keys(resource, data_source):
res_index = resource_elasticsearchindex.get(resource)
res = search_index_for_value(res_index, json.loads(resource_keys_template.substitute(data_source=data_source)))
return res["aggregations"]["value_search"]["required_name"]["buckets"]
Loading

0 comments on commit 5a32426

Please sign in to comment.