From 69dd6ff8d5a3bf835e360b760ce8133973ebefb1 Mon Sep 17 00:00:00 2001 From: khaledk2 Date: Mon, 16 Dec 2024 21:46:12 +0000 Subject: [PATCH] adding data sourcses parsing scripts --- .../elasticsearch/utils/extract_BIA_data.py | 189 ++++++++++++++++ .../elasticsearch/utils/extract_SSBD_data.py | 201 ++++++++++++++++++ .../utils/parsing_instructions.md | 13 ++ 3 files changed, 403 insertions(+) create mode 100644 omero_search_engine/cache_functions/elasticsearch/utils/extract_BIA_data.py create mode 100644 omero_search_engine/cache_functions/elasticsearch/utils/extract_SSBD_data.py create mode 100644 omero_search_engine/cache_functions/elasticsearch/utils/parsing_instructions.md diff --git a/omero_search_engine/cache_functions/elasticsearch/utils/extract_BIA_data.py b/omero_search_engine/cache_functions/elasticsearch/utils/extract_BIA_data.py new file mode 100644 index 0000000..f6027bf --- /dev/null +++ b/omero_search_engine/cache_functions/elasticsearch/utils/extract_BIA_data.py @@ -0,0 +1,189 @@ +''' +This script reads the bia json files and creates the csv files +These csv files will be used to index the data and push it to the elasticsearch +''' +import json +import copy +import csv +import random + +# All the path and file names should be configured to the hosted platform +bia_image_sample_json_file="../data/bia-image-export.json" +bia_study_sample_json_file="../data/bia-study-metadata.json" +bia_dataset_sample_json_file="../data/bia-dataset-metadata.json" + +with open(bia_study_sample_json_file) as f: + studies = json.load(f) + +with open(bia_image_sample_json_file) as f: + images = json.load(f) + +with open(bia_dataset_sample_json_file) as f: + datasets = json.load(f) + +print ("Number of studies: ",len(studies)) +print ("Number of datasets: ",len(datasets)) +print ("Number of images: ",len(images)) + +def uuid_to_int (uuid_string): + ''' + the bia data does not have id but uuid so this function generates + Ids to be used in the search engine + :param uuid_string: + :return: + ''' + return random.randint(0,1000000) + +projects_keyvalues=["accession_id","release_date","licence","uuid"] +projects_main=["description","study"] +projects_data=[] +datasets_projects={} +images_data=[] +projects_id_name={} +dataset_names_id={} +images_without_url=[] +project_id_uuid= {} +dataset_id_uuid={} +values=[] + +def extract_dataset_projects(): + for st, dataset in datasets.items(): + datasets_projects[st]=dataset.get("submitted_in_study").get("uuid") + dataset_names_id[st]=dataset.get("title_id") + dataset_id_uuid[st]=uuid_to_int(st) + +def extract_images_data(): + global keys + for st, imag in images.items(): + image = {} + images_data.append(image) + image["id"] = uuid_to_int(imag.get("uuid")) + image["name"] = imag.get("uuid") + image["description"] = imag.get("description") + image["dataset_id"] = imag.get("submission_dataset_uuid") + image["dataset_id"]=dataset_id_uuid[image["dataset_id"]] + image["project_id"] = project_id_uuid[datasets_projects[imag.get("submission_dataset_uuid")]] + image["project_name"] = projects_id_name[datasets_projects[imag.get("submission_dataset_uuid")]] + image["dataset_name"] = dataset_names_id[imag.get("submission_dataset_uuid")] + image_ = copy.deepcopy(image) + iamge_uid=copy.deepcopy(image_) + images_data.append(iamge_uid) + iamge_uid["mapvalue_name"] = "uuid" + iamge_uid["mapvalue_value"] = imag.get("uuid") + iamge_uid["mapvalue_index"] = 0 + iamge_project_uid = copy.deepcopy(image_) + images_data.append(iamge_project_uid) + iamge_project_uid["mapvalue_name"] = "project_uuid" + iamge_project_uid["mapvalue_value"] = datasets_projects[imag.get("submission_dataset_uuid")] + iamge_project_uid["mapvalue_index"] = 0 + iamge_dataset_uid = copy.deepcopy(image_) + images_data.append(iamge_dataset_uid) + iamge_dataset_uid["mapvalue_name"] = "dataset_uuid" + iamge_dataset_uid["mapvalue_value"] = image["dataset_id"] + iamge_dataset_uid["mapvalue_index"] = 0 + index=0 + for sample in imag["subject"]["sample_of"]: + image_org=copy.deepcopy(image_) + images_data.append(image_org) + image_org["mapvalue_name"]="organism" + image_org ["mapvalue_value"]=sample["organism_classification"][0]["scientific_name"] + image_org ["mapvalue_index"]=index + if not keys: + keys=image_org.keys() + index = index+1 + for key, value in imag["attribute"].items(): + if type(value) is str: + if key not in values: + values.append(key) + image_attr = copy.deepcopy(image_) + images_data.append(image_attr) + image_attr["mapvalue_name"] = key + image_attr["mapvalue_value"] = value + image_attr["mapvalue_index"] = 0 + index=0 + #total_size_in_bytes + for file_ in imag["representation"]: + image_file= copy.deepcopy(image_) + images_data.append(image_file) + image_file["mapvalue_name"] = "image_format" + image_file["mapvalue_value"] = file_["image_format"] + image_file["mapvalue_index"] = index + image_file_ = copy.deepcopy(image_) + images_data.append(image_file_) + image_file_["mapvalue_name"] = "file_uri" + image_file_["mapvalue_index"] = index + if len(file_["file_uri"])==0: + images_without_url.append(st) + image_file_["mapvalue_value"] = "None" + else: + image_file_["mapvalue_value"] = file_["file_uri"][0] + image_size = copy.deepcopy(image_) + images_data.append(image_size) + image_size["mapvalue_name"] = "image_size" + image_size["mapvalue_index"] = index + image_size["mapvalue_value"] =file_.get("total_size_in_bytes") + index=index+1 +def extract_projects_data(): + for st, study in studies.items(): + project = {} + project["id"] = uuid_to_int(study.get("uuid")) + project_id_uuid[study.get("uuid")]= project["id"] + project["name"] = study.get("title") + projects_id_name[study.get("uuid")]=study.get("title") + project["description"] = study.get("description") + project_ = copy.deepcopy(project) + #projects_data.append(project) + if study.get("Title"): + Project_title = copy.deepcopy(project_) + projects_data.append(Project_title) + Project_title["mapvalue_name"] = "Title" + Project_title["mapvalue_value"] = study.get("Title") + Project_title["mapvalue_index"] = 0 + for name in projects_keyvalues: + if study.get(name): + project__ = copy.deepcopy(project_) + projects_data.append(project__) + project__["mapvalue_name"] = name + project__["mapvalue_value"] = study.get(name) + project__["mapvalue_index"] = 0 + index=0 + if study.get("Author"): + for author in study.get("Author"): + index=index+1 + project__ = copy.deepcopy(project_) + projects_data.append(project__) + project__["mapvalue_name"] = "author_name" + project__["mapvalue_value"] = author.get("name") + project__["mapvalue_index"] = index + project___ = copy.deepcopy(project_) + projects_data.append(project___) + project___["mapvalue_name"] = "author_email" + project__["mapvalue_value"] = author.get("contact_email") + project__["mapvalue_index"] = index + +keys = None +projects_filename = "../data/bia_projects.csv" +images_filename = "../data/bia_images.csv" + +projects_keys=["id","name","description","mapvalue_value","mapvalue_name","mapvalue_index"] +extract_projects_data() +with open(projects_filename, 'w', newline='') as output_file: + dict_writer = csv.DictWriter(output_file, projects_keys) + dict_writer.writeheader() + dict_writer.writerows(projects_data) + +extract_dataset_projects() +extract_images_data() + +print ("datasets_projects",len(datasets_projects)) +print("images_without_url", len(images_without_url)) +print ("projects:",len(projects_filename)) + + +with open(images_filename, 'w', newline='') as output_file: + dict_writer = csv.DictWriter(output_file, keys) + dict_writer.writeheader() + dict_writer.writerows(images_data) + +print (len(values)) +print ("images:",len(images_filename)) diff --git a/omero_search_engine/cache_functions/elasticsearch/utils/extract_SSBD_data.py b/omero_search_engine/cache_functions/elasticsearch/utils/extract_SSBD_data.py new file mode 100644 index 0000000..e964b2d --- /dev/null +++ b/omero_search_engine/cache_functions/elasticsearch/utils/extract_SSBD_data.py @@ -0,0 +1,201 @@ +import pandas as pd +import numpy as np +import requests +import json +import copy +import os +import csv + +#dataset should be added +def read_csv_file(file_name): + df = pd.read_csv(file_name).replace({np.nan: None}) + return df + +def get_values(data): + for index, row in data.iterrows(): + for col in data.columns: + if row[col]: + print (col, ":",row[col]) + + +dump_for_ome="../data/SSBD_dump-for-ome.csv" +df=df=read_csv_file(dump_for_ome) + +images_key_values=["Organism", + "Organism ontology", + "Strain", + "Strain ontology", + "Cell line", + "Cell line ontology", + "Gene symbols", + "Protein names", + "Protein tags", + "Reporter", + "GO Biological Process", + "GO Biological Process ontology", + "GO Cellular Component", + "GO Cellular Component ontology", + "GO Molecular Function", + "GO Molecular Function ontology"] + +project_keyvalues=["Biological Imaging Method", "Title", + "Biological Imaging Method ontology", + "ssbd_dataset_id"] + + +trouble_links=[] +trouble_datasets=[] +datasets_projects = {} +added_projects=[] + +def download_datasets_images(): + from datetime import datetime + cou=0 + for index, row in df.iterrows(): + if row.get("SSBD:OMERO Dataset ID") and row.get("SSBD:OMERO Project ID"): + cou=cou+1 + print ("processing %s"%cou) + datasets_projects[int(row.get("SSBD:OMERO Dataset ID"))]=int(row.get("SSBD:OMERO Project ID")) + + st=datetime.now() + print (int (row.get("SSBD:OMERO Dataset ID"))) + dataset_id=int (row.get("SSBD:OMERO Dataset ID")) + if os.path.isfile("../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id): + print ("Escap downloading, file ../data/ssbd_images/datasets_images_dataset_%s.json is exist"%dataset_id) + continue + try: + url = "https://ssbd.riken.jp/omero/webgateway/dataset/%s/children/" % dataset_id + raise Exception + res = requests.get(url) + data=json.loads((res.text)) + with open("../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id, "w") as outfile: + outfile.write(json.dumps(data, indent=4)) + + except Exception as ex: + print ("error for url %s, error message is: %s"% (url, ex)) + trouble_links.append(url) + trouble_datasets.append(row.get("Dataset ID")) + trouble_datasets.append(row.get("SSBD:OMERO Dataset ID")) + if len(trouble_links) ==3: + print (trouble_datasets) + print (trouble_links) + end = datetime.now() + print (st, end) + + +images_data=[] +projects_data=[] +no_image_found_jsom=[] +def extract_images_projects_data(): + files_counter=0 + for index, row in df.iterrows(): + if not row.get("SSBD:OMERO Dataset ID"): + continue + files_counter+=1 + print ("processing %s"%files_counter) + dataset_id=int(row.get("SSBD:OMERO Dataset ID")) + fname="../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id + if not os.path.isfile(fname): + print("No images' file found for dataset: %s"%dataset_id) + no_image_found_jsom.append(dataset_id) + continue + + with open(fname) as f: + images_json_data = json.load(f) + print ("Found") + if row.get("SSBD:OMERO Project ID") not in added_projects: + added_projects.append(row.get("SSBD:OMERO Project ID")) + project={} + project["id"] = row.get("SSBD:OMERO Project ID") + project["name"] = row.get("Project ID") + project["description"]=row.get("Description") + project_=copy.deepcopy(project) + projects_data.append(project) + Project_title = copy.deepcopy(project_) + projects_data.append(Project_title) + Project_title["mapvalue_name"] = "Title" + Project_title["mapvalue_value"] = row.get("Title") + Project_title["mapvalue_index"] = 0 + for name in project_keyvalues: + if row.get(name): + project__=copy.deepcopy(project_) + projects_data.append(project__) + project__["mapvalue_name"]=name + project__["mapvalue_value"]=row.get(name) + project__["mapvalue_index"]=0 + + for image_ in images_json_data: + image={} + images_data.append(image) + image["id"]=int(image_.get("id")) + image["name"]=image_.get("name") + image["description"]=image_.get("description") + image["dataset_id"]=dataset_id + image["project_id"] = datasets_projects[image["dataset_id"]] + image["project_name"]=row.get("Project ID") + image["dataset_name"] = row.get("Dataset ID") + iamge_name_ = copy.deepcopy(image) + image["mapvalue_name"]="thumb_url" + image["mapvalue_value"]="https://ssbd.riken.jp%s"%image_.get("thumb_url") + image["mapvalue_index"]=0 + iamge_name_url = copy.deepcopy(iamge_name_) + images_data.append(iamge_name_url) + iamge_name_url["mapvalue_name"] = "image_url" + iamge_name_url["mapvalue_value"]="https://ssbd.riken.jp/omero/webclient/img_detail/%s"%image["id"] + iamge_name_url["mapvalue_index"] = 0 + + dataset_url= copy.deepcopy(iamge_name_) + images_data.append(dataset_url) + dataset_url["mapvalue_name"] = "dataset_url" + dataset_url["mapvalue_value"] = "https://ssbd.riken.jp/omero/webclient/?show=dataset-%s" % dataset_id + dataset_url["mapvalue_index"] = 0 + for name in images_key_values: + if row.get(name): + iamge_name=copy.deepcopy(iamge_name_) + images_data.append(iamge_name) + iamge_name_=copy.deepcopy(image) + ''' "Gene symbols", + "Protein names",''' + if name =="Gene symbols": + iamge_name_["mapvalue_name"]="Gene symbol" + elif name =="Protein names": + iamge_name_["mapvalue_name"] = "Protein name" + else: + iamge_name_["mapvalue_name"]=name + if row.get(name).lower() == "h. sapiens" and name.lower() == "organism": + iamge_name_["mapvalue_value"] = "Homo sapiens" + elif row.get(name).lower() == "hell cell" and name.lower() == "cell line": + iamge_name_["mapvalue_value"] = "hela" + elif row.get(name).lower() == "m. musculus" and name.lower() == "organism": + iamge_name_["mapvalue_value"] = "mus musculus" + else: + iamge_name_["mapvalue_value"] = row.get(name) + +projects_filename = "../data/ssbd_images/ssbd_projects.csv" +images_filename = "../data/ssbd_images/ssbd_images.csv" +links_filename = "../data/ssbd_images/ssbd_links_error.csv" + +download_datasets_images() +extract_images_projects_data() + +keys = images_data[0].keys() +with open(images_filename, 'w', newline='') as output_file: + dict_writer = csv.DictWriter(output_file, keys) + dict_writer.writeheader() + dict_writer.writerows(images_data) + +projects_keys=["id","name","description","mapvalue_value","mapvalue_name","mapvalue_index"] + +with open(projects_filename, 'w', newline='') as output_file: + dict_writer = csv.DictWriter(output_file, projects_keys) + dict_writer.writeheader() + dict_writer.writerows(projects_data) + + +print (len(no_image_found_jsom)) + + +print (len(trouble_links)) + +with open(links_filename, 'w', newline='') as output_file: + output_file.write('\n'.join(trouble_links)) diff --git a/omero_search_engine/cache_functions/elasticsearch/utils/parsing_instructions.md b/omero_search_engine/cache_functions/elasticsearch/utils/parsing_instructions.md new file mode 100644 index 0000000..97e642e --- /dev/null +++ b/omero_search_engine/cache_functions/elasticsearch/utils/parsing_instructions.md @@ -0,0 +1,13 @@ +* The user must change the file path for each data source. +* Next, the user should run the script to read the files containing the provided data. +* Each script will generate two files: one for the projects and one for the images. +* These files will be in a format that can be indexed by the search engine. +* The data source should be added to the search engine before running the indexer. +This can be achieved using a method in manage.py. For example, you can use the following command: + + python manage.py set_data_source_files -d csv -n bia + +* Then the indexer can push the data source to Elasticsearch by running a method in manage.py. This should run for each resource + + python manage get_index_data_from_csv_files -s bia -f /path/to/bia_images.csv -r image + python manage get_index_data_from_csv_files -s bia -f /path/to/bia_projects.csv -r project