forked from ome/omero_search_engine
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding data sourcses parsing scripts
- Loading branch information
Showing
3 changed files
with
403 additions
and
0 deletions.
There are no files selected for viewing
189 changes: 189 additions & 0 deletions
189
omero_search_engine/cache_functions/elasticsearch/utils/extract_BIA_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
''' | ||
This script reads the bia json files and creates the csv files | ||
These csv files will be used to index the data and push it to the elasticsearch | ||
''' | ||
import json | ||
import copy | ||
import csv | ||
import random | ||
|
||
# All the path and file names should be configured to the hosted platform | ||
bia_image_sample_json_file="../data/bia-image-export.json" | ||
bia_study_sample_json_file="../data/bia-study-metadata.json" | ||
bia_dataset_sample_json_file="../data/bia-dataset-metadata.json" | ||
|
||
with open(bia_study_sample_json_file) as f: | ||
studies = json.load(f) | ||
|
||
with open(bia_image_sample_json_file) as f: | ||
images = json.load(f) | ||
|
||
with open(bia_dataset_sample_json_file) as f: | ||
datasets = json.load(f) | ||
|
||
print ("Number of studies: ",len(studies)) | ||
print ("Number of datasets: ",len(datasets)) | ||
print ("Number of images: ",len(images)) | ||
|
||
def uuid_to_int (uuid_string): | ||
''' | ||
the bia data does not have id but uuid so this function generates | ||
Ids to be used in the search engine | ||
:param uuid_string: | ||
:return: | ||
''' | ||
return random.randint(0,1000000) | ||
|
||
projects_keyvalues=["accession_id","release_date","licence","uuid"] | ||
projects_main=["description","study"] | ||
projects_data=[] | ||
datasets_projects={} | ||
images_data=[] | ||
projects_id_name={} | ||
dataset_names_id={} | ||
images_without_url=[] | ||
project_id_uuid= {} | ||
dataset_id_uuid={} | ||
values=[] | ||
|
||
def extract_dataset_projects(): | ||
for st, dataset in datasets.items(): | ||
datasets_projects[st]=dataset.get("submitted_in_study").get("uuid") | ||
dataset_names_id[st]=dataset.get("title_id") | ||
dataset_id_uuid[st]=uuid_to_int(st) | ||
|
||
def extract_images_data(): | ||
global keys | ||
for st, imag in images.items(): | ||
image = {} | ||
images_data.append(image) | ||
image["id"] = uuid_to_int(imag.get("uuid")) | ||
image["name"] = imag.get("uuid") | ||
image["description"] = imag.get("description") | ||
image["dataset_id"] = imag.get("submission_dataset_uuid") | ||
image["dataset_id"]=dataset_id_uuid[image["dataset_id"]] | ||
image["project_id"] = project_id_uuid[datasets_projects[imag.get("submission_dataset_uuid")]] | ||
image["project_name"] = projects_id_name[datasets_projects[imag.get("submission_dataset_uuid")]] | ||
image["dataset_name"] = dataset_names_id[imag.get("submission_dataset_uuid")] | ||
image_ = copy.deepcopy(image) | ||
iamge_uid=copy.deepcopy(image_) | ||
images_data.append(iamge_uid) | ||
iamge_uid["mapvalue_name"] = "uuid" | ||
iamge_uid["mapvalue_value"] = imag.get("uuid") | ||
iamge_uid["mapvalue_index"] = 0 | ||
iamge_project_uid = copy.deepcopy(image_) | ||
images_data.append(iamge_project_uid) | ||
iamge_project_uid["mapvalue_name"] = "project_uuid" | ||
iamge_project_uid["mapvalue_value"] = datasets_projects[imag.get("submission_dataset_uuid")] | ||
iamge_project_uid["mapvalue_index"] = 0 | ||
iamge_dataset_uid = copy.deepcopy(image_) | ||
images_data.append(iamge_dataset_uid) | ||
iamge_dataset_uid["mapvalue_name"] = "dataset_uuid" | ||
iamge_dataset_uid["mapvalue_value"] = image["dataset_id"] | ||
iamge_dataset_uid["mapvalue_index"] = 0 | ||
index=0 | ||
for sample in imag["subject"]["sample_of"]: | ||
image_org=copy.deepcopy(image_) | ||
images_data.append(image_org) | ||
image_org["mapvalue_name"]="organism" | ||
image_org ["mapvalue_value"]=sample["organism_classification"][0]["scientific_name"] | ||
image_org ["mapvalue_index"]=index | ||
if not keys: | ||
keys=image_org.keys() | ||
index = index+1 | ||
for key, value in imag["attribute"].items(): | ||
if type(value) is str: | ||
if key not in values: | ||
values.append(key) | ||
image_attr = copy.deepcopy(image_) | ||
images_data.append(image_attr) | ||
image_attr["mapvalue_name"] = key | ||
image_attr["mapvalue_value"] = value | ||
image_attr["mapvalue_index"] = 0 | ||
index=0 | ||
#total_size_in_bytes | ||
for file_ in imag["representation"]: | ||
image_file= copy.deepcopy(image_) | ||
images_data.append(image_file) | ||
image_file["mapvalue_name"] = "image_format" | ||
image_file["mapvalue_value"] = file_["image_format"] | ||
image_file["mapvalue_index"] = index | ||
image_file_ = copy.deepcopy(image_) | ||
images_data.append(image_file_) | ||
image_file_["mapvalue_name"] = "file_uri" | ||
image_file_["mapvalue_index"] = index | ||
if len(file_["file_uri"])==0: | ||
images_without_url.append(st) | ||
image_file_["mapvalue_value"] = "None" | ||
else: | ||
image_file_["mapvalue_value"] = file_["file_uri"][0] | ||
image_size = copy.deepcopy(image_) | ||
images_data.append(image_size) | ||
image_size["mapvalue_name"] = "image_size" | ||
image_size["mapvalue_index"] = index | ||
image_size["mapvalue_value"] =file_.get("total_size_in_bytes") | ||
index=index+1 | ||
def extract_projects_data(): | ||
for st, study in studies.items(): | ||
project = {} | ||
project["id"] = uuid_to_int(study.get("uuid")) | ||
project_id_uuid[study.get("uuid")]= project["id"] | ||
project["name"] = study.get("title") | ||
projects_id_name[study.get("uuid")]=study.get("title") | ||
project["description"] = study.get("description") | ||
project_ = copy.deepcopy(project) | ||
#projects_data.append(project) | ||
if study.get("Title"): | ||
Project_title = copy.deepcopy(project_) | ||
projects_data.append(Project_title) | ||
Project_title["mapvalue_name"] = "Title" | ||
Project_title["mapvalue_value"] = study.get("Title") | ||
Project_title["mapvalue_index"] = 0 | ||
for name in projects_keyvalues: | ||
if study.get(name): | ||
project__ = copy.deepcopy(project_) | ||
projects_data.append(project__) | ||
project__["mapvalue_name"] = name | ||
project__["mapvalue_value"] = study.get(name) | ||
project__["mapvalue_index"] = 0 | ||
index=0 | ||
if study.get("Author"): | ||
for author in study.get("Author"): | ||
index=index+1 | ||
project__ = copy.deepcopy(project_) | ||
projects_data.append(project__) | ||
project__["mapvalue_name"] = "author_name" | ||
project__["mapvalue_value"] = author.get("name") | ||
project__["mapvalue_index"] = index | ||
project___ = copy.deepcopy(project_) | ||
projects_data.append(project___) | ||
project___["mapvalue_name"] = "author_email" | ||
project__["mapvalue_value"] = author.get("contact_email") | ||
project__["mapvalue_index"] = index | ||
|
||
keys = None | ||
projects_filename = "../data/bia_projects.csv" | ||
images_filename = "../data/bia_images.csv" | ||
|
||
projects_keys=["id","name","description","mapvalue_value","mapvalue_name","mapvalue_index"] | ||
extract_projects_data() | ||
with open(projects_filename, 'w', newline='') as output_file: | ||
dict_writer = csv.DictWriter(output_file, projects_keys) | ||
dict_writer.writeheader() | ||
dict_writer.writerows(projects_data) | ||
|
||
extract_dataset_projects() | ||
extract_images_data() | ||
|
||
print ("datasets_projects",len(datasets_projects)) | ||
print("images_without_url", len(images_without_url)) | ||
print ("projects:",len(projects_filename)) | ||
|
||
|
||
with open(images_filename, 'w', newline='') as output_file: | ||
dict_writer = csv.DictWriter(output_file, keys) | ||
dict_writer.writeheader() | ||
dict_writer.writerows(images_data) | ||
|
||
print (len(values)) | ||
print ("images:",len(images_filename)) |
201 changes: 201 additions & 0 deletions
201
omero_search_engine/cache_functions/elasticsearch/utils/extract_SSBD_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import requests | ||
import json | ||
import copy | ||
import os | ||
import csv | ||
|
||
#dataset should be added | ||
def read_csv_file(file_name): | ||
df = pd.read_csv(file_name).replace({np.nan: None}) | ||
return df | ||
|
||
def get_values(data): | ||
for index, row in data.iterrows(): | ||
for col in data.columns: | ||
if row[col]: | ||
print (col, ":",row[col]) | ||
|
||
|
||
dump_for_ome="../data/SSBD_dump-for-ome.csv" | ||
df=df=read_csv_file(dump_for_ome) | ||
|
||
images_key_values=["Organism", | ||
"Organism ontology", | ||
"Strain", | ||
"Strain ontology", | ||
"Cell line", | ||
"Cell line ontology", | ||
"Gene symbols", | ||
"Protein names", | ||
"Protein tags", | ||
"Reporter", | ||
"GO Biological Process", | ||
"GO Biological Process ontology", | ||
"GO Cellular Component", | ||
"GO Cellular Component ontology", | ||
"GO Molecular Function", | ||
"GO Molecular Function ontology"] | ||
|
||
project_keyvalues=["Biological Imaging Method", "Title", | ||
"Biological Imaging Method ontology", | ||
"ssbd_dataset_id"] | ||
|
||
|
||
trouble_links=[] | ||
trouble_datasets=[] | ||
datasets_projects = {} | ||
added_projects=[] | ||
|
||
def download_datasets_images(): | ||
from datetime import datetime | ||
cou=0 | ||
for index, row in df.iterrows(): | ||
if row.get("SSBD:OMERO Dataset ID") and row.get("SSBD:OMERO Project ID"): | ||
cou=cou+1 | ||
print ("processing %s"%cou) | ||
datasets_projects[int(row.get("SSBD:OMERO Dataset ID"))]=int(row.get("SSBD:OMERO Project ID")) | ||
|
||
st=datetime.now() | ||
print (int (row.get("SSBD:OMERO Dataset ID"))) | ||
dataset_id=int (row.get("SSBD:OMERO Dataset ID")) | ||
if os.path.isfile("../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id): | ||
print ("Escap downloading, file ../data/ssbd_images/datasets_images_dataset_%s.json is exist"%dataset_id) | ||
continue | ||
try: | ||
url = "https://ssbd.riken.jp/omero/webgateway/dataset/%s/children/" % dataset_id | ||
raise Exception | ||
res = requests.get(url) | ||
data=json.loads((res.text)) | ||
with open("../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id, "w") as outfile: | ||
outfile.write(json.dumps(data, indent=4)) | ||
|
||
except Exception as ex: | ||
print ("error for url %s, error message is: %s"% (url, ex)) | ||
trouble_links.append(url) | ||
trouble_datasets.append(row.get("Dataset ID")) | ||
trouble_datasets.append(row.get("SSBD:OMERO Dataset ID")) | ||
if len(trouble_links) ==3: | ||
print (trouble_datasets) | ||
print (trouble_links) | ||
end = datetime.now() | ||
print (st, end) | ||
|
||
|
||
images_data=[] | ||
projects_data=[] | ||
no_image_found_jsom=[] | ||
def extract_images_projects_data(): | ||
files_counter=0 | ||
for index, row in df.iterrows(): | ||
if not row.get("SSBD:OMERO Dataset ID"): | ||
continue | ||
files_counter+=1 | ||
print ("processing %s"%files_counter) | ||
dataset_id=int(row.get("SSBD:OMERO Dataset ID")) | ||
fname="../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id | ||
if not os.path.isfile(fname): | ||
print("No images' file found for dataset: %s"%dataset_id) | ||
no_image_found_jsom.append(dataset_id) | ||
continue | ||
|
||
with open(fname) as f: | ||
images_json_data = json.load(f) | ||
print ("Found") | ||
if row.get("SSBD:OMERO Project ID") not in added_projects: | ||
added_projects.append(row.get("SSBD:OMERO Project ID")) | ||
project={} | ||
project["id"] = row.get("SSBD:OMERO Project ID") | ||
project["name"] = row.get("Project ID") | ||
project["description"]=row.get("Description") | ||
project_=copy.deepcopy(project) | ||
projects_data.append(project) | ||
Project_title = copy.deepcopy(project_) | ||
projects_data.append(Project_title) | ||
Project_title["mapvalue_name"] = "Title" | ||
Project_title["mapvalue_value"] = row.get("Title") | ||
Project_title["mapvalue_index"] = 0 | ||
for name in project_keyvalues: | ||
if row.get(name): | ||
project__=copy.deepcopy(project_) | ||
projects_data.append(project__) | ||
project__["mapvalue_name"]=name | ||
project__["mapvalue_value"]=row.get(name) | ||
project__["mapvalue_index"]=0 | ||
|
||
for image_ in images_json_data: | ||
image={} | ||
images_data.append(image) | ||
image["id"]=int(image_.get("id")) | ||
image["name"]=image_.get("name") | ||
image["description"]=image_.get("description") | ||
image["dataset_id"]=dataset_id | ||
image["project_id"] = datasets_projects[image["dataset_id"]] | ||
image["project_name"]=row.get("Project ID") | ||
image["dataset_name"] = row.get("Dataset ID") | ||
iamge_name_ = copy.deepcopy(image) | ||
image["mapvalue_name"]="thumb_url" | ||
image["mapvalue_value"]="https://ssbd.riken.jp%s"%image_.get("thumb_url") | ||
image["mapvalue_index"]=0 | ||
iamge_name_url = copy.deepcopy(iamge_name_) | ||
images_data.append(iamge_name_url) | ||
iamge_name_url["mapvalue_name"] = "image_url" | ||
iamge_name_url["mapvalue_value"]="https://ssbd.riken.jp/omero/webclient/img_detail/%s"%image["id"] | ||
iamge_name_url["mapvalue_index"] = 0 | ||
|
||
dataset_url= copy.deepcopy(iamge_name_) | ||
images_data.append(dataset_url) | ||
dataset_url["mapvalue_name"] = "dataset_url" | ||
dataset_url["mapvalue_value"] = "https://ssbd.riken.jp/omero/webclient/?show=dataset-%s" % dataset_id | ||
dataset_url["mapvalue_index"] = 0 | ||
for name in images_key_values: | ||
if row.get(name): | ||
iamge_name=copy.deepcopy(iamge_name_) | ||
images_data.append(iamge_name) | ||
iamge_name_=copy.deepcopy(image) | ||
''' "Gene symbols", | ||
"Protein names",''' | ||
if name =="Gene symbols": | ||
iamge_name_["mapvalue_name"]="Gene symbol" | ||
elif name =="Protein names": | ||
iamge_name_["mapvalue_name"] = "Protein name" | ||
else: | ||
iamge_name_["mapvalue_name"]=name | ||
if row.get(name).lower() == "h. sapiens" and name.lower() == "organism": | ||
iamge_name_["mapvalue_value"] = "Homo sapiens" | ||
elif row.get(name).lower() == "hell cell" and name.lower() == "cell line": | ||
iamge_name_["mapvalue_value"] = "hela" | ||
elif row.get(name).lower() == "m. musculus" and name.lower() == "organism": | ||
iamge_name_["mapvalue_value"] = "mus musculus" | ||
else: | ||
iamge_name_["mapvalue_value"] = row.get(name) | ||
|
||
projects_filename = "../data/ssbd_images/ssbd_projects.csv" | ||
images_filename = "../data/ssbd_images/ssbd_images.csv" | ||
links_filename = "../data/ssbd_images/ssbd_links_error.csv" | ||
|
||
download_datasets_images() | ||
extract_images_projects_data() | ||
|
||
keys = images_data[0].keys() | ||
with open(images_filename, 'w', newline='') as output_file: | ||
dict_writer = csv.DictWriter(output_file, keys) | ||
dict_writer.writeheader() | ||
dict_writer.writerows(images_data) | ||
|
||
projects_keys=["id","name","description","mapvalue_value","mapvalue_name","mapvalue_index"] | ||
|
||
with open(projects_filename, 'w', newline='') as output_file: | ||
dict_writer = csv.DictWriter(output_file, projects_keys) | ||
dict_writer.writeheader() | ||
dict_writer.writerows(projects_data) | ||
|
||
|
||
print (len(no_image_found_jsom)) | ||
|
||
|
||
print (len(trouble_links)) | ||
|
||
with open(links_filename, 'w', newline='') as output_file: | ||
output_file.write('\n'.join(trouble_links)) |
Oops, something went wrong.