Skip to content

Commit

Permalink
adding data sourcses parsing scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledk2 committed Dec 16, 2024
1 parent d09ef57 commit 69dd6ff
Show file tree
Hide file tree
Showing 3 changed files with 403 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
'''
This script reads the bia json files and creates the csv files
These csv files will be used to index the data and push it to the elasticsearch
'''
import json
import copy
import csv
import random

# All the path and file names should be configured to the hosted platform
bia_image_sample_json_file="../data/bia-image-export.json"
bia_study_sample_json_file="../data/bia-study-metadata.json"
bia_dataset_sample_json_file="../data/bia-dataset-metadata.json"

with open(bia_study_sample_json_file) as f:
studies = json.load(f)

with open(bia_image_sample_json_file) as f:
images = json.load(f)

with open(bia_dataset_sample_json_file) as f:
datasets = json.load(f)

print ("Number of studies: ",len(studies))
print ("Number of datasets: ",len(datasets))
print ("Number of images: ",len(images))

def uuid_to_int (uuid_string):
'''
the bia data does not have id but uuid so this function generates
Ids to be used in the search engine
:param uuid_string:
:return:
'''
return random.randint(0,1000000)

projects_keyvalues=["accession_id","release_date","licence","uuid"]
projects_main=["description","study"]
projects_data=[]
datasets_projects={}
images_data=[]
projects_id_name={}
dataset_names_id={}
images_without_url=[]
project_id_uuid= {}
dataset_id_uuid={}
values=[]

def extract_dataset_projects():
for st, dataset in datasets.items():
datasets_projects[st]=dataset.get("submitted_in_study").get("uuid")
dataset_names_id[st]=dataset.get("title_id")
dataset_id_uuid[st]=uuid_to_int(st)

def extract_images_data():
global keys
for st, imag in images.items():
image = {}
images_data.append(image)
image["id"] = uuid_to_int(imag.get("uuid"))
image["name"] = imag.get("uuid")
image["description"] = imag.get("description")
image["dataset_id"] = imag.get("submission_dataset_uuid")
image["dataset_id"]=dataset_id_uuid[image["dataset_id"]]
image["project_id"] = project_id_uuid[datasets_projects[imag.get("submission_dataset_uuid")]]
image["project_name"] = projects_id_name[datasets_projects[imag.get("submission_dataset_uuid")]]
image["dataset_name"] = dataset_names_id[imag.get("submission_dataset_uuid")]
image_ = copy.deepcopy(image)
iamge_uid=copy.deepcopy(image_)
images_data.append(iamge_uid)
iamge_uid["mapvalue_name"] = "uuid"
iamge_uid["mapvalue_value"] = imag.get("uuid")
iamge_uid["mapvalue_index"] = 0
iamge_project_uid = copy.deepcopy(image_)
images_data.append(iamge_project_uid)
iamge_project_uid["mapvalue_name"] = "project_uuid"
iamge_project_uid["mapvalue_value"] = datasets_projects[imag.get("submission_dataset_uuid")]
iamge_project_uid["mapvalue_index"] = 0
iamge_dataset_uid = copy.deepcopy(image_)
images_data.append(iamge_dataset_uid)
iamge_dataset_uid["mapvalue_name"] = "dataset_uuid"
iamge_dataset_uid["mapvalue_value"] = image["dataset_id"]
iamge_dataset_uid["mapvalue_index"] = 0
index=0
for sample in imag["subject"]["sample_of"]:
image_org=copy.deepcopy(image_)
images_data.append(image_org)
image_org["mapvalue_name"]="organism"
image_org ["mapvalue_value"]=sample["organism_classification"][0]["scientific_name"]
image_org ["mapvalue_index"]=index
if not keys:
keys=image_org.keys()
index = index+1
for key, value in imag["attribute"].items():
if type(value) is str:
if key not in values:
values.append(key)
image_attr = copy.deepcopy(image_)
images_data.append(image_attr)
image_attr["mapvalue_name"] = key
image_attr["mapvalue_value"] = value
image_attr["mapvalue_index"] = 0
index=0
#total_size_in_bytes
for file_ in imag["representation"]:
image_file= copy.deepcopy(image_)
images_data.append(image_file)
image_file["mapvalue_name"] = "image_format"
image_file["mapvalue_value"] = file_["image_format"]
image_file["mapvalue_index"] = index
image_file_ = copy.deepcopy(image_)
images_data.append(image_file_)
image_file_["mapvalue_name"] = "file_uri"
image_file_["mapvalue_index"] = index
if len(file_["file_uri"])==0:
images_without_url.append(st)
image_file_["mapvalue_value"] = "None"
else:
image_file_["mapvalue_value"] = file_["file_uri"][0]
image_size = copy.deepcopy(image_)
images_data.append(image_size)
image_size["mapvalue_name"] = "image_size"
image_size["mapvalue_index"] = index
image_size["mapvalue_value"] =file_.get("total_size_in_bytes")
index=index+1
def extract_projects_data():
for st, study in studies.items():
project = {}
project["id"] = uuid_to_int(study.get("uuid"))
project_id_uuid[study.get("uuid")]= project["id"]
project["name"] = study.get("title")
projects_id_name[study.get("uuid")]=study.get("title")
project["description"] = study.get("description")
project_ = copy.deepcopy(project)
#projects_data.append(project)
if study.get("Title"):
Project_title = copy.deepcopy(project_)
projects_data.append(Project_title)
Project_title["mapvalue_name"] = "Title"
Project_title["mapvalue_value"] = study.get("Title")
Project_title["mapvalue_index"] = 0
for name in projects_keyvalues:
if study.get(name):
project__ = copy.deepcopy(project_)
projects_data.append(project__)
project__["mapvalue_name"] = name
project__["mapvalue_value"] = study.get(name)
project__["mapvalue_index"] = 0
index=0
if study.get("Author"):
for author in study.get("Author"):
index=index+1
project__ = copy.deepcopy(project_)
projects_data.append(project__)
project__["mapvalue_name"] = "author_name"
project__["mapvalue_value"] = author.get("name")
project__["mapvalue_index"] = index
project___ = copy.deepcopy(project_)
projects_data.append(project___)
project___["mapvalue_name"] = "author_email"
project__["mapvalue_value"] = author.get("contact_email")
project__["mapvalue_index"] = index

keys = None
projects_filename = "../data/bia_projects.csv"
images_filename = "../data/bia_images.csv"

projects_keys=["id","name","description","mapvalue_value","mapvalue_name","mapvalue_index"]
extract_projects_data()
with open(projects_filename, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, projects_keys)
dict_writer.writeheader()
dict_writer.writerows(projects_data)

extract_dataset_projects()
extract_images_data()

print ("datasets_projects",len(datasets_projects))
print("images_without_url", len(images_without_url))
print ("projects:",len(projects_filename))


with open(images_filename, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(images_data)

print (len(values))
print ("images:",len(images_filename))
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import pandas as pd
import numpy as np
import requests
import json
import copy
import os
import csv

#dataset should be added
def read_csv_file(file_name):
df = pd.read_csv(file_name).replace({np.nan: None})
return df

def get_values(data):
for index, row in data.iterrows():
for col in data.columns:
if row[col]:
print (col, ":",row[col])


dump_for_ome="../data/SSBD_dump-for-ome.csv"
df=df=read_csv_file(dump_for_ome)

images_key_values=["Organism",
"Organism ontology",
"Strain",
"Strain ontology",
"Cell line",
"Cell line ontology",
"Gene symbols",
"Protein names",
"Protein tags",
"Reporter",
"GO Biological Process",
"GO Biological Process ontology",
"GO Cellular Component",
"GO Cellular Component ontology",
"GO Molecular Function",
"GO Molecular Function ontology"]

project_keyvalues=["Biological Imaging Method", "Title",
"Biological Imaging Method ontology",
"ssbd_dataset_id"]


trouble_links=[]
trouble_datasets=[]
datasets_projects = {}
added_projects=[]

def download_datasets_images():
from datetime import datetime
cou=0
for index, row in df.iterrows():
if row.get("SSBD:OMERO Dataset ID") and row.get("SSBD:OMERO Project ID"):
cou=cou+1
print ("processing %s"%cou)
datasets_projects[int(row.get("SSBD:OMERO Dataset ID"))]=int(row.get("SSBD:OMERO Project ID"))

st=datetime.now()
print (int (row.get("SSBD:OMERO Dataset ID")))
dataset_id=int (row.get("SSBD:OMERO Dataset ID"))
if os.path.isfile("../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id):
print ("Escap downloading, file ../data/ssbd_images/datasets_images_dataset_%s.json is exist"%dataset_id)
continue
try:
url = "https://ssbd.riken.jp/omero/webgateway/dataset/%s/children/" % dataset_id
raise Exception
res = requests.get(url)
data=json.loads((res.text))
with open("../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id, "w") as outfile:
outfile.write(json.dumps(data, indent=4))

except Exception as ex:
print ("error for url %s, error message is: %s"% (url, ex))
trouble_links.append(url)
trouble_datasets.append(row.get("Dataset ID"))
trouble_datasets.append(row.get("SSBD:OMERO Dataset ID"))
if len(trouble_links) ==3:
print (trouble_datasets)
print (trouble_links)
end = datetime.now()
print (st, end)


images_data=[]
projects_data=[]
no_image_found_jsom=[]
def extract_images_projects_data():
files_counter=0
for index, row in df.iterrows():
if not row.get("SSBD:OMERO Dataset ID"):
continue
files_counter+=1
print ("processing %s"%files_counter)
dataset_id=int(row.get("SSBD:OMERO Dataset ID"))
fname="../data/ssbd_images/datasets_images_dataset_%s.json"%dataset_id
if not os.path.isfile(fname):
print("No images' file found for dataset: %s"%dataset_id)
no_image_found_jsom.append(dataset_id)
continue

with open(fname) as f:
images_json_data = json.load(f)
print ("Found")
if row.get("SSBD:OMERO Project ID") not in added_projects:
added_projects.append(row.get("SSBD:OMERO Project ID"))
project={}
project["id"] = row.get("SSBD:OMERO Project ID")
project["name"] = row.get("Project ID")
project["description"]=row.get("Description")
project_=copy.deepcopy(project)
projects_data.append(project)
Project_title = copy.deepcopy(project_)
projects_data.append(Project_title)
Project_title["mapvalue_name"] = "Title"
Project_title["mapvalue_value"] = row.get("Title")
Project_title["mapvalue_index"] = 0
for name in project_keyvalues:
if row.get(name):
project__=copy.deepcopy(project_)
projects_data.append(project__)
project__["mapvalue_name"]=name
project__["mapvalue_value"]=row.get(name)
project__["mapvalue_index"]=0

for image_ in images_json_data:
image={}
images_data.append(image)
image["id"]=int(image_.get("id"))
image["name"]=image_.get("name")
image["description"]=image_.get("description")
image["dataset_id"]=dataset_id
image["project_id"] = datasets_projects[image["dataset_id"]]
image["project_name"]=row.get("Project ID")
image["dataset_name"] = row.get("Dataset ID")
iamge_name_ = copy.deepcopy(image)
image["mapvalue_name"]="thumb_url"
image["mapvalue_value"]="https://ssbd.riken.jp%s"%image_.get("thumb_url")
image["mapvalue_index"]=0
iamge_name_url = copy.deepcopy(iamge_name_)
images_data.append(iamge_name_url)
iamge_name_url["mapvalue_name"] = "image_url"
iamge_name_url["mapvalue_value"]="https://ssbd.riken.jp/omero/webclient/img_detail/%s"%image["id"]
iamge_name_url["mapvalue_index"] = 0

dataset_url= copy.deepcopy(iamge_name_)
images_data.append(dataset_url)
dataset_url["mapvalue_name"] = "dataset_url"
dataset_url["mapvalue_value"] = "https://ssbd.riken.jp/omero/webclient/?show=dataset-%s" % dataset_id
dataset_url["mapvalue_index"] = 0
for name in images_key_values:
if row.get(name):
iamge_name=copy.deepcopy(iamge_name_)
images_data.append(iamge_name)
iamge_name_=copy.deepcopy(image)
''' "Gene symbols",
"Protein names",'''
if name =="Gene symbols":
iamge_name_["mapvalue_name"]="Gene symbol"
elif name =="Protein names":
iamge_name_["mapvalue_name"] = "Protein name"
else:
iamge_name_["mapvalue_name"]=name
if row.get(name).lower() == "h. sapiens" and name.lower() == "organism":
iamge_name_["mapvalue_value"] = "Homo sapiens"
elif row.get(name).lower() == "hell cell" and name.lower() == "cell line":
iamge_name_["mapvalue_value"] = "hela"
elif row.get(name).lower() == "m. musculus" and name.lower() == "organism":
iamge_name_["mapvalue_value"] = "mus musculus"
else:
iamge_name_["mapvalue_value"] = row.get(name)

projects_filename = "../data/ssbd_images/ssbd_projects.csv"
images_filename = "../data/ssbd_images/ssbd_images.csv"
links_filename = "../data/ssbd_images/ssbd_links_error.csv"

download_datasets_images()
extract_images_projects_data()

keys = images_data[0].keys()
with open(images_filename, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(images_data)

projects_keys=["id","name","description","mapvalue_value","mapvalue_name","mapvalue_index"]

with open(projects_filename, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, projects_keys)
dict_writer.writeheader()
dict_writer.writerows(projects_data)


print (len(no_image_found_jsom))


print (len(trouble_links))

with open(links_filename, 'w', newline='') as output_file:
output_file.write('\n'.join(trouble_links))
Loading

0 comments on commit 69dd6ff

Please sign in to comment.