Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pinnacle experiment data -- scDTI #309

Merged
merged 4 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions tdc/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,16 @@ def get_task2category():
"pinnacle_protein_embed": "pth",
"pinnacle_labels_dict": "txt",
"panpep": "tab",
"pinnacle_output1": "zip",
"pinnacle_output2": "zip",
"pinnacle_output3": "zip",
"pinnacle_output4": "zip",
"pinnacle_output5": "zip",
"pinnacle_output6": "zip",
"pinnacle_output7": "zip",
"pinnacle_output8": "zip",
"pinnacle_output9": "zip",
"pinnacle_output10": "zip",
}

name2id = {
Expand Down Expand Up @@ -1104,6 +1114,16 @@ def get_task2category():
"pinnacle_protein_embed": 10407128,
"pinnacle_labels_dict": 10409635,
"panpep": 10428565,
"pinnacle_output1": 10431072,
"pinnacle_output2": 10431073,
"pinnacle_output3": 10431078,
"pinnacle_output4": 10431080,
"pinnacle_output5": 10431077,
"pinnacle_output6": 10431076,
"pinnacle_output7": 10431079,
"pinnacle_output8": 10431074,
"pinnacle_output9": 10431075,
"pinnacle_output10": 10431081,
}

oracle2type = {
Expand Down
54 changes: 42 additions & 12 deletions tdc/resource/pinnacle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ..utils import general_load
from ..utils.load import download_wrapper, load_json_from_txt_file
from ..utils.load import download_wrapper, load_json_from_txt_file, zip_data_download_wrapper

import pandas as pd
import os
Expand All @@ -9,15 +9,6 @@
class PINNACLE:
"""
PINNACLE is a class for loading and manipulating the PINNACLE networks and embeddings.
@article{
Li2023,
author = "Michelle Li",
title = "{PINNACLE}",
year = "2023",
month = "4",
url = "https://figshare.com/articles/software/AWARE/22708126",
doi = "10.6084/m9.figshare.22708126.v5"
}
"""

def __init__(self, path="./data"):
Expand All @@ -30,7 +21,6 @@ def __init__(self, path="./data"):
"\t") # use tab as names were left with spaces
self.cell_tissue_mg.columns = ["Tissue", "Cell"]
self.embeds_name = "pinnacle_protein_embed"
# self.embeds = resource_dataset_load(self.embeds_name, path, [self.embeds_name])
self.embeds_name = download_wrapper(self.embeds_name, path,
self.embeds_name)
self.embeds = torch.load(os.path.join(path, self.embeds_name + ".pth"))
Expand Down Expand Up @@ -61,7 +51,6 @@ def get_keys(self):
def get_embeds(self):
prots = self.get_keys()
emb = self.get_embeds_raw()
# nemb = {'--'.join(prots.iloc[k]): v for k, v in emb.items()}
x = {}
ctr = 0
for _, v in emb.items():
Expand All @@ -86,3 +75,44 @@ def get_embeds(self):
x), "dims not mantained when translated to pandas. {} vs {}".format(
len(df), len(x))
return df

def get_exp_data(self, seed=1, split="train"):
if split not in ["train", "val", "test"]:
raise ValueError("{} not a valid split".format(split))
filename = "pinnacle_output{}".format(seed)
# clean data directory
file_list = os.listdir("./data")
for file in file_list:
os.remove(os.path.join("./data", file))
print("downloading pinancle zip data...")
zip_data_download_wrapper(
filename, "./data",
["pinnacle_output{}".format(x) for x in range(1, 11)])
print("success!")
# Get non-csv files and remove them
non_csv_files = [
f for f in os.listdir("./data") if not f.endswith(".csv")
]
for x in non_csv_files:
os.remove("./data/{}".format(x))
# Get a list of all CSV files in the unzipped folder
csv_files = [f for f in os.listdir("./data") if f.endswith(".csv")]
if not csv_files:
raise Exception("no csv")
x = []
print("iterating over csv files...")
for file in csv_files:
print("got file {}".format(file))
if "_{}_".format(split) not in file:
os.remove("./data/{}".format(file))
continue
print("reading into pandas...")
df = pd.read_csv("./data/{}".format(file))
cell = file.split("_")[-1]
cell = cell.split(".")[0]
df["cell_type_label"] = cell
disease = "IBD" if "3767" in file else "RA"
df["disease"] = disease
x.append(df)
os.remove("./data/{}".format(file))
return pd.concat(x, axis=0, ignore_index=True)
8 changes: 7 additions & 1 deletion tdc/test/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def test_get_gene_metadata(self):
fmt="pyarrow",
measurement_name="RNA")
print(varpyarrow)
# assert isinstance(varpyarrow, SparseCOOTensor)

def test_get_measurement_matrix(self):
X = self.resource.query_measurement_matrix(
Expand Down Expand Up @@ -112,6 +111,13 @@ def test_embeddings(self):
assert len(set(cells)) == num_cells, "{} vs {} for cell_types".format(
len(cells), num_cells)

def test_exp_data(self):
from tdc.resource.pinnacle import PINNACLE
pinnacle = PINNACLE()
exp_data = pinnacle.get_exp_data()
assert isinstance(exp_data, DataFrame)
assert len(exp_data) > 0, "PINNACLE exp_data is empty"

def tearDown(self):
try:
print(os.getcwd())
Expand Down
Loading