Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
suneelbvs authored Mar 29, 2024
1 parent 6440547 commit df2f1ce
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 0 deletions.
27 changes: 27 additions & 0 deletions data_loaders/ames.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import pandas as pd
import os
import ast

from data_loaders.dataset import DataLoader

class Ames(DataLoader):
def __init__(self):
# We're using the Mordred fingerprint
# This dataset was generated using Mordred.py
dataset_dir = os.path.dirname(os.path.abspath(__file__)) + "/../Fingerprints/Ames_Mordred.csv"

data = pd.read_csv(dataset_dir)
fingerprint_arrays = data["fingerprint_Mordred"].apply(lambda array_str: np.array(ast.literal_eval(array_str))).tolist()
self.x_values = np.stack(fingerprint_arrays)
self.y_values = data["Y"].to_numpy()
self.name = "Ames"

def size(self):
return len(self.x_values)

def x(self, dataset_slice_indices: np.ndarray) -> np.ndarray:
return self.x_values[dataset_slice_indices]

def y(self, dataset_slice_indices):
return self.y_values[dataset_slice_indices]
21 changes: 21 additions & 0 deletions data_loaders/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np

# This class will be used to load the dataset and perform feature extraction
# When we do bayesian optimization, we'll call the dataloader to get the x and y values for each dataset slice
class DataLoader:
def __init__(self):
self.name = ""
# You should perform feature extraction and data cleaning here
pass

def size(self) -> int:
pass

# for a given slice of the dataset, we'll return the x values
# these slices are indexes of the original dataset. e.g. if you pass in [1, 2, 9], The features at index 1, 2, and 9 will be returned
def x(self, dataset_slice: np.ndarray) -> np.ndarray:
pass

# for a given slice of the dataset, we'll return the y values
def y(self, dataset_slice: np.ndarray) -> np.ndarray:
pass
25 changes: 25 additions & 0 deletions data_loaders/halflife.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np
import pandas as pd
from data_loaders.dataset import DataLoader
from transformer.fingerprints import FingerprintsTransformer
import os

class halflife(DataLoader):
def __init__(self):
# path = r"datasets\cleaned_datasets\halflife_dataset.csv"
path = os.path.dirname(os.path.abspath(__file__)) + "/../datasets/cleaned_datasets/halflife_dataset.csv"
self.data = pd.read_csv(path)
transformer = FingerprintsTransformer(self.data, "Drug", "ECFP")

self.name = "halflife"
self.x_values = transformer.to_np()
self.y_values = self.data["Y"].to_numpy()

def size(self):
return len(self.x_values)

def x(self, dataset_slice_indices: np.ndarray) -> np.ndarray:
return self.x_values[dataset_slice_indices]

def y(self, dataset_slice_indices):
return self.y_values[dataset_slice_indices]
33 changes: 33 additions & 0 deletions data_loaders/ld50.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import numpy as np
import pandas as pd
import ast

from data_loaders.dataset import DataLoader
from functools import lru_cache

class LD50(DataLoader):
fingerprint = 'ECFP'

def __init__(self):
path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"../datasets/fingerprint_datasets/LD50_Zhu_ECFP.h5"
)
self.data = pd.read_hdf(path)
self.name = "ld50"
self.x_values = np.stack(self.data['fingerprint_ECFP'])
self.y_values = self.data["Y"].to_numpy()

def size(self):
return len(self.x_values)

def x(self, dataset_slice: slice | np.ndarray = None) -> np.ndarray:
if dataset_slice is None:
return self.x_values
return self.x_values[dataset_slice]

def y(self, dataset_slice: slice | np.ndarray = None) -> np.ndarray:
if dataset_slice is None:
return self.y_values
return self.y_values[dataset_slice]
26 changes: 26 additions & 0 deletions data_loaders/tox21.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from tdc.utils import retrieve_label_name_list
from tdc.single_pred import Tox
import numpy as np
from transformer.fingerprints import FingerprintsTransformer
from data_loaders.dataset import DataLoader

# https://tdcommons.ai/single_pred_tasks/tox/#tox21
class Tox21(DataLoader):
def __init__(self):
label_list = retrieve_label_name_list('Tox21')
data = Tox(name = 'Tox21', label_name = label_list[0]).get_data()

transformer = FingerprintsTransformer(data, "Drug", "ECFP")

self.name = "Tox21"
self.x_values = transformer.to_np()
self.y_values = data["Y"].to_numpy()

def size(self):
return len(self.x_values)

def x(self, dataset_slice_indices: np.ndarray) -> np.ndarray:
return self.x_values[dataset_slice_indices]

def y(self, dataset_slice_indices):
return self.y_values[dataset_slice_indices]

0 comments on commit df2f1ce

Please sign in to comment.