Skip to content

Commit

Permalink
implement hdf5 dataset reader op (#356)
Browse files Browse the repository at this point in the history
  • Loading branch information
mosheraboh authored May 31, 2024
1 parent 1810465 commit 7ed012b
Showing 1 changed file with 48 additions and 0 deletions.
48 changes: 48 additions & 0 deletions fuse/data/ops/ops_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing import Hashable, List, Optional, Dict, Union
from fuse.utils.file_io.file_io import read_dataframe
import pandas as pd
import h5py

from fuse.data import OpBase
from fuse.utils.ndict import NDict
Expand Down Expand Up @@ -113,3 +114,50 @@ def get_all_keys(self) -> List[Hashable]:
:return: list of dataframe index values
"""
return list(self.data.keys())


class OpReadHDF5(OpBase):
"""
Op reading data from hd5f based dataset
"""

def __init__(
self,
data_filename: Optional[str] = None,
columns_to_extract: Optional[List[str]] = None,
rename_columns: Optional[Dict[str, str]] = None,
key_index: str = "data.sample_id",
key_column: str = "sample_id",
):
"""
:param data_filename: path to hdf5 file
:param columns_to_extract: list of columns to extract - dataset keys to extract. When None (default) all columns are extracted
:param rename_columns: rename columns
:param key_index: name of value in sample_dict which will be used as the key/index
:param key_column: name of the column which use as key/index. In case of None, the original dataframe index will be used to extract the values for a single sample.
"""
# store input
self._data_filename = data_filename
self._columns_to_extract = columns_to_extract
self._rename_columns = rename_columns if rename_columns is not None else {}
self._key_index = key_index
self._key_column = key_column

self._h5 = h5py.File(self._data_filename, "r")

if self._columns_to_extract is None:
self._columns_to_extract = self._h5.keys()

self._num_samples = len(self._h5[self._columns_to_extract[0]])

def num_samples(self) -> int:
return self._num_samples

def __call__(self, sample_dict: NDict) -> Union[None, dict, List[dict]]:

index = sample_dict[self._key_index]
for column in self._columns_to_extract:
key_to_store = self._rename_columns.get(column, column)
sample_dict[key_to_store] = self._h5[column][index]

return sample_dict

0 comments on commit 7ed012b

Please sign in to comment.