Skip to content

Commit

Permalink
Clean up in h5::array_interface
Browse files Browse the repository at this point in the history
  • Loading branch information
Thoemi09 authored and Wentzell committed Apr 24, 2024
1 parent 4b6228e commit beb8de9
Show file tree
Hide file tree
Showing 10 changed files with 375 additions and 216 deletions.
86 changes: 53 additions & 33 deletions c++/h5/array_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ namespace h5::array_interface {
namespace {

// Create an HDF5 memory dataspace.
dataspace make_mem_dspace(h5_array_view const &v) {
dataspace make_mem_dspace(array_view const &v) {
// scalar case
if (v.rank() == 0) return H5Screate(H5S_SCALAR);

// create a dataspace of rank v.rank() and with shape v.L_tot
dataspace dspace = H5Screate_simple(v.slab.rank(), v.L_tot.data(), nullptr);
if (!dspace.is_valid()) throw std::runtime_error("Error in make_mem_dspace: Creating the dataspace for an h5_array_view failed");
// create a dataspace of rank v.rank() and with shape v.parent_shape
dataspace dspace = H5Screate_simple(v.slab.rank(), v.parent_shape.data(), nullptr);
if (!dspace.is_valid()) throw std::runtime_error("Error in make_mem_dspace: Creating the dataspace for an array_view failed");

// select the hyperslab according to v.slab
herr_t err = H5Sselect_hyperslab(dspace, H5S_SELECT_SET, v.slab.offset.data(), v.slab.stride.data(), v.slab.count.data(),
Expand All @@ -57,32 +57,52 @@ namespace h5::array_interface {

} // namespace

std::pair<v_t, v_t> get_L_tot_and_strides_h5(long const *np_strides, int rank, long view_size) {
std::pair<v_t, v_t> get_parent_shape_and_h5_strides(long const *np_strides, int rank, long view_size) {
// scalar case: return empty vectors
if (rank == 0) return {};

// empty nd-array case: return (0,0,0), (1,1,1)
// empty view case: return (0,0,0), (1,1,1)
if (view_size == 0) return {v_t(rank, 0), v_t(rank, 1)};

// general case
v_t Ltot(rank), h5_strides(rank);
// for the general case, we would like to find a parent_shape and h5_strides such that the following equations hold (rank = N):
// 1. np_strides[N - 1] = h5_strides[N - 1]
// 2. np_strides[N - 2] = h5_strides[N - 2] * parent_shape[N - 1]
// 3. np_strides[N - 3] = h5_strides[N - 3] * parent_shape[N - 1] * parent_shape[N - 2]
// ...
// N. np_strides[N - N] = h5_strides[N - N] * parent_shape[N - 1] * parent_shape[N - 2] * ... * parent_shape[1]
//
// note that np_strides[N - i] = m * s[N - 1] * s[N - 2] * ... * s[N - i + 1], where 0 < m < s[N - i] and
// s is the true shape of the parent array

// initialize h5_strides with the np_strides
v_t parent_shape(rank), h5_strides(rank);
for (int u = 0; u < rank; ++u) h5_strides[u] = np_strides[u];
Ltot[0] = view_size;

// from the above equations it follows that parent_shape[0] (size of the slowest varying dimension) is arbitrary
parent_shape[0] = view_size;

// to solve the equations, we use the following algorithm:
// - Eq. 1 is already satisfied
// - parent_shape[N - i] = gcd(np_strides[N - i], np_strides[N - i - 1], ..., np_strides[0])
// - h5_strides[N - 1] = np_strides[N - 1] / parent_shape[N - 1]
// - divide equations i, i+1, ..., N by parent_shape[N - i]
// - repeat until all equations are satisfied
// that way we guarantee that the parent_shape[i] >= s[i] (see note above), which is important to make sure that
// the view elements are not out of bounds when HDF5 selects the hyperslab
for (int u = rank - 2; u >= 0; --u) {
// L[u+1] as gcd of size and stride[u] ... stride[0]
hsize_t L = h5_strides[u];
// L becomes the gcd
for (int v = u - 1; v >= 0; --v) { L = std::gcd(L, h5_strides[v]); }
// divides
for (int v = u; v >= 0; --v) { h5_strides[v] /= L; }
Ltot[u + 1] = L;
// calculate the gcd
hsize_t gcd = h5_strides[u];
for (int v = u - 1; v >= 0; --v) { gcd = std::gcd(gcd, h5_strides[v]); }
// set partent_shape to the gcd
parent_shape[u + 1] = gcd;
// divide the remaining equations by the gcd
for (int v = u; v >= 0; --v) { h5_strides[v] /= gcd; }
}

return {Ltot, h5_strides};
return {parent_shape, h5_strides};
}

h5_lengths_type get_h5_lengths_type(group g, std::string const &name) {
dataset_info get_dataset_info(group g, std::string const &name) {
// open dataset
dataset ds = g.open_dataset(name);

Expand All @@ -97,7 +117,7 @@ namespace h5::array_interface {
return {std::move(dims_out), ty, has_complex_attribute};
}

void write(group g, std::string const &name, h5_array_view const &v, bool compress) {
void write(group g, std::string const &name, array_view const &v, bool compress) {
// unlink the dataset if it already exists
g.unlink(name);

Expand Down Expand Up @@ -141,15 +161,15 @@ namespace h5::array_interface {
if (v.is_complex) h5_write_attribute(ds, "__complex__", "1");
}

void write_slice(group g, std::string const &name, h5_array_view const &v, h5_lengths_type lt, hyperslab sl) {
void write_slice(group g, std::string const &name, array_view const &v, dataset_info ds_info, hyperslab sl) {
// empty hyperslab
if (sl.empty()) return;

// check consistency of input: block shape {1, ..., 1} is assumed
if (v.slab.count != sl.count) throw std::runtime_error("Error in h5::array_interface::write_slice: Memory and file slabs are incompatible");
if (not hdf5_type_equal(v.ty, lt.ty))
if (not hdf5_type_equal(v.ty, ds_info.ty))
throw std::runtime_error("Error in h5::array_interface::write_slice: Incompatible HDF5 types: " + get_name_of_h5_type(v.ty)
+ " != " + get_name_of_h5_type(lt.ty));
+ " != " + get_name_of_h5_type(ds_info.ty));

// open existing dataset, get dataspace and select hyperslab
dataset ds = g.open_dataset(name);
Expand All @@ -169,7 +189,7 @@ namespace h5::array_interface {
}
}

void write_attribute(object obj, std::string const &name, h5_array_view v) {
void write_attribute(object obj, std::string const &name, array_view v) {
// check if the attribute already exists
if (H5LTfind_attribute(obj, name.c_str()) != 0)
throw std::runtime_error("Error in h5::array_interface::write_attribute: Attribute " + name + " already exists");
Expand All @@ -186,7 +206,7 @@ namespace h5::array_interface {
if (err < 0) throw std::runtime_error("Error in h5::array_interface::write_attribute: Writing to the attribute " + name + " failed");
}

void read(group g, std::string const &name, h5_array_view v, h5_lengths_type lt, hyperslab sl) {
void read(group g, std::string const &name, array_view v, dataset_info ds_info, hyperslab sl) {
// open dataset and get dataspace
dataset ds = g.open_dataset(name);
dataspace file_dspace = H5Dget_space(ds);
Expand All @@ -199,20 +219,20 @@ namespace h5::array_interface {
}

// check consistency of input
if (H5Tget_class(v.ty) != H5Tget_class(lt.ty))
if (H5Tget_class(v.ty) != H5Tget_class(ds_info.ty))
throw std::runtime_error("Error in h5::array_interface::read: Incompatible HDF5 types: " + get_name_of_h5_type(v.ty)
+ " != " + get_name_of_h5_type(lt.ty));
+ " != " + get_name_of_h5_type(ds_info.ty));

if (not hdf5_type_equal(v.ty, lt.ty))
std::cerr << "WARNING: HDF5 type mismatch while reading into an h5_array_view: " + get_name_of_h5_type(v.ty)
+ " != " + get_name_of_h5_type(lt.ty) + "\n";
if (not hdf5_type_equal(v.ty, ds_info.ty))
std::cerr << "WARNING: HDF5 type mismatch while reading into an array_view: " + get_name_of_h5_type(v.ty)
+ " != " + get_name_of_h5_type(ds_info.ty) + "\n";

if (lt.rank() != v.rank())
if (ds_info.rank() != v.rank())
throw std::runtime_error("Error in h5::array_interface::read: Incompatible ranks: " + std::to_string(v.rank())
+ " != " + std::to_string(lt.rank()));
+ " != " + std::to_string(ds_info.rank()));

// block shape of {1, ..., 1} is assumed
if (sl.empty() and lt.lengths != v.slab.count) throw std::runtime_error("Error in h5::array_interface::read: Incompatible shapes");
if (sl.empty() and ds_info.lengths != v.slab.count) throw std::runtime_error("Error in h5::array_interface::read: Incompatible shapes");

// memory dataspace
dataspace mem_dspace = make_mem_dspace(v);
Expand All @@ -225,7 +245,7 @@ namespace h5::array_interface {
}
}

void read_attribute(object obj, std::string const &name, h5_array_view v) {
void read_attribute(object obj, std::string const &name, array_view v) {
// open attribute
attribute attr = H5Aopen(obj, name.c_str(), H5P_DEFAULT);
if (!attr.is_valid()) throw std::runtime_error("Error in h5::array_interface::read_attribute: Opening the attribute " + name + " failed");
Expand Down
83 changes: 47 additions & 36 deletions c++/h5/array_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,25 @@ namespace h5::array_interface {
* @{
*/

/// Simple struct to store basic information about an n-dimensional array/dataspace.
struct h5_lengths_type {
/// Shape of the array/dataspace.
/// Simple struct to store basic information about an HDF5 dataset.
struct dataset_info {
/// Shape of the dataspace in the dataset.
v_t lengths;

/// h5::datatype stored in the array/dataspace.
/// h5::datatype stored in the dataset.
datatype ty;

/// Whether the stored values are complex.
bool has_complex_attribute;

/// Get the rank of the array/dataspace.
/// Get the rank of the dataspace in the dataset.
[[nodiscard]] int rank() const { return static_cast<int>(lengths.size()); }
};

/**
* @brief Struct representing an HDF5 hyperslab.
*
* @details A hyperslab is used to select elements form an n-dimensional array/dataspace and it is defined
* @details A hyperslab is used to select elements from an n-dimensional array/dataspace and it is defined
* by 4 arrays of the same size as the rank of the underlying dataspace
* (see <a href="https://docs.hdfgroup.org/hdf5/v1_12/group___h5_s.html">HDF5 docs</a>):
* - `offset`: Origin of the hyperslab in the dataspace.
Expand Down Expand Up @@ -129,19 +129,22 @@ namespace h5::array_interface {
/**
* @brief Struct representing a view on an n-dimensional array/dataspace.
*
* @details A view consists of the parent array and of an h5::array_interface::hyperslab
* specifying a selection. The array is defined by a pointer to its data and its shape. If
* the data of the array is complex, its imaginary part is treated as just another dimension.
* @details A view consists of the parent array and of an h5::array_interface::hyperslab specifying a selection.
* The parent array is defined by a pointer to its data and its shape. Note that the shape of the parent array
* does not necessarily have to correspond to the actual shape and size of the underlying memory. It is only used
* to select the correct elements in the hyperslab.
*
* If the data of the array is complex, its imaginary part is treated as just another dimension.
*/
struct h5_array_view {
struct array_view {
/// h5::datatype stored in the array.
datatype ty;

/// Pointer to the data of the array.
void *start;

/// Shape of the (contiguous) parent array.
v_t L_tot;
v_t parent_shape;

/// h5::array_interface::hyperslab specifying the selection of the view.
hyperslab slab;
Expand All @@ -160,9 +163,9 @@ namespace h5::array_interface {
* @param rank Rank of the parent array (excluding the possible added imaginary dimension).
* @param is_complex Whether the data is complex valued.
*/
h5_array_view(datatype ty, void *start, int rank, bool is_complex)
: ty(std::move(ty)), start(start), L_tot(rank + is_complex), slab(rank, is_complex), is_complex(is_complex) {
if (is_complex) L_tot[rank] = 2;
array_view(datatype ty, void *start, int rank, bool is_complex)
: ty(std::move(ty)), start(start), parent_shape(rank + is_complex), slab(rank, is_complex), is_complex(is_complex) {
if (is_complex) parent_shape[rank] = 2;
}

/// Get the rank of the view (including the possible added imaginary dimension).
Expand All @@ -171,31 +174,39 @@ namespace h5::array_interface {

/**
* @brief Given a view on an n-dimensional array (dataspace) by specifying its numpy/nda-style strides and
* its size, calculate the shape of the underlying parent array and the HDF5 strides of the view.
* its size (number of elements in the view), calculate the shape of a possible parent array and the corresponding
* HDF5 strides of the view.
*
* @details The memory layout is assumend to be in C-order. Suppose `parent_shape` is an array containing the shape of
* the n-dimensional parent array, `np_strides` contains the numpy strides and `h5_strides` are the HDF5 strides. Then
* the following equations have to hold (rank = N):
* - `np_strides[N - 1] = h5_strides[N - 1]`
* - `np_strides[N - 2] = h5_strides[N - 2] * parent_shape[N - 1]`
* - `np_strides[N - 3] = h5_strides[N - 3] * parent_shape[N - 1] * parent_shape[N - 2]`
* - `...`
* - `np_strides[N - N] = h5_strides[N - N] * parent_shape[N - 1] * parent_shape[N - 2] * ... * parent_shape[1]`.
*
* @warning This function contains a bug and only works as intended in special cases.
* Note that `np_strides[N - i] = m * s[N - 1] * s[N - 2] * ... * s[N - i + 1]`, where `0 < m < s[N - i]` is an integer
* an `s` is the true shape of the parent array.
*
* @details The memory layout is assumend to be in C-order. Suppose `L` is an array containing the shape of the
* n-dimensional parent array, `np_strides` contains the numpy strides and `h5_strides` are the HDF5 strides. Then
* - `np_strides[n - 1] = h5_strides[n - 1]`,
* - `np_strides[n - i] = L[n - 1] * ... * L[n - i - 1] * h5_strides[n - i]` and
* - `np_strides[0] = L[n - 1] * ... * L[1] * h5_strides[0]`.
* @note The shape of the parent array as well as the HDF5 strides that fulfill the above equations are not unique.
* This is not a problem as long as HDF5 manages to select the correct elements in memory.
*
* @param np_strides Numpy/nda-style strides.
* @param rank Rank of the n-dimensional parent array.
* @param view_size Number of elements in the given view.
* @return std::pair containing the shape of the parent array and the HDF5 strides of the view.
*/
std::pair<v_t, v_t> get_L_tot_and_strides_h5(long const *np_strides, int rank, long view_size);
std::pair<v_t, v_t> get_parent_shape_and_h5_strides(long const *np_strides, int rank, long view_size);

/**
* @brief Retrieve the shape and the h5::datatype from a dataset.
*
* @param g h5::group containing the dataset.
* @param name Name of the dataset.
* @return h5::array_interface::h5_lengths_type containing the shape and HDF5 type of the dataset.
* @return h5::array_interface::dataset_info containing the shape and HDF5 type of the dataset.
*/
h5_lengths_type get_h5_lengths_type(group g, std::string const &name);
dataset_info get_dataset_info(group g, std::string const &name);

/**
* @brief Write an array view to an HDF5 dataset.
Expand All @@ -206,10 +217,10 @@ namespace h5::array_interface {
*
* @param g h5::group in which the dataset is created.
* @param name Name of the dataset
* @param v h5::array_interface::h5_array_view to be written.
* @param v h5::array_interface::array_view to be written.
* @param compress Whether to compress the dataset.
*/
void write(group g, std::string const &name, h5_array_view const &v, bool compress);
void write(group g, std::string const &name, array_view const &v, bool compress);

/**
* @brief Write an array view to a selected hyperslab of an existing HDF5 dataset.
Expand All @@ -221,40 +232,40 @@ namespace h5::array_interface {
*
* @param g h5::group which contains the dataset.
* @param name Name of the dataset.
* @param v h5::array_interface::h5_array_view to be written.
* @param lt h5::array_interface::h5_lengths_type of the file dataset (only used to check the consistency of the input).
* @param v h5::array_interface::array_view to be written.
* @param ds_info h5::array_interface::dataset_info of the file dataset (only used to check the consistency of the input).
* @param sl h5::array_interface::hyperslab specifying the selection to be written to.
*/
void write_slice(group g, std::string const &name, h5_array_view const &v, h5_lengths_type lt, hyperslab sl);
void write_slice(group g, std::string const &name, array_view const &v, dataset_info ds_info, hyperslab sl);

/**
* @brief Write an array view to an HDF5 attribute.
*
* @param obj h5::object to which the attribute is attached.
* @param name Name of the attribute.
* @param v v h5::array_interface::h5_array_view to be written.
* @param v v h5::array_interface::array_view to be written.
*/
void write_attribute(object obj, std::string const &name, h5_array_view v);
void write_attribute(object obj, std::string const &name, array_view v);

/**
* @brief Read a given hyperslab from an HDF5 dataset into an array view.
*
* @param g h5::group which contains the dataset.
* @param name Name of the dataset.
* @param v h5::array_interface::h5_array_view to read into.
* @param lt h5::array_interface::h5_lengths_type of the file dataset (only used to check the consistency of the input).
* @param v h5::array_interface::array_view to read into.
* @param ds_info h5::array_interface::dataset_info of the file dataset (only used to check the consistency of the input).
* @param sl h5::array_interface::hyperslab specifying the selection to read from.
*/
void read(group g, std::string const &name, h5_array_view v, h5_lengths_type lt, hyperslab sl = {});
void read(group g, std::string const &name, array_view v, dataset_info ds_info, hyperslab sl = {});

/**
* @brief Read from an HDF5 attribute into an array view.
*
* @param obj h5::object to which the attribute is attached.
* @param name Name of the attribute.
* @param v h5::array_interface::h5_array_view to read into.
* @param v h5::array_interface::array_view to read into.
*/
void read_attribute(object obj, std::string const &name, h5_array_view v);
void read_attribute(object obj, std::string const &name, array_view v);

/** @} */

Expand Down
Loading

0 comments on commit beb8de9

Please sign in to comment.