From 88d12d497c16c0a399845ad1d492e7e8f4ca87d6 Mon Sep 17 00:00:00 2001 From: Thomas Hahn Date: Tue, 23 Apr 2024 11:54:21 -0400 Subject: [PATCH] Allow arbitrary block sizes in h5::array_interface --- c++/h5/array_interface.cpp | 41 ++++--- c++/h5/array_interface.hpp | 40 +++++-- c++/h5/scalar.hpp | 2 +- c++/h5/stl/array.hpp | 2 +- c++/h5/stl/vector.hpp | 2 +- doc/documentation.md | 2 +- python/h5/h5py_io.cpp | 2 +- test/c++/h5_array_interface.cpp | 205 ++++++++++++++++++-------------- 8 files changed, 170 insertions(+), 126 deletions(-) diff --git a/c++/h5/array_interface.cpp b/c++/h5/array_interface.cpp index bcf3eee..15841fc 100644 --- a/c++/h5/array_interface.cpp +++ b/c++/h5/array_interface.cpp @@ -102,11 +102,8 @@ namespace h5::array_interface { return {parent_shape, h5_strides}; } - dataset_info get_dataset_info(group g, std::string const &name) { - // open dataset - dataset ds = g.open_dataset(name); - - // retrieve shape information + dataset_info get_dataset_info(dataset ds) { + // retrieve shape and datatype information datatype ty = H5Dget_type(ds); bool has_complex_attribute = H5LTfind_attribute(ds, "__complex__"); dataspace dspace = H5Dget_space(ds); @@ -117,10 +114,18 @@ namespace h5::array_interface { return {std::move(dims_out), ty, has_complex_attribute}; } + dataset_info get_dataset_info(group g, std::string const &name) { + dataset ds = g.open_dataset(name); + return get_dataset_info(ds); + } + void write(group g, std::string const &name, array_view const &v, bool compress) { // unlink the dataset if it already exists g.unlink(name); + // shape of the hyperslab in memory + auto hs_shape = v.slab.shape(); + // chunk the dataset and add compression proplist cparms = H5P_DEFAULT; if (compress and (v.rank() != 0)) { @@ -131,7 +136,7 @@ namespace h5::array_interface { for (int i = v.rank() - 1; i >= 0; --i) { H5_ASSERT(max_chunk_size >= chunk_size); hsize_t max_dim = max_chunk_size / chunk_size; - chunk_dims[i] = std::clamp(v.slab.count[i], hsize_t{1}, max_dim); + chunk_dims[i] = std::clamp(hs_shape[i], hsize_t{1}, max_dim); chunk_size *= chunk_dims[i]; } cparms = H5Pcreate(H5P_DATASET_CREATE); @@ -139,8 +144,8 @@ namespace h5::array_interface { H5Pset_deflate(cparms, 1); } - // dataspace for the dataset in the file: v.slab.block shape {1, ..., 1} is assumed - dataspace file_dspace = H5Screate_simple(v.slab.rank(), v.slab.count.data(), nullptr); + // dataspace for the dataset in the file + dataspace file_dspace = H5Screate_simple(v.slab.rank(), hs_shape.data(), nullptr); // create the dataset in the file dataset ds = H5Dcreate2(g, name.c_str(), v.ty, file_dspace, H5P_DEFAULT, cparms, H5P_DEFAULT); @@ -161,12 +166,14 @@ namespace h5::array_interface { if (v.is_complex) h5_write_attribute(ds, "__complex__", "1"); } - void write_slice(group g, std::string const &name, array_view const &v, dataset_info ds_info, hyperslab sl) { + void write_slice(group g, std::string const &name, array_view const &v, hyperslab sl) { // empty hyperslab if (sl.empty()) return; - // check consistency of input: block shape {1, ..., 1} is assumed - if (v.slab.count != sl.count) throw std::runtime_error("Error in h5::array_interface::write_slice: Memory and file slabs are incompatible"); + // check consistency of input + if (v.slab.size() != sl.size()) throw std::runtime_error("Error in h5::array_interface::write_slice: Incompatible sizes"); + + auto ds_info = get_dataset_info(g, name); if (not hdf5_type_equal(v.ty, ds_info.ty)) throw std::runtime_error("Error in h5::array_interface::write_slice: Incompatible HDF5 types: " + get_name_of_h5_type(v.ty) + " != " + get_name_of_h5_type(ds_info.ty)); @@ -206,7 +213,7 @@ namespace h5::array_interface { if (err < 0) throw std::runtime_error("Error in h5::array_interface::write_attribute: Writing to the attribute " + name + " failed"); } - void read(group g, std::string const &name, array_view v, dataset_info ds_info, hyperslab sl) { + void read(group g, std::string const &name, array_view v, hyperslab sl) { // open dataset and get dataspace dataset ds = g.open_dataset(name); dataspace file_dspace = H5Dget_space(ds); @@ -219,6 +226,7 @@ namespace h5::array_interface { } // check consistency of input + auto ds_info = get_dataset_info(g, name); if (H5Tget_class(v.ty) != H5Tget_class(ds_info.ty)) throw std::runtime_error("Error in h5::array_interface::read: Incompatible HDF5 types: " + get_name_of_h5_type(v.ty) + " != " + get_name_of_h5_type(ds_info.ty)); @@ -227,12 +235,9 @@ namespace h5::array_interface { std::cerr << "WARNING: HDF5 type mismatch while reading into an array_view: " + get_name_of_h5_type(v.ty) + " != " + get_name_of_h5_type(ds_info.ty) + "\n"; - if (ds_info.rank() != v.rank()) - throw std::runtime_error("Error in h5::array_interface::read: Incompatible ranks: " + std::to_string(v.rank()) - + " != " + std::to_string(ds_info.rank())); - - // block shape of {1, ..., 1} is assumed - if (sl.empty() and ds_info.lengths != v.slab.count) throw std::runtime_error("Error in h5::array_interface::read: Incompatible shapes"); + auto sl_size = sl.size(); + if (sl.empty()) { sl_size = std::accumulate(ds_info.lengths.begin(), ds_info.lengths.end(), (hsize_t)1, std::multiplies<>()); } + if (sl_size != v.slab.size()) throw std::runtime_error("Error in h5::array_interface::read: Incompatible sizes"); // memory dataspace dataspace mem_dspace = make_mem_dspace(v); diff --git a/c++/h5/array_interface.hpp b/c++/h5/array_interface.hpp index e5164c7..54fc439 100644 --- a/c++/h5/array_interface.hpp +++ b/c++/h5/array_interface.hpp @@ -25,6 +25,8 @@ #include "./group.hpp" #include "./object.hpp" +#include +#include #include #include @@ -124,15 +126,29 @@ namespace h5::array_interface { /// Check whether the hyperslab is empty (has been initialized). [[nodiscard]] bool empty() const { return count.empty(); } + + /// Get the shape of the selected hyperslab. + [[nodiscard]] v_t shape() const { + v_t shape(rank()); + std::transform(count.begin(), count.end(), block.begin(), shape.begin(), std::multiplies<>()); + return shape; + } + + /// Get the total number of elements in the hyperslab. + [[nodiscard]] auto size() const { + auto sh = shape(); + return std::accumulate(sh.begin(), sh.end(), (hsize_t)1, std::multiplies<>()); + } }; /** * @brief Struct representing a view on an n-dimensional array/dataspace. * * @details A view consists of the parent array and of an h5::array_interface::hyperslab specifying a selection. - * The parent array is defined by a pointer to its data and its shape. Note that the shape of the parent array - * does not necessarily have to correspond to the actual shape and size of the underlying memory. It is only used - * to select the correct elements in the hyperslab. + * The parent array is defined by a pointer to its data and its shape. + * + * Note that the shape of the parent array does not necessarily have to correspond to the actual shape and size of + * the underlying memory. It is only used to select the correct elements in the hyperslab. * * If the data of the array is complex, its imaginary part is treated as just another dimension. */ @@ -202,6 +218,14 @@ namespace h5::array_interface { /** * @brief Retrieve the shape and the h5::datatype from a dataset. * + * @param ds h5::dataset. + * @return h5::array_interface::dataset_info containing the shape and HDF5 type of the dataset. + */ + dataset_info get_dataset_info(dataset ds); + + /** + * @brief Retrieve the shape and the h5::datatype from a dataset with a given name in the given group. + * * @param g h5::group containing the dataset. * @param name Name of the dataset. * @return h5::array_interface::dataset_info containing the shape and HDF5 type of the dataset. @@ -213,8 +237,6 @@ namespace h5::array_interface { * * @details If a link with the given name already exists, it is first unlinked. * - * @warning This function only works consistently if the blocks of the hyperslab contain only a single element! - * * @param g h5::group in which the dataset is created. * @param name Name of the dataset * @param v h5::array_interface::array_view to be written. @@ -228,15 +250,12 @@ namespace h5::array_interface { * @details It checks if the number of elements in the view is the same as selected in the hyperslab and if the * datatypes are compatible. Otherwise, an exception is thrown. * - * @warning This function only works consistently if the blocks of both hyperslabs contain only a single element! - * * @param g h5::group which contains the dataset. * @param name Name of the dataset. * @param v h5::array_interface::array_view to be written. - * @param ds_info h5::array_interface::dataset_info of the file dataset (only used to check the consistency of the input). * @param sl h5::array_interface::hyperslab specifying the selection to be written to. */ - void write_slice(group g, std::string const &name, array_view const &v, dataset_info ds_info, hyperslab sl); + void write_slice(group g, std::string const &name, array_view const &v, hyperslab sl); /** * @brief Write an array view to an HDF5 attribute. @@ -253,10 +272,9 @@ namespace h5::array_interface { * @param g h5::group which contains the dataset. * @param name Name of the dataset. * @param v h5::array_interface::array_view to read into. - * @param ds_info h5::array_interface::dataset_info of the file dataset (only used to check the consistency of the input). * @param sl h5::array_interface::hyperslab specifying the selection to read from. */ - void read(group g, std::string const &name, array_view v, dataset_info ds_info, hyperslab sl = {}); + void read(group g, std::string const &name, array_view v, hyperslab sl = {}); /** * @brief Read from an HDF5 attribute into an array view. diff --git a/c++/h5/scalar.hpp b/c++/h5/scalar.hpp index 8daac1c..b85149a 100644 --- a/c++/h5/scalar.hpp +++ b/c++/h5/scalar.hpp @@ -108,7 +108,7 @@ namespace h5 { } // read scalar value - array_interface::read(g, name, array_interface::array_view_from_scalar(x), ds_info); + array_interface::read(g, name, array_interface::array_view_from_scalar(x)); } /** diff --git a/c++/h5/stl/array.hpp b/c++/h5/stl/array.hpp index 2bf4eae..8f960cb 100644 --- a/c++/h5/stl/array.hpp +++ b/c++/h5/stl/array.hpp @@ -118,7 +118,7 @@ namespace h5 { v.slab.count[0] = N; v.slab.stride[0] = 1; v.parent_shape[0] = N; - array_interface::read(g, name, v, ds_info); + array_interface::read(g, name, v); } else { // array of generic type auto g2 = g.open_group(name); diff --git a/c++/h5/stl/vector.hpp b/c++/h5/stl/vector.hpp index e37cfc8..524452b 100644 --- a/c++/h5/stl/vector.hpp +++ b/c++/h5/stl/vector.hpp @@ -162,7 +162,7 @@ namespace h5 { if (ds_info.rank() != 1 + is_complex_v) throw make_runtime_error("Error in h5_read: Reading a vector from an array of rank ", ds_info.rank(), " is not allowed"); v.resize(ds_info.lengths[0]); - array_interface::read(g, name, array_interface::array_view_from_vector(v), ds_info); + array_interface::read(g, name, array_interface::array_view_from_vector(v)); } else if constexpr (std::is_same_v or std::is_same_v>) { // vector of strings or vector of vector of strings char_buf cb; diff --git a/doc/documentation.md b/doc/documentation.md index 52551e9..ad266aa 100644 --- a/doc/documentation.md +++ b/doc/documentation.md @@ -58,7 +58,7 @@ the interface. * @ref rw_variant * @ref rw_vector -The @ref rw_array "array interface" helps with loading and storing n-dimensional arrays. +The @ref rw_arrayinterface "array interface" helps with loading and storing n-dimensional arrays. Furthermore, the generic design of the read/write functionality makes it easily extendible to support custom user types as well. @ref ex2 shows how to make a user defined type HDF5 serializable. diff --git a/python/h5/h5py_io.cpp b/python/h5/h5py_io.cpp index 7b118cc..00bf83f 100644 --- a/python/h5/h5py_io.cpp +++ b/python/h5/h5py_io.cpp @@ -296,7 +296,7 @@ namespace h5 { // in case of allocation error // read from the file - read(g, name, make_av_from_npy((PyArrayObject *)ob), ds_info); + read(g, name, make_av_from_npy((PyArrayObject *)ob)); return ob; } diff --git a/test/c++/h5_array_interface.cpp b/test/c++/h5_array_interface.cpp index 0e6cc5e..4724977 100644 --- a/test/c++/h5_array_interface.cpp +++ b/test/c++/h5_array_interface.cpp @@ -17,6 +17,8 @@ #include #include
+#include + #include #include @@ -123,107 +125,126 @@ TEST(H5, GetParentShapeAndH5Strides3D) { TEST(H5, ArrayInterface1DArray) { // test the array interface for a 1D array h5::file file("1d_array.h5", 'w'); + h5::group group(file); std::vector data(100, 0); std::iota(data.begin(), data.end(), 0); int rank = 1; int size = static_cast(data.size()); // write the full array - h5::array_interface::array_view full_view(h5::hdf5_type(), (void *)data.data(), rank, false); - full_view.slab.count[0] = size; - full_view.parent_shape[0] = size; - h5::array_interface::write(file, "full_view", full_view, true); + h5::array_interface::array_view view_1(h5::hdf5_type(), (void *)data.data(), rank, false); + view_1.slab.count[0] = size; + view_1.parent_shape[0] = size; + h5::array_interface::write(file, "view_1", view_1, true); // read the full array - std::vector full_data_in(data.size(), 0); - h5::array_interface::array_view full_view_in(h5::hdf5_type(), (void *)full_data_in.data(), rank, false); - full_view_in.slab.count[0] = size; - full_view_in.parent_shape[0] = size; - auto full_view_ds_info = h5::array_interface::get_dataset_info(file, "full_view"); - h5::array_interface::read(file, "full_view", full_view_in, full_view_ds_info, full_view.slab); + std::vector data_in_1(data.size(), 0); + h5::array_interface::array_view view_in_1(h5::hdf5_type(), (void *)data_in_1.data(), rank, false); + view_in_1.slab.count[0] = size; + view_in_1.parent_shape[0] = size; + h5::array_interface::read(file, "view_1", view_in_1); + + // check results + for (int i = 0; i < size; ++i) { EXPECT_EQ(data_in_1[i], data[i]); } + + // write blocks of size 10, skipping every other block, starting with the second block + h5::array_interface::array_view view_2(h5::hdf5_type(), (void *)data.data(), rank, false); + view_2.slab.offset[0] = 10; + view_2.slab.stride[0] = 20; + view_2.slab.count[0] = 5; + view_2.slab.block[0] = 10; + view_2.parent_shape[0] = size; + h5::array_interface::write(file, "view_2", view_2, true); + + // read the data written to view_2 into a 1D array + std::vector data_in_2(50, 0); + h5::array_interface::array_view view_in_2(h5::hdf5_type(), (void *)data_in_2.data(), rank, false); + view_in_2.slab.count[0] = 50; + view_in_2.parent_shape[0] = 50; + h5::array_interface::read(file, "view_2", view_in_2); + + // read blocks of size 10, skipping every other block, starting with the second block directly from view_1 + std::vector data_in_3(50, 0); + h5::array_interface::array_view view_in_3(h5::hdf5_type(), (void *)data_in_3.data(), rank, false); + view_in_3.slab.count[0] = 50; + view_in_3.parent_shape[0] = 50; + h5::array_interface::read(file, "view_1", view_in_3, view_2.slab); // check results - for (int i = 0; i < size; ++i) { EXPECT_EQ(full_data_in[i], data[i]); } - - // // write blocks of size 10, skipping every other block, starting with the second block - // h5::array_interface::array_view block_view(h5::hdf5_type(), (void *)data.data(), rank, false); - // block_view.slab.offset[0] = 10; - // block_view.slab.stride[0] = 20; - // block_view.slab.count[0] = 5; - // block_view.slab.block[0] = 10; - // block_view.parent_shape[0] = size; - // h5::array_interface::write(file, "block_view", block_view, true); - - // // read blocks of size 10, skipping every other block, starting with the second block - // std::vector block_data_in(50, 0); - // h5::array_interface::array_view block_view_in(h5::hdf5_type(), (void *)block_data_in.data(), rank, false); - // block_view_in.slab.count[0] = 50; - // block_view_in.parent_shape[0] = 50; - // h5::array_interface::read(file, "full_view", block_view_in, full_view_lt, block_view_in.slab); - - // read the block view - // std::vector block_data_in(50, 0); - // h5::array_interface::array_view block_view_in(h5::hdf5_type(), (void *)block_data_in.data(), rank, false); - // block_view_in.slab.count[0] = 50; - // block_view_in.parent_shape[0] = 50; - // auto block_view_lt = h5::array_interface::get_dataset_info(file, "block_view"); - // h5::array_interface::read(file, "block_view", block_view_in, block_view_lt, block_view_in.slab); + for (int i = 0; i < 50; ++i) { EXPECT_EQ(data_in_2[i], data_in_3[i]); } + + // write the blocks selected in view_2 to a 5x10 2D array + h5::v_t shape_3 = {5, 10}; + h5::dataspace dspace_3 = H5Screate_simple(2, shape_3.data(), nullptr); + std::ignore = group.create_dataset("view_3", h5::hdf5_type(), dspace_3); + h5::array_interface::hyperslab slab_4(2, false); + slab_4.count = {5, 1}; + slab_4.block = {1, 10}; + h5::array_interface::write_slice(file, "view_3", view_2, slab_4); + + // read the 2D view_3 into a 1D array + std::vector data_in_4(50, 0); + h5::array_interface::array_view view_in_4(h5::hdf5_type(), (void *)data_in_4.data(), rank, false); + view_in_4.slab.count[0] = 50; + view_in_4.parent_shape[0] = 50; + h5::array_interface::read(file, "view_3", view_in_4, slab_4); + + for (int i = 0; i < 50; ++i) { EXPECT_EQ(data_in_4[i], data_in_3[i]); } } -// TEST(H5, ArrayInterface2DArray) { -// // write a 5x5 array and read a 5x3 array (every other column) -// std::vector data(25, 0); -// std::iota(data.begin(), data.end(), 0); - -// // create an array_view of rank 2 with dimensions 5x5 of the original data -// int rank = 2; -// int rows_w = 5; -// int cols_w = 5; -// h5::array_interface::array_view view(h5::hdf5_type(), (void *)data.data(), rank, false); -// // view.slab.count[0] = rows_w; -// // view.slab.count[1] = cols_w; -// // view.parent_shape[0] = rows_w; -// // view.parent_shape[1] = cols_w; -// view.slab.count[0] = 1; -// view.slab.count[1] = 1; -// view.slab.block[0] = 5; -// view.slab.block[1] = 5; -// view.parent_shape[0] = rows_w; -// view.parent_shape[1] = cols_w; - -// // create file in read/write mode -// h5::file file("2d_array.h5", 'w'); - -// // write array_view to file -// h5::array_interface::write(file, "view", view, false); - -// // reserve memory for reading -// std::vector data_in(15, 0); - -// // create an array_view or rank 2 with dimensions 5x3 of the read memory -// int rows_in = 5; -// int cols_in = 3; -// h5::array_interface::array_view view_in(h5::hdf5_type(), (void *)data_in.data(), rank, false); -// view_in.slab.count[0] = rows_in; -// view_in.slab.count[1] = cols_in; -// view_in.parent_shape[0] = rows_in; -// view_in.parent_shape[1] = cols_in; - -// // create an hyperslab to select the data to be read from the file (every other column -> stride in second dimension is 2) -// h5::array_interface::hyperslab slab_in(rank, false); -// slab_in.count[0] = rows_in; -// slab_in.count[1] = cols_in; -// slab_in.stride[0] = 1; -// slab_in.stride[1] = 2; - -// // get dataset_info from the dataset in the file -// auto lengths_type = h5::array_interface::get_dataset_info(file, "view"); - -// // read data from file -// h5::array_interface::read(file, "view", view_in, lengths_type, slab_in); - -// // check results -// for (int i = 0; i < rows_in; ++i) { -// for (int j = 0; j < cols_in; ++j) { EXPECT_EQ(data_in[i * cols_in + j], data[i * cols_w + j * 2]); } -// } -// } +TEST(H5, ArrayInterface3DArray) { + // test the array interface for a 3D array + h5::file file("3d_array.h5", 'w'); + h5::group group(file); + std::vector data(27, 0); + std::iota(data.begin(), data.end(), 0); + int rank = 3; + int size = static_cast(data.size()); + + // write the full data as a 3x3x3 array + h5::array_interface::array_view view_1(h5::hdf5_type(), (void *)data.data(), rank, false); + view_1.slab.count = {3, 3, 3}; + view_1.parent_shape = {3, 3, 3}; + h5::array_interface::write(file, "view_1", view_1, true); + + // read the full array as a 1D array + std::vector data_in_1(data.size(), 0); + h5::array_interface::array_view view_in_1(h5::hdf5_type(), (void *)data_in_1.data(), 1, false); + view_in_1.slab.count[0] = size; + view_in_1.parent_shape[0] = size; + h5::array_interface::read(file, "view_1", view_in_1); + + // check results + for (int i = 0; i < size; ++i) { EXPECT_EQ(data_in_1[i], data[i]); } + + // write the 3x3 arrays starting at (0, 0, 0) and (2, 0, 0) into a 6x3 array + h5::array_interface::array_view view_2(h5::hdf5_type(), (void *)data.data(), rank, false); + view_2.slab.offset = {0, 0, 0}; + view_2.slab.stride = {2, 1, 1}; + view_2.slab.count = {2, 1, 1}; + view_2.slab.block = {1, 3, 3}; + view_2.parent_shape = {3, 3, 3}; + h5::v_t shape_2 = {6, 3}; + h5::dataspace dspace_2 = H5Screate_simple(2, shape_2.data(), nullptr); + std::ignore = group.create_dataset("view_2", h5::hdf5_type(), dspace_2); + h5::array_interface::hyperslab slab_2(2, false); + slab_2.count = shape_2; + h5::array_interface::write_slice(file, "view_2", view_2, slab_2); + + // read in the data written to view_2 into a 2D array + std::vector data_in_2(18, 0); + h5::array_interface::array_view view_in_2(h5::hdf5_type(), (void *)data_in_2.data(), 2, false); + view_in_2.slab = slab_2; + view_in_2.parent_shape = shape_2; + h5::array_interface::read(file, "view_2", view_in_2); + + // read in the 3x3 arrays starting at (0, 0, 0) and (2, 0, 0) directly from view_1 + std::vector data_in_3(18, 0); + h5::array_interface::array_view view_in_3(h5::hdf5_type(), (void *)data_in_3.data(), 2, false); + view_in_3.slab = slab_2; + view_in_3.parent_shape = shape_2; + h5::array_interface::read(file, "view_1", view_in_3, view_2.slab); + + // check results + for (int i = 0; i < 18; ++i) { EXPECT_EQ(data_in_2[i], data_in_3[i]); } +}