diff --git a/examples/7_extended_write_serial.cpp b/examples/7_extended_write_serial.cpp index 63d9cb38ff..579cc8b8e9 100644 --- a/examples/7_extended_write_serial.cpp +++ b/examples/7_extended_write_serial.cpp @@ -118,6 +118,15 @@ int main() } ] } + }, + "hdf5": { + "dataset": { + "chunks": "auto", + "permanent_filters": { + "id": "fletcher32", + "flags": "optional" + } + } } })END"; d.options = datasetConfig; @@ -144,7 +153,27 @@ int main() d = io::Dataset(dtype, mpiDims); electrons["positionOffset"]["x"].resetDataset(d); - auto dset = io::Dataset(io::determineDatatype(), {2}); + auto dset = io::Dataset( + io::determineDatatype(), + {2}, + R"( + { + "hdf5": { + "dataset": { + "chunks": "auto", + "permanent_filters": [ + { + "aggression": 5, + "type": "zlib" + }, + { + "flags": "MANDATORY", + "id": "shuffle" + } + ] + } + } + })"); electrons.particlePatches["numParticles"].resetDataset(dset); electrons.particlePatches["numParticlesOffset"].resetDataset(dset); diff --git a/src/IO/HDF5/HDF5IOHandler.cpp b/src/IO/HDF5/HDF5IOHandler.cpp index 124a65559e..40fd9c113b 100644 --- a/src/IO/HDF5/HDF5IOHandler.cpp +++ b/src/IO/HDF5/HDF5IOHandler.cpp @@ -74,6 +74,26 @@ namespace openPMD } while (0) #endif +constexpr char const *const init_json_shadow_str = &R"( +{ + "dataset": { + "chunks": null + }, + "independent_stores": null +})"[1]; +constexpr char const *dataset_cfg_mask = &R"( +{ + "dataset": { + "chunks": null, + "permanent_filters": null + } +} +)"[1]; +constexpr char const *const flush_cfg_mask = &R"( +{ + "independent_stores": null +})"[1]; + HDF5IOHandlerImpl::HDF5IOHandlerImpl( AbstractIOHandler *handler, json::TracingJSON config, @@ -149,23 +169,6 @@ HDF5IOHandlerImpl::HDF5IOHandlerImpl( m_config = config["hdf5"]; { - constexpr char const *const init_json_shadow_str = R"( - { - "dataset": { - "chunks": null - }, - "independent_stores": null - })"; - constexpr char const *const dataset_cfg_mask = R"( - { - "dataset": { - "chunks": null - } - })"; - constexpr char const *const flush_cfg_mask = R"( - { - "independent_stores": null - })"; m_global_dataset_config = m_config.json(); json::filterByTemplate( m_global_dataset_config, @@ -460,72 +463,245 @@ void HDF5IOHandlerImpl::createPath( "creation"); } -void HDF5IOHandlerImpl::createDataset( - Writable *writable, Parameter const ¶meters) +namespace { - if (access::readOnly(m_handler->m_backendAccess)) - throw std::runtime_error( - "[HDF5] Creating a dataset in a file opened as read only is not " - "possible."); - - if (parameters.joinedDimension.has_value()) + using chunking_t = std::vector; + struct DatasetParams { - error::throwOperationUnsupportedInBackend( - "HDF5", "Joined Arrays currently only supported in ADIOS2"); + struct ByID + { + H5Z_filter_t id = 0; + unsigned int flags = 0; + std::vector c_values; + }; + struct Zlib + { + unsigned aggression = 1; + }; + using filter_t = std::variant< + // generic + ByID, + // H5Pset_deflate + Zlib>; + + std::optional chunking; + bool resizable = false; + std::vector filters; + }; + + template + auto parse_filter_by_id(JSON &filter_config, Accessor &&json_accessor) + -> DatasetParams::ByID + { + DatasetParams::ByID byID; + if (!json_accessor(filter_config).contains("id")) + { + throw error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters", "id"}, + "Required key for selecting a filter by ID."); + } + byID.id = [&]() -> H5Z_filter_t { + auto const &id_config = json_accessor(filter_config["id"]); + using pair_t = std::pair; + std::array filter_types{ + pair_t{"deflate", H5Z_FILTER_DEFLATE}, + pair_t{"shuffle", H5Z_FILTER_SHUFFLE}, + pair_t{"fletcher32", H5Z_FILTER_FLETCHER32}, + pair_t{"szip", H5Z_FILTER_SZIP}, + pair_t{"nbit", H5Z_FILTER_NBIT}, + pair_t{"scaleoffset", H5Z_FILTER_SCALEOFFSET}}; + auto id_error = [&]() { + std::stringstream error; + error << "Must be either of unsigned integer type or one of:"; + for (auto const &pair : filter_types) + { + error << " '" << pair.first << "'"; + } + error << "."; + return error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters", "id"}, + error.str()); + }; + if (id_config.is_number_integer()) + { + return id_config.template get(); + } + auto maybe_string = json::asLowerCaseStringDynamic(id_config); + if (!maybe_string.has_value()) + { + throw id_error(); + } + for (auto const &[key, res_type] : filter_types) + { + if (*maybe_string == key) + { + return res_type; + } + } + throw id_error(); + }(); + byID.flags = [&]() -> unsigned int { + if (!json_accessor(filter_config).contains("flags")) + { + return 0; + } + auto const &flag_config = json_accessor(filter_config["flags"]); + using pair_t = std::pair; + std::array filter_types{ + pair_t{"optional", H5Z_FLAG_OPTIONAL}, + pair_t{"mandatory", H5Z_FLAG_MANDATORY}}; + auto flag_error = [&]() { + std::stringstream error; + error << "Must be either of unsigned integer type or one of:"; + for (auto const &pair : filter_types) + { + error << " '" << pair.first << "'"; + } + error << "."; + return error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters", "flags"}, + error.str()); + }; + if (flag_config.is_number_integer()) + { + return flag_config.template get(); + } + auto maybe_string = json::asLowerCaseStringDynamic(flag_config); + if (!maybe_string.has_value()) + { + throw flag_error(); + } + for (auto const &[key, res_type] : filter_types) + { + if (*maybe_string == key) + { + return res_type; + } + } + throw flag_error(); + }(); + if (json_accessor(filter_config).contains("c_values")) + { + auto const &c_values_config = + json_accessor(filter_config["c_values"]); + try + { + + byID.c_values = + c_values_config.template get>(); + } + catch (nlohmann::json::type_error const &) + { + throw error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters", "c_values"}, + "Must be an array of unsigned integers."); + } + } + return byID; } - if (!writable->written) + template + auto parse_filter_zlib(JSON &filter_config, Accessor &&json_accessor) + -> DatasetParams::Zlib { - /* Sanitize name */ - std::string name = parameters.name; - if (auxiliary::starts_with(name, '/')) - name = auxiliary::replace_first(name, "/", ""); - if (auxiliary::ends_with(name, '/')) - name = auxiliary::replace_last(name, "/", ""); - - std::vector dims; - std::uint64_t num_elements = 1u; - for (auto const &val : parameters.extent) + DatasetParams::Zlib zlib; + if (json_accessor(filter_config).contains("aggression")) { - dims.push_back(static_cast(val)); - num_elements *= val; + auto const &aggression_config = + json_accessor(filter_config["aggression"]); + if (!aggression_config.is_number_integer()) + { + throw error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters", "aggression"}, + "Must be of unsigned integer type."); + } + zlib.aggression = aggression_config.template get(); } + return zlib; + } - Datatype d = parameters.dtype; - if (d == Datatype::UNDEFINED) + template + auto parse_filter(JSON &filter_config, Accessor &&json_accessor) + -> DatasetParams::filter_t + { + auto filter_error = []() { + return error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters"}, + "Must be either a JSON object or a vector of JSON objects."); + }; + if (!json_accessor(filter_config).is_object()) { - // TODO handle unknown dtype - std::cerr << "[HDF5] Datatype::UNDEFINED caught during dataset " - "creation (serial HDF5)" - << std::endl; - d = Datatype::BOOL; + throw filter_error(); } - json::TracingJSON config = [&]() { - auto parsed_config = json::parseOptions( - parameters.options, /* considerFiles = */ false); - if (auto hdf5_config_it = parsed_config.config.find("hdf5"); - hdf5_config_it != parsed_config.config.end()) + enum class filter_type + { + ByID, + Zlib + }; + + filter_type type = [&]() -> filter_type { + if (json_accessor(filter_config).contains("type")) { - auto copy = m_global_dataset_config; - json::merge(copy, hdf5_config_it.value()); - hdf5_config_it.value() = std::move(copy); + auto res = json::asLowerCaseStringDynamic( + json_accessor(filter_config["type"])); + if (!res.has_value()) + { + throw error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters", "type"}, + "Must be of type string."); + } + using pair_t = std::pair; + std::array filter_types{ + pair_t{"by_id", filter_type::ByID}, + pair_t{"zlib", filter_type::Zlib}}; + for (auto const &[key, res_type] : filter_types) + { + if (*res == key) + { + return res_type; + } + } + std::stringstream error; + error << "Must be one of:"; + for (auto const &pair : filter_types) + { + error << " '" << pair.first << "'"; + } + error << "."; + throw error::BackendConfigSchema( + {"hdf5", "dataset", "permanent_filters", "type"}, + error.str()); } else { - parsed_config.config["hdf5"] = m_global_dataset_config; + return filter_type::ByID; } - return parsed_config; }(); + switch (type) + { + case filter_type::ByID: + return parse_filter_by_id(filter_config, json_accessor); + case filter_type::Zlib: + return parse_filter_zlib(filter_config, json_accessor); + } + throw std::runtime_error("Unreachable!"); + } + + auto parse_dataset_config( + json::TracingJSON &config, + std::vector const &dims, + Datatype const d) -> DatasetParams + { + DatasetParams res; + // general - bool is_resizable_dataset = false; if (config.json().contains("resizable")) { - is_resizable_dataset = config["resizable"].json().get(); + res.resizable = config["resizable"].json().get(); } - using chunking_t = std::vector; using compute_chunking_t = std::variant; @@ -583,8 +759,33 @@ void HDF5IOHandlerImpl::createDataset( throw_chunking_error(); } } + + if (datasetConfig.json().contains("permanent_filters")) + { + auto permanent_filters = datasetConfig["permanent_filters"]; + if (permanent_filters.json().is_array()) + { + permanent_filters.declareFullyRead(); + res.filters.reserve(permanent_filters.json().size()); + for (auto const &entry : permanent_filters.json()) + { + res.filters.push_back(parse_filter( + entry, [](auto const &j) -> nlohmann::json const & { + return j; + })); + } + } + else + { + res.filters = {parse_filter( + permanent_filters, + [](auto &&j) -> nlohmann::json const & { + return j.json(); + })}; + } + } } - std::optional chunking = std::visit( + res.chunking = std::visit( auxiliary::overloaded{ [&](chunking_t &&explicitly_specified) -> std::optional { @@ -609,6 +810,71 @@ void HDF5IOHandlerImpl::createDataset( }}, std::move(compute_chunking)); + return res; + } +} // namespace + +void HDF5IOHandlerImpl::createDataset( + Writable *writable, Parameter const ¶meters) +{ + if (access::readOnly(m_handler->m_backendAccess)) + throw std::runtime_error( + "[HDF5] Creating a dataset in a file opened as read only is not " + "possible."); + + if (parameters.joinedDimension.has_value()) + { + error::throwOperationUnsupportedInBackend( + "HDF5", "Joined Arrays currently only supported in ADIOS2"); + } + + if (!writable->written) + { + /* Sanitize name */ + std::string name = parameters.name; + if (auxiliary::starts_with(name, '/')) + name = auxiliary::replace_first(name, "/", ""); + if (auxiliary::ends_with(name, '/')) + name = auxiliary::replace_last(name, "/", ""); + + std::vector dims; + std::uint64_t num_elements = 1u; + for (auto const &val : parameters.extent) + { + dims.push_back(static_cast(val)); + num_elements *= val; + } + + Datatype d = parameters.dtype; + if (d == Datatype::UNDEFINED) + { + // TODO handle unknown dtype + std::cerr << "[HDF5] Datatype::UNDEFINED caught during dataset " + "creation (serial HDF5)" + << std::endl; + d = Datatype::BOOL; + } + + json::TracingJSON config = [&]() { + auto parsed_config = json::parseOptions( + parameters.options, /* considerFiles = */ false); + if (auto hdf5_config_it = parsed_config.config.find("hdf5"); + hdf5_config_it != parsed_config.config.end()) + { + auto copy = m_global_dataset_config; + json::merge(copy, hdf5_config_it.value()); + hdf5_config_it.value() = std::move(copy); + } + else + { + parsed_config.config["hdf5"] = m_global_dataset_config; + } + return parsed_config; + }(); + + auto [chunking, is_resizable_dataset, filters] = + parse_dataset_config(config, dims, d); + parameters.warnUnusedParameters( config, "hdf5", @@ -693,25 +959,27 @@ void HDF5IOHandlerImpl::createDataset( { if (chunking->size() != parameters.extent.size()) { - std::string chunking_printed = [&]() { - if (chunking->empty()) - { - return std::string("[]"); - } - else - { - std::stringstream s; - auto it = chunking->begin(); - auto end = chunking->end(); - s << '[' << *it++; - for (; it != end; ++it) + // captured structured bindings are a C++20 extension + std::string chunking_printed = + [&, &captured_chunking = chunking]() { + if (captured_chunking->empty()) { - s << ", " << *it; + return std::string("[]"); } - s << ']'; - return s.str(); - } - }(); + else + { + std::stringstream s; + auto it = captured_chunking->begin(); + auto end = captured_chunking->end(); + s << '[' << *it++; + for (; it != end; ++it) + { + s << ", " << *it; + } + s << ']'; + return s.str(); + } + }(); std::cerr << "[HDF5] Chunking for dataset '" << name << "' was specified as " << chunking_printed << ", but dataset has dimensionality " @@ -731,11 +999,29 @@ void HDF5IOHandlerImpl::createDataset( } } - std::string const &compression = ""; // @todo read from JSON - if (!compression.empty()) - std::cerr - << "[HDF5] Compression not yet implemented in HDF5 backend." - << std::endl; + for (auto const &filter : filters) + { + herr_t status = std::visit( + auxiliary::overloaded{ + [&](DatasetParams::ByID const &by_id) { + return H5Pset_filter( + datasetCreationProperty, + by_id.id, + by_id.flags, + by_id.c_values.size(), + by_id.c_values.data()); + }, + [&](DatasetParams::Zlib const &zlib) { + return H5Pset_deflate( + datasetCreationProperty, zlib.aggression); + }}, + filter); + VERIFY( + status == 0, + "[HDF5] Internal error: Failed to set filter during dataset " + "creation"); + } + /* { std::vector< std::string > args = auxiliary::split(compression,