diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f6ebb1..bcc03ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ # cmake_minimum_required(VERSION 3.12 FATAL_ERROR) -project(Dml VERSION 0.1.5 LANGUAGES C CXX) +project(Dml VERSION 0.1.6 LANGUAGES C CXX) set(PROJECT_SOVERSION 0) @@ -28,7 +28,6 @@ else() endif() # TODO: Remove all options below -option(LIB_ACCEL_3_2 "Use libaccel-3.2" OFF) option(LOG_HW_INIT "Enables HW initialization log" OFF) option(EFFICIENT_WAIT "Enables usage of umonitor/umwait" OFF) @@ -61,14 +60,13 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") # TODO: Remove when option is removed if (DML_HW) message(STATUS "HW path: ON") + message(STATUS "Hardware initialization logging: ${LOG_HW_INIT}") endif() # TODO: Remove get_git_revision() add_subdirectory(sources) - -# Testing add_subdirectory(examples) # Install rules diff --git a/CODEOWNERS b/CODEOWNERS index a69cec5..74ce726 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -12,7 +12,3 @@ # express or implied warranties, other than those that are expressly # stated in the License. # - -# Repository Control Files -.gitignore anton.rubtsov@intel.com -CODEOWNERS anton.rubtsov@intel.com diff --git a/Doxyfile b/Doxyfile index 3961778..4e9aeb2 100644 --- a/Doxyfile +++ b/Doxyfile @@ -53,7 +53,7 @@ PROJECT_NAME = "Intel DML Library" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "v0.1.5-beta" +PROJECT_NUMBER = "v0.1.6-beta" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -831,8 +831,9 @@ WARN_LOGFILE = # Note: If this tag is empty the current directory is searched. INPUT = examples \ - sources \ README.md \ + CONTRIBUTING.md \ + SECURITY.md \ include \ doc/LOW_LEVEL_API_GUIDE.md \ doc/HIGH_LEVEL_API_GUIDE.md \ @@ -922,9 +923,7 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = sources/hw-path/include/libaccel_config.h \ - include/dml/cpp/middle_layer/\ - sources/middle_layer/ +EXCLUDE = include/dml/detail # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -961,15 +960,12 @@ EXCLUDE_SYMBOLS = *_H__ \ DML_CORE_CHECK_* \ DML_BAD_* \ DML_PACKED_STRUCT_DECLARATION* \ - TEST_REGISTER* \ NULL \ DML_FUN \ OWN_API \ OWN_FUN \ OWN_API_INLINE \ - OWN_FUN_INLINE \ - DML_JOB_API_TEST_REGISTER \ - DML_UNIT_TEST_REGISTER + OWN_FUN_INLINE # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include diff --git a/README.md b/README.md index c3c001f..700beec 100644 --- a/README.md +++ b/README.md @@ -68,11 +68,6 @@ Note: cmake -DCMAKE_BUILD_TYPE=Release -DDML_HW=ON ``` -```shell -# Enable libaccel-config-3.2 support for hardware path -cmake -DCMAKE_BUILD_TYPE=Release -DDML_HW=ON -DLIB_ACCEL_3_2=ON -``` - - To enable `-frecord-gcc-switches` flag, use the DML_RECORD_SWITCHES option as follows: ```shell @@ -85,7 +80,7 @@ The resulting library is available in the `/lib` folder. ## Documentation - [Intel DML Reference Manual](./doc/DML_REFERENCE_MANUAL.md) -- [Security Policy](doc/SECURITY.md) +- [Security Policy](./SECURITY.md) To generate full documentation from sources with Doxygen, use the following commands: @@ -100,12 +95,12 @@ To open the generated HTML Reference, open the `/doc/html/index.htm ## How to Contribute -See [Contributing document](CONTRIBUTING.md) for details about contribution process. +See [Contributing document](./CONTRIBUTING.md) for details about contribution process. ## License The library is licensed under the MIT license. Refer to the -"[LICENSE](LICENSE)" file for the full license text. +"[LICENSE](./LICENSE)" file for the full license text. This distribution includes third party software governed by separate license -terms (see "[THIRD-PARTY-PROGRAMS](THIRD-PARTY-PROGRAMS)"). +terms (see "[THIRD-PARTY-PROGRAMS](./third-party-programs.txt)"). diff --git a/cmake/gnu.cmake b/cmake/gnu.cmake index 7c9e85e..99c83cd 100644 --- a/cmake/gnu.cmake +++ b/cmake/gnu.cmake @@ -26,10 +26,13 @@ list(APPEND DML_SECURITY_DEFINITIONS $<$:_FORTIFY_SOURCE=2>) list(APPEND DML_QUALITY_OPTIONS - $<$:-Werror> -Wall -Wextra -pedantic ) -list(APPEND DML_AVX512_OPTIONS -march=skylake-avx512) # -mavx512dq -mavx512vl -mavx512bw -mclflushopt -mclwb +list(APPEND DML_CPP_PRIVATE_OPTIONS + -fno-exceptions + -fno-rtti + -fno-threadsafe-statics + ) diff --git a/cmake/msvc.cmake b/cmake/msvc.cmake index 77285cc..e345bf7 100644 --- a/cmake/msvc.cmake +++ b/cmake/msvc.cmake @@ -19,9 +19,6 @@ list(APPEND DML_SECURITY_OPTIONS list(APPEND DML_SECURITY_DEFINITIONS) -list(APPEND DML_QUALITY_OPTIONS - $<$:/Wx>> - #/Wall MSVC warning level can be set only via CMAKE_CXX_FLAGS - ) +list(APPEND DML_QUALITY_OPTIONS /W3) -list(APPEND DML_AVX512_OPTIONS /arch:AVX512) +list(APPEND DML_CPP_PRIVATE_OPTIONS) diff --git a/doc/DML_REFERENCE_MANUAL.md b/doc/DML_REFERENCE_MANUAL.md index c82ef1b..73cb4fa 100644 --- a/doc/DML_REFERENCE_MANUAL.md +++ b/doc/DML_REFERENCE_MANUAL.md @@ -119,14 +119,14 @@ Hardware path is required to set up environment to utilize Intel DSA accelerator ### Build Prerequisites - Compiler: - - Linux* OS: gcc 8.2 or higher - - Windows* OS: MSVC 19 or higher - - Libraries: - - Linux: Universally Unique ID library: `uuid-dev` version 2.35.2 or higher. + - Linux* OS: gcc 8.2 or higher + - Windows* OS: MSVC 19 or higher +- Libraries: + - Linux: Universally Unique ID library: `uuid-dev` version 2.35.2 or higher. - Cross-platform build tool: CMake* version 3.12 or higher - Make: GNU 'make' (Linux* OS) or 'nmake' (Windows*) -- Documentation generator: - - Doxygen 1.8.17 or higher +- Documentation generator: + - Doxygen 1.8.17 or higher diff --git a/doc/RELEASE_NOTES.md b/doc/RELEASE_NOTES.md index c9ab7a7..7f5f58c 100644 --- a/doc/RELEASE_NOTES.md +++ b/doc/RELEASE_NOTES.md @@ -1,6 +1,31 @@ Intel® Data Mover Library (Intel® DML) Release Notes =============================================================================== +### Intel® DML v0.1.6-beta + +**Date: December 2021** + +**Note**: Release introduces bug fixes and several minor improvements + +**Features**: +* Improved incorrect input checking +* Added check for adjacent buffers for the DIF Strip operation. Status: `DML_STATUS_DIF_STRIP_ADJACENT_ERROR` +* Reworked hardware related statuses for C API +* Added new status to indicate submission failure: + * `DML_STATUS_WORK_QUEUES_NOT_AVAILABLE` for C API + * `dml::status_code::queue_busy` for C++ API +* Removed LIBACCEL_3_2 cmake option. The supported version of accel-config is now 3.2 and higher +* NUMA node id is detected before each submission now, so threads are safe to change nodes at any time + +**Bug fix**: +* Fixed the issue when batch operation doesn't work for buffer not aligned on 64 bytes boundary +* Fixed the issue when current thread NUMA node id is deduced incorrectly +* Fixed crashes when there are no available devices for the current thread NUMA node id +* Removed dependencies on C++ runtime from C API + +**Warnings**: +* As NUMA node id of the current thread is now deduced correctly, ensure that accelerators' configuration is compatible. The library does no cross-socket submissions. If there is no available device for the current NUMA node id, then an error status code is reported. + ### Intel® DML v0.1.5-beta **Date: November 2021** diff --git a/examples/dml_job_api/job_wrapper_launchers.c b/examples/dml_job_api/job_wrapper_launchers.c index 46ab635..3184d12 100644 --- a/examples/dml_job_api/job_wrapper_launchers.c +++ b/examples/dml_job_api/job_wrapper_launchers.c @@ -414,7 +414,8 @@ dml_status_t dif_strip_sample_launcher(dml_job_t *const dml_job_ptr) DIF_SAMPLE_REFERENCE_ARRAY_SIZE, DIF_SAMPLE_BLOCK_SIZE); - uint8_t source_array[DIF_SAMPLE_REFERENCE_ARRAY_SIZE]; + // The same size used because of DML_STATUS_DIF_STRIP_ADJACENT_ERROR + uint8_t source_array[DIF_SAMPLE_PROTECTED_ARRAY_SIZE]; uint8_t destination_array[DIF_SAMPLE_PROTECTED_ARRAY_SIZE]; dml_status_t status = dml_dif_insert_8u(source_array, diff --git a/include/dml/cpp/detail/make_result.hpp b/include/dml/cpp/detail/make_result.hpp deleted file mode 100644 index 49a8f8b..0000000 --- a/include/dml/cpp/detail/make_result.hpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @date 05/20/2021 - * @brief Contains internal execute implementation - */ - -#ifndef DML_DETAIL_MAKE_RESULT_HPP -#define DML_DETAIL_MAKE_RESULT_HPP - -#include -#include -#include -#include - -namespace dml::detail -{ - /** - * @todo - */ - inline auto to_own(ml::execution_status status) noexcept - { - switch (status) - { - case ml::execution_status::success: - return status_code::ok; - case ml::execution_status::false_predicate_success: - return status_code::false_predicate; - default: - // Anything else is considered an error temporarily - return status_code::error; - } - } - - /** - * @todo - */ - template - auto make_result(ml::completion_record& record) noexcept - { - if constexpr (std::is_same_v) - { - auto view = ml::views::mem_move_result(record); - - return mem_move_result{ to_own(static_cast(view.status())) }; - } - if constexpr (std::is_same_v) - { - auto view = ml::views::mem_move_result(record); - - return mem_copy_result{ to_own(static_cast(view.status())) }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::fill_result(record); - - return fill_result{ to_own(static_cast(view.status())) }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::compare_result(record); - - return compare_result{ to_own(static_cast(view.status())), - static_cast(view.result()), - view.bytes_completed() }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::create_delta_result(record); - - return create_delta_result{ to_own(static_cast(view.status())), - static_cast(view.result()), - view.bytes_completed(), - view.delta_record_size() }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::apply_delta_result(record); - - return apply_delta_result{ to_own(static_cast(view.status())) }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::dualcast_result(record); - - return dualcast_result{ to_own(static_cast(view.status())) }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::crc_result(record); - - return crc_result{ to_own(static_cast(view.status())), - view.crc_value() }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::cache_flush_result(record); - - return cache_flush_result{ to_own(static_cast(view.status())) }; - } - else if constexpr (std::is_same_v) - { - auto view = ml::views::batch_result(record); - - return batch_result{ to_own(static_cast(view.status())), - view.descriptors_completed() }; - } - } - -} // namespace dml::detail - -#endif //DML_DETAIL_MAKE_RESULT_HPP diff --git a/include/dml/cpp/middle_layer/awaiter.hpp b/include/dml/cpp/middle_layer/awaiter.hpp deleted file mode 100644 index ab9431f..0000000 --- a/include/dml/cpp/middle_layer/awaiter.hpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_AWAITER_HPP -#define DML_AWAITER_HPP - -#include - -namespace dml::ml -{ - /** - * @brief Class that allows to defer scope exit to the moment when a certain address is changed - */ - class awaiter final - { - public: - /** - * @brief Constructor of the class - * - * @param address pointer to memory that should be asynchronously changed - * @param initial_value value to compare with - * @param period number of clocks between checks - */ - explicit awaiter(volatile void *address, uint8_t initial_value, uint32_t period = 200) noexcept; - - /** - * @brief Destructor that performs actual wait - */ - ~awaiter() noexcept; - - private: - volatile uint8_t *address_ptr_; /** - -#include "awaiter.hpp" -#include "types.hpp" - -namespace dml::ml -{ - /** - * @todo - */ - struct alignas(32u) completion_record - { - byte_t bytes[32u]; /**< @todo */ - }; - - /** - * @todo - */ - inline void wait(volatile completion_record &record) noexcept - { - awaiter wait_for(static_cast(&record), 0); - } - - /** - * @todo - */ - inline bool is_finished(const completion_record &record) noexcept - { - return 0 != record.bytes[0]; - } - -} // namespace dml::ml - -#endif //DML_ML_COMPLETION_RECORD_HPP diff --git a/include/dml/cpp/middle_layer/device.hpp b/include/dml/cpp/middle_layer/device.hpp deleted file mode 100644 index e440d2f..0000000 --- a/include/dml/cpp/middle_layer/device.hpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_ML_DEVICE_HPP -#define DML_ML_DEVICE_HPP - -#include "completion_record.hpp" -#include "core.hpp" -#include "make_descriptor.hpp" -#include "status.hpp" - -namespace dml::ml -{ - /** - * @todo - */ - class device - { - public: - /** - * @todo - */ - device() = default; - - /** - * @todo - */ - virtual ~device() = default; - - /** - * @todo - */ - virtual submission_status submit(descriptor& dsc, completion_record& record) noexcept = 0; - }; - - /** - * @todo - */ - class software: public device - { - public: - /** - * @todo - */ - software() = default; - - /** - * @todo - */ - ~software() override = default; - - /** - * @todo - */ - submission_status submit(descriptor& dsc, completion_record& record) noexcept override - { - views::any_descriptor(dsc).completion_record_address() = reinterpret_cast(&record); - static_cast(core::submit(dsc)); - - return submission_status::success; - } - }; - - /** - * @todo - */ - class hardware: public device - { - public: - /** - * @todo - */ - hardware() = default; - - /** - * @todo - */ - ~hardware() override = default; - -#ifdef DML_HW - /** - * @todo - */ - submission_status submit(descriptor& dsc, completion_record& record) noexcept override; -#else - /** - * @todo - */ - submission_status submit(descriptor& dsc, completion_record& record) noexcept override - { - return submission_status::failure; - } -#endif - }; - -} // namespace dml::ml - -#endif //DML_ML_DEVICE_HPP diff --git a/include/dml/cpp/middle_layer/make_descriptor.hpp b/include/dml/cpp/middle_layer/make_descriptor.hpp deleted file mode 100644 index 8b04163..0000000 --- a/include/dml/cpp/middle_layer/make_descriptor.hpp +++ /dev/null @@ -1,419 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_ML_OPERATION_HPP -#define DML_ML_OPERATION_HPP - -#include "descriptor.hpp" -#include "descriptor_views.hpp" -#include "options.hpp" -#include "types.hpp" -#include "values.hpp" - -namespace dml::ml -{ - /** - * @todo - */ - [[nodiscard]] inline descriptor make_nop_descriptor(const nop_options options) noexcept - { - auto dsc = descriptor(); - auto view = views::nop_descriptor(dsc); - - view.operation() = static_cast(operation::nop); - view.flags() = static_cast(options); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_drain_descriptor(address_t readback_address_1, - address_t readback_address_2, - const drain_options options, - const drain_additional_options additional_options) noexcept - { - auto dsc = descriptor(); - auto view = views::drain_descriptor(dsc); - - view.operation() = static_cast(operation::drain); - view.readback_address_1() = readback_address_1; - view.readback_address_2() = readback_address_2; - view.flags() = static_cast(options); - view.operation_specific_flags() = static_cast(additional_options); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_mem_move_descriptor(const byte_t *const src, - byte_t *const dst, - const transfer_size_t size, - const mem_move_options options) noexcept - { - auto dsc = descriptor(); - auto view = views::mem_move_descriptor(dsc); - - view.operation() = static_cast(operation::memory_move); - view.source_address() = reinterpret_cast(src); - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = size; - view.flags() = static_cast(options); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_fill_descriptor(const uint64_t pattern, - byte_t *const dst, - const transfer_size_t size, - const fill_options options) noexcept - { - auto dsc = descriptor(); - auto view = views::fill_descriptor(dsc); - - view.operation() = static_cast(operation::fill); - view.pattern() = pattern; - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = size; - view.flags() = static_cast(options); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_dualcast_descriptor(const byte_t *const src, - byte_t *const dst1, - byte_t *const dst2, - const transfer_size_t size, - const dualcast_options options, - const dualcast_additional_options additional_options) noexcept - { - auto dsc = descriptor(); - auto view = views::dualcast_descriptor(dsc); - - view.operation() = static_cast(operation::dualcast); - view.source_address() = reinterpret_cast(src); - view.destination_1_address() = reinterpret_cast(dst1); - view.destination_2_address() = reinterpret_cast(dst2); - view.transfer_size() = size; - view.flags() = static_cast(options); - view.operation_specific_flags() = static_cast(additional_options); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_compare_descriptor(const byte_t *const src1, - const byte_t *const src2, - const transfer_size_t size, - const compare_options options, - const compare_expected_result_options expected_result) noexcept - { - auto dsc = descriptor(); - auto view = views::compare_descriptor(dsc); - - view.operation() = static_cast(operation::compare); - view.source_1_address() = reinterpret_cast(src1); - view.source_2_address() = reinterpret_cast(src2); - view.transfer_size() = size; - view.flags() = static_cast(options); - view.expected_result() = static_cast(expected_result); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_compare_pattern_descriptor(const uint64_t pattern, - const byte_t *src, - const transfer_size_t size, - const compare_pattern_options options, - const compare_expected_result_options expected_result) noexcept - { - auto dsc = descriptor(); - auto view = views::compare_pattern_descriptor(dsc); - - view.operation() = static_cast(operation::compare_pattern); - view.pattern() = pattern; - view.source_address() = reinterpret_cast(src); - view.transfer_size() = size; - view.flags() = static_cast(options); - view.expected_result() = static_cast(expected_result); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_crc_descriptor(const byte_t *const src, - const transfer_size_t size, - const crc_value_t crc_seed, - const crc_options options, - const crc_additional_options additional_options) noexcept - { - auto dsc = descriptor(); - auto view = views::crc_descriptor(dsc); - - view.operation() = static_cast(operation::crc); - view.source_address() = reinterpret_cast(src); - view.transfer_size() = size; - view.flags() = static_cast(options); - view.operation_specific_flags() = static_cast(additional_options); - view.crc_seed() = crc_seed; - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_copy_crc_descriptor(const byte_t *const src, - byte_t *const dst, - const transfer_size_t size, - const crc_value_t crc_seed, - const copy_crc_options options, - const copy_crc_additional_options additional_options) noexcept - { - auto dsc = descriptor(); - auto view = views::copy_crc_descriptor(dsc); - - view.operation() = static_cast(operation::copy_crc); - view.source_address() = reinterpret_cast(src); - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = size; - view.flags() = static_cast(options); - view.operation_specific_flags() = static_cast(additional_options); - view.crc_seed() = crc_seed; - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_create_delta_descriptor(const byte_t *const src1, - const byte_t *const src2, - const transfer_size_t size, - byte_t *const delta_record, - const transfer_size_t delta_max_size, - const create_delta_options options, - const delta_expected_result_options expected_result) noexcept - { - auto dsc = descriptor(); - auto view = views::create_delta_descriptor(dsc); - - view.operation() = static_cast(operation::create_delta); - view.source_1_address() = reinterpret_cast(src1); - view.source_2_address() = reinterpret_cast(src2); - view.delta_record_address() = reinterpret_cast(delta_record); - view.transfer_size() = size; - view.maximum_delta_record_size() = delta_max_size; - view.flags() = static_cast(options); - view.expected_result_mask() = static_cast(expected_result); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_apply_delta_descriptor(const byte_t *const delta_record, - const transfer_size_t delta_size, - byte_t *const dst, - const transfer_size_t size, - const apply_delta_options options) noexcept - { - auto dsc = descriptor(); - auto view = views::apply_delta_descriptor(dsc); - - view.operation() = static_cast(operation::apply_delta); - view.delta_record_address() = reinterpret_cast(delta_record); - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = size; - view.delta_record_size() = delta_size; - view.flags() = static_cast(options); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_cache_flush_descriptor(byte_t *const dst, - const transfer_size_t size, - const cache_flush_options options) noexcept - { - auto dsc = descriptor(); - auto view = views::cache_flush_descriptor(dsc); - - view.operation() = static_cast(operation::cache_flush); - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = size; - view.flags() = static_cast(options); - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_dif_check_descriptor(const byte_t *src, - transfer_size_t transfer_size, - dif_parameters src_parameters, - dif_check_options options, - dif_additional_options additional_options, - dif_additional_src_options additional_src_options) noexcept - { - auto dsc = descriptor(); - auto view = views::dif_check_descriptor(dsc); - - view.operation() = static_cast(operation::dif_check); - view.source_address() = reinterpret_cast(src); - view.transfer_size() = transfer_size; - view.flags() = static_cast(options); - view.dif_flags() = static_cast(additional_options); - view.source_dif_flags() = static_cast(additional_src_options); - view.source_ref_tag() = src_parameters.ref_tag_seed; - view.source_app_tag() = src_parameters.app_tag_seed; - view.source_app_tag_mask() = src_parameters.app_tag_mask; - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_dif_insert_descriptor(const byte_t *src, - byte_t *dst, - transfer_size_t transfer_size, - dif_parameters dst_parameters, - dif_insert_options options, - dif_additional_options additional_options, - dif_additional_dst_options additional_dst_options) noexcept - { - auto dsc = descriptor(); - auto view = views::dif_insert_descriptor(dsc); - - view.operation() = static_cast(operation::dif_insert); - view.source_address() = reinterpret_cast(src); - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = transfer_size; - view.flags() = static_cast(options); - view.dif_flags() = static_cast(additional_options); - view.destination_dif_flags() = static_cast(additional_dst_options); - view.destination_ref_tag() = dst_parameters.ref_tag_seed; - view.destination_app_tag() = dst_parameters.app_tag_seed; - view.destination_app_tag_mask() = dst_parameters.app_tag_mask; - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_dif_strip_descriptor(const byte_t *src, - byte_t *dst, - transfer_size_t transfer_size, - dif_parameters src_parameters, - dif_strip_options options, - dif_additional_options additional_options, - dif_additional_src_options additional_src_options) noexcept - { - auto dsc = descriptor(); - auto view = views::dif_strip_descriptor(dsc); - - view.operation() = static_cast(operation::dif_strip); - view.source_address() = reinterpret_cast(src); - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = transfer_size; - view.flags() = static_cast(options); - view.dif_flags() = static_cast(additional_options); - view.source_dif_flags() = static_cast(additional_src_options); - view.source_ref_tag() = src_parameters.ref_tag_seed; - view.source_app_tag() = src_parameters.app_tag_seed; - view.source_app_tag_mask() = src_parameters.app_tag_mask; - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_dif_update_descriptor(const byte_t *src, - byte_t *dst, - transfer_size_t transfer_size, - dif_parameters src_parameters, - dif_parameters dst_parameters, - dif_update_options options, - dif_additional_options additional_options, - dif_additional_src_options additional_src_options, - dif_additional_dst_options additional_dst_options) noexcept - { - auto dsc = descriptor(); - auto view = views::dif_update_descriptor(dsc); - - view.operation() = static_cast(operation::dif_update); - view.source_address() = reinterpret_cast(src); - view.destination_address() = reinterpret_cast(dst); - view.transfer_size() = transfer_size; - view.flags() = static_cast(options); - view.dif_flags() = static_cast(additional_options); - - view.source_dif_flags() = static_cast(additional_src_options); - view.source_ref_tag() = src_parameters.ref_tag_seed; - view.source_app_tag() = src_parameters.app_tag_seed; - view.source_app_tag_mask() = src_parameters.app_tag_mask; - - view.destination_dif_flags() = static_cast(additional_dst_options); - view.destination_ref_tag() = dst_parameters.ref_tag_seed; - view.destination_app_tag() = dst_parameters.app_tag_seed; - view.destination_app_tag_mask() = dst_parameters.app_tag_mask; - - return dsc; - } - - /** - * @todo - */ - [[nodiscard]] inline descriptor make_batch_descriptor(const descriptor *const src, - const transfer_size_t length, - const batch_options options) noexcept - { - auto dsc = descriptor(); - auto view = views::batch_descriptor(dsc); - - view.operation() = static_cast(operation::batch); - view.descriptor_list_address() = reinterpret_cast(src); - view.descriptors_count() = length; - view.flags() = static_cast(options); - - return dsc; - } -} // namespace dml::ml - -#endif //DML_ML_OPERATION_HPP diff --git a/include/dml/cpp/middle_layer/option_types.hpp b/include/dml/cpp/middle_layer/option_types.hpp deleted file mode 100644 index e58e910..0000000 --- a/include/dml/cpp/middle_layer/option_types.hpp +++ /dev/null @@ -1,590 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_COMMON_OPTION_TYPES_HPP -#define DML_COMMON_OPTION_TYPES_HPP - -#include -#include - -#include "types.hpp" -#include "values.hpp" - -namespace dml::ml -{ - /** - * @todo - */ - template - class options_interface - { - public: - /** - * @todo - */ - explicit constexpr options_interface(underlying_type_t flags = 0u) noexcept: flags_(flags) - { - } - - /** - * @todo - */ - constexpr explicit operator underlying_type_t() const noexcept - { - return flags_; - } - - /** - * @todo - */ - constexpr bool operator==(const options_interface &rhs) const - { - return flags_ == rhs.flags_; - } - - /** - * @todo - */ - constexpr bool operator!=(const options_interface &rhs) const - { - return rhs != *this; - } - - /** - * @todo - */ - [[nodiscard]] constexpr bool contains(const options_interface &rhs) const noexcept - { - return (this->flags_ & rhs.flags_) == rhs.flags_; - } - - /** - * @todo - */ - - protected: - underlying_type_t flags_; /**< @todo */ - }; - - /** - * @todo - */ - class options: public options_interface - { - public: - using options_interface::options_interface; - - constexpr options(flag value): options_interface(static_cast>(value)) - { - } - }; - - /** - * @todo - */ - class nop_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr nop_options operator|(const nop_options &rhs) const noexcept - { - return nop_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class drain_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr drain_options operator|(const drain_options &rhs) const noexcept - { - return drain_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class mem_move_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr mem_move_options operator|(const mem_move_options &rhs) const noexcept - { - return mem_move_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class fill_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr fill_options operator|(const fill_options &rhs) const noexcept - { - return fill_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dualcast_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr dualcast_options operator|(const dualcast_options &rhs) const noexcept - { - return dualcast_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class compare_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr compare_options operator|(const compare_options &rhs) const noexcept - { - return compare_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class compare_pattern_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr compare_pattern_options operator|(const compare_pattern_options &rhs) const noexcept - { - return compare_pattern_options(this->flags_ | rhs.flags_); - } - }; - - /** -* @todo -*/ - class crc_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr crc_options operator|(const crc_options &rhs) const noexcept - { - return crc_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class copy_crc_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr copy_crc_options operator|(const copy_crc_options &rhs) const noexcept - { - return copy_crc_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class create_delta_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr create_delta_options operator|(const create_delta_options &rhs) const noexcept - { - return create_delta_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class apply_delta_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr apply_delta_options operator|(const apply_delta_options &rhs) const noexcept - { - return apply_delta_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class cache_flush_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr cache_flush_options operator|(const cache_flush_options &rhs) const noexcept - { - return cache_flush_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dif_check_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr dif_check_options operator|(const dif_check_options &rhs) const noexcept - { - return dif_check_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dif_insert_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr dif_insert_options operator|(const dif_insert_options &rhs) const noexcept - { - return dif_insert_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dif_strip_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr dif_strip_options operator|(const dif_strip_options &rhs) const noexcept - { - return dif_strip_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dif_update_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr dif_update_options operator|(const dif_update_options &rhs) const noexcept - { - return dif_update_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class batch_options: public options - { - public: - using options::options; - - /** - * @todo - */ - constexpr batch_options operator|(const batch_options &rhs) const noexcept - { - return batch_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class additional_options: public options_interface - { - public: - using options_interface::options_interface; - }; - - /** - * @todo - */ - class compare_expected_result_options: public options_interface - { - public: - using options_interface::options_interface; - - /** - * @todo - */ - constexpr compare_expected_result_options operator|(const compare_expected_result_options &rhs) const noexcept - { - return compare_expected_result_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class delta_expected_result_options: public options_interface - { - public: - using options_interface::options_interface; - - /** - * @todo - */ - constexpr delta_expected_result_options operator|(const delta_expected_result_options &rhs) const noexcept - { - return delta_expected_result_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class drain_additional_options: public additional_options - { - public: - using additional_options::additional_options; - - constexpr drain_additional_options() = default; - - constexpr drain_additional_options(drain_flag value): additional_options(static_cast>(value)) - { - } - - /** - * @todo - */ - constexpr drain_additional_options operator|(const drain_additional_options &rhs) const noexcept - { - return drain_additional_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class crc_additional_options: public additional_options - { - public: - using additional_options::additional_options; - - constexpr crc_additional_options() = default; - - constexpr crc_additional_options(crc_flag value): additional_options(static_cast>(value)) - { - } - - /** - * @todo - */ - constexpr crc_additional_options operator|(const crc_additional_options &rhs) const noexcept - { - return crc_additional_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class copy_crc_additional_options: public additional_options - { - public: - using additional_options::additional_options; - - constexpr copy_crc_additional_options() = default; - - constexpr copy_crc_additional_options(crc_flag value): additional_options(static_cast>(value)) - { - } - - /** - * @todo - */ - constexpr copy_crc_additional_options operator|(const copy_crc_additional_options &rhs) const noexcept - { - return copy_crc_additional_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dualcast_additional_options: public additional_options - { - public: - using additional_options::additional_options; - - constexpr dualcast_additional_options() = default; - - constexpr dualcast_additional_options(dualcast_flag value): additional_options(static_cast>(value)) - { - } - - /** - * @todo - */ - constexpr dualcast_additional_options operator|(const dualcast_additional_options &rhs) const noexcept - { - return dualcast_additional_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dif_additional_options: public additional_options - { - public: - using additional_options::additional_options; - - constexpr dif_additional_options() = default; - - constexpr dif_additional_options(dif_flag value): additional_options(static_cast>(value)) - { - } - - /** - * @todo - */ - constexpr dif_additional_options operator|(const dif_additional_options &rhs) const noexcept - { - return dif_additional_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dif_additional_src_options: public additional_options - { - public: - using additional_options::additional_options; - - constexpr dif_additional_src_options() = default; - - constexpr dif_additional_src_options(dif_src_flag value): additional_options(static_cast>(value)) - { - } - - /** - * @todo - */ - constexpr dif_additional_src_options operator|(const dif_additional_src_options &rhs) const noexcept - { - return dif_additional_src_options(this->flags_ | rhs.flags_); - } - }; - - /** - * @todo - */ - class dif_additional_dst_options: public additional_options - { - public: - using additional_options::additional_options; - - constexpr dif_additional_dst_options() = default; - - constexpr dif_additional_dst_options(dif_dst_flag value): additional_options(static_cast>(value)) - { - } - - /** - * @todo - */ - constexpr dif_additional_dst_options operator|(const dif_additional_dst_options &rhs) const noexcept - { - return dif_additional_dst_options(this->flags_ | rhs.flags_); - } - }; - -} // namespace dml::ml - -#endif //DML_COMMON_OPTION_TYPES_HPP diff --git a/include/dml/cpp/middle_layer/options.hpp b/include/dml/cpp/middle_layer/options.hpp deleted file mode 100644 index 9bb3ca4..0000000 --- a/include/dml/cpp/middle_layer/options.hpp +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_COMMON_OPTIONS_HPP -#define DML_COMMON_OPTIONS_HPP - -#include "option_types.hpp" -#include "values.hpp" - -namespace dml::ml -{ - /** - * @todo - */ - struct nop_option - { - using type = nop_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - }; - - /** - * @todo - */ - struct drain_option - { - using type = drain_options; - - static constexpr type none = type(); - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - }; - - /** - * @todo - */ - struct mem_move_option - { - using type = mem_move_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type cache_control = flag::cache_control; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - static constexpr type strict_ordering = flag::strict_ordering; - static constexpr type destination_readback = flag::destination_readback; - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; - }; - - /** - * @todo - */ - struct fill_option - { - using type = fill_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type cache_control = flag::cache_control; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - static constexpr type strict_ordering = flag::strict_ordering; - static constexpr type destination_readback = flag::destination_readback; - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; - }; - - /** - * @todo - */ - struct dualcast_option - { - using type = dualcast_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type cache_control = flag::cache_control; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - static constexpr type address_3_tc_selector = flag::address_3_tc_selector; - static constexpr type strict_ordering = flag::strict_ordering; - static constexpr type destination_readback = flag::destination_readback; - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; - }; - - /** - * @todo - */ - struct compare_option - { - using type = compare_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type check_result = flag::check_result; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - }; - - /** - * @todo - */ - struct compare_pattern_option - { - using type = compare_pattern_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type check_result = flag::check_result; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - }; - - /** - * @todo - */ - struct crc_option - { - using type = crc_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_3_tc_selector = flag::address_3_tc_selector; - }; - - /** - * @todo - */ - struct copy_crc_option - { - using type = copy_crc_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type cache_control = flag::cache_control; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - static constexpr type address_3_tc_selector = flag::address_3_tc_selector; - static constexpr type strict_ordering = flag::strict_ordering; - static constexpr type destination_readback = flag::destination_readback; - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; - }; - - /** - * @todo - */ - struct create_delta_option - { - using type = create_delta_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type check_result = flag::check_result; - static constexpr type cache_control = flag::cache_control; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - static constexpr type address_3_tc_selector = flag::address_3_tc_selector; - static constexpr type strict_ordering = flag::strict_ordering; - static constexpr type destination_readback = flag::destination_readback; - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; - }; - - /** - * @todo - */ - struct apply_delta_option - { - using type = apply_delta_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type cache_control = flag::cache_control; - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - static constexpr type strict_ordering = flag::strict_ordering; - static constexpr type destination_readback = flag::destination_readback; - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; - }; - - /** - * @todo - */ - struct cache_flush_option - { - using type = cache_flush_options; - - static constexpr type none = type(); - static constexpr type fence = flag::fence; - static constexpr type block_on_fault = flag::block_on_fault; - static constexpr type cache_control = flag::cache_control; - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; - static constexpr type strict_ordering = flag::strict_ordering; - static constexpr type destination_readback = flag::destination_readback; - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; - }; - - /** - * @todo - */ - struct batch_option - { - using type = batch_options; - - static constexpr type none = type(); - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; - }; - - /** - * @todo - */ - struct dif_check_option - { - using type = dif_check_options; - - static constexpr type none = type(0); /**< @todo */ - static constexpr type fence = flag::fence; /**< @todo */ - static constexpr type block_on_fault = flag::block_on_fault; /**< @todo */ - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; /**< @todo */ - }; - - /** - * @todo - */ - struct dif_insert_option - { - using type = dif_insert_options; - - static constexpr type none = type(0); /**< @todo */ - static constexpr type fence = flag::fence; /**< @todo */ - static constexpr type block_on_fault = flag::block_on_fault; /**< @todo */ - static constexpr type cache_control = flag::cache_control; /**< @todo */ - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; /**< @todo */ - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; /**< @todo */ - static constexpr type strict_ordering = flag::strict_ordering; /**< @todo */ - static constexpr type destination_readback = flag::destination_readback; /**< @todo */ - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; /**< @todo */ - }; - - /** - * @todo - */ - struct dif_strip_option - { - using type = dif_strip_options; - - static constexpr type none = type(0); /**< @todo */ - static constexpr type fence = flag::fence; /**< @todo */ - static constexpr type block_on_fault = flag::block_on_fault; /**< @todo */ - static constexpr type cache_control = flag::cache_control; /**< @todo */ - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; /**< @todo */ - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; /**< @todo */ - static constexpr type strict_ordering = flag::strict_ordering; /**< @todo */ - static constexpr type destination_readback = flag::destination_readback; /**< @todo */ - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; /**< @todo */ - }; - - /** - * @todo - */ - struct dif_update_option - { - using type = dif_update_options; - - static constexpr type none = type(0); /**< @todo */ - static constexpr type fence = flag::fence; /**< @todo */ - static constexpr type block_on_fault = flag::block_on_fault; /**< @todo */ - static constexpr type cache_control = flag::cache_control; /**< @todo */ - static constexpr type address_1_tc_selector = flag::address_1_tc_selector; /**< @todo */ - static constexpr type address_2_tc_selector = flag::address_2_tc_selector; /**< @todo */ - static constexpr type strict_ordering = flag::strict_ordering; /**< @todo */ - static constexpr type destination_readback = flag::destination_readback; /**< @todo */ - static constexpr type destination_steering_tag_selector = flag::destination_steering_tag_selector; /**< @todo */ - }; - - /** - * @todo - */ - struct compare_expected_result_option - { - static constexpr auto expect_equal = compare_expected_result_options(0); - static constexpr auto expect_not_equal = compare_expected_result_options(1); - }; - - /** - * @todo - */ - struct delta_expected_result_option - { - static constexpr auto expect_equal = delta_expected_result_options(1); - static constexpr auto expect_not_equal = delta_expected_result_options(2); - static constexpr auto expect_overflow = delta_expected_result_options(4); - }; - - /** - * @todo - */ - struct drain_additional_option - { - using type = drain_additional_options; - - static constexpr type none = type(); - static constexpr type readback_address_1_valid = drain_flag::readback_address_1_valid; - static constexpr type readback_address_2_valid = drain_flag::readback_address_2_valid; - static constexpr type suppress_tc_a_implicit_readback = drain_flag::suppress_tc_a_implicit_readback; - static constexpr type suppress_tc_b_implicit_readback = drain_flag::suppress_tc_b_implicit_readback; - }; - - /** - * @todo - */ - struct crc_additional_option - { - using type = crc_additional_options; - - static constexpr type none = type(); - static constexpr type read_crc_seed = crc_flag::read_crc_seed; - static constexpr type bypass_reflection = crc_flag::bypass_crc_inversion_and_reflection; - static constexpr type bypass_data_reflection = crc_flag::bypass_data_reflection; - }; - - /** - * @todo - */ - struct copy_crc_additional_option - { - using type = copy_crc_additional_options; - - static constexpr type none = type(); - static constexpr type read_crc_seed = crc_flag::read_crc_seed; - static constexpr type bypass_reflection = crc_flag::bypass_crc_inversion_and_reflection; - static constexpr type bypass_data_reflection = crc_flag::bypass_data_reflection; - }; - - /** - * @todo - */ - struct dualcast_additional_option - { - using type = dualcast_additional_options; - - static constexpr type none = type(); - static constexpr type destination_2_steering_tag_selector = dualcast_flag::destination_2_steering_tag_selector; - }; - - /** - * @todo - */ - struct dif_additional_option - { - using type = dif_additional_options; - - static constexpr type block_size_512 = dif_flag::block_size_512; /**< @todo */ - static constexpr type block_size_520 = dif_flag::block_size_520; /**< @todo */ - static constexpr type block_size_4096 = dif_flag::block_size_4096; /**< @todo */ - static constexpr type block_size_4104 = dif_flag::block_size_4104; /**< @todo */ - static constexpr type invert_crc_seed = dif_flag::invert_crc_seed; /**< @todo */ - static constexpr type invert_crc_result = dif_flag::invert_crc_result; /**< @todo */ - }; - - /** - * @todo - */ - struct dif_additional_src_option - { - using type = dif_additional_src_options; - - static constexpr type none = type(0); /**< @todo */ - static constexpr type enable_all_f_detect_error = dif_src_flag::enable_all_f_detect_error; /**< @todo */ - static constexpr type all_f_detect = dif_src_flag::all_f_detect; /**< @todo */ - static constexpr type app_tag_f_detect = dif_src_flag::app_tag_f_detect; /**< @todo */ - static constexpr type app_and_ref_tag_f_detect = dif_src_flag::app_and_ref_tag_f_detect; /**< @todo */ - static constexpr type incrementing_app_tag_type = dif_src_flag::incrementing_app_tag_type; /**< @todo */ - static constexpr type guard_check_disable = dif_src_flag::guard_check_disable; /**< @todo */ - static constexpr type ref_tag_check_disable = dif_src_flag::ref_tag_check_disable; /**< @todo */ - static constexpr type fixed_ref_tag_type = dif_src_flag::fixed_ref_tag_type; /**< @todo */ - }; - - /** - * @todo - */ - struct dif_additional_dst_option - { - using type = dif_additional_dst_options; - - static constexpr type none = type(0); /**< @todo */ - static constexpr type app_tag_pass_through = dif_dst_flag::app_tag_pass_through; /**< @todo */ - static constexpr type incrementing_app_tag_type = dif_dst_flag::incrementing_app_tag_type; /**< @todo */ - static constexpr type guard_field_pass_through = dif_dst_flag::guard_field_pass_through; /**< @todo */ - static constexpr type ref_tag_pass_through = dif_dst_flag::ref_tag_pass_through; /**< @todo */ - static constexpr type fixed_ref_tag_type = dif_dst_flag::fixed_ref_tag_type; /**< @todo */ - }; -} // namespace dml::ml - -#endif //DML_COMMON_OPTIONS_HPP diff --git a/include/dml/cpp/middle_layer/values.hpp b/include/dml/cpp/middle_layer/values.hpp deleted file mode 100644 index de17930..0000000 --- a/include/dml/cpp/middle_layer/values.hpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_ML_VALUES_HPP -#define DML_ML_VALUES_HPP - -#include "types.hpp" - -namespace dml::ml -{ - enum class operation : operation_t - { - nop = 0x00, - batch = 0x01, - drain = 0x02, - memory_move = 0x03, - fill = 0x04, - compare = 0x05, - compare_pattern = 0x06, - create_delta = 0x07, - apply_delta = 0x08, - dualcast = 0x09, - crc = 0x10, - copy_crc = 0x11, - dif_check = 0x12, - dif_insert = 0x13, - dif_strip = 0x14, - dif_update = 0x15, - cache_flush = 0x20 - }; - - enum class flag : flags_t - { - fence = 0b0000000000000001, - block_on_fault = 0b0000000000000010, - completion_record_address_valid = 0b0000000000000100, - request_completion_record = 0b0000000000001000, - request_completion_interrupt = 0b0000000000010000, - completion_record_steering_tag_selector = 0b0000000000100000, - check_result = 0b0000000010000000, - cache_control = 0b0000000100000000, - address_1_tc_selector = 0b0000001000000000, - address_2_tc_selector = 0b0000010000000000, - address_3_tc_selector = 0b0000100000000000, - completion_record_tc_selector = 0b0001000000000000, - strict_ordering = 0b0010000000000000, - destination_readback = 0b0100000000000000, - destination_steering_tag_selector = 0b1000000000000000 - }; - - enum class drain_flag : operation_specific_flags_t - { - readback_address_1_valid = 0b0001, - readback_address_2_valid = 0b0010, - suppress_tc_a_implicit_readback = 0b0100, - suppress_tc_b_implicit_readback = 0b1000 - }; - - enum class dualcast_flag : operation_specific_flags_t - { - destination_2_steering_tag_selector = 0b1 - }; - - enum class crc_flag : operation_specific_flags_t - { - read_crc_seed = 0b001, - bypass_crc_inversion_and_reflection = 0b010, - bypass_data_reflection = 0b100, - }; - - enum class dif_flag : operation_specific_flags_t - { - block_size_512 = 0b0000, - block_size_520 = 0b0001, - block_size_4096 = 0b0010, - block_size_4104 = 0b0011, - invert_crc_seed = 0b0100, - invert_crc_result = 0b1000, - }; - - enum class dif_src_flag : operation_specific_flags_t - { - enable_all_f_detect_error = 0b00000001, - all_f_detect = 0b00000010, - app_tag_f_detect = 0b00000100, - app_and_ref_tag_f_detect = 0b00001000, - incrementing_app_tag_type = 0b00010000, - guard_check_disable = 0b00100000, - ref_tag_check_disable = 0b01000000, - fixed_ref_tag_type = 0b10000000 - }; - - enum class dif_dst_flag : operation_specific_flags_t - { - app_tag_pass_through = 0b00001000, - incrementing_app_tag_type = 0b00010000, - guard_field_pass_through = 0b00100000, - ref_tag_pass_through = 0b01000000, - fixed_ref_tag_type = 0b10000000 - }; - - enum class dif_status : dif_status_t - { - guard_mismatch = 0x01, - app_tag_mismatch = 0x02, - ref_tag_mismatch = 0x04, - all_f_detect_error = 0x08 - }; -} // namespace dml::ml - -#endif //DML_ML_VALUES_HPP diff --git a/include/dml/detail/common/flags.hpp b/include/dml/detail/common/flags.hpp new file mode 100644 index 0000000..c8b707b --- /dev/null +++ b/include/dml/detail/common/flags.hpp @@ -0,0 +1,216 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_COMMON_FLAGS_HPP +#define DML_COMMON_FLAGS_HPP + +#include +#include +#include + +namespace dml::detail +{ + enum class flag : flags_t + { + fence = 0b0000000000000001, + block_on_fault = 0b0000000000000010, + completion_record_address_valid = 0b0000000000000100, + request_completion_record = 0b0000000000001000, + request_completion_interrupt = 0b0000000000010000, + completion_record_steering_tag_selector = 0b0000000000100000, + check_result = 0b0000000010000000, + cache_control = 0b0000000100000000, + address_1_tc_selector = 0b0000001000000000, + address_2_tc_selector = 0b0000010000000000, + address_3_tc_selector = 0b0000100000000000, + completion_record_tc_selector = 0b0001000000000000, + strict_ordering = 0b0010000000000000, + destination_readback = 0b0100000000000000, + destination_steering_tag_selector = 0b1000000000000000 + }; + + enum class nop_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence) + }; + + enum class batch_flag : std::underlying_type_t + { + address_1_tc_selector = to_underlying(flag::address_1_tc_selector) + }; + + enum class drain_flag : std::underlying_type_t + { + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector) + }; + + enum class mem_move_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class fill_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class compare_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + check_result = to_underlying(flag::check_result), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector) + }; + + enum class compare_pattern_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + check_result = to_underlying(flag::check_result), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector) + }; + + enum class create_delta_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + check_result = to_underlying(flag::check_result), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + address_3_tc_selector = to_underlying(flag::address_3_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class apply_delta_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class dualcast_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + address_3_tc_selector = to_underlying(flag::address_3_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class crc_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_3_tc_selector = to_underlying(flag::address_3_tc_selector) + }; + + enum class copy_crc_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + address_3_tc_selector = to_underlying(flag::address_3_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class dif_check_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector) + }; + + enum class dif_insert_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class dif_strip_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class dif_update_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_1_tc_selector = to_underlying(flag::address_1_tc_selector), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; + + enum class cache_flush_flag : std::underlying_type_t + { + fence = to_underlying(flag::fence), + block_on_fault = to_underlying(flag::block_on_fault), + cache_control = to_underlying(flag::cache_control), + address_2_tc_selector = to_underlying(flag::address_2_tc_selector), + strict_ordering = to_underlying(flag::strict_ordering), + destination_readback = to_underlying(flag::destination_readback), + destination_steering_tag_selector = to_underlying(flag::destination_steering_tag_selector) + }; +} // namespace dml::detail + +#endif //DML_COMMON_FLAGS_HPP diff --git a/include/dml/detail/common/specific_flags.hpp b/include/dml/detail/common/specific_flags.hpp new file mode 100644 index 0000000..aa13862 --- /dev/null +++ b/include/dml/detail/common/specific_flags.hpp @@ -0,0 +1,90 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_COMMON_SPECIFIC_FLAGS_HPP +#define DML_COMMON_SPECIFIC_FLAGS_HPP + +#include +#include + +namespace dml::detail +{ + enum class compare_result : result_t + { + equal = 0, + not_equal = 1 + }; + + enum class create_delta_result : result_t + { + equal = 1, + not_equal = 2, + overflow = 4 + }; + + enum class drain_specific_flag : operation_specific_flags_t + { + readback_address_1_valid = 0b0001, + readback_address_2_valid = 0b0010, + suppress_tc_a_implicit_readback = 0b0100, + suppress_tc_b_implicit_readback = 0b1000 + }; + + enum class dualcast_specific_flag : operation_specific_flags_t + { + destination_2_steering_tag_selector = 0b1 + }; + + enum class crc_specific_flag : operation_specific_flags_t + { + read_crc_seed = 0b001, + bypass_crc_inversion_and_reflection = 0b010, + bypass_data_reflection = 0b100 + }; + + enum class dif_specific_flag : operation_specific_flags_t + { + block_size_512 = 0b0000, + block_size_520 = 0b0001, + block_size_4096 = 0b0010, + block_size_4104 = 0b0011, + invert_crc_seed = 0b0100, + invert_crc_result = 0b1000 + }; + + enum class dif_source_flag : operation_specific_flags_t + { + enable_all_f_detect_error = 0b00000001, + all_f_detect = 0b00000010, + app_tag_f_detect = 0b00000100, + app_and_ref_tag_f_detect = 0b00001000, + incrementing_app_tag_type = 0b00010000, + guard_check_disable = 0b00100000, + ref_tag_check_disable = 0b01000000, + fixed_ref_tag_type = 0b10000000 + }; + + enum class dif_destination_flag : operation_specific_flags_t + { + app_tag_pass_through = 0b00001000, + incrementing_app_tag_type = 0b00010000, + guard_field_pass_through = 0b00100000, + ref_tag_pass_through = 0b01000000, + fixed_ref_tag_type = 0b10000000 + }; +} // namespace dml::detail + +#endif //DML_COMMON_SPECIFIC_FLAGS_HPP diff --git a/include/dml/cpp/middle_layer/status.hpp b/include/dml/detail/common/status.hpp similarity index 85% rename from include/dml/cpp/middle_layer/status.hpp rename to include/dml/detail/common/status.hpp index e3d4c8a..84b3332 100644 --- a/include/dml/cpp/middle_layer/status.hpp +++ b/include/dml/detail/common/status.hpp @@ -14,39 +14,40 @@ * */ -#ifndef DML_ML_STATUS_HPP -#define DML_ML_STATUS_HPP +#ifndef DML_DETAIL_COMMON_STATUS_HPP +#define DML_DETAIL_COMMON_STATUS_HPP -#include -#include +#include -namespace dml::ml +namespace dml::detail { enum class validation_status { success, - address_is_null, - size_is_null, - buffers_overlap, - address_is_misaligned, - delta_size_is_wrong, - delta_input_size_is_wrong, - delta_input_size_overflow, - delta_record_size_is_wrong, - dif_size_is_wrong, - dualcast_address_is_wrong, - batch_size_is_wrong, + null_address, + null_size, + large_size, + overlapping, + misalignment, + dif_strip_adjacent, + wrong_size, + wrong_delta_size, + wrong_dif_size, + wrong_dualcast_address, + wrong_batch_size, unsupported_operation }; enum class submission_status { success, + queue_busy, failure }; - enum class execution_status : std::uint8_t + enum class execution_status : status_t { + processing = 0x00u, /** Descriptor is still being processed */ success = 0x01u, /**< Success. */ false_predicate_success = 0x02u, /**< Success with false predicate. */ page_fault_during_processing = 0x03u, /**< Partial completion due to page fault. */ @@ -75,9 +76,7 @@ namespace dml::ml operation_readback_timeout = 0x20u, /**< The operation failed due to a hardware error other than a completion timeout or unsuccessful */ hardware_timeout = 0x21u, /**< Hardware error (completion timeout or unsuccessful completion status) */ address_translation_error = 0x22u, /**< An error occurred during address translation */ - - unexpected = std::numeric_limits::max() /**< Unexpected error code */ }; -} // namespace dml::ml +} // namespace dml::detail -#endif //DML_ML_STATUS_HPP +#endif //DML_DETAIL_COMMON_STATUS_HPP diff --git a/include/dml/cpp/middle_layer/types.hpp b/include/dml/detail/common/types.hpp similarity index 70% rename from include/dml/cpp/middle_layer/types.hpp rename to include/dml/detail/common/types.hpp index e654214..8ca7d69 100644 --- a/include/dml/cpp/middle_layer/types.hpp +++ b/include/dml/detail/common/types.hpp @@ -14,88 +14,46 @@ * */ -#ifndef DML_ML_TYPES_HPP -#define DML_ML_TYPES_HPP +#ifndef DML_DETAIL_COMMON_TYPES_HPP +#define DML_DETAIL_COMMON_TYPES_HPP +#include #include -#include "status.hpp" - -namespace dml::ml +namespace dml::detail { - /** - * @todo - */ using byte_t = std::uint8_t; - /** - * @todo - */ + using size_t = std::size_t; + + using transfer_size_t = std::uint32_t; + using operation_t = std::uint8_t; - /** - * @todo - */ using status_t = std::uint8_t; - /** - * @todo - */ using flags_t = std::uint16_t; - /** - * @todo - */ using operation_specific_flags_t = std::uint8_t; - /** - * @todo - */ using completion_interrupt_handle_t = std::uint16_t; - /** - * @todo - */ using transfer_size_t = std::uint32_t; - /** - * @todo - */ using address_t = uint64_t; - /** - * @todo - */ using pattern_t = uint64_t; - /** - * @todo - */ using result_t = uint8_t; - /** - * @todo - */ using crc_value_t = uint32_t; - /** - * @todo - */ using dif_flags_t = uint8_t; - /** - * @todo - */ using dif_status_t = uint8_t; - /** - * @todo - */ using dif_ref_tag_t = uint32_t; - /** - * @todo - */ using dif_app_tag_t = uint16_t; struct dif_parameters @@ -104,7 +62,6 @@ namespace dml::ml dif_app_tag_t app_tag_mask; dif_app_tag_t app_tag_seed; }; +} // namespace dml::detail -} // namespace dml::ml - -#endif //DML_ML_TYPES_HPP +#endif //DML_DETAIL_COMMON_TYPES_HPP diff --git a/include/dml/detail/common/utils/enum.hpp b/include/dml/detail/common/utils/enum.hpp new file mode 100644 index 0000000..21057ac --- /dev/null +++ b/include/dml/detail/common/utils/enum.hpp @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_COMMON_UTILS_ENUM_HPP +#define DML_COMMON_UTILS_ENUM_HPP + +#include + +namespace dml::detail +{ + template + [[nodiscard]] constexpr auto to_underlying(const enum_t enum_value) noexcept + { + return static_cast>(enum_value); + } + + template + [[nodiscard]] constexpr bool intersects(const std::underlying_type_t lhs, const enum_t rhs) noexcept + { + return (lhs & to_underlying(rhs)) == to_underlying(rhs); + } +} // namespace dml::detail + +#endif //DML_COMMON_UTILS_ENUM_HPP diff --git a/include/dml/detail/ml/execution_path.hpp b/include/dml/detail/ml/execution_path.hpp new file mode 100644 index 0000000..29673e6 --- /dev/null +++ b/include/dml/detail/ml/execution_path.hpp @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_ML_EXECUTION_PATH +#define DML_ML_EXECUTION_PATH + +#include +#include +#include + +namespace dml::detail::ml::execution_path +{ + struct software + { + [[nodiscard]] static submission_status submit(operation& op, result& res) noexcept; + }; + + struct hardware + { + [[nodiscard]] static submission_status submit(operation& op, result& res) noexcept; + }; +} // namespace dml::detail::ml::execution_path + +#endif //DML_ML_EXECUTION_PATH diff --git a/include/dml/detail/ml/operation.hpp b/include/dml/detail/ml/operation.hpp new file mode 100644 index 0000000..527c016 --- /dev/null +++ b/include/dml/detail/ml/operation.hpp @@ -0,0 +1,128 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_ML_OPERATION_HPP +#define DML_ML_OPERATION_HPP + +#include +#include + +namespace dml::detail::ml +{ + struct alignas(64u) operation + { + byte_t bytes[64u]; + }; + + [[nodiscard]] operation make_nop_operation(nop_options options) noexcept; + + [[nodiscard]] operation make_drain_operation(address_t readback_address_1, + address_t readback_address_2, + drain_options options, + drain_specific_options specific_options) noexcept; + + [[nodiscard]] operation make_mem_move_operation(const byte_t *src, + byte_t *dst, + transfer_size_t size, + mem_move_options options) noexcept; + + [[nodiscard]] operation make_fill_operation(uint64_t pattern, byte_t *dst, transfer_size_t size, fill_options options) noexcept; + + [[nodiscard]] operation make_dualcast_operation(const byte_t *src, + byte_t *dst1, + byte_t *dst2, + transfer_size_t size, + dualcast_options options, + dualcast_specific_options specific_options) noexcept; + + [[nodiscard]] operation make_compare_operation(const byte_t *src1, + const byte_t *src2, + transfer_size_t size, + compare_options options, + compare_result expected_result) noexcept; + + [[nodiscard]] operation make_compare_pattern_operation(uint64_t pattern, + const byte_t *src, + transfer_size_t size, + compare_pattern_options options, + compare_result expected_result) noexcept; + + [[nodiscard]] operation make_crc_operation(const byte_t *src, + transfer_size_t size, + crc_value_t crc_seed, + crc_options options, + crc_specific_options specific_options) noexcept; + + [[nodiscard]] operation make_copy_crc_operation(const byte_t *src, + byte_t *dst, + transfer_size_t size, + crc_value_t crc_seed, + copy_crc_options options, + copy_crc_specific_options specific_options) noexcept; + + [[nodiscard]] operation make_create_delta_operation(const byte_t *src1, + const byte_t *src2, + transfer_size_t size, + byte_t *delta_record, + transfer_size_t delta_max_size, + create_delta_options options, + create_delta_result expected_result) noexcept; + + [[nodiscard]] operation make_apply_delta_operation(const byte_t *delta_record, + transfer_size_t delta_size, + byte_t *dst, + transfer_size_t size, + apply_delta_options options) noexcept; + + [[nodiscard]] operation make_cache_flush_operation(byte_t *dst, transfer_size_t size, cache_flush_options options) noexcept; + + [[nodiscard]] operation make_dif_check_operation(const byte_t *src, + transfer_size_t transfer_size, + dif_parameters src_parameters, + dif_check_options options, + dif_specific_options specific_options, + dif_source_options source_options) noexcept; + + [[nodiscard]] operation make_dif_insert_operation(const byte_t *src, + byte_t *dst, + transfer_size_t transfer_size, + dif_parameters dst_parameters, + dif_insert_options options, + dif_specific_options specific_options, + dif_destination_options destination_options) noexcept; + + [[nodiscard]] operation make_dif_strip_operation(const byte_t *src, + byte_t *dst, + transfer_size_t transfer_size, + dif_parameters src_parameters, + dif_strip_options options, + dif_specific_options specific_options, + dif_source_options source_options) noexcept; + + [[nodiscard]] operation make_dif_update_operation(const byte_t *src, + byte_t *dst, + transfer_size_t transfer_size, + dif_parameters src_parameters, + dif_parameters dst_parameters, + dif_update_options options, + dif_specific_options specific_options, + dif_source_options source_options, + dif_destination_options destination_options) noexcept; + + [[nodiscard]] operation make_batch_operation(const operation *src, transfer_size_t length, batch_options options) noexcept; +} // namespace dml::detail::ml + +#endif //DML_ML_OPERATION_HPP diff --git a/include/dml/detail/ml/options.hpp b/include/dml/detail/ml/options.hpp new file mode 100644 index 0000000..5558b6f --- /dev/null +++ b/include/dml/detail/ml/options.hpp @@ -0,0 +1,105 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_ML_OPTIONS_HPP +#define DML_ML_OPTIONS_HPP + +#include +#include +#include + +namespace dml::detail::ml +{ + template + class options final + { + using value_t = std::underlying_type_t; + + public: + constexpr options() noexcept = default; + + template + [[nodiscard]] constexpr auto enable() const noexcept + { + return options(bit_mask_ | to_underlying(option)); + } + + constexpr explicit operator value_t() const noexcept + { + return bit_mask_; + } + + // TODO: Should be private, but job API batch implementation is clunky + + public: + constexpr explicit options(const value_t bit_mask) noexcept: bit_mask_(bit_mask) + { + } + + private: + value_t bit_mask_{}; + }; + + using nop_options = options; + + using batch_options = options; + + using drain_options = options; + + using mem_move_options = options; + + using fill_options = options; + + using compare_options = options; + + using compare_pattern_options = options; + + using create_delta_options = options; + + using apply_delta_options = options; + + using dualcast_options = options; + + using crc_options = options; + + using copy_crc_options = options; + + using dif_check_options = options; + + using dif_insert_options = options; + + using dif_strip_options = options; + + using dif_update_options = options; + + using cache_flush_options = options; + + using drain_specific_options = options; + + using dualcast_specific_options = options; + + using crc_specific_options = options; + + using copy_crc_specific_options = options; + + using dif_specific_options = options; + + using dif_source_options = options; + + using dif_destination_options = options; +} // namespace dml::detail::ml + +#endif //DML_ML_OPTIONS_HPP diff --git a/include/dml/detail/ml/result.hpp b/include/dml/detail/ml/result.hpp new file mode 100644 index 0000000..463580a --- /dev/null +++ b/include/dml/detail/ml/result.hpp @@ -0,0 +1,51 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_ML_RESULT_HPP +#define DML_ML_RESULT_HPP + +#include +#include +#include + +namespace dml::detail::ml +{ + struct alignas(32) result + { + byte_t bytes[32]; + }; + + void bind(operation& op, result& res) noexcept; + + void wait(result& res) noexcept; + + [[nodiscard]] detail::execution_status get_status(result& res) noexcept; + + [[nodiscard]] detail::result_t get_result(result& res) noexcept; + + [[nodiscard]] detail::transfer_size_t get_bytes_completed(result& res) noexcept; + + [[nodiscard]] detail::transfer_size_t get_delta_record_size(result& res) noexcept; + + [[nodiscard]] detail::transfer_size_t get_crc_value(result& res) noexcept; + + [[nodiscard]] inline bool is_finished(const volatile result& res) noexcept + { + return 0 != res.bytes[0]; + } +} // namespace dml::detail::ml + +#endif //DML_ML_RESULT_HPP diff --git a/sources/cores/include/core_api.h b/include/dml/detail/ml/validation.hpp similarity index 58% rename from sources/cores/include/core_api.h rename to include/dml/detail/ml/validation.hpp index 3fa01e8..4fb485c 100644 --- a/sources/cores/include/core_api.h +++ b/include/dml/detail/ml/validation.hpp @@ -1,5 +1,5 @@ /* - * Copyright 2020-2021 Intel Corporation. + * Copyright 2021 Intel Corporation. * * This software and the related documents are Intel copyrighted materials, * and your use of them is governed by the express license under which they @@ -14,27 +14,15 @@ * */ -/** - * @file - * @brief - * @date 2/10/2020 - * - * @defgroup core_public_features Public Functions - * @ingroup core_public - * @{ - * - * @brief Public Intel DML core features - * - */ - -#ifndef KERNEL_API_H__ -#define KERNEL_API_H__ +#ifndef DML_ML_VALIDATION +#define DML_ML_VALIDATION -#include "core_compare.h" -#include "core_memory.h" -#include "core_cpu_features.h" -#include "core_hash_functions.h" +#include +#include -#endif //KERNEL_API_H__ +namespace dml::detail::ml +{ + [[nodiscard]] validation_status validate(operation& op) noexcept; +} // namespace dml::detail::ml -/** @} */ +#endif //DML_ML_VALIDATION diff --git a/include/dml/dml.h b/include/dml/dml.h index c2e30cb..c8987e2 100644 --- a/include/dml/dml.h +++ b/include/dml/dml.h @@ -94,8 +94,8 @@ dml_status_t dml_get_job_size(dml_path_t dml_path, uint32_t *job_size_ptr); * - @ref DML_STATUS_OK * - @ref DML_STATUS_PATH_ERROR * - @ref DML_STATUS_NULL_POINTER_ERROR - * - @ref DML_STATUS_DRIVER_NOT_FOUND - * - @ref DML_STATUS_HARDWARE_CONNECTION_ERROR + * - @ref DML_STATUS_LIBACCEL_NOT_FOUND + * - @ref DML_STATUS_LIBACCEL_ERROR * */ dml_status_t dml_init_job(dml_path_t path, dml_job_t *dml_job_ptr); @@ -109,7 +109,6 @@ dml_status_t dml_init_job(dml_path_t path, dml_job_t *dml_job_ptr); * * @return The follow statuses; * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_HARDWARE_DISCONNECTION_ERROR * */ dml_status_t dml_finalize_job(dml_job_t *dml_job_ptr); diff --git a/include/dml/dml.hpp b/include/dml/dml.hpp index 635ca1b..f81ec22 100644 --- a/include/dml/dml.hpp +++ b/include/dml/dml.hpp @@ -55,12 +55,12 @@ namespace dml { } -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #endif //DML_DML_HPP diff --git a/include/dml/dmldefs.h b/include/dml/dmldefs.h index 7dabaa4..00a68d5 100644 --- a/include/dml/dmldefs.h +++ b/include/dml/dmldefs.h @@ -269,8 +269,6 @@ typedef enum * @brief All possible return values of the Intel DML Library functions. * * @note All general statuses are described here. - * @note All driver errors described in the @ref DRIVER_STATUSES - * @note All device errors described in the @ref hw_status_t enumeration */ typedef enum { @@ -307,20 +305,12 @@ typedef enum DML_STATUS_BATCH_SIZE_ERROR = 29u, /**< The desired batch size is bigger than the possible one */ DML_STATUS_DRAIN_PAGE_FAULT_ERROR = 30u, /**< A page fault occured while translating a Readback Addres in a Drain descriptor */ DML_STATUS_UNKNOWN_CACHE_SIZE_ERROR = 31u, /**< Max cache size can't be calculated */ + DML_STATUS_DIF_STRIP_ADJACENT_ERROR = 32u, /**< SRC Address for DIF Strip operation should be greater than (DST Address + SRC Size) */ - // Initialisation Errors - DML_STATUS_DRIVER_NOT_FOUND = (DML_BASE_DRIVER_ERROR + 0u), /**< Unable to initialize job because hardware driver was not found */ - DML_STATUS_DRIVER_ERROR = (DML_BASE_DRIVER_ERROR + 1u), /**< Unable to initialize job because hardware driver API is incompatible */ - DML_STATUS_HARDWARE_CONNECTION_ERROR = (DML_BASE_DRIVER_ERROR + 2u), /**< Cannot connect to hardware to complete initialization */ - DML_STATUS_HARDWARE_DISCONNECTION_ERROR = (DML_BASE_DRIVER_ERROR + 3u), /**< Cannot disconnect hardware */ - DML_STATUS_QUEUE_IS_BUSY = (DML_BASE_DRIVER_ERROR + 4u), /**< Descriptor can't be submitted into filled work queue*/ - DML_STATUS_INSTANCE_NOT_FOUND = (DML_BASE_DRIVER_ERROR + 5u), /**< Accelerator instance can not be found */ - DML_STATUS_VERSION_DETECTION_ERROR = (DML_BASE_DRIVER_ERROR + 6u), /**< Accelerator version can not be determined */ - DML_STATUS_DEVICES_NOT_AVAILABLE = (DML_BASE_DRIVER_ERROR + 7u), /**< Enabled devices are not found */ - DML_STATUS_WORK_QUEUES_NOT_AVAILABLE = (DML_BASE_DRIVER_ERROR + 8u), /**< Enabled work queues are not found */ - DML_STATUS_INCORRECT_WORK_QUEUE_ID = (DML_BASE_DRIVER_ERROR + 9u), /**< Work Queue ID is wrong */ - DML_STATUS_WORK_QUEUE_CONNECTION_ERROR = (DML_BASE_DRIVER_ERROR + 10u), /**< Work Queue can not be connected */ - DML_STATUS_PORTAL_CREATION_ERROR = (DML_BASE_DRIVER_ERROR + 11u), /**< Portal can not be created */ + // Initialization Errors + DML_STATUS_LIBACCEL_NOT_FOUND = (DML_BASE_DRIVER_ERROR + 0u), /**< Unable to initialize job because hardware driver was not found */ + DML_STATUS_LIBACCEL_ERROR = (DML_BASE_DRIVER_ERROR + 1u), /**< Unable to initialize job because hardware driver API is incompatible */ + DML_STATUS_WORK_QUEUES_NOT_AVAILABLE = (DML_BASE_DRIVER_ERROR + 2u), /**< Enabled work queues are not found */ } dml_status_t; diff --git a/include/dml/cpp/data_view.hpp b/include/dml/hl/data_view.hpp similarity index 90% rename from include/dml/cpp/data_view.hpp rename to include/dml/hl/data_view.hpp index d37fddf..38187df 100644 --- a/include/dml/cpp/data_view.hpp +++ b/include/dml/hl/data_view.hpp @@ -23,7 +23,7 @@ */ #include -#include +#include #include #include @@ -46,7 +46,9 @@ namespace dml * @param[in] data_ptr Pointer to the data for the view * @param[in] size Byte size of the data for the view */ - data_view(byte_t *const data_ptr, const size_t size) noexcept: data_ptr_(data_ptr), size_(size) { } + data_view(byte_t *const data_ptr, const size_t size) noexcept: data_ptr_(data_ptr), size_(size) + { + } /** * @brief Explicitly deleted conversion constructor from @ref const_data_view @@ -84,14 +86,20 @@ namespace dml * * @return Pointer to the viewed data */ - [[nodiscard]] auto data() noexcept { return data_ptr_; } + [[nodiscard]] auto data() noexcept + { + return data_ptr_; + } /** * @brief Getter for the size of the viewed data * * @return Size of the viewed data */ - [[nodiscard]] auto size() const noexcept { return size_; } + [[nodiscard]] auto size() const noexcept + { + return size_; + } private: byte_t *const data_ptr_; /**< Pointer to the viewed data */ @@ -113,7 +121,9 @@ namespace dml * @param[in] data_ptr Pointer to the data for the view * @param[in] size Byte size of the data for the view */ - const_data_view(const byte_t *const data_ptr, const size_t size) noexcept: data_ptr_(data_ptr), size_(size) { } + const_data_view(const byte_t *const data_ptr, const size_t size) noexcept: data_ptr_(data_ptr), size_(size) + { + } /** * @brief Constructor @ref data_view @@ -122,7 +132,9 @@ namespace dml * * @param[in] other Instance of @ref data_view */ - const_data_view(data_view other) noexcept: data_ptr_(other.data()), size_(other.size()) { } + const_data_view(data_view other) noexcept: data_ptr_(other.data()), size_(other.size()) + { + } /** * @brief Default copy constructor @@ -153,14 +165,20 @@ namespace dml * * @return Pointer to the viewed data */ - [[nodiscard]] auto data() const noexcept { return data_ptr_; } + [[nodiscard]] auto data() const noexcept + { + return data_ptr_; + } /** * @brief Getter for the size of the viewed data * * @return Size of the viewed data */ - [[nodiscard]] auto size() const noexcept { return size_; } + [[nodiscard]] auto size() const noexcept + { + return size_; + } private: const byte_t *const data_ptr_; /**< Pointer to the viewed immutable data */ @@ -217,12 +235,10 @@ namespace dml * @return Constructed @ref data_view */ template - inline auto make_view(iterator_t begin, - iterator_t end) noexcept(noexcept(&*begin) &&noexcept(std::distance(begin, end))) + inline auto make_view(iterator_t begin, iterator_t end) noexcept(noexcept(&*begin) &&noexcept(std::distance(begin, end))) { using iterator_category = typename std::iterator_traits::iterator_category; - static_assert(std::is_same_v, - "Only random access iterators are supported."); + static_assert(std::is_same_v, "Only random access iterators are supported."); return make_view(&*begin, std::distance(begin, end)); } diff --git a/include/dml/cpp/detail/buffer.hpp b/include/dml/hl/detail/buffer.hpp similarity index 90% rename from include/dml/cpp/detail/buffer.hpp rename to include/dml/hl/detail/buffer.hpp index d7c5ed0..07136ce 100644 --- a/include/dml/cpp/detail/buffer.hpp +++ b/include/dml/hl/detail/buffer.hpp @@ -22,8 +22,7 @@ #ifndef DML_DETAIL_BUFFER_HPP #define DML_DETAIL_BUFFER_HPP -#include - +#include #include namespace dml::detail @@ -137,7 +136,10 @@ namespace dml::detail * * @return Reference to the element */ - [[nodiscard]] auto &get() noexcept { return *aligned_data_; } + [[nodiscard]] auto &get() noexcept + { + return *aligned_data_; + } /** * @brief Returns reference to the element (const version) @@ -146,11 +148,14 @@ namespace dml::detail * * @return Const reference to the element */ - [[nodiscard]] const auto &get() const noexcept { return *aligned_data_; } + [[nodiscard]] const auto &get() const noexcept + { + return *aligned_data_; + } private: - elem_t * data_{}; /**< Pointer to allocated memory */ - elem_t * aligned_data_{}; /**< Pointer to the element */ + elem_t *data_{}; /**< Pointer to allocated memory */ + elem_t *aligned_data_{}; /**< Pointer to the element */ own_alloc_t allocator_{}; /**< Allocator instance */ }; @@ -190,8 +195,7 @@ namespace dml::detail * @param count Number of elements * @param allocator Instance of allocator */ - buffer_array(size_t count, allocator_t allocator): - count_(count), data_(nullptr), aligned_data_(nullptr), allocator_(allocator) + buffer_array(size_t count, allocator_t allocator): count_(count), data_(nullptr), aligned_data_(nullptr), allocator_(allocator) { data_ = own_traits_t::allocate(allocator_, (count_ * memory_size) + alignment); @@ -268,7 +272,10 @@ namespace dml::detail * * @return Number of elements */ - [[nodiscard]] size_t get_count() const noexcept { return count_; } + [[nodiscard]] size_t get_count() const noexcept + { + return count_; + } /** * @brief Returns reference to the element by index @@ -277,7 +284,10 @@ namespace dml::detail * * @return Reference to the element */ - [[nodiscard]] auto &get(size_t index) noexcept { return aligned_data_[index]; } + [[nodiscard]] auto &get(size_t index) noexcept + { + return aligned_data_[index]; + } /** * @brief Returns reference to the element by index (const version) @@ -286,12 +296,15 @@ namespace dml::detail * * @return Const reference to the element */ - [[nodiscard]] const auto &get(size_t index) const noexcept { return aligned_data_[index]; } + [[nodiscard]] const auto &get(size_t index) const noexcept + { + return aligned_data_[index]; + } private: size_t count_{}; /**< Number of elements in the array */ - elem_t * data_{}; /**< Pointer to the allocated memory */ - elem_t * aligned_data_{}; /**< Pointer to the array */ + elem_t *data_{}; /**< Pointer to the allocated memory */ + elem_t *aligned_data_{}; /**< Pointer to the array */ own_alloc_t allocator_{}; /**< Allocator instance */ }; } // namespace dml::detail diff --git a/include/dml/cpp/detail/execute.hpp b/include/dml/hl/detail/execute.hpp similarity index 81% rename from include/dml/cpp/detail/execute.hpp rename to include/dml/hl/detail/execute.hpp index f85b508..3296dc3 100644 --- a/include/dml/cpp/detail/execute.hpp +++ b/include/dml/hl/detail/execute.hpp @@ -22,12 +22,12 @@ #ifndef DML_DETAIL_EXECUTE_HPP #define DML_DETAIL_EXECUTE_HPP -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include "make_result.hpp" @@ -56,25 +56,23 @@ namespace dml::detail auto descriptor = make_operation(); - if (auto status = ml::validate(descriptor); status != ml::validation_status::success) + if (auto status = ml::validate(descriptor); status != detail::validation_status::success) { return typename operation::result_type{ detail::to_own(status) }; } - auto record = ml::completion_record(); + auto record = detail::ml::result(); // If execution_path::run returns status code auto status = execution_path()(descriptor, record); - if (status != ml::submission_status::success) + if (status != detail::submission_status::success) { return typename operation::result_type{ status_code::error }; } -#ifdef DML_HW if constexpr (std::is_same_v) { - ml::wait(record); + detail::ml::wait(record); } -#endif return make_result(record); } diff --git a/include/dml/cpp/detail/handler.hpp b/include/dml/hl/detail/handler.hpp similarity index 90% rename from include/dml/cpp/detail/handler.hpp rename to include/dml/hl/detail/handler.hpp index b5dea06..2479c4c 100644 --- a/include/dml/cpp/detail/handler.hpp +++ b/include/dml/hl/detail/handler.hpp @@ -22,7 +22,7 @@ #ifndef DML_DETAIL_HANDLER_HPP #define DML_DETAIL_HANDLER_HPP -#include +#include namespace dml { @@ -41,7 +41,7 @@ namespace dml * @return Middle Layer result object */ template - ml::completion_record &get_ml_result(handler &h) noexcept + detail::ml::result &get_ml_result(handler &h) noexcept { return h.record_.get(); } diff --git a/include/dml/hl/detail/make_result.hpp b/include/dml/hl/detail/make_result.hpp new file mode 100644 index 0000000..46ef6ae --- /dev/null +++ b/include/dml/hl/detail/make_result.hpp @@ -0,0 +1,103 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +/** + * @date 05/20/2021 + * @brief Contains internal execute implementation + */ + +#ifndef DML_DETAIL_MAKE_RESULT_HPP +#define DML_DETAIL_MAKE_RESULT_HPP + +#include +#include +#include + +namespace dml::detail +{ + /** + * @todo + */ + inline auto to_own(detail::execution_status status) noexcept + { + switch (status) + { + case detail::execution_status::success: + return status_code::ok; + case detail::execution_status::false_predicate_success: + return status_code::false_predicate; + default: + // Anything else is considered an error temporarily + return status_code::error; + } + } + + /** + * @todo + */ + template + auto make_result(detail::ml::result& result) noexcept + { + if constexpr (std::is_same_v) + { + return mem_move_result{ to_own(detail::ml::get_status(result)) }; + } + if constexpr (std::is_same_v) + { + return mem_copy_result{ to_own(detail::ml::get_status(result)) }; + } + else if constexpr (std::is_same_v) + { + return fill_result{ to_own(detail::ml::get_status(result)) }; + } + else if constexpr (std::is_same_v) + { + return dml::compare_result{ to_own(detail::ml::get_status(result)), + static_cast(detail::ml::get_result(result)), + detail::ml::get_bytes_completed(result) }; + } + else if constexpr (std::is_same_v) + { + return dml::create_delta_result{ to_own(detail::ml::get_status(result)), + static_cast(detail::ml::get_result(result)), + detail::ml::get_bytes_completed(result), + detail::ml::get_delta_record_size(result) }; + } + else if constexpr (std::is_same_v) + { + return apply_delta_result{ to_own(detail::ml::get_status(result)) }; + } + else if constexpr (std::is_same_v) + { + return dualcast_result{ to_own(detail::ml::get_status(result)) }; + } + else if constexpr (std::is_same_v) + { + return crc_result{ to_own(detail::ml::get_status(result)), detail::ml::get_crc_value(result) }; + } + else if constexpr (std::is_same_v) + { + return cache_flush_result{ to_own(detail::ml::get_status(result)) }; + } + else if constexpr (std::is_same_v) + { + return batch_result{ to_own(detail::ml::get_status(result)), detail::ml::get_bytes_completed(result) }; + } + } + +} // namespace dml::detail + +#endif //DML_DETAIL_MAKE_RESULT_HPP diff --git a/include/dml/cpp/detail/submit.hpp b/include/dml/hl/detail/submit.hpp similarity index 82% rename from include/dml/cpp/detail/submit.hpp rename to include/dml/hl/detail/submit.hpp index 24b14a4..c32c54c 100644 --- a/include/dml/cpp/detail/submit.hpp +++ b/include/dml/hl/detail/submit.hpp @@ -22,10 +22,10 @@ #ifndef DML_DETAIL_SUBMIT_HPP #define DML_DETAIL_SUBMIT_HPP -#include -#include -#include -#include +#include +#include +#include +#include namespace dml::detail { @@ -59,7 +59,7 @@ namespace dml::detail auto operation = make_operation(); - auto op_handler = executor.template make_handler(detail::to_own(ml::validate(operation))); + auto op_handler = executor.template make_handler(detail::to_own(detail::ml::validate(operation))); if (!op_handler.valid()) { @@ -67,7 +67,6 @@ namespace dml::detail } // If execution_path{} returns status code (hw path) -#ifdef DML_HW if constexpr (std::is_same_v) { auto& result = detail::get_ml_result(op_handler); @@ -77,27 +76,31 @@ namespace dml::detail return execution_path{}(operation, result); }); - if (status != ml::submission_status::success) + if (status != detail::submission_status::success) { - return executor.template make_handler(status_code::error); + if(status == detail::submission_status::queue_busy) + { + return executor.template make_handler(status_code::queue_busy); + } + else + { + return executor.template make_handler(status_code::error); + } } } else { -#endif auto& result = detail::get_ml_result(op_handler); executor.execute( [operation, &result]() mutable { auto status = execution_path{}(operation, result); - if (status != ml::submission_status::success) + if (status != detail::submission_status::success) { result.bytes[0] = 0xFF; // Temporary } }); -#ifdef DML_HW } -#endif return op_handler; } diff --git a/include/dml/cpp/detail/utils.hpp b/include/dml/hl/detail/utils.hpp similarity index 63% rename from include/dml/cpp/detail/utils.hpp rename to include/dml/hl/detail/utils.hpp index 9cd4106..39c3d07 100644 --- a/include/dml/cpp/detail/utils.hpp +++ b/include/dml/hl/detail/utils.hpp @@ -22,8 +22,8 @@ #ifndef DML_DETAIL_UTILS_HPP #define DML_DETAIL_UTILS_HPP -#include -#include +#include +#include /** * @brief Checks whether two sizes are the same @@ -39,6 +39,11 @@ namespace dml::detail */ struct always_success { + /** + * @brief Functor which always return success status + * + * @return status_code::ok + */ auto operator()() const noexcept { return status_code::ok; @@ -46,39 +51,37 @@ namespace dml::detail }; /** - * @brief Converts Middle Layer's @ref dml::ml::validation_status to @ref dml::status_code + * @brief Converts Middle Layer status code to @ref dml::status_code * * @param status Status for conversion * - * @return dml::status_code that represents dml::ml::validation_status + * @return dml::status_code */ - [[nodiscard]] static constexpr auto to_own(ml::validation_status status) noexcept + [[nodiscard]] static constexpr auto to_own(detail::validation_status status) noexcept { switch (status) { - case ml::validation_status::success: + case detail::validation_status::success: return status_code::ok; - case ml::validation_status::address_is_null: + case detail::validation_status::null_address: return status_code::nullptr_error; - case ml::validation_status::size_is_null: + case detail::validation_status::null_size: return status_code::bad_size; - case ml::validation_status::delta_size_is_wrong: + case detail::validation_status::wrong_size: return status_code::bad_size; - case ml::validation_status::delta_input_size_is_wrong: + case detail::validation_status::large_size: return status_code::bad_size; - case ml::validation_status::delta_input_size_overflow: - return status_code::bad_size; - case ml::validation_status::buffers_overlap: + case detail::validation_status::overlapping: return status_code::buffers_overlapping; - case ml::validation_status::address_is_misaligned: + case detail::validation_status::misalignment: return status_code::bad_alignment; - case ml::validation_status::delta_record_size_is_wrong: + case detail::validation_status::wrong_delta_size: return status_code::delta_bad_size; - case ml::validation_status::dualcast_address_is_wrong: + case detail::validation_status::wrong_dualcast_address: return status_code::dualcast_bad_padding; - case ml::validation_status::batch_size_is_wrong: + case detail::validation_status::wrong_batch_size: return status_code::bad_length; - case ml::validation_status::unsupported_operation: + case detail::validation_status::unsupported_operation: return status_code::unsupported_operation; default: return status_code::error; diff --git a/include/dml/cpp/execute.hpp b/include/dml/hl/execute.hpp similarity index 81% rename from include/dml/cpp/execute.hpp rename to include/dml/hl/execute.hpp index 79c8e56..d549657 100644 --- a/include/dml/cpp/execute.hpp +++ b/include/dml/hl/execute.hpp @@ -22,12 +22,12 @@ #ifndef DML_EXECUTE_HPP #define DML_EXECUTE_HPP -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include namespace dml { @@ -63,7 +63,7 @@ namespace dml return detail::execute( [&]() { - return ml::make_batch_descriptor(seq.data(), seq.length(), operation.get_options()); + return detail::ml::make_batch_operation(seq.data(), seq.length(), operation.get_options()); }); } @@ -94,7 +94,7 @@ namespace dml return detail::execute( [&]() { - return ml::make_mem_move_descriptor(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); + return detail::ml::make_mem_move_operation(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); }, [&]() { @@ -130,7 +130,7 @@ namespace dml return detail::execute( [&]() { - return ml::make_mem_move_descriptor(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); + return detail::ml::make_mem_move_operation(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); }, [&]() { @@ -166,7 +166,7 @@ namespace dml return detail::execute( [&]() { - return ml::make_fill_descriptor(pattern, dst_view.data(), dst_view.size(), operation.get_options()); + return detail::ml::make_fill_operation(pattern, dst_view.data(), dst_view.size(), operation.get_options()); }); } @@ -198,12 +198,12 @@ namespace dml return detail::execute( [&]() { - return ml::make_dualcast_descriptor(src_view.data(), - dst1_view.data(), - dst2_view.data(), - src_view.size(), - operation.get_options(), - operation.get_additional_options()); + return detail::ml::make_dualcast_operation(src_view.data(), + dst1_view.data(), + dst2_view.data(), + src_view.size(), + operation.get_options(), + operation.get_specific_options()); }, [&]() { @@ -240,11 +240,11 @@ namespace dml return detail::execute( [&]() { - return ml::make_compare_descriptor(src1_view.data(), - src2_view.data(), - src1_view.size(), - operation.get_options(), - operation.get_expected_result()); + return detail::ml::make_compare_operation(src1_view.data(), + src2_view.data(), + src1_view.size(), + operation.get_options(), + operation.get_expected_result()); }, [&]() { @@ -280,11 +280,11 @@ namespace dml return detail::execute( [&]() { - return ml::make_compare_pattern_descriptor(pattern, - src_view.data(), - src_view.size(), - operation.get_options(), - operation.get_expected_result()); + return detail::ml::make_compare_pattern_operation(pattern, + src_view.data(), + src_view.size(), + operation.get_options(), + operation.get_expected_result()); }); } @@ -316,13 +316,13 @@ namespace dml return detail::execute( [&]() { - return ml::make_create_delta_descriptor(src1_view.data(), - src2_view.data(), - src1_view.size(), - delta_view.data(), - delta_view.size(), - operation.get_options(), - operation.get_expected_result()); + return detail::ml::make_create_delta_operation(src1_view.data(), + src2_view.data(), + src1_view.size(), + delta_view.data(), + delta_view.size(), + operation.get_options(), + operation.get_expected_result()); }, [&]() { @@ -359,11 +359,11 @@ namespace dml return detail::execute( [&]() { - return ml::make_apply_delta_descriptor(delta_view.data(), - delta_result.delta_record_size, - dst_view.data(), - dst_view.size(), - operation.get_options()); + return detail::ml::make_apply_delta_operation(delta_view.data(), + delta_result.delta_record_size, + dst_view.data(), + dst_view.size(), + operation.get_options()); }, [&]() { @@ -402,11 +402,11 @@ namespace dml return detail::execute( [&]() { - return ml::make_crc_descriptor(src_view.data(), - src_view.size(), - crc_seed, - operation.get_options(), - operation.get_additional_options()); + return detail::ml::make_crc_operation(src_view.data(), + src_view.size(), + crc_seed, + operation.get_options(), + operation.get_specific_options()); }); } @@ -438,12 +438,12 @@ namespace dml return detail::execute( [&]() { - return ml::make_copy_crc_descriptor(src_view.data(), - dst_view.data(), - src_view.size(), - crc_seed, - operation.get_options(), - operation.get_additional_options()); + return detail::ml::make_copy_crc_operation(src_view.data(), + dst_view.data(), + src_view.size(), + crc_seed, + operation.get_options(), + operation.get_specific_options()); }, [&]() { @@ -478,7 +478,7 @@ namespace dml return detail::execute( [&]() { - return ml::make_cache_flush_descriptor(dst_view.data(), dst_view.size(), operation.get_options()); + return detail::ml::make_cache_flush_operation(dst_view.data(), dst_view.size(), operation.get_options()); }); } diff --git a/include/dml/cpp/execution_interface.hpp b/include/dml/hl/execution_interface.hpp similarity index 90% rename from include/dml/cpp/execution_interface.hpp rename to include/dml/hl/execution_interface.hpp index b7babd3..48a9a58 100644 --- a/include/dml/cpp/execution_interface.hpp +++ b/include/dml/hl/execution_interface.hpp @@ -22,7 +22,7 @@ #ifndef DML_EXECUTION_INTERFACE_HPP #define DML_EXECUTION_INTERFACE_HPP -#include +#include namespace dml { @@ -65,7 +65,8 @@ namespace dml * @param allocator Instance of allocator */ explicit execution_interface(executor_t executor = executor_t(), allocator_t allocator = allocator_t()): - executor_(executor), allocator_(allocator) + executor_(executor), + allocator_(allocator) { } @@ -78,8 +79,7 @@ namespace dml * @return Executor return value (if present) */ template - auto execute(task_t &&task) const - noexcept(noexcept(std::declval()(std::forward(task)))) + auto execute(task_t &&task) const noexcept(noexcept(std::declval()(std::forward(task)))) { return executor_(std::forward(task)); } @@ -125,8 +125,8 @@ namespace dml * @tparam execution_path Type of execution path */ template - using default_execution_interface = execution_interface; + using default_execution_interface = + execution_interface; } // namespace dml #endif //DML_EXECUTION_INTERFACE_HPP diff --git a/include/dml/cpp/execution_path.hpp b/include/dml/hl/execution_path.hpp similarity index 89% rename from include/dml/cpp/execution_path.hpp rename to include/dml/hl/execution_path.hpp index 9b7503c..8399296 100644 --- a/include/dml/cpp/execution_path.hpp +++ b/include/dml/hl/execution_path.hpp @@ -22,8 +22,7 @@ #ifndef DML_EXECUTION_PATH_HPP #define DML_EXECUTION_PATH_HPP -#include -#include +#include #include namespace dml @@ -72,13 +71,12 @@ namespace dml * @param op Instance of Middle Layer operation * @param res Instance of Middle Layer result */ - auto operator()(ml::descriptor &op, ml::completion_record &res) const noexcept + auto operator()(detail::ml::operation &op, detail::ml::result &res) const noexcept { - return ml::software().submit(op, res); + return detail::ml::execution_path::software::submit(op, res); } }; -#ifdef DML_HW /** * @brief Represent hardware execution path * @@ -120,12 +118,11 @@ namespace dml * * @return @ref status_code::ok if submission was a success, error code otherwise */ - [[nodiscard]] auto operator()(ml::descriptor& dsc, ml::completion_record &record) const noexcept + [[nodiscard]] auto operator()(detail::ml::operation &op, detail::ml::result &res) const noexcept { - return ml::hardware().submit(dsc, record); + return detail::ml::execution_path::hardware::submit(op, res); } }; -#endif /** * @} diff --git a/include/dml/cpp/handler.hpp b/include/dml/hl/handler.hpp similarity index 82% rename from include/dml/cpp/handler.hpp rename to include/dml/hl/handler.hpp index 4a2db0b..9c0153c 100644 --- a/include/dml/cpp/handler.hpp +++ b/include/dml/hl/handler.hpp @@ -22,9 +22,9 @@ #ifndef DML_HANDLER_HPP #define DML_HANDLER_HPP -#include -#include -#include +#include +#include +#include namespace dml { @@ -41,7 +41,7 @@ namespace dml /** * @brief Internal buffer type for a result */ - using buffer_type = detail::buffer; + using buffer_type = detail::buffer; /** * @brief Actual operation's result type @@ -59,8 +59,7 @@ namespace dml * * @param allocator Memory allocator to use */ - explicit handler(allocator_t allocator = allocator_t()) noexcept: - record_(allocator, false), status_(status_code::error) + explicit handler(allocator_t allocator = allocator_t()) noexcept: record_(allocator, false), status_(status_code::error) { } @@ -72,7 +71,10 @@ namespace dml * * @return True if hanlder is valid, false otherwise */ - [[nodiscard]] bool valid() const noexcept { return status_ == status_code::ok; } + [[nodiscard]] bool valid() const noexcept + { + return status_ == status_code::ok; + } /** * @brief Get result for a submitted operation @@ -87,14 +89,14 @@ namespace dml { if (status_ == status_code::ok) { - ml::wait(record_.get()); + detail::ml::wait(record_.get()); return detail::make_result(record_.get()); } else { // Aggregate initialization ensures only first element initialized - return result_type{status_}; + return result_type{ status_ }; } } @@ -107,7 +109,7 @@ namespace dml { if (status_ == status_code::ok) { - return ml::is_finished(record_.get()); + return detail::ml::is_finished(record_.get()); } else { @@ -124,12 +126,12 @@ namespace dml * @param status Initial status * @param allocator Instance of memory allocator */ - explicit handler(status_code status, allocator_t allocator): - record_(allocator, status == status_code::ok), status_(status) + explicit handler(status_code status, allocator_t allocator): record_(allocator, status == status_code::ok), status_(status) { } - friend ml::completion_record &detail::get_ml_result<>(handler &h) noexcept; + template + friend detail::ml::result &detail::get_ml_result(handler &h) noexcept; private: buffer_type record_; /**< Memory buffer for a result */ diff --git a/include/dml/cpp/operations.hpp b/include/dml/hl/operations.hpp similarity index 78% rename from include/dml/cpp/operations.hpp rename to include/dml/hl/operations.hpp index 1da4dca..ee0318e 100644 --- a/include/dml/cpp/operations.hpp +++ b/include/dml/hl/operations.hpp @@ -22,7 +22,8 @@ #ifndef DML_OPERATIONS_MEM_MOVE_HPP #define DML_OPERATIONS_MEM_MOVE_HPP -#include +#include +#include namespace dml { @@ -40,7 +41,7 @@ namespace dml /** * @brief Constructs the operation */ - constexpr mem_move_operation() noexcept: options_(ml::mem_move_option::cache_control) + constexpr mem_move_operation() noexcept: options_(detail::ml::mem_move_options().enable()) { } @@ -60,7 +61,7 @@ namespace dml } private: - ml::mem_move_options options_; /**< @todo */ + detail::ml::mem_move_options options_; /**< @todo */ }; /** @@ -87,7 +88,7 @@ namespace dml /** * @brief Constructs the operation */ - constexpr mem_copy_operation() noexcept: options_(ml::mem_move_option::cache_control) + constexpr mem_copy_operation() noexcept: options_(detail::ml::mem_move_options().enable()) { } @@ -107,7 +108,7 @@ namespace dml } private: - ml::mem_move_options options_; /**< @todo */ + detail::ml::mem_move_options options_; /**< @todo */ }; /** @@ -132,7 +133,7 @@ namespace dml /** * @brief Constructs the operation */ - constexpr fill_operation() noexcept: options_(ml::fill_option::cache_control) + constexpr fill_operation() noexcept: options_(detail::ml::fill_options().enable()) { } @@ -152,7 +153,7 @@ namespace dml } private: - ml::fill_options options_; /**< @todo */ + detail::ml::fill_options options_; /**< @todo */ }; /** @@ -177,7 +178,11 @@ namespace dml /** * @brief Constructs the operation */ - constexpr dualcast_operation() = default; + constexpr dualcast_operation() noexcept: + options_(detail::ml::dualcast_options().enable()), + specific_options_() + { + } /** * @brief Result type for this operation @@ -197,14 +202,14 @@ namespace dml /** * @todo */ - [[nodiscard]] constexpr auto get_additional_options() const noexcept + [[nodiscard]] constexpr auto get_specific_options() const noexcept { - return additional_options_; + return specific_options_; } private: - ml::dualcast_options options_; /**< @todo */ - ml::dualcast_additional_options additional_options_; /**< @todo */ + detail::ml::dualcast_options options_; /**< @todo */ + detail::ml::dualcast_specific_options specific_options_; /**< @todo */ }; /** @@ -258,7 +263,7 @@ namespace dml */ [[nodiscard]] constexpr auto expect_equal() const noexcept { - return compare_operation(ml::compare_option::check_result, ml::compare_expected_result_option::expect_equal); + return compare_operation(options_.enable(), detail::compare_result::equal); } /** @@ -268,7 +273,7 @@ namespace dml */ [[nodiscard]] constexpr auto expect_not_equal() const noexcept { - return compare_operation(ml::compare_option::check_result, ml::compare_expected_result_option::expect_not_equal); + return compare_operation(options_.enable(), detail::compare_result::not_equal); } /** @@ -284,7 +289,7 @@ namespace dml * * @return Expected result */ - [[nodiscard]] ml::compare_expected_result_options get_expected_result() const + [[nodiscard]] detail::compare_result get_expected_result() const { return expected_; } @@ -293,15 +298,15 @@ namespace dml /** * @brief Constructs the operation with specified options and expected result */ - constexpr compare_operation(ml::compare_options options, ml::compare_expected_result_options expected) noexcept: + constexpr compare_operation(detail::ml::compare_options options, detail::compare_result expected) noexcept: options_(options), expected_(expected) { } private: - ml::compare_options options_{}; /**< @todo */ - ml::compare_expected_result_options expected_{}; /**< @todo */ + detail::ml::compare_options options_{}; /**< @todo */ + detail::compare_result expected_{}; /**< @todo */ }; /** @@ -346,7 +351,7 @@ namespace dml * * See @ref compare_result */ - using result_type = compare_result; + using result_type = dml::compare_result; /** * @brief Returns a new instance of the operation with "equal" expected result @@ -355,7 +360,7 @@ namespace dml */ [[nodiscard]] constexpr auto expect_equal() const noexcept { - return compare_pattern_operation(ml::compare_pattern_option::check_result, ml::compare_expected_result_option::expect_equal); + return compare_pattern_operation(options_.enable(), detail::compare_result::equal); } /** @@ -365,8 +370,8 @@ namespace dml */ [[nodiscard]] constexpr auto expect_not_equal() const noexcept { - return compare_pattern_operation(ml::compare_pattern_option::check_result, - ml::compare_expected_result_option::expect_not_equal); + return compare_pattern_operation(options_.enable(), + detail::compare_result::not_equal); } /** @@ -382,7 +387,7 @@ namespace dml * * @return Expected result */ - [[nodiscard]] ml::compare_expected_result_options get_expected_result() const + [[nodiscard]] detail::compare_result get_expected_result() const noexcept { return expected_; } @@ -391,15 +396,15 @@ namespace dml /** * @brief Constructs the operation with specified options and expected result */ - constexpr compare_pattern_operation(ml::compare_pattern_options options, ml::compare_expected_result_options expected) noexcept: + constexpr compare_pattern_operation(detail::ml::compare_pattern_options options, detail::compare_result expected) noexcept: options_(options), expected_(expected) { } private: - ml::compare_pattern_options options_{}; /**< @todo */ - ml::compare_expected_result_options expected_{}; /**< @todo */ + detail::ml::compare_pattern_options options_{}; /**< @todo */ + detail::compare_result expected_{}; /**< @todo */ }; /** @@ -429,14 +434,18 @@ namespace dml /** * @brief Constructs the operation */ - constexpr create_delta_operation() = default; + constexpr create_delta_operation() noexcept: + options_(detail::ml::create_delta_options().enable()), + expected_() + { + } /** * @brief Result type for this operation * * See @ref create_delta_result */ - using result_type = create_delta_result; + using result_type = dml::create_delta_result; /** * @brief Returns a new instance of the operation with "equal" expected result @@ -445,7 +454,7 @@ namespace dml */ [[nodiscard]] constexpr auto expect_equal() const noexcept { - return create_delta_operation(ml::create_delta_option::check_result, ml::delta_expected_result_option::expect_equal); + return create_delta_operation(options_.enable(), detail::create_delta_result::equal); } /** @@ -455,7 +464,8 @@ namespace dml */ [[nodiscard]] constexpr auto expect_not_equal() const noexcept { - return create_delta_operation(ml::create_delta_option::check_result, ml::delta_expected_result_option::expect_not_equal); + return create_delta_operation(options_.enable(), + detail::create_delta_result::not_equal); } /** @@ -471,7 +481,7 @@ namespace dml * * @return Expected result */ - [[nodiscard]] ml::delta_expected_result_options get_expected_result() const + [[nodiscard]] detail::create_delta_result get_expected_result() const noexcept { return expected_; } @@ -480,15 +490,15 @@ namespace dml /** * @brief Constructs the operation with specified options and expected result */ - constexpr create_delta_operation(ml::create_delta_options options, ml::delta_expected_result_options expected) noexcept: + constexpr create_delta_operation(detail::ml::create_delta_options options, detail::create_delta_result expected) noexcept: options_(options), expected_(expected) { } private: - ml::create_delta_options options_; /**< @todo */ - ml::delta_expected_result_options expected_{}; /**< @todo */ + detail::ml::create_delta_options options_; /**< @todo */ + detail::create_delta_result expected_{}; /**< @todo */ }; /** @@ -515,7 +525,10 @@ namespace dml /** * @brief Constructs the operation */ - constexpr apply_delta_operation() = default; + constexpr apply_delta_operation() noexcept: + options_(detail::ml::apply_delta_options().enable()) + { + } /** * @brief Result type for this operation @@ -533,7 +546,7 @@ namespace dml } private: - ml::apply_delta_options options_; /**< @todo */ + detail::ml::apply_delta_options options_; /**< @todo */ }; /** @@ -581,7 +594,7 @@ namespace dml */ [[nodiscard]] constexpr auto bypass_reflection() const noexcept { - return crc_operation({}, this->additional_options_ | ml::crc_additional_option::bypass_reflection); + return crc_operation(options_, specific_options_.enable()); } /** @@ -591,7 +604,7 @@ namespace dml */ [[nodiscard]] constexpr auto bypass_data_reflection() const noexcept { - return crc_operation({}, this->additional_options_ | ml::crc_additional_option::bypass_data_reflection); + return crc_operation(options_, specific_options_.enable()); } /** @@ -605,24 +618,24 @@ namespace dml /** * @todo */ - [[nodiscard]] constexpr auto get_additional_options() const noexcept + [[nodiscard]] constexpr auto get_specific_options() const noexcept { - return additional_options_; + return specific_options_; } private: /** * @brief Constructs the operation with specified parameters */ - constexpr crc_operation(ml::crc_options options, ml::crc_additional_options additional_options) noexcept: + constexpr crc_operation(detail::ml::crc_options options, detail::ml::crc_specific_options specific_options) noexcept: options_(options), - additional_options_(additional_options) + specific_options_(specific_options) { } private: - ml::crc_options options_; /**< @todo */ - ml::crc_additional_options additional_options_; /**< @todo */ + detail::ml::crc_options options_; /**< @todo */ + detail::ml::crc_specific_options specific_options_; /**< @todo */ }; /** @@ -662,7 +675,11 @@ namespace dml * * Reflection and data reflection are enabled. */ - constexpr copy_crc_operation() noexcept = default; + constexpr copy_crc_operation() noexcept: + options_(detail::ml::copy_crc_options().enable()), + specific_options_() + { + } /** * @brief Returns a new instance of the operation with bypass_reflection option enabled. @@ -671,7 +688,7 @@ namespace dml */ [[nodiscard]] constexpr auto bypass_reflection() const noexcept { - return copy_crc_operation({}, this->additional_options_ | ml::copy_crc_additional_option::bypass_reflection); + return copy_crc_operation(options_, specific_options_.enable()); } /** @@ -681,7 +698,7 @@ namespace dml */ [[nodiscard]] constexpr auto bypass_data_reflection() const noexcept { - return copy_crc_operation({}, this->additional_options_ | ml::copy_crc_additional_option::bypass_data_reflection); + return copy_crc_operation(options_, specific_options_.enable()); } /** @@ -695,24 +712,24 @@ namespace dml /** * @todo */ - [[nodiscard]] constexpr auto get_additional_options() const noexcept + [[nodiscard]] constexpr auto get_specific_options() const noexcept { - return additional_options_; + return specific_options_; } private: /** * @brief Constructs the operation with specified parameters */ - constexpr copy_crc_operation(ml::copy_crc_options options, ml::copy_crc_additional_options additional_options) noexcept: + constexpr copy_crc_operation(detail::ml::copy_crc_options options, detail::ml::copy_crc_specific_options specific_options) noexcept: options_(options), - additional_options_(additional_options) + specific_options_(specific_options) { } private: - ml::copy_crc_options options_; /**< @todo */ - ml::copy_crc_additional_options additional_options_; /**< @todo */ + detail::ml::copy_crc_options options_; /**< @todo */ + detail::ml::copy_crc_specific_options specific_options_; /**< @todo */ }; /** @@ -758,7 +775,7 @@ namespace dml */ [[nodiscard]] constexpr auto dont_invalidate_cache() const noexcept { - return cache_flush_operation(ml::cache_flush_option::cache_control); + return cache_flush_operation(options_.enable()); } /** @@ -773,12 +790,12 @@ namespace dml /** * @brief Constructs the operation with specified parameter */ - constexpr explicit cache_flush_operation(ml::cache_flush_options options) noexcept: options_(options) + constexpr explicit cache_flush_operation(detail::ml::cache_flush_options options) noexcept: options_(options) { } private: - ml::cache_flush_options options_; /**< @todo */ + detail::ml::cache_flush_options options_; /**< @todo */ }; /** @@ -821,7 +838,7 @@ namespace dml } private: - ml::batch_options options_; /**< @todo */ + detail::ml::batch_options options_; /**< @todo */ }; /** diff --git a/include/dml/cpp/result.hpp b/include/dml/hl/result.hpp similarity index 56% rename from include/dml/cpp/result.hpp rename to include/dml/hl/result.hpp index bfed222..5ffdf8c 100644 --- a/include/dml/cpp/result.hpp +++ b/include/dml/hl/result.hpp @@ -22,10 +22,9 @@ * @brief Contains definitions of result types */ -#include -#include - #include +#include +#include namespace dml { @@ -41,7 +40,7 @@ namespace dml */ struct mem_move_result { - status_code status{status_code::error}; /**< Status of operation execution */ + status_code status{ status_code::error }; /**< Status of operation execution */ }; /** @@ -49,7 +48,7 @@ namespace dml */ struct mem_copy_result { - status_code status{status_code::error}; /**< Status of operation execution */ + status_code status{ status_code::error }; /**< Status of operation execution */ }; /** @@ -57,7 +56,7 @@ namespace dml */ struct fill_result { - status_code status{status_code::error}; /**< Status of operation execution */ + status_code status{ status_code::error }; /**< Status of operation execution */ }; /** @@ -65,7 +64,7 @@ namespace dml */ struct dualcast_result { - status_code status{status_code::error}; /**< Status of operation execution */ + status_code status{ status_code::error }; /**< Status of operation execution */ }; /** @@ -73,9 +72,9 @@ namespace dml */ struct compare_result { - status_code status{status_code::error}; /**< Status of operation execution */ - comparison_result result{}; /**< Comparison result */ - size_t mismatch{}; /**< First mismatch byte position */ + status_code status{ status_code::error }; /**< Status of operation execution */ + comparison_result result{}; /**< Comparison result */ + size_t mismatch{}; /**< First mismatch byte position */ }; /** @@ -83,10 +82,10 @@ namespace dml */ struct create_delta_result { - status_code status{status_code::error}; /**< Status of operation execution */ - comparison_result result{}; /**< Comparison result */ - size_t bytes_completed{}; /**< Bytes completed, before delta overflowed */ - size_t delta_record_size{}; /**< Delta record written size */ + status_code status{ status_code::error }; /**< Status of operation execution */ + comparison_result result{}; /**< Comparison result */ + size_t bytes_completed{}; /**< Bytes completed, before delta overflowed */ + size_t delta_record_size{}; /**< Delta record written size */ }; /** @@ -94,7 +93,7 @@ namespace dml */ struct apply_delta_result { - status_code status{status_code::error}; /**< Status of operation execution */ + status_code status{ status_code::error }; /**< Status of operation execution */ }; /** @@ -102,8 +101,8 @@ namespace dml */ struct crc_result { - status_code status{status_code::error}; /**< Status of operation execution */ - uint32_t crc_value{}; /**< Calculated CRC value */ + status_code status{ status_code::error }; /**< Status of operation execution */ + uint32_t crc_value{}; /**< Calculated CRC value */ }; /** @@ -111,7 +110,7 @@ namespace dml */ struct cache_flush_result { - status_code status{status_code::error}; /**< Status of operation execution */ + status_code status{ status_code::error }; /**< Status of operation execution */ }; /** @@ -119,8 +118,8 @@ namespace dml */ struct batch_result { - status_code status{status_code::error}; /**< Status of operation execution */ - size_t operations_completed{}; /**< Number of operation successfully completed */ + status_code status{ status_code::error }; /**< Status of operation execution */ + size_t operations_completed{}; /**< Number of operation successfully completed */ }; /** diff --git a/include/dml/cpp/sequence.hpp b/include/dml/hl/sequence.hpp similarity index 80% rename from include/dml/cpp/sequence.hpp rename to include/dml/hl/sequence.hpp index cf5817a..34e2652 100644 --- a/include/dml/cpp/sequence.hpp +++ b/include/dml/hl/sequence.hpp @@ -22,11 +22,14 @@ #ifndef DML_SEQUENCE_HPP #define DML_SEQUENCE_HPP -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include namespace dml { @@ -45,12 +48,12 @@ namespace dml /** * @brief Type of buffer for Middle Layer operations */ - using op_buffer_t = detail::buffer_array; + using op_buffer_t = detail::buffer_array; /** * @brief Type of buffer for Middle Layer results */ - using res_buffer_t = detail::buffer_array; + using res_buffer_t = detail::buffer_array; public: /** @@ -61,7 +64,7 @@ namespace dml */ explicit sequence(size_t length, allocator_t allocator = allocator_t()): operations_(length, allocator), - records_(length, allocator), + results_(length, allocator), current_length_(0u) { } @@ -330,26 +333,27 @@ namespace dml /** * @todo */ - inline status_code add(ml::descriptor operation) noexcept + inline status_code add(detail::ml::operation operation) noexcept { if (current_length_ == operations_.get_count()) { return status_code::batch_overflow; } - if (auto status = ml::validate(operation); status != ml::validation_status::success) + if (auto status = detail::ml::validate(operation); status != detail::validation_status::success) { return detail::to_own(status); } operations_.get(current_length_) = operation; + detail::ml::bind(operations_.get(current_length_), results_.get(current_length_)); current_length_++; return status_code::ok; } private: op_buffer_t operations_; /**< Buffer for operations array */ - res_buffer_t records_; /**< Buffer for results array */ + res_buffer_t results_; /**< Buffer for results array */ size_t current_length_; /**< Current number of operation stored in the sequence */ }; @@ -358,7 +362,7 @@ namespace dml { DML_VALIDATE_SIZE_CONSISTENCY(src_view.size(), dst_view.size()); - return add(ml::make_mem_move_descriptor(src_view.data(), dst_view.data(), src_view.size(), operation.get_options())); + return add(detail::ml::make_mem_move_operation(src_view.data(), dst_view.data(), src_view.size(), operation.get_options())); } template @@ -366,13 +370,13 @@ namespace dml { DML_VALIDATE_SIZE_CONSISTENCY(src_view.size(), dst_view.size()); - return add(ml::make_mem_move_descriptor(src_view.data(), dst_view.data(), src_view.size(), operation.get_options())); + return add(detail::ml::make_mem_move_operation(src_view.data(), dst_view.data(), src_view.size(), operation.get_options())); } template inline status_code sequence::add(fill_operation operation, uint64_t pattern, data_view dst_view) { - return add(ml::make_fill_descriptor(pattern, dst_view.data(), dst_view.size(), operation.get_options())); + return add(detail::ml::make_fill_operation(pattern, dst_view.data(), dst_view.size(), operation.get_options())); } template @@ -384,12 +388,12 @@ namespace dml DML_VALIDATE_SIZE_CONSISTENCY(src_view.size(), dst1_view.size()); DML_VALIDATE_SIZE_CONSISTENCY(src_view.size(), dst2_view.size()); - return add(ml::make_dualcast_descriptor(src_view.data(), - dst1_view.data(), - dst2_view.data(), - src_view.size(), - operation.get_options(), - operation.get_additional_options())); + return add(detail::ml::make_dualcast_operation(src_view.data(), + dst1_view.data(), + dst2_view.data(), + src_view.size(), + operation.get_options(), + operation.get_specific_options())); } template @@ -397,21 +401,21 @@ namespace dml { DML_VALIDATE_SIZE_CONSISTENCY(src1_view.size(), src2_view.size()); - return add(ml::make_compare_descriptor(src1_view.data(), - src2_view.data(), - src1_view.size(), - operation.get_options(), - operation.get_expected_result())); + return add(detail::ml::make_compare_operation(src1_view.data(), + src2_view.data(), + src1_view.size(), + operation.get_options(), + operation.get_expected_result())); } template inline status_code sequence::add(compare_pattern_operation operation, uint64_t pattern, const_data_view src_view) { - return add(ml::make_compare_pattern_descriptor(pattern, - src_view.data(), - src_view.size(), - operation.get_options(), - operation.get_expected_result())); + return add(detail::ml::make_compare_pattern_operation(pattern, + src_view.data(), + src_view.size(), + operation.get_options(), + operation.get_expected_result())); } template @@ -422,13 +426,13 @@ namespace dml { DML_VALIDATE_SIZE_CONSISTENCY(src1_view.size(), src2_view.size()); - return add(ml::make_create_delta_descriptor(src1_view.data(), - src2_view.data(), - src1_view.size(), - delta_view.data(), - delta_view.size(), - operation.get_options(), - operation.get_expected_result())); + return add(detail::ml::make_create_delta_operation(src1_view.data(), + src2_view.data(), + src1_view.size(), + delta_view.data(), + delta_view.size(), + operation.get_options(), + operation.get_expected_result())); } template @@ -441,21 +445,21 @@ namespace dml { return status_code::delta_delta_empty; } - return add(ml::make_apply_delta_descriptor(delta_view.data(), - delta_result.delta_record_size, - dst_view.data(), - dst_view.size(), - operation.get_options())); + return add(detail::ml::make_apply_delta_operation(delta_view.data(), + delta_result.delta_record_size, + dst_view.data(), + dst_view.size(), + operation.get_options())); } template inline status_code sequence::add(crc_operation operation, const_data_view src_view, uint32_t crc_seed) { - return add(ml::make_crc_descriptor(src_view.data(), - src_view.size(), - crc_seed, - operation.get_options(), - operation.get_additional_options())); + return add(detail::ml::make_crc_operation(src_view.data(), + src_view.size(), + crc_seed, + operation.get_options(), + operation.get_specific_options())); } template @@ -465,18 +469,18 @@ namespace dml uint32_t crc_seed) { DML_VALIDATE_SIZE_CONSISTENCY(src_view.size(), dst_view.size()); - return add(ml::make_copy_crc_descriptor(src_view.data(), - dst_view.data(), - src_view.size(), - crc_seed, - operation.get_options(), - operation.get_additional_options())); + return add(detail::ml::make_copy_crc_operation(src_view.data(), + dst_view.data(), + src_view.size(), + crc_seed, + operation.get_options(), + operation.get_specific_options())); } template inline status_code sequence::add(cache_flush_operation operation, data_view dst_view) { - return add(ml::make_cache_flush_descriptor(dst_view.data(), dst_view.size(), operation.get_options())); + return add(detail::ml::make_cache_flush_operation(dst_view.data(), dst_view.size(), operation.get_options())); } } // namespace dml diff --git a/include/dml/cpp/status_code.hpp b/include/dml/hl/status_code.hpp similarity index 95% rename from include/dml/cpp/status_code.hpp rename to include/dml/hl/status_code.hpp index 77603d4..0639ec7 100644 --- a/include/dml/cpp/status_code.hpp +++ b/include/dml/hl/status_code.hpp @@ -22,7 +22,7 @@ * @brief Contains definition of status type */ -#include +#include namespace dml { @@ -46,6 +46,7 @@ namespace dml batch_overflow, /**< Batch is full */ execution_failed, /**< Unknown execution error */ unsupported_operation, /**< Unknown execution error */ + queue_busy, /**< Enqueue failed to one or several queues */ error /**< Internal library error occurred */ }; } // namespace dml diff --git a/include/dml/cpp/submit.hpp b/include/dml/hl/submit.hpp similarity index 88% rename from include/dml/cpp/submit.hpp rename to include/dml/hl/submit.hpp index b624d9b..651a0ce 100644 --- a/include/dml/cpp/submit.hpp +++ b/include/dml/hl/submit.hpp @@ -22,13 +22,13 @@ #ifndef DML_SUBMIT_HPP #define DML_SUBMIT_HPP -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include namespace dml { @@ -81,7 +81,7 @@ namespace dml executor, [&]() { - return ml::make_batch_descriptor(seq.data(), seq.length(), operation.get_options()); + return detail::ml::make_batch_operation(seq.data(), seq.length(), operation.get_options()); }); } @@ -128,7 +128,7 @@ namespace dml executor, [&]() { - return ml::make_mem_move_descriptor(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); + return detail::ml::make_mem_move_operation(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); }, [&]() { @@ -180,7 +180,7 @@ namespace dml executor, [&]() { - return ml::make_mem_move_descriptor(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); + return detail::ml::make_mem_move_operation(src_view.data(), dst_view.data(), src_view.size(), operation.get_options()); }, [&]() { @@ -232,7 +232,7 @@ namespace dml executor, [&]() { - return ml::make_fill_descriptor(pattern, dst_view.data(), dst_view.size(), operation.get_options()); + return detail::ml::make_fill_operation(pattern, dst_view.data(), dst_view.size(), operation.get_options()); }); } @@ -281,12 +281,12 @@ namespace dml executor, [&]() { - return ml::make_dualcast_descriptor(src_view.data(), - dst1_view.data(), - dst2_view.data(), - src_view.size(), - operation.get_options(), - operation.get_additional_options()); + return detail::ml::make_dualcast_operation(src_view.data(), + dst1_view.data(), + dst2_view.data(), + src_view.size(), + operation.get_options(), + operation.get_specific_options()); }, [&]() { @@ -339,11 +339,11 @@ namespace dml executor, [&]() { - return ml::make_compare_descriptor(src1_view.data(), - src2_view.data(), - src1_view.size(), - operation.get_options(), - operation.get_expected_result()); + return detail::ml::make_compare_operation(src1_view.data(), + src2_view.data(), + src1_view.size(), + operation.get_options(), + operation.get_expected_result()); }, [&]() { @@ -394,7 +394,7 @@ namespace dml return detail::submit(executor, [&]() { - return ml::make_compare_pattern_descriptor( + return detail::ml::make_compare_pattern_operation( pattern, src_view.data(), src_view.size(), @@ -448,13 +448,13 @@ namespace dml executor, [&]() { - return ml::make_create_delta_descriptor(src1_view.data(), - src2_view.data(), - src1_view.size(), - delta_view.data(), - delta_view.size(), - operation.get_options(), - operation.get_expected_result()); + return detail::ml::make_create_delta_operation(src1_view.data(), + src2_view.data(), + src1_view.size(), + delta_view.data(), + delta_view.size(), + operation.get_options(), + operation.get_expected_result()); }, [&]() { @@ -508,11 +508,11 @@ namespace dml executor, [&]() { - return ml::make_apply_delta_descriptor(delta_view.data(), - delta_result.delta_record_size, - dst_view.data(), - dst_view.size(), - operation.get_options()); + return detail::ml::make_apply_delta_operation(delta_view.data(), + delta_result.delta_record_size, + dst_view.data(), + dst_view.size(), + operation.get_options()); }, [&]() { @@ -566,11 +566,11 @@ namespace dml return detail::submit(executor, [&]() { - return ml::make_crc_descriptor(src_view.data(), - src_view.size(), - crc_seed, - operation.get_options(), - operation.get_additional_options()); + return detail::ml::make_crc_operation(src_view.data(), + src_view.size(), + crc_seed, + operation.get_options(), + operation.get_specific_options()); }); } @@ -619,12 +619,12 @@ namespace dml executor, [&]() { - return ml::make_copy_crc_descriptor(src_view.data(), - dst_view.data(), - src_view.size(), - crc_seed, - operation.get_options(), - operation.get_additional_options()); + return detail::ml::make_copy_crc_operation(src_view.data(), + dst_view.data(), + src_view.size(), + crc_seed, + operation.get_options(), + operation.get_specific_options()); }, [&]() { @@ -672,7 +672,7 @@ namespace dml executor, [&]() { - return ml::make_cache_flush_descriptor(dst_view.data(), dst_view.size(), operation.get_options()); + return detail::ml::make_cache_flush_operation(dst_view.data(), dst_view.size(), operation.get_options()); }); } } // namespace dml diff --git a/include/dml/cpp/types.hpp b/include/dml/hl/types.hpp similarity index 100% rename from include/dml/cpp/types.hpp rename to include/dml/hl/types.hpp diff --git a/sources/CMakeLists.txt b/sources/CMakeLists.txt index b144762..b794365 100644 --- a/sources/CMakeLists.txt +++ b/sources/CMakeLists.txt @@ -13,9 +13,7 @@ # stated in the License. # -# TODO: Move to middle_layer -add_subdirectory(cores) - +add_subdirectory(core) add_subdirectory(middle_layer) add_subdirectory(c_api) add_subdirectory(cpp_api) diff --git a/sources/c_api/CMakeLists.txt b/sources/c_api/CMakeLists.txt index 3d0d0e5..6523224 100644 --- a/sources/c_api/CMakeLists.txt +++ b/sources/c_api/CMakeLists.txt @@ -14,46 +14,47 @@ # add_library(dml STATIC - dml.cpp - dml_batch.cpp - dml_get_library_version.cpp - - $ - $ - ) - -target_compile_features(dml PUBLIC c_std_11) -target_compile_features(dml PRIVATE cxx_std_17) - -# TODO: target_compile_options(dml PRIVATE ${DML_QUALITY_OPTIONS}) + dml.cpp + dml_batch.cpp + dml_get_library_version.cpp + ) target_include_directories(dml - PUBLIC $ - PUBLIC $ - PRIVATE include - PRIVATE ../cores/include # TODO: Remove - ) + PUBLIC $ + PUBLIC $ + PRIVATE include + PRIVATE ../include + ) +target_sources(dml + PRIVATE $ + PRIVATE $ + ) +target_compile_features(dml + PUBLIC c_std_11 + PRIVATE cxx_std_17 + ) +target_compile_options(dml + PRIVATE ${DML_QUALITY_OPTIONS} + PRIVATE ${DML_CPP_PRIVATE_OPTIONS} + ) + +if(DML_HW) + target_link_libraries(dml PRIVATE ${CMAKE_DL_LIBS}) +endif() # Pass git revision to get_library_version source file get_git_revision() set_property( - SOURCE dml_get_library_version.cpp APPEND - PROPERTY COMPILE_DEFINITIONS DML_GIT_REVISION="${GIT_REV}") + SOURCE dml_get_library_version.cpp APPEND + PROPERTY COMPILE_DEFINITIONS DML_GIT_REVISION="${GIT_REV}") set_target_properties(dml PROPERTIES - VERSION ${PROJECT_VERSION} - SOVERSION ${PROJECT_SOVERSION}) + VERSION ${PROJECT_VERSION} + SOVERSION ${PROJECT_SOVERSION}) install(TARGETS dml - EXPORT ${PROJECT_NAME}Targets - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) - -target_include_directories(dml PRIVATE sw-path/include) - -if (DML_HW) - target_compile_definitions(dml PRIVATE DML_HW) - target_link_libraries(dml PRIVATE ${CMAKE_DL_LIBS}) -endif() + EXPORT ${PROJECT_NAME}Targets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/sources/c_api/dml_batch.cpp b/sources/c_api/dml_batch.cpp index 9aa6121..db3b5a7 100644 --- a/sources/c_api/dml_batch.cpp +++ b/sources/c_api/dml_batch.cpp @@ -14,67 +14,63 @@ * */ -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include "dml/dml.h" #include "macros.hpp" #include "range_check.hpp" #include "status.hpp" +#include "utils.hpp" namespace dml { + constexpr auto get_task_size() noexcept -> uint32_t { - return sizeof(dml::ml::descriptor) + sizeof(dml::ml::completion_record); + return sizeof(dml::detail::ml::operation) + sizeof(dml::detail::ml::result); } class batch { public: - batch(uint8_t *batch_data, uint32_t tasks_count) noexcept: batch_data_(batch_data), tasks_count_(tasks_count) + batch(uint8_t *batch_data, uint32_t tasks_count) noexcept: batch_data_(dml::align(batch_data)), tasks_count_(tasks_count) { } - template - void add_by_index(uint32_t index, make_callback_t &&make) const noexcept + template + void add_by_index(uint32_t index, make_operation &&make) const noexcept { - const auto descriptors = reinterpret_cast(batch_data_); - - descriptors[index] = make(); + const auto operations = reinterpret_cast(batch_data_); - const auto records = reinterpret_cast(descriptors + tasks_count_); + operations[index] = make(); - auto view = dml::ml::views::any_descriptor(descriptors[index]); + const auto results = reinterpret_cast(operations + tasks_count_); - view.completion_record_address() = reinterpret_cast(records + index); - view.flags() |= static_cast(dml::ml::flag::request_completion_record) | - static_cast(dml::ml::flag::completion_record_address_valid); + dml::detail::ml::bind(operations[index], results[index]); } [[nodiscard]] auto get_status(uint32_t index) const noexcept { - const auto descriptors = reinterpret_cast(batch_data_); + const auto operations = reinterpret_cast(batch_data_); - const auto records = reinterpret_cast(descriptors + tasks_count_); + const auto records = reinterpret_cast(operations + tasks_count_); - auto view = dml::ml::views::any_result(records[index]); - - return to_own_status(static_cast(view.status())); + return to_own_status(dml::detail::ml::get_status(records[index])); } [[nodiscard]] auto get_result(uint32_t index) const noexcept { - const auto descriptors = reinterpret_cast(batch_data_); - - const auto records = reinterpret_cast(descriptors + tasks_count_); + const auto operations = reinterpret_cast(batch_data_); - auto view = dml::ml::views::any_result(records[index]); + const auto records = reinterpret_cast(operations + tasks_count_); - return view.result(); + return dml::detail::ml::get_result(records[index]); } private: @@ -92,7 +88,9 @@ extern "C" dml_status_t dml_get_batch_size(const dml_job_t *dml_job_ptr, uint32_ return DML_STATUS_BATCH_SIZE_ERROR; } - *byte_size_ptr = dml::get_task_size() * task_count; + const uint32_t required_alignment = 64u; + + *byte_size_ptr = dml::get_task_size() * task_count + required_alignment; return DML_STATUS_OK; } @@ -112,7 +110,7 @@ extern "C" dml_status_t dml_batch_set_nop_by_index(dml_job_t *dml_job_ptr, uint3 .add_by_index(task_index, [&] { - return dml::ml::make_nop_descriptor(dml::ml::nop_options(static_cast(flags & 0xFFFF))); + return dml::detail::ml::make_nop_operation(dml::detail::ml::nop_options(static_cast(flags & 0xFFFF))); }); return DML_STATUS_OK; @@ -140,12 +138,12 @@ extern "C" dml_status_t dml_batch_set_mem_move_by_index(dml_job_t *dm return status; } - auto descriptor = dml::ml::make_mem_move_descriptor(source_ptr, - destination_ptr, - byte_length, - dml::ml::mem_move_options(static_cast(flags & 0xFFFF))); + auto operation = dml::detail::ml::make_mem_move_operation(source_ptr, + destination_ptr, + byte_length, + dml::detail::ml::mem_move_options(static_cast(flags & 0xFFFF))); - status = dml::to_own_status(dml::ml::validate(descriptor)); + status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -155,7 +153,7 @@ extern "C" dml_status_t dml_batch_set_mem_move_by_index(dml_job_t *dm .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -178,14 +176,15 @@ extern "C" dml_status_t dml_batch_set_dualcast_by_index(dml_job_t *dm return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_dualcast_descriptor(source_ptr, - destination_first_ptr, - destination_second_ptr, - byte_length, - dml::ml::dualcast_options(static_cast(flags & 0xFFFF)), - dml::ml::dualcast_additional_options(static_cast((flags >> 16) & 0xFF))); + auto operation = + dml::detail::ml::make_dualcast_operation(source_ptr, + destination_first_ptr, + destination_second_ptr, + byte_length, + dml::detail::ml::dualcast_options(static_cast(flags & 0xFFFF)), + dml::detail::ml::dualcast_specific_options(static_cast((flags >> 16) & 0xFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -195,7 +194,7 @@ extern "C" dml_status_t dml_batch_set_dualcast_by_index(dml_job_t *dm .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -218,12 +217,12 @@ extern "C" dml_status_t dml_batch_set_compare_by_index(dml_job_t *dml return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_compare_descriptor(source_first_ptr, - source_second_ptr, - byte_length, - dml::ml::compare_options(static_cast(flags & 0xFFFF)), - dml::ml::compare_expected_result_options(expected_result)); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = make_compare_operation(source_first_ptr, + source_second_ptr, + byte_length, + dml::detail::ml::compare_options(static_cast(flags & 0xFFFF)), + dml::detail::compare_result(expected_result)); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -233,7 +232,7 @@ extern "C" dml_status_t dml_batch_set_compare_by_index(dml_job_t *dml .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -257,12 +256,13 @@ extern "C" dml_status_t dml_batch_set_compare_pattern_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_compare_pattern_descriptor(*reinterpret_cast(pattern_ptr), - source_ptr, - byte_length, - dml::ml::compare_pattern_options(static_cast(flags & 0xFFFF)), - dml::ml::compare_expected_result_options(expected_result)); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = + dml::detail::ml::make_compare_pattern_operation(*reinterpret_cast(pattern_ptr), + source_ptr, + byte_length, + dml::detail::ml::compare_pattern_options(static_cast(flags & 0xFFFF)), + dml::detail::compare_result(expected_result)); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -272,7 +272,7 @@ extern "C" dml_status_t dml_batch_set_compare_pattern_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -301,12 +301,12 @@ extern "C" dml_status_t dml_batch_set_crc_by_index(dml_job_t *dml_job return status; } - auto descriptor = dml::ml::make_crc_descriptor(source_ptr, - byte_length, - *crc_seed_ptr, - dml::ml::crc_options(static_cast(flags & 0xFFFF)), - dml::ml::crc_additional_options(static_cast((flags >> 16) & 0xFF))); - status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = dml::detail::ml::make_crc_operation(source_ptr, + byte_length, + *crc_seed_ptr, + dml::detail::ml::crc_options(static_cast(flags & 0xFFFF)), + dml::detail::ml::crc_specific_options(static_cast((flags >> 16) & 0xFF))); + status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -316,7 +316,7 @@ extern "C" dml_status_t dml_batch_set_crc_by_index(dml_job_t *dml_job .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -346,13 +346,14 @@ extern "C" dml_status_t dml_batch_set_copy_crc_by_index(dml_job_t *dm return status; } - auto descriptor = dml::ml::make_copy_crc_descriptor(source_ptr, - destination_ptr, - byte_length, - *crc_seed_ptr, - dml::ml::copy_crc_options(static_cast(flags & 0xFFFF)), - dml::ml::copy_crc_additional_options(static_cast((flags >> 16) & 0xFF))); - status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = + dml::detail::ml::make_copy_crc_operation(source_ptr, + destination_ptr, + byte_length, + *crc_seed_ptr, + dml::detail::ml::copy_crc_options(static_cast(flags & 0xFFFF)), + dml::detail::ml::copy_crc_specific_options(static_cast((flags >> 16) & 0xFF))); + status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -362,7 +363,7 @@ extern "C" dml_status_t dml_batch_set_copy_crc_by_index(dml_job_t *dm .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -385,11 +386,11 @@ extern "C" dml_status_t dml_batch_set_fill_by_index(dml_job_t *dml_jo return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_fill_descriptor(*reinterpret_cast(pattern_ptr), - destination_ptr, - byte_length, - dml::ml::fill_options(static_cast(flags & 0xFFFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = dml::detail::ml::make_fill_operation(*reinterpret_cast(pattern_ptr), + destination_ptr, + byte_length, + dml::detail::ml::fill_options(static_cast(flags & 0xFFFF))); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -399,7 +400,7 @@ extern "C" dml_status_t dml_batch_set_fill_by_index(dml_job_t *dml_jo .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -420,10 +421,11 @@ extern "C" dml_status_t dml_batch_set_cache_flush_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_cache_flush_descriptor(destination_ptr, - byte_length, - dml::ml::cache_flush_options(static_cast(flags & 0xFFFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = + dml::detail::ml::make_cache_flush_operation(destination_ptr, + byte_length, + dml::detail::ml::cache_flush_options(static_cast(flags & 0xFFFF))); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -433,7 +435,7 @@ extern "C" dml_status_t dml_batch_set_cache_flush_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -458,14 +460,15 @@ extern "C" dml_status_t dml_batch_set_delta_create_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_create_delta_descriptor(source_ptr, - reference_ptr, - compare_length, - delta_record_ptr, - delta_record_length, - dml::ml::create_delta_options(static_cast(flags & 0xFFFF)), - dml::ml::delta_expected_result_options(expected_result)); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = + dml::detail::ml::make_create_delta_operation(source_ptr, + reference_ptr, + compare_length, + delta_record_ptr, + delta_record_length, + dml::detail::ml::create_delta_options(static_cast(flags & 0xFFFF)), + dml::detail::create_delta_result(expected_result)); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -475,7 +478,7 @@ extern "C" dml_status_t dml_batch_set_delta_create_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -498,12 +501,13 @@ extern "C" dml_status_t dml_batch_set_delta_apply_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_apply_delta_descriptor(delta_record_ptr, - delta_record_length, - destination_ptr, - destination_length, - dml::ml::apply_delta_options(static_cast(flags & 0xFFFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto operation = + dml::detail::ml::make_apply_delta_operation(delta_record_ptr, + delta_record_length, + destination_ptr, + destination_length, + dml::detail::ml::apply_delta_options(static_cast(flags & 0xFFFF))); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -513,7 +517,7 @@ extern "C" dml_status_t dml_batch_set_delta_apply_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -526,8 +530,6 @@ extern "C" dml_status_t dml_batch_set_dif_check_by_index(dml_job_t const dml_dif_config_t *dif_config_ptr, dml_operation_flags_t flags) { - constexpr uint32_t dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; - CHECK_NULL(dml_job_ptr); CHECK_NULL(dml_job_ptr->destination_length); CHECK_NULL(source_ptr); @@ -539,16 +541,16 @@ extern "C" dml_status_t dml_batch_set_dif_check_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_dif_check_descriptor( + auto operation = dml::detail::ml::make_dif_check_operation( source_ptr, source_length, { dif_config_ptr->source_reference_tag_seed, dif_config_ptr->source_application_tag_mask, dif_config_ptr->source_application_tag_seed }, - dml::ml::dif_check_options(static_cast(flags & 0xFFFF)), - dml::ml::dif_additional_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), - dml::ml::dif_additional_src_options(static_cast(dif_config_ptr->flags & 0xFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + dml::detail::ml::dif_check_options(static_cast(flags & 0xFFFF)), + dml::detail::ml::dif_specific_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), + dml::detail::ml::dif_source_options(static_cast(dif_config_ptr->flags & 0xFF))); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -558,7 +560,7 @@ extern "C" dml_status_t dml_batch_set_dif_check_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -573,7 +575,7 @@ extern "C" dml_status_t dml_batch_set_dif_update_by_index(dml_job_t uint32_t destination_length, dml_operation_flags_t flags) { - constexpr uint32_t dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; + static_cast(destination_length); CHECK_NULL(dml_job_ptr); CHECK_NULL(dml_job_ptr->destination_length); @@ -587,7 +589,7 @@ extern "C" dml_status_t dml_batch_set_dif_update_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_dif_update_descriptor( + auto operation = dml::detail::ml::make_dif_update_operation( source_ptr, destination_ptr, source_length, @@ -597,11 +599,11 @@ extern "C" dml_status_t dml_batch_set_dif_update_by_index(dml_job_t { dif_config_ptr->destination_reference_tag_seed, dif_config_ptr->destination_application_tag_mask, dif_config_ptr->destination_application_tag_seed }, - dml::ml::dif_update_options(static_cast(flags & 0xFFFF)), - dml::ml::dif_additional_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), - dml::ml::dif_additional_src_options(static_cast(dif_config_ptr->flags & 0xFF)), - dml::ml::dif_additional_dst_options(static_cast((dif_config_ptr->flags >> 8) & 0xFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + dml::detail::ml::dif_update_options(static_cast(flags & 0xFFFF)), + dml::detail::ml::dif_specific_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), + dml::detail::ml::dif_source_options(static_cast(dif_config_ptr->flags & 0xFF)), + dml::detail::ml::dif_destination_options(static_cast((dif_config_ptr->flags >> 8) & 0xFF))); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -611,7 +613,7 @@ extern "C" dml_status_t dml_batch_set_dif_update_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -626,7 +628,7 @@ extern "C" dml_status_t dml_batch_set_dif_insert_by_index(dml_job_t uint32_t destination_length, dml_operation_flags_t flags) { - constexpr uint32_t dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; + static_cast(destination_length); CHECK_NULL(dml_job_ptr); CHECK_NULL(dml_job_ptr->destination_length); @@ -640,17 +642,17 @@ extern "C" dml_status_t dml_batch_set_dif_insert_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_dif_insert_descriptor( + auto operation = dml::detail::ml::make_dif_insert_operation( source_ptr, destination_ptr, source_length, { dif_config_ptr->destination_reference_tag_seed, dif_config_ptr->destination_application_tag_mask, dif_config_ptr->destination_application_tag_seed }, - dml::ml::dif_insert_options(static_cast(flags & 0xFFFF)), - dml::ml::dif_additional_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), - dml::ml::dif_additional_dst_options(static_cast((dif_config_ptr->flags >> 8) & 0xFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + dml::detail::ml::dif_insert_options(static_cast(flags & 0xFFFF)), + dml::detail::ml::dif_specific_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), + dml::detail::ml::dif_destination_options(static_cast((dif_config_ptr->flags >> 8) & 0xFF))); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -660,7 +662,7 @@ extern "C" dml_status_t dml_batch_set_dif_insert_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -675,7 +677,7 @@ extern "C" dml_status_t dml_batch_set_dif_strip_by_index(dml_job_t uint32_t destination_length, dml_operation_flags_t flags) { - constexpr uint32_t dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; + static_cast(destination_length); CHECK_NULL(dml_job_ptr); CHECK_NULL(dml_job_ptr->destination_length); @@ -689,18 +691,18 @@ extern "C" dml_status_t dml_batch_set_dif_strip_by_index(dml_job_t return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - auto descriptor = dml::ml::make_dif_strip_descriptor( + auto operation = dml::detail::ml::make_dif_strip_operation( source_ptr, destination_ptr, source_length, { dif_config_ptr->source_reference_tag_seed, dif_config_ptr->source_application_tag_mask, dif_config_ptr->source_application_tag_seed }, - dml::ml::dif_strip_options(static_cast(flags & 0xFFFF)), - dml::ml::dif_additional_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), - dml::ml::dif_additional_src_options(static_cast(dif_config_ptr->flags & 0xFF))); + dml::detail::ml::dif_strip_options(static_cast(flags & 0xFFFF)), + dml::detail::ml::dif_specific_options(static_cast(((dif_config_ptr->flags >> 16) & 0xFF) | dif_config_ptr->block_size)), + dml::detail::ml::dif_source_options(static_cast(dif_config_ptr->flags & 0xFF))); - auto status = dml::to_own_status(dml::ml::validate(descriptor)); + auto status = dml::to_own_status(dml::detail::ml::validate(operation)); if (status != DML_STATUS_OK) { return status; @@ -710,7 +712,7 @@ extern "C" dml_status_t dml_batch_set_dif_strip_by_index(dml_job_t .add_by_index(task_index, [&] { - return descriptor; + return operation; }); return DML_STATUS_OK; @@ -743,7 +745,7 @@ extern "C" dml_status_t dml_batch_get_status(const dml_job_t *dml_job_ptr, uint3 return DML_STATUS_BATCH_TASK_INDEX_OVERFLOW; } - *status_ptr = dml::batch(dml_job_ptr->destination_first_ptr, task_count).get_status(task_index); + *status_ptr = ::dml::batch(dml_job_ptr->destination_first_ptr, task_count).get_status(task_index); return DML_STATUS_OK; } diff --git a/sources/c_api/dml_get_library_version.cpp b/sources/c_api/dml_get_library_version.cpp index 2a917e6..f61e9a2 100644 --- a/sources/c_api/dml_get_library_version.cpp +++ b/sources/c_api/dml_get_library_version.cpp @@ -35,7 +35,7 @@ #define DML_LIBRARY_MINOR_VERSION 1u /** Minor version of the library*/ -#define DML_LIBRARY_PATCH 5u +#define DML_LIBRARY_PATCH 6u /** Supported CPU ISA */ #define DML_LIBRARY_MINIMAL_CPU_ISA "N/A" @@ -45,15 +45,9 @@ extern "C" const dml_library_version_t* dml_get_library_version() { - static const dml_library_version_t library_version = { DML_LIBRARY_NAME, - DML_LIBRARY_MINIMAL_CPU_ISA, - __DATE__, - DML_LIBRARY_VERSION, - DML_LIBRARY_MIN_HW_VERSION, - DML_GIT_REVISION, - DML_LIBRARY_MAJOR_VERSION, - DML_LIBRARY_MINOR_VERSION, - DML_LIBRARY_PATCH }; + static const dml_library_version_t library_version = { DML_LIBRARY_NAME, DML_LIBRARY_MINIMAL_CPU_ISA, __DATE__, + DML_LIBRARY_VERSION, DML_LIBRARY_MIN_HW_VERSION, DML_GIT_REVISION, + DML_LIBRARY_MAJOR_VERSION, DML_LIBRARY_MINOR_VERSION, DML_LIBRARY_PATCH }; return &library_version; } diff --git a/sources/c_api/include/impl.hpp b/sources/c_api/include/impl.hpp index ef50f71..dd70938 100644 --- a/sources/c_api/include/impl.hpp +++ b/sources/c_api/include/impl.hpp @@ -17,8 +17,8 @@ #ifndef DML_IMPL_HPP #define DML_IMPL_HPP -#include -#include +#include +#include #include "job_view.hpp" #include "range_check.hpp" @@ -29,7 +29,7 @@ namespace dml { inline dml_status_t wait(job_view job) noexcept { - ml::wait(job.state().record); + detail::ml::wait(job.state().record); // Extract result return write_result(job); @@ -37,7 +37,7 @@ namespace dml inline dml_status_t check(job_view job) noexcept { - if (ml::is_finished(job.state().record)) + if (detail::ml::is_finished(job.state().record)) { // Extract result return write_result(job); @@ -59,18 +59,18 @@ namespace dml write_descriptor(job); // Middle Layer checks - if (auto status = to_own_status(ml::validate(job.state().dsc)); status != DML_STATUS_OK) + if (auto status = to_own_status(detail::ml::validate(job.state().dsc)); status != DML_STATUS_OK) { return status; } if (job.state().path == DML_PATH_HW) { - return to_own_status(ml::hardware().submit(job.state().dsc, job.state().record)); + return to_own_status(detail::ml::execution_path::hardware::submit(job.state().dsc, job.state().record)); } else { - return to_own_status(ml::software().submit(job.state().dsc, job.state().record)); + return to_own_status(detail::ml::execution_path::software::submit(job.state().dsc, job.state().record)); } } diff --git a/sources/c_api/include/job_view.hpp b/sources/c_api/include/job_view.hpp index 0fb93bf..43723c6 100644 --- a/sources/c_api/include/job_view.hpp +++ b/sources/c_api/include/job_view.hpp @@ -19,16 +19,16 @@ #include -#include "state.hpp" - #include +#include "state.hpp" + namespace dml { constexpr auto get_job_size() noexcept { - constexpr auto alignment = 64u; - constexpr auto job_size = sizeof(dml_job_t); + constexpr auto alignment = 64u; + constexpr auto job_size = sizeof(dml_job_t); constexpr auto state_size = sizeof(dml::state); // Enough size for job, state, and to align state to default boundary @@ -187,4 +187,4 @@ namespace dml }; } // namespace dml -#endif // DML_JOB_VIEW_HPP +#endif // DML_JOB_VIEW_HPP diff --git a/sources/c_api/include/macros.hpp b/sources/c_api/include/macros.hpp index 639d043..ef26c19 100644 --- a/sources/c_api/include/macros.hpp +++ b/sources/c_api/include/macros.hpp @@ -23,8 +23,8 @@ if (!(p)) \ return DML_STATUS_NULL_POINTER_ERROR -#define CHECK_PATH(p) \ +#define CHECK_PATH(p) \ if (DML_PATH_AUTO != (p) && DML_PATH_SW != (p) && DML_PATH_HW != (p)) \ return DML_STATUS_NULL_POINTER_ERROR -#endif // DML_MACROS_HPP +#endif // DML_MACROS_HPP diff --git a/sources/c_api/include/range_check.hpp b/sources/c_api/include/range_check.hpp index ee19e26..11a4790 100644 --- a/sources/c_api/include/range_check.hpp +++ b/sources/c_api/include/range_check.hpp @@ -85,6 +85,8 @@ namespace dml inline dml_status_t range_check_crc(const uint8_t* src1, const uint32_t* crc_ptr, const uint32_t src_size) noexcept { + static_cast(src1); + static_cast(src_size); if (crc_ptr == nullptr) { return DML_STATUS_NULL_POINTER_ERROR; @@ -100,6 +102,9 @@ namespace dml const uint32_t* crc_ptr, const uint32_t src_size) noexcept { + static_cast(src1); + static_cast(dst1); + static_cast(src_size); if (crc_ptr == nullptr) { return DML_STATUS_NULL_POINTER_ERROR; diff --git a/sources/c_api/include/state.hpp b/sources/c_api/include/state.hpp index 6b4a67c..6ae568d 100644 --- a/sources/c_api/include/state.hpp +++ b/sources/c_api/include/state.hpp @@ -18,17 +18,16 @@ #define DML_STATE_HPP #include -#include -#include +#include namespace dml { struct state { - ml::descriptor dsc; - ml::completion_record record; - dml_path_t path; + dml::detail::ml::operation dsc; + dml::detail::ml::result record; + dml_path_t path; }; } // namespace dml -#endif // DML_STATE_HPP +#endif // DML_STATE_HPP diff --git a/sources/c_api/include/status.hpp b/sources/c_api/include/status.hpp index a791432..51e8dc1 100644 --- a/sources/c_api/include/status.hpp +++ b/sources/c_api/include/status.hpp @@ -19,123 +19,123 @@ #include -#include +#include namespace dml { - inline dml_status_t to_own_status(ml::execution_status status) noexcept + inline dml_status_t to_own_status(detail::execution_status status) noexcept { switch (status) { - case ml::execution_status::success: + case detail::execution_status::success: { return DML_STATUS_OK; } - case ml::execution_status::false_predicate_success: + case detail::execution_status::false_predicate_success: { return DML_STATUS_FALSE_PREDICATE_OK; } - case ml::execution_status::page_fault_during_processing: + case detail::execution_status::page_fault_during_processing: { return DML_STATUS_PAGE_FAULT_ERROR; } - case ml::execution_status::page_response_error: + case detail::execution_status::page_response_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::batch_error: + case detail::execution_status::batch_error: { return DML_STATUS_BATCH_ERROR; } - case ml::execution_status::batch_page_fault_error: + case detail::execution_status::batch_page_fault_error: { return DML_STATUS_PAGE_FAULT_ERROR; } - case ml::execution_status::offset_order_error: + case detail::execution_status::offset_order_error: { return DML_STATUS_DELTA_ASCENDT_ERROR; } - case ml::execution_status::offset_overflow: + case detail::execution_status::offset_overflow: { return DML_STATUS_DELTA_OFFSET_ERROR; } - case ml::execution_status::dif_control_error: + case detail::execution_status::dif_control_error: { return DML_STATUS_DIF_CHECK_ERROR; } - case ml::execution_status::operation_error: + case detail::execution_status::operation_error: { return DML_STATUS_JOB_OPERATION_ERROR; } - case ml::execution_status::flag_error: + case detail::execution_status::flag_error: { return DML_STATUS_JOB_FLAGS_ERROR; } - case ml::execution_status::non_zero_reserved_field_error: + case detail::execution_status::non_zero_reserved_field_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::invalid_transfer_size_error: + case detail::execution_status::invalid_transfer_size_error: { return DML_STATUS_JOB_LENGTH_ERROR; } - case ml::execution_status::descriptor_count_error: + case detail::execution_status::descriptor_count_error: { return DML_STATUS_BATCH_SIZE_ERROR; } - case ml::execution_status::delta_size_error: + case detail::execution_status::delta_size_error: { return DML_STATUS_DELTA_INPUT_SIZE_ERROR; } - case ml::execution_status::buffers_overlap: + case detail::execution_status::buffers_overlap: { return DML_STATUS_OVERLAPPING_BUFFER_ERROR; } - case ml::execution_status::dualcast_misalign_error: + case detail::execution_status::dualcast_misalign_error: { return DML_STATUS_DUALCAST_ALIGN_ERROR; } - case ml::execution_status::descriptor_list_align_error: + case detail::execution_status::descriptor_list_align_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::invalid_interrupt_handle: + case detail::execution_status::invalid_interrupt_handle: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::page_fault_on_translation: + case detail::execution_status::page_fault_on_translation: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::completion_record_align_error: + case detail::execution_status::completion_record_align_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::misalign_address_error: + case detail::execution_status::misalign_address_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::privilege_error: + case detail::execution_status::privilege_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::traffic_class_error: + case detail::execution_status::traffic_class_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::readback_translation_error: + case detail::execution_status::readback_translation_error: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::operation_readback_timeout: + case detail::execution_status::operation_readback_timeout: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::hardware_timeout: + case detail::execution_status::hardware_timeout: { return DML_STATUS_INTERNAL_ERROR; } - case ml::execution_status::address_translation_error: + case detail::execution_status::address_translation_error: { return DML_STATUS_INTERNAL_ERROR; } @@ -146,48 +146,50 @@ namespace dml } } - inline dml_status_t to_own_status(ml::submission_status status) noexcept + inline dml_status_t to_own_status(detail::submission_status status) noexcept { switch (status) { - case ml::submission_status::success: + case detail::submission_status::success: return DML_STATUS_OK; - case ml::submission_status::failure: - return DML_STATUS_INSTANCE_NOT_FOUND; + case detail::submission_status::queue_busy: + return DML_STATUS_WORK_QUEUE_OVERFLOW_ERROR; + case detail::submission_status::failure: + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; default: return DML_STATUS_INTERNAL_ERROR; } } - inline dml_status_t to_own_status(ml::validation_status status) noexcept + inline dml_status_t to_own_status(detail::validation_status status) noexcept { switch (status) { - case ml::validation_status::success: + case detail::validation_status::success: return DML_STATUS_OK; - case ml::validation_status::address_is_null: + case detail::validation_status::null_address: return DML_STATUS_NULL_POINTER_ERROR; - case ml::validation_status::size_is_null: + case detail::validation_status::null_size: return DML_STATUS_JOB_LENGTH_ERROR; - case ml::validation_status::delta_size_is_wrong: + case detail::validation_status::wrong_size: return DML_STATUS_DELTA_INPUT_SIZE_ERROR; - case ml::validation_status::buffers_overlap: + case detail::validation_status::overlapping: return DML_STATUS_OVERLAPPING_BUFFER_ERROR; - case ml::validation_status::address_is_misaligned: + case detail::validation_status::misalignment: return DML_STATUS_DELTA_ALIGN_ERROR; - case ml::validation_status::delta_input_size_is_wrong: - return DML_STATUS_DELTA_ALIGN_ERROR; - case ml::validation_status::delta_input_size_overflow: + case detail::validation_status::large_size: return DML_STATUS_DELTA_INPUT_SIZE_ERROR; - case ml::validation_status::delta_record_size_is_wrong: + case detail::validation_status::wrong_delta_size: return DML_STATUS_DELTA_RECORD_SIZE_ERROR; - case ml::validation_status::dualcast_address_is_wrong: + case detail::validation_status::wrong_dualcast_address: return DML_STATUS_DUALCAST_ALIGN_ERROR; - case ml::validation_status::batch_size_is_wrong: + case detail::validation_status::wrong_batch_size: return DML_STATUS_BATCH_SIZE_ERROR; - case ml::validation_status::dif_size_is_wrong: + case detail::validation_status::wrong_dif_size: return DML_STATUS_JOB_LENGTH_ERROR; - case ml::validation_status::unsupported_operation: + case detail::validation_status::dif_strip_adjacent: + return DML_STATUS_DIF_STRIP_ADJACENT_ERROR; + case detail::validation_status::unsupported_operation: return DML_STATUS_JOB_OPERATION_ERROR; default: return DML_STATUS_INTERNAL_ERROR; diff --git a/sources/c_api/include/utils.hpp b/sources/c_api/include/utils.hpp index b942afd..e0fbbb8 100644 --- a/sources/c_api/include/utils.hpp +++ b/sources/c_api/include/utils.hpp @@ -33,4 +33,4 @@ namespace dml } } // namespace dml -#endif // DML_UTILS_HPP +#endif // DML_UTILS_HPP diff --git a/sources/c_api/include/write_descriptor.hpp b/sources/c_api/include/write_descriptor.hpp index a634531..b4ce4f7 100644 --- a/sources/c_api/include/write_descriptor.hpp +++ b/sources/c_api/include/write_descriptor.hpp @@ -17,7 +17,9 @@ #ifndef DML_MAKE_DESCRIPTOR_HPP #define DML_MAKE_DESCRIPTOR_HPP -#include +#include + +#include "utils.hpp" namespace dml { @@ -26,157 +28,157 @@ namespace dml switch (job.operation()) { case DML_OP_NOP: - job.state().dsc = ml::make_nop_descriptor(ml::nop_options(job.flags())); + job.state().dsc = detail::ml::make_nop_operation(detail::ml::nop_options(job.flags())); break; case DML_OP_BATCH: - job.state().dsc = - ml::make_batch_descriptor(reinterpret_cast(job.destination_first()), - job.destination_length() / (sizeof(dml::ml::descriptor) + sizeof(dml::ml::completion_record)), - ml::batch_options(job.flags())); + job.state().dsc = detail::ml::make_batch_operation( + reinterpret_cast(dml::align(job.destination_first())), + job.destination_length() / (sizeof(dml::detail::ml::operation) + sizeof(dml::detail::ml::result)), + detail::ml::batch_options(job.flags())); break; case DML_OP_DRAIN: - job.state().dsc = ml::make_drain_descriptor(reinterpret_cast(job.destination_first()), - reinterpret_cast(job.destination_second()), - ml::drain_options(job.flags()), - ml::drain_additional_options(job.specific_flags())); + job.state().dsc = detail::ml::make_drain_operation(reinterpret_cast(job.destination_first()), + reinterpret_cast(job.destination_second()), + detail::ml::drain_options(job.flags()), + detail::ml::drain_specific_options(job.specific_flags())); break; case DML_OP_MEM_MOVE: - job.state().dsc = ml::make_mem_move_descriptor(job.source_first(), - job.destination_first(), - job.source_length(), - ml::mem_move_options(job.flags())); + job.state().dsc = detail::ml::make_mem_move_operation(job.source_first(), + job.destination_first(), + job.source_length(), + detail::ml::mem_move_options(job.flags())); break; case DML_OP_FILL: - job.state().dsc = ml::make_fill_descriptor(job.pattern(), - job.destination_first(), - job.destination_length(), - ml::fill_options(job.flags())); + job.state().dsc = detail::ml::make_fill_operation(job.pattern(), + job.destination_first(), + job.destination_length(), + detail::ml::fill_options(job.flags())); break; case DML_OP_DUALCAST: - job.state().dsc = ml::make_dualcast_descriptor(job.source_first(), - job.destination_first(), - job.destination_second(), - job.source_length(), - ml::dualcast_options(job.flags()), - ml::dualcast_additional_options(job.specific_flags())); + job.state().dsc = detail::ml::make_dualcast_operation(job.source_first(), + job.destination_first(), + job.destination_second(), + job.source_length(), + detail::ml::dualcast_options(job.flags()), + detail::ml::dualcast_specific_options(job.specific_flags())); break; case DML_OP_COMPARE: - job.state().dsc = ml::make_compare_descriptor(job.source_first(), - job.source_second(), - job.source_length(), - ml::compare_options(job.flags()), - ml::compare_expected_result_options(job.expected_result())); + job.state().dsc = detail::ml::make_compare_operation(job.source_first(), + job.source_second(), + job.source_length(), + detail::ml::compare_options(job.flags()), + detail::compare_result(job.expected_result())); break; case DML_OP_COMPARE_PATTERN: - job.state().dsc = ml::make_compare_pattern_descriptor(job.pattern(), - job.source_first(), - job.source_length(), - ml::compare_pattern_options(job.flags()), - ml::compare_expected_result_options(job.expected_result())); + job.state().dsc = detail::ml::make_compare_pattern_operation(job.pattern(), + job.source_first(), + job.source_length(), + detail::ml::compare_pattern_options(job.flags()), + detail::compare_result(job.expected_result())); break; case DML_OP_CRC: { - uint32_t crc_seed = 0; - auto crc_options = ml::crc_additional_options(job.specific_flags()); + uint32_t crc_seed = 0; + auto crc_options = detail::ml::crc_specific_options(job.specific_flags()); - if (crc_options.contains(ml::crc_additional_option::read_crc_seed)) + if (intersects(job.specific_flags(), detail::crc_specific_flag::read_crc_seed)) { crc_seed = *job.crc_ptr(); } // Erase read_crc_seed flag from enabled options - uint8_t mask = 0xFF ^ static_cast(ml::crc_additional_option::read_crc_seed); - crc_options = ml::crc_additional_options(static_cast(crc_options) & mask); + uint8_t mask = 0xFF ^ to_underlying(detail::crc_specific_flag::read_crc_seed); + crc_options = detail::ml::crc_specific_options(static_cast(crc_options) & mask); - job.state().dsc = ml::make_crc_descriptor(job.source_first(), - job.source_length(), - crc_seed, - ml::crc_options(job.flags()), - crc_options); + job.state().dsc = detail::ml::make_crc_operation(job.source_first(), + job.source_length(), + crc_seed, + detail::ml::crc_options(job.flags()), + crc_options); break; } case DML_OP_COPY_CRC: { - uint32_t crc_seed = 0; - auto crc_options = ml::copy_crc_additional_options(job.specific_flags()); + uint32_t crc_seed = 0; + auto crc_options = detail::ml::copy_crc_specific_options(job.specific_flags()); - if (crc_options.contains(ml::crc_additional_option::read_crc_seed)) + if (intersects(job.specific_flags(), detail::crc_specific_flag::read_crc_seed)) { crc_seed = *job.crc_ptr(); } // Erase read_crc_seed flag from enabled options - uint8_t mask = 0xFF ^ static_cast(ml::crc_additional_option::read_crc_seed); - crc_options = ml::copy_crc_additional_options(static_cast(crc_options) & mask); + uint8_t mask = 0xFF ^ to_underlying(detail::crc_specific_flag::read_crc_seed); + crc_options = detail::ml::copy_crc_specific_options(static_cast(crc_options) & mask); - job.state().dsc = ml::make_copy_crc_descriptor(job.source_first(), - job.destination_first(), - job.source_length(), - crc_seed, - ml::copy_crc_options(job.flags()), - crc_options); + job.state().dsc = detail::ml::make_copy_crc_operation(job.source_first(), + job.destination_first(), + job.source_length(), + crc_seed, + detail::ml::copy_crc_options(job.flags()), + crc_options); } break; case DML_OP_DELTA_CREATE: - job.state().dsc = ml::make_create_delta_descriptor(job.source_first(), - job.source_second(), - job.source_length(), - job.destination_first(), - job.destination_length(), - ml::create_delta_options(job.flags()), - ml::delta_expected_result_options(job.expected_result())); + job.state().dsc = detail::ml::make_create_delta_operation(job.source_first(), + job.source_second(), + job.source_length(), + job.destination_first(), + job.destination_length(), + detail::ml::create_delta_options(job.flags()), + detail::create_delta_result(job.expected_result())); break; case DML_OP_DELTA_APPLY: - job.state().dsc = ml::make_apply_delta_descriptor(job.source_first(), - job.source_length(), - job.destination_first(), - job.destination_length(), - ml::apply_delta_options(job.flags())); + job.state().dsc = detail::ml::make_apply_delta_operation(job.source_first(), + job.source_length(), + job.destination_first(), + job.destination_length(), + detail::ml::apply_delta_options(job.flags())); break; case DML_OP_DIF_CHECK: - job.state().dsc = ml::make_dif_check_descriptor(job.source_first(), - job.source_length(), - { job.src_ref_tag(), job.src_app_tag_mask(), job.src_app_tag() }, - ml::dif_check_options(job.flags()), - ml::dif_additional_options(job.dif_flags()), - ml::dif_additional_src_options(job.dif_src_flags())); + job.state().dsc = detail::ml::make_dif_check_operation(job.source_first(), + job.source_length(), + { job.src_ref_tag(), job.src_app_tag_mask(), job.src_app_tag() }, + detail::ml::dif_check_options(job.flags()), + detail::ml::dif_specific_options(job.dif_flags()), + detail::ml::dif_source_options(job.dif_src_flags())); break; case DML_OP_DIF_INSERT: - job.state().dsc = ml::make_dif_insert_descriptor(job.source_first(), - job.destination_first(), - job.source_length(), - { job.dst_ref_tag(), job.dst_app_tag_mask(), job.dst_app_tag() }, - ml::dif_insert_options(job.flags()), - ml::dif_additional_options(job.dif_flags()), - ml::dif_additional_dst_options(job.dif_dst_flags())); + job.state().dsc = detail::ml::make_dif_insert_operation(job.source_first(), + job.destination_first(), + job.source_length(), + { job.dst_ref_tag(), job.dst_app_tag_mask(), job.dst_app_tag() }, + detail::ml::dif_insert_options(job.flags()), + detail::ml::dif_specific_options(job.dif_flags()), + detail::ml::dif_destination_options(job.dif_dst_flags())); break; case DML_OP_DIF_STRIP: - job.state().dsc = ml::make_dif_strip_descriptor(job.source_first(), - job.destination_first(), - job.source_length(), - { job.src_ref_tag(), job.src_app_tag_mask(), job.src_app_tag() }, - ml::dif_strip_options(job.flags()), - ml::dif_additional_options(job.dif_flags()), - ml::dif_additional_src_options(job.dif_src_flags())); + job.state().dsc = detail::ml::make_dif_strip_operation(job.source_first(), + job.destination_first(), + job.source_length(), + { job.src_ref_tag(), job.src_app_tag_mask(), job.src_app_tag() }, + detail::ml::dif_strip_options(job.flags()), + detail::ml::dif_specific_options(job.dif_flags()), + detail::ml::dif_source_options(job.dif_src_flags())); break; case DML_OP_DIF_UPDATE: - job.state().dsc = ml::make_dif_update_descriptor(job.source_first(), - job.destination_first(), - job.source_length(), - { job.src_ref_tag(), job.src_app_tag_mask(), job.src_app_tag() }, - { job.dst_ref_tag(), job.dst_app_tag_mask(), job.dst_app_tag() }, - ml::dif_update_options(job.flags()), - ml::dif_additional_options(job.dif_flags()), - ml::dif_additional_src_options(job.dif_src_flags()), - ml::dif_additional_dst_options(job.dif_dst_flags())); + job.state().dsc = detail::ml::make_dif_update_operation(job.source_first(), + job.destination_first(), + job.source_length(), + { job.src_ref_tag(), job.src_app_tag_mask(), job.src_app_tag() }, + { job.dst_ref_tag(), job.dst_app_tag_mask(), job.dst_app_tag() }, + detail::ml::dif_update_options(job.flags()), + detail::ml::dif_specific_options(job.dif_flags()), + detail::ml::dif_source_options(job.dif_src_flags()), + detail::ml::dif_destination_options(job.dif_dst_flags())); break; case DML_OP_CACHE_FLUSH: - job.state().dsc = ml::make_cache_flush_descriptor(job.destination_first(), - job.destination_length(), - ml::cache_flush_options(job.flags())); + job.state().dsc = detail::ml::make_cache_flush_operation(job.destination_first(), + job.destination_length(), + detail::ml::cache_flush_options(job.flags())); break; default: - job.state().dsc = ml::descriptor{}; + job.state().dsc = dml::detail::ml::operation{}; } } } // namespace dml diff --git a/sources/c_api/include/write_result.hpp b/sources/c_api/include/write_result.hpp index c389174..9c63faf 100644 --- a/sources/c_api/include/write_result.hpp +++ b/sources/c_api/include/write_result.hpp @@ -17,8 +17,6 @@ #ifndef DML_WRITE_RESULT_HPP #define DML_WRITE_RESULT_HPP -#include - #include "status.hpp" namespace dml @@ -98,130 +96,100 @@ namespace dml inline dml_status_t write_result_nop(job_view job) noexcept { - auto result_view = ml::views::nop_result(job.state().record); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_batch(job_view job) noexcept { - auto result_view = ml::views::batch_result(job.state().record); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_drain(job_view job) noexcept { - auto result_view = ml::views::drain_result(job.state().record); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_mem_move(job_view job) noexcept { - auto result_view = ml::views::mem_move_result(job.state().record); + job.set_result(detail::ml::get_result(job.state().record)); - job.set_result(result_view.result()); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_fill(job_view job) noexcept { - auto result_view = ml::views::fill_result(job.state().record); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_dualcast(job_view job) noexcept { - auto result_view = ml::views::fill_result(job.state().record); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_crc(job_view job) noexcept { - auto result_view = ml::views::crc_result(job.state().record); - - job.set_crc(result_view.crc_value()); + job.set_crc(dml::detail::ml::get_crc_value(job.state().record)); - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_compare(job_view job) noexcept { - auto result_view = ml::views::compare_result(job.state().record); + job.set_offset(dml::detail::ml::get_bytes_completed(job.state().record)); + job.set_result(dml::detail::ml::get_result(job.state().record)); - job.set_offset(result_view.bytes_completed()); - job.set_result(result_view.result()); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_create_delta(job_view job) noexcept { - auto result_view = ml::views::create_delta_result(job.state().record); - //job.set_offset(); - job.set_destination_length(result_view.delta_record_size()); - job.set_result(result_view.result()); + job.set_destination_length(dml::detail::ml::get_delta_record_size(job.state().record)); + job.set_result(dml::detail::ml::get_result(job.state().record)); job.set_offset(*reinterpret_cast(job.destination_first())); - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_apply_delta(job_view job) noexcept { - auto result_view = ml::views::apply_delta_result(job.state().record); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_dif_check(job_view job) noexcept { - auto result_view = ml::views::dif_check_result(job.state().record); - - job.set_offset(result_view.bytes_completed()); - job.set_result(result_view.dif_status()); + job.set_offset(dml::detail::ml::get_bytes_completed(job.state().record)); + job.set_result(dml::detail::ml::get_result(job.state().record)); - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_dif_insert(job_view job) noexcept { - auto result_view = ml::views::dif_insert_result(job.state().record); + job.set_offset(dml::detail::ml::get_bytes_completed(job.state().record)); - job.set_offset(result_view.bytes_completed()); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_dif_strip(job_view job) noexcept { - auto result_view = ml::views::dif_strip_result(job.state().record); - - job.set_offset(result_view.bytes_completed()); - job.set_result(result_view.dif_status()); + job.set_offset(dml::detail::ml::get_bytes_completed(job.state().record)); + job.set_result(dml::detail::ml::get_result(job.state().record)); - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_dif_update(job_view job) noexcept { - auto result_view = ml::views::dif_update_result(job.state().record); + job.set_offset(dml::detail::ml::get_bytes_completed(job.state().record)); + job.set_result(dml::detail::ml::get_result(job.state().record)); - job.set_offset(result_view.bytes_completed()); - job.set_result(result_view.dif_status()); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } inline dml_status_t write_result_cache_flush(job_view job) noexcept { - auto result_view = ml::views::cache_flush_result(job.state().record); - - return to_own_status(static_cast(result_view.status())); + return to_own_status(dml::detail::ml::get_status(job.state().record)); } } // namespace dml diff --git a/sources/core/CMakeLists.txt b/sources/core/CMakeLists.txt new file mode 100644 index 0000000..ac16653 --- /dev/null +++ b/sources/core/CMakeLists.txt @@ -0,0 +1,84 @@ +# +# Copyright 2020-2021 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you ("License"). Unless the License provides otherwise, +# you may not use, modify, copy, publish, distribute, disclose or transmit +# this software or the related documents without Intel's prior written +# permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +add_library(dml_core OBJECT + # Core sources + src/software_device.cpp + src/hardware_device.cpp + src/nop.cpp + src/batch.cpp + src/drain.cpp + src/mem_move.cpp + src/fill.cpp + src/compare.cpp + src/compare_pattern.cpp + src/create_delta.cpp + src/apply_delta.cpp + src/dualcast.cpp + src/crc.cpp + src/copy_crc.cpp + src/dif_check.cpp + src/dif_insert.cpp + src/dif_strip.cpp + src/dif_update.cpp + src/cache_flush.cpp + src/kernels.hpp + src/validation.cpp + + include/core/operations.hpp + include/core/descriptor_views.hpp + include/core/completion_record_views.hpp + include/core/validation.hpp + include/core/device.hpp + include/core/types.hpp + ) + +target_link_libraries(dml_core + PRIVATE dml_hw_dispatcher + PRIVATE dml_sw_dispatcher + PRIVATE dml_dif_impl + ) +target_include_directories(dml_core + PUBLIC include + PUBLIC ../../include + ) +target_sources(dml_core + PUBLIC $ + PUBLIC $ + + PUBLIC $ + PUBLIC $ + + PUBLIC $ + PUBLIC $ + ) +target_compile_features(dml_core + PUBLIC cxx_std_17 + ) +target_compile_options(dml_core + PRIVATE ${DML_QUALITY_OPTIONS} + PRIVATE ${DML_CPP_PRIVATE_OPTIONS} + ) +target_compile_definitions(dml_core + PRIVATE $ + ) + +if (DML_HW) + target_compile_definitions(dml_core PRIVATE DML_HW) +endif () + +add_subdirectory(src/sw_dispatcher) +add_subdirectory(src/hw_dispatcher) +add_subdirectory(src/dif_impl) diff --git a/include/dml/cpp/middle_layer/result_views.hpp b/sources/core/include/core/completion_record_views.hpp similarity index 64% rename from include/dml/cpp/middle_layer/result_views.hpp rename to sources/core/include/core/completion_record_views.hpp index 91da013..d1237e9 100644 --- a/include/dml/cpp/middle_layer/result_views.hpp +++ b/sources/core/include/core/completion_record_views.hpp @@ -1,33 +1,27 @@ /* -* Copyright 2021 Intel Corporation. -* -* This software and the related documents are Intel copyrighted materials, -* and your use of them is governed by the express license under which they -* were provided to you ("License"). Unless the License provides otherwise, -* you may not use, modify, copy, publish, distribute, disclose or transmit -* this software or the related documents without Intel's prior written -* permission. -* -* This software and the related documents are provided as is, with no -* express or implied warranties, other than those that are expressly -* stated in the License. -* -*/ - -/** - * @date 05/19/2021 - * @brief Contains definitions of @ref dml::ml::completion_record type + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * */ -#ifndef DML_ML_RESULT_VIEWS_HPP -#define DML_ML_RESULT_VIEWS_HPP +#ifndef DML_CORE_COMPLETION_RECORD_VIEWS_HPP +#define DML_CORE_COMPLETION_RECORD_VIEWS_HPP -#include "completion_record.hpp" -#include "types.hpp" +#include -namespace dml::ml::views +namespace dml::core { - class any_result + class any_completion_record { private: struct offsets @@ -39,7 +33,7 @@ namespace dml::ml::views }; public: - explicit any_result(completion_record& record): record_(record) + explicit any_completion_record(completion_record& record): record_(record) { } @@ -67,21 +61,21 @@ namespace dml::ml::views completion_record& record_; }; - class nop_result: public any_result + class nop_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; private: - using any_result::result; - using any_result::bytes_completed; - using any_result::fault_address; + using any_completion_record::result; + using any_completion_record::bytes_completed; + using any_completion_record::fault_address; }; - class batch_result: public any_result + class batch_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; [[nodiscard]] transfer_size_t& descriptors_completed() const noexcept { @@ -89,48 +83,48 @@ namespace dml::ml::views } private: - using any_result::result; + using any_completion_record::result; }; - class drain_result: public any_result + class drain_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; private: - using any_result::result; - using any_result::bytes_completed; - using any_result::fault_address; + using any_completion_record::result; + using any_completion_record::bytes_completed; + using any_completion_record::fault_address; }; - class mem_move_result: public any_result + class mem_move_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; }; - class fill_result: public any_result + class fill_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; private: - using any_result::result; + using any_completion_record::result; }; - class compare_result: public any_result + class compare_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; }; - class compare_pattern_result: public any_result + class compare_pattern_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; }; - class create_delta_result: public any_result + class create_delta_completion_record: public any_completion_record { private: struct offsets @@ -139,7 +133,7 @@ namespace dml::ml::views }; public: - using any_result::any_result; + using any_completion_record::any_completion_record; [[nodiscard]] transfer_size_t& delta_record_size() const noexcept { @@ -147,25 +141,25 @@ namespace dml::ml::views } }; - class apply_delta_result: public any_result + class apply_delta_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; private: - using any_result::result; + using any_completion_record::result; }; - class dualcast_result: public any_result + class dualcast_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; private: - using any_result::result; + using any_completion_record::result; }; - class crc_result: public any_result + class crc_completion_record: public any_completion_record { private: struct offsets @@ -174,7 +168,7 @@ namespace dml::ml::views }; public: - using any_result::any_result; + using any_completion_record::any_completion_record; [[nodiscard]] crc_value_t& crc_value() const noexcept { @@ -182,10 +176,10 @@ namespace dml::ml::views } private: - using any_result::result; + using any_completion_record::result; }; - class dif_check_result: public any_result + class dif_check_completion_record: public any_completion_record { private: struct offsets @@ -196,7 +190,7 @@ namespace dml::ml::views }; public: - using any_result::any_result; + using any_completion_record::any_completion_record; [[nodiscard]] dif_status_t& dif_status() const noexcept { @@ -219,10 +213,10 @@ namespace dml::ml::views } private: - using any_result::result; + using any_completion_record::result; }; - class dif_insert_result: public any_result + class dif_insert_completion_record: public any_completion_record { private: struct offsets @@ -233,7 +227,7 @@ namespace dml::ml::views }; public: - using any_result::any_result; + using any_completion_record::any_completion_record; [[nodiscard]] dif_ref_tag_t& destination_ref_tag() const noexcept { @@ -251,10 +245,10 @@ namespace dml::ml::views } private: - using any_result::result; + using any_completion_record::result; }; - class dif_strip_result: public any_result + class dif_strip_completion_record: public any_completion_record { private: struct offsets @@ -265,7 +259,7 @@ namespace dml::ml::views }; public: - using any_result::any_result; + using any_completion_record::any_completion_record; [[nodiscard]] dif_status_t& dif_status() const noexcept { @@ -288,10 +282,10 @@ namespace dml::ml::views } private: - using any_result::result; + using any_completion_record::result; }; - class dif_update_result: public any_result + class dif_update_completion_record: public any_completion_record { private: struct offsets @@ -305,7 +299,7 @@ namespace dml::ml::views }; public: - using any_result::any_result; + using any_completion_record::any_completion_record; [[nodiscard]] dif_status_t& dif_status() const noexcept { @@ -343,17 +337,17 @@ namespace dml::ml::views } private: - using any_result::result; + using any_completion_record::result; }; - class cache_flush_result: public any_result + class cache_flush_completion_record: public any_completion_record { public: - using any_result::any_result; + using any_completion_record::any_completion_record; private: - using any_result::result; + using any_completion_record::result; }; -} // namespace dml::ml::views +} // namespace dml::core -#endif //DML_ML_RESULT_VIEWS_HPP +#endif //DML_CORE_COMPLETION_RECORD_VIEWS_HPP diff --git a/include/dml/cpp/middle_layer/descriptor_views.hpp b/sources/core/include/core/descriptor_views.hpp similarity index 98% rename from include/dml/cpp/middle_layer/descriptor_views.hpp rename to sources/core/include/core/descriptor_views.hpp index 3047e5a..3d78cc6 100644 --- a/include/dml/cpp/middle_layer/descriptor_views.hpp +++ b/sources/core/include/core/descriptor_views.hpp @@ -14,13 +14,12 @@ * */ -#ifndef DML_ML_DESCRIPTOR_VIEWS_HPP -#define DML_ML_DESCRIPTOR_VIEWS_HPP +#ifndef DML_CORE_DESCRIPTOR_VIEW_HPP +#define DML_CORE_DESCRIPTOR_VIEW_HPP -#include "descriptor.hpp" -#include "types.hpp" +#include -namespace dml::ml::views +namespace dml::core { class any_descriptor { @@ -554,6 +553,6 @@ namespace dml::ml::views private: using any_descriptor::source_address; }; -} // namespace dml::ml::views +} // namespace dml::core -#endif //DML_ML_DESCRIPTOR_VIEWS_HPP +#endif //DML_CORE_DESCRIPTOR_VIEW_HPP diff --git a/sources/core/include/core/device.hpp b/sources/core/include/core/device.hpp new file mode 100644 index 0000000..21593f6 --- /dev/null +++ b/sources/core/include/core/device.hpp @@ -0,0 +1,38 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_EXECUTION_DEVICE_HPP +#define DML_CORE_EXECUTION_DEVICE_HPP + +#include +#include + +namespace dml::core +{ + class software_device + { + public: + [[nodiscard]] dml::detail::submission_status submit(descriptor& dsc, completion_record& completion_record) noexcept; + }; + + class hardware_device + { + public: + [[nodiscard]] dml::detail::submission_status submit(descriptor& descriptor, completion_record& completion_record) noexcept; + }; +} // namespace dml::core + +#endif //DML_CORE_EXECUTION_DEVICE_HPP diff --git a/sources/core/include/core/operations.hpp b/sources/core/include/core/operations.hpp new file mode 100644 index 0000000..fbfd6ac --- /dev/null +++ b/sources/core/include/core/operations.hpp @@ -0,0 +1,47 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_OPERATIONS_HPP +#define DML_CORE_OPERATIONS_HPP + +#include +#include + +namespace dml::core +{ + enum class operation : operation_t + { + nop = 0x00, + batch = 0x01, + drain = 0x02, + memory_move = 0x03, + fill = 0x04, + compare = 0x05, + compare_pattern = 0x06, + create_delta = 0x07, + apply_delta = 0x08, + dualcast = 0x09, + crc = 0x10, + copy_crc = 0x11, + dif_check = 0x12, + dif_insert = 0x13, + dif_strip = 0x14, + dif_update = 0x15, + cache_flush = 0x20 + }; +} // namespace dml::core + +#endif //DML_CORE_OPERATIONS_HPP diff --git a/sources/core/include/core/types.hpp b/sources/core/include/core/types.hpp new file mode 100644 index 0000000..9ff00d2 --- /dev/null +++ b/sources/core/include/core/types.hpp @@ -0,0 +1,67 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_TYPES_HPP +#define DML_CORE_TYPES_HPP + +#include + +namespace dml::core +{ + using byte_t = dml::detail::byte_t; + + struct alignas(64u) descriptor + { + byte_t bytes[64u]{}; + }; + + struct alignas(32u) completion_record + { + byte_t bytes[32u]{}; + }; + + using status_t = dml::detail::status_t; + + using transfer_size_t = dml::detail::transfer_size_t; + + using operation_t = dml::detail::operation_t; + + using flags_t = dml::detail::flags_t; + + using operation_specific_flags_t = dml::detail::operation_specific_flags_t; + + using completion_interrupt_handle_t = std::uint16_t; + + using transfer_size_t = dml::detail::transfer_size_t; + + using address_t = uint64_t; + + using pattern_t = dml::detail::pattern_t; + + using result_t = dml::detail::result_t; + + using crc_value_t = dml::detail::crc_value_t; + + using dif_flags_t = dml::detail::dif_flags_t; + + using dif_status_t = dml::detail::dif_status_t; + + using dif_ref_tag_t = dml::detail::dif_ref_tag_t; + + using dif_app_tag_t = dml::detail::dif_app_tag_t; +} // namespace dml::core + +#endif //DML_CORE_TYPES_HPP diff --git a/sources/core/include/core/validation.hpp b/sources/core/include/core/validation.hpp new file mode 100644 index 0000000..8705848 --- /dev/null +++ b/sources/core/include/core/validation.hpp @@ -0,0 +1,28 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_VALIDATION_HPP +#define DML_CORE_VALIDATION_HPP + +#include +#include + +namespace dml::core +{ + [[nodiscard]] dml::detail::validation_status validate(descriptor &dsc) noexcept; +} // namespace dml::core + +#endif //DML_CORE_VALIDATION_HPP diff --git a/sources/core/src/apply_delta.cpp b/sources/core/src/apply_delta.cpp new file mode 100644 index 0000000..e21d06f --- /dev/null +++ b/sources/core/src/apply_delta.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void apply_delta(apply_delta_descriptor dsc, apply_delta_completion_record record) noexcept + { + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto delta_record = reinterpret_cast(dsc.delta_record_address()); + const auto delta_record_size = dsc.delta_record_size(); + + dispatch::apply_delta(delta_record, dst, delta_record_size); + + _mm_mfence(); + record.status() = to_underlying(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/batch.cpp b/sources/core/src/batch.cpp new file mode 100644 index 0000000..e48cf81 --- /dev/null +++ b/sources/core/src/batch.cpp @@ -0,0 +1,114 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void batch(batch_descriptor dsc, batch_completion_record record) noexcept + { + const auto operations = reinterpret_cast(dsc.descriptor_list_address()); + const auto descriptors_count = dsc.descriptors_count(); + + auto status = dml::detail::execution_status::processing; + auto index = size_t(0); + + while (index < descriptors_count && status == dml::detail::execution_status::processing) + { + auto ¤t_dsc = operations[index]; + auto ¤t_record = *reinterpret_cast(any_descriptor(current_dsc).completion_record_address()); + + auto op = operation(any_descriptor(current_dsc).operation()); + + switch (op) + { + case operation::nop: + kernels::nop(nop_descriptor(current_dsc), nop_completion_record(current_record)); + break; + case operation::memory_move: + kernels::mem_move(mem_move_descriptor(current_dsc), mem_move_completion_record(current_record)); + break; + case operation::fill: + kernels::fill(fill_descriptor(current_dsc), fill_completion_record(current_record)); + break; + case operation::compare: + kernels::compare(compare_descriptor(current_dsc), compare_completion_record(current_record)); + break; + case operation::compare_pattern: + kernels::compare_pattern(compare_pattern_descriptor(current_dsc), compare_pattern_completion_record(current_record)); + break; + case operation::create_delta: + kernels::create_delta(create_delta_descriptor(current_dsc), create_delta_completion_record(current_record)); + break; + case operation::apply_delta: + kernels::apply_delta(apply_delta_descriptor(current_dsc), apply_delta_completion_record(current_record)); + break; + case operation::dualcast: + kernels::dualcast(dualcast_descriptor(current_dsc), dualcast_completion_record(current_record)); + break; + case operation::crc: + kernels::crc(crc_descriptor(current_dsc), crc_completion_record(current_record)); + break; + case operation::copy_crc: + kernels::copy_crc(copy_crc_descriptor(current_dsc), crc_completion_record(current_record)); + break; + case operation::dif_check: + kernels::dif_check(dif_check_descriptor(current_dsc), dif_check_completion_record(current_record)); + break; + case operation::dif_insert: + kernels::dif_insert(dif_insert_descriptor(current_dsc), dif_insert_completion_record(current_record)); + break; + case operation::dif_strip: + kernels::dif_strip(dif_strip_descriptor(current_dsc), dif_strip_completion_record(current_record)); + break; + case operation::dif_update: + kernels::dif_update(dif_update_descriptor(current_dsc), dif_update_completion_record(current_record)); + break; + case operation::cache_flush: + kernels::cache_flush(cache_flush_descriptor(current_dsc), cache_flush_completion_record(current_record)); + break; + default: + status = dml::detail::execution_status::batch_error; + } + + if (any_completion_record(current_record).status() != to_underlying(dml::detail::execution_status::success)) + { + status = dml::detail::execution_status::batch_error; + } + else + { + ++index; + } + } + + if (index == descriptors_count) + { + status = dml::detail::execution_status::success; + } + + record.descriptors_completed() = static_cast(index); + + _mm_mfence(); + record.status() = to_underlying(status); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/cache_flush.cpp b/sources/core/src/cache_flush.cpp new file mode 100644 index 0000000..4df5737 --- /dev/null +++ b/sources/core/src/cache_flush.cpp @@ -0,0 +1,41 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void cache_flush(cache_flush_descriptor dsc, cache_flush_completion_record record) noexcept + { + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto transfer_size = dsc.transfer_size(); + const auto invalidate_cache = intersects(dsc.flags(), dml::detail::cache_flush_flag::cache_control); + + if (invalidate_cache) + dispatch::cache_flush(dst, transfer_size); + else + dispatch::cache_write_back(dst, transfer_size); + + _mm_mfence(); + record.status() = to_underlying(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/compare.cpp b/sources/core/src/compare.cpp new file mode 100644 index 0000000..bd21148 --- /dev/null +++ b/sources/core/src/compare.cpp @@ -0,0 +1,45 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void compare(compare_descriptor dsc, compare_completion_record record) noexcept + { + const auto src1 = reinterpret_cast(dsc.source_1_address()); + const auto src2 = reinterpret_cast(dsc.source_2_address()); + const auto transfer_size = dsc.transfer_size(); + const auto expected_result = dsc.expected_result(); + const auto check_result = intersects(dsc.flags(), dml::detail::compare_flag::check_result); + + std::tie(record.bytes_completed(), record.result()) = dispatch::compare(src1, src2, transfer_size); + + _mm_mfence(); + record.status() = + to_underlying(check_result ? (expected_result == record.result()) ? dml::detail::execution_status::success + : dml::detail::execution_status::false_predicate_success + : dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/compare_pattern.cpp b/sources/core/src/compare_pattern.cpp new file mode 100644 index 0000000..fd456a4 --- /dev/null +++ b/sources/core/src/compare_pattern.cpp @@ -0,0 +1,45 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void compare_pattern(compare_pattern_descriptor dsc, compare_pattern_completion_record record) noexcept + { + const auto pattern = dsc.pattern(); + const auto src = reinterpret_cast(dsc.source_address()); + const auto transfer_size = dsc.transfer_size(); + const auto expected_result = dsc.expected_result(); + const auto check_result = intersects(dsc.flags(), dml::detail::compare_flag::check_result); + + std::tie(record.bytes_completed(), record.result()) = dispatch::compare_pattern(pattern, src, transfer_size); + + _mm_mfence(); + record.status() = + to_underlying(check_result ? (expected_result == record.result()) ? dml::detail::execution_status::success + : dml::detail::execution_status::false_predicate_success + : dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/copy_crc.cpp b/sources/core/src/copy_crc.cpp new file mode 100644 index 0000000..b120251 --- /dev/null +++ b/sources/core/src/copy_crc.cpp @@ -0,0 +1,76 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void copy_crc(copy_crc_descriptor dsc, crc_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto transfer_size = dsc.transfer_size(); + const auto crc_seed = dsc.crc_seed(); + const auto bypass_reflection = + intersects(dsc.operation_specific_flags(), dml::detail::crc_specific_flag::bypass_crc_inversion_and_reflection); + const auto bypass_data_reflection = + intersects(dsc.operation_specific_flags(), dml::detail::crc_specific_flag::bypass_data_reflection); + + dispatch::mem_move(src, dst, transfer_size); + + auto reverse = [](uint32_t value) + { + value = (value & 0x55555555u) << 1u | (value & 0xAAAAAAAAu) >> 1u; + value = (value & 0x33333333u) << 2u | (value & 0xCCCCCCCCu) >> 2u; + value = (value & 0x0F0F0F0Fu) << 4u | (value & 0xF0F0F0F0u) >> 4u; + value = (value & 0x00FF00FFu) << 8u | (value & 0xFF00FF00u) >> 8u; + value = (value & 0x0000FFFFu) << 16u | (value & 0xFFFF0000u) >> 16u; + + return value; + }; + + auto crc_value = crc_seed; + + // Bypass inversion and use reverse bit order for CRC completion_record + if (!bypass_reflection) + { + crc_value = ~(crc_value); + crc_value = reverse(crc_value); + } + + // Bypass Data Reflection in case if DML_FLAG_DATA_REFLECTION set + crc_value = + !bypass_data_reflection ? dispatch::crc_reflected(src, transfer_size, crc_value) : dispatch::crc(src, transfer_size, crc_value); + + // Bypass inversion and use reverse bit order for CRC completion_record + if (!bypass_reflection) + { + crc_value = reverse(crc_value); + crc_value = ~(crc_value); + } + + record.crc_value() = crc_value; + + _mm_mfence(); + record.status() = to_underlying(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/crc.cpp b/sources/core/src/crc.cpp new file mode 100644 index 0000000..95d3f06 --- /dev/null +++ b/sources/core/src/crc.cpp @@ -0,0 +1,73 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void crc(crc_descriptor dsc, crc_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto transfer_size = dsc.transfer_size(); + const auto crc_seed = dsc.crc_seed(); + const auto bypass_reflection = + intersects(dsc.operation_specific_flags(), dml::detail::crc_specific_flag::bypass_crc_inversion_and_reflection); + const auto bypass_data_reflection = + intersects(dsc.operation_specific_flags(), dml::detail::crc_specific_flag::bypass_data_reflection); + + auto reverse = [](uint32_t value) + { + value = (value & 0x55555555u) << 1u | (value & 0xAAAAAAAAu) >> 1u; + value = (value & 0x33333333u) << 2u | (value & 0xCCCCCCCCu) >> 2u; + value = (value & 0x0F0F0F0Fu) << 4u | (value & 0xF0F0F0F0u) >> 4u; + value = (value & 0x00FF00FFu) << 8u | (value & 0xFF00FF00u) >> 8u; + value = (value & 0x0000FFFFu) << 16u | (value & 0xFFFF0000u) >> 16u; + + return value; + }; + + auto crc_value = crc_seed; + + // Bypass inversion and use reverse bit order for CRC completion_record + if (!bypass_reflection) + { + crc_value = ~(crc_value); + crc_value = reverse(crc_value); + } + + // Bypass Data Reflection in case if DML_FLAG_DATA_REFLECTION set + crc_value = + !bypass_data_reflection ? dispatch::crc_reflected(src, transfer_size, crc_value) : dispatch::crc(src, transfer_size, crc_value); + + // Bypass inversion and use reverse bit order for CRC completion_record + if (!bypass_reflection) + { + crc_value = reverse(crc_value); + crc_value = ~(crc_value); + } + + record.crc_value() = crc_value; + + _mm_mfence(); + record.status() = to_underlying(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/create_delta.cpp b/sources/core/src/create_delta.cpp new file mode 100644 index 0000000..d42c2ce --- /dev/null +++ b/sources/core/src/create_delta.cpp @@ -0,0 +1,47 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void create_delta(create_delta_descriptor dsc, create_delta_completion_record record) noexcept + { + const auto src1 = reinterpret_cast(dsc.source_1_address()); + const auto src2 = reinterpret_cast(dsc.source_2_address()); + const auto delta_record = reinterpret_cast(dsc.delta_record_address()); + const auto delta_max_size = dsc.maximum_delta_record_size(); + const auto transfer_size = dsc.transfer_size(); + const auto expected_result = dsc.expected_result_mask(); + const auto check_result = intersects(dsc.flags(), dml::detail::create_delta_flag::check_result); + + std::tie(record.delta_record_size(), record.result()) = + dispatch::create_delta(src1, src2, transfer_size, delta_record, delta_max_size); + + _mm_mfence(); + record.status() = to_underlying(check_result ? ((expected_result >> 1) == record.result()) + ? dml::detail::execution_status::success + : dml::detail::execution_status::false_predicate_success + : dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/dif_check.cpp b/sources/core/src/dif_check.cpp new file mode 100644 index 0000000..f21be4b --- /dev/null +++ b/sources/core/src/dif_check.cpp @@ -0,0 +1,66 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include +#include +#include + +#include "../../../../include/dml/dmldefs.h" +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void dif_check(dif_check_descriptor dsc, dif_check_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto transfer_size = dsc.transfer_size(); + const auto options = dsc.flags(); + const auto dif_options = dsc.dif_flags(); + const auto dif_src_options = dsc.source_dif_flags(); + const auto src_app_tag_mask = dsc.source_app_tag_mask(); + + const auto src_ref_tag = dsc.source_ref_tag(); + const auto src_app_tag = dsc.source_app_tag(); + + dml_job_t job; + memset(&job, 0, sizeof(dml_job_t)); + job.source_first_ptr = src; + job.source_length = transfer_size; + job.operation = DML_OP_DIF_CHECK; + job.dif_config.source_reference_tag_seed = src_ref_tag; + job.dif_config.source_application_tag_seed = src_app_tag; + job.dif_config.source_application_tag_mask = src_app_tag_mask; + job.dif_config.block_size = static_cast(dif_options & 0b11); + + // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h + job.dif_config.flags = (uint64_t(dif_options) << 16) | dif_src_options; + + job.flags = options; + + auto status = dml_legacy_dif_check(&job); + + record.dif_status() = job.result; + record.bytes_completed() = job.offset; + // TODO: Tags should be written + + _mm_mfence(); + record.status() = to_underlying((status == DML_STATUS_OK) ? dml::detail::execution_status::success + : dml::detail::execution_status::dif_control_error); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/dif_impl/CMakeLists.txt b/sources/core/src/dif_impl/CMakeLists.txt new file mode 100644 index 0000000..1ff8040 --- /dev/null +++ b/sources/core/src/dif_impl/CMakeLists.txt @@ -0,0 +1,34 @@ +# +# Copyright 2021 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you ("License"). Unless the License provides otherwise, +# you may not use, modify, copy, publish, distribute, disclose or transmit +# this software or the related documents without Intel's prior written +# permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +add_library(dml_dif_impl OBJECT + # DIFs + dml_dif.h + dml_dif.c + ) + +target_link_libraries(dml_dif_impl + PRIVATE dml_sw_dispatcher + ) +target_include_directories(dml_dif_impl + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} + PRIVATE ../../../../include + ) +target_compile_features(dml_dif_impl + PUBLIC c_std_11 + ) +target_compile_options(dml_dif_impl + PRIVATE ${DML_QUALITY_OPTIONS} + ) diff --git a/sources/middle_layer/sw_path_legacy/dif.c b/sources/core/src/dif_impl/dml_dif.c similarity index 82% rename from sources/middle_layer/sw_path_legacy/dif.c rename to sources/core/src/dif_impl/dml_dif.c index df2da1a..7b6a238 100644 --- a/sources/middle_layer/sw_path_legacy/dif.c +++ b/sources/core/src/dif_impl/dml_dif.c @@ -14,9 +14,11 @@ * */ -#include "dif.h" +#include "dml_dif.h" -#include +#include +#include +#include #define OWN_DIF_CRC_POLYNOMIAL 0x8BB7u /**< CRC16 T10 polynomial */ @@ -80,17 +82,6 @@ static const uint32_t own_dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; #define OWN_F_APPLICATION_TAG_DETECTED(dif_ptr, flags) \ (DML_DIF_FLAG_SRC_F_DETECT_APP_TAG & dif_flags && ((dif_ptr)->application_tag == DML_MAX_16U)) -static inline uint32_t bit_reverse_32u(uint32_t value) -{ - value = (value & 0x55555555u) << 1u | (value & 0xAAAAAAAAu) >> 1u; - value = (value & 0x33333333u) << 2u | (value & 0xCCCCCCCCu) >> 2u; - value = (value & 0x0F0F0F0Fu) << 4u | (value & 0xF0F0F0F0u) >> 4u; - value = (value & 0x00FF00FFu) << 8u | (value & 0xFF00FF00u) >> 8u; - value = (value & 0x0000FFFFu) << 16u | (value & 0xFFFF0000u) >> 16u; - - return value; -} - static inline uint16_t reverse_bytes_16u(uint16_t value) { union @@ -113,20 +104,49 @@ static inline uint32_t reverse_bytes_32u(uint32_t value) { uint32_t value; uint8_t bytes[4]; - } received_value, reverced_value; + } received_value, reversed_value; received_value.value = value; - reverced_value.bytes[0] = received_value.bytes[3]; - reverced_value.bytes[1] = received_value.bytes[2]; - reverced_value.bytes[2] = received_value.bytes[1]; - reverced_value.bytes[3] = received_value.bytes[0]; + reversed_value.bytes[0] = received_value.bytes[3]; + reversed_value.bytes[1] = received_value.bytes[2]; + reversed_value.bytes[2] = received_value.bytes[1]; + reversed_value.bytes[3] = received_value.bytes[0]; + + return reversed_value.value; +} + +static inline uint16_t calculate_crc_16u(uint16_t crc_value, const uint8_t data, const uint16_t polynomial) +{ + const size_t byte_width = 8; + const size_t crc_bit_count = sizeof(crc_value) * byte_width; + const size_t crc_byte_shift = crc_bit_count - byte_width; + const uint16_t high_bit_mask = 1 << (crc_bit_count - 1); + + crc_value ^= (data << crc_byte_shift); + + for (size_t bit = 0u; bit < byte_width; ++bit) + { + crc_value = (crc_value & high_bit_mask) ? ((crc_value << 1) ^ polynomial) : (crc_value << 1); + } + + return crc_value; +} + +static uint16_t crc_16u(const uint8_t *src, const uint32_t transfer_size, uint16_t crc_value, const uint16_t polynomial) +{ + for (size_t byte = 0; byte < transfer_size; ++byte) + { + crc_value = calculate_crc_16u(crc_value, src[byte], polynomial); + } - return reverced_value.value; + return crc_value; } -dml_status_t dml_legacy_dif_check(dml_job_t *dml_job_ptr) +int dml_legacy_dif_check(void *dml_job_ptr_) { + dml_job_t *dml_job_ptr = (dml_job_t *)dml_job_ptr_; + // General constants const uint32_t dif_flags = dml_job_ptr->dif_config.flags; const uint32_t block_size = own_dif_block_sizes[dml_job_ptr->dif_config.block_size]; @@ -173,8 +193,8 @@ dml_status_t dml_legacy_dif_check(dml_job_t *dml_job_ptr) if (check_guard) { uint16_t crc = crc_seed; - dmlc_calculate_crc_16u(source_ptr, block_size, &crc, OWN_DIF_CRC_POLYNOMIAL); - crc = reverse_bytes_16u((invert_crc_result) ? ~crc : crc); + crc = crc_16u(source_ptr, block_size, crc, OWN_DIF_CRC_POLYNOMIAL); + crc = reverse_bytes_16u((invert_crc_result) ? ~crc : crc); if (crc != dif_ptr->guard_tag) { @@ -216,8 +236,10 @@ dml_status_t dml_legacy_dif_check(dml_job_t *dml_job_ptr) return DML_STATUS_OK; } -dml_status_t dml_legacy_dif_insert(dml_job_t *dml_job_ptr) +int dml_legacy_dif_insert(void *dml_job_ptr_) { + dml_job_t *dml_job_ptr = (dml_job_t *)dml_job_ptr_; + const uint32_t dif_flags = dml_job_ptr->dif_config.flags; const uint32_t block_size = own_dif_block_sizes[dml_job_ptr->dif_config.block_size]; const uint32_t block_count = dml_job_ptr->source_length / block_size; @@ -245,10 +267,10 @@ dml_status_t dml_legacy_dif_insert(dml_job_t *dml_job_ptr) uint16_t crc = crc_seed; // Copy - dmlc_copy_8u(source_ptr, destination_ptr, block_size); + dml_ref_mem_move(source_ptr, destination_ptr, block_size); // Calculate CRC - dmlc_calculate_crc_16u(destination_ptr, block_size, &crc, OWN_DIF_CRC_POLYNOMIAL); + crc = crc_16u(destination_ptr, block_size, crc, OWN_DIF_CRC_POLYNOMIAL); // Write data integrity field dif_ptr->application_tag = reverse_bytes_16u(application_tag & application_tag_mask); @@ -265,8 +287,10 @@ dml_status_t dml_legacy_dif_insert(dml_job_t *dml_job_ptr) return DML_STATUS_OK; } -dml_status_t dml_legacy_dif_strip(dml_job_t *dml_job_ptr) +int dml_legacy_dif_strip(void *dml_job_ptr_) { + dml_job_t *dml_job_ptr = (dml_job_t *)dml_job_ptr_; + // General constants const uint32_t block_size = own_dif_block_sizes[dml_job_ptr->dif_config.block_size]; const uint32_t source_step = block_size + sizeof(own_dif_t); @@ -277,7 +301,7 @@ dml_status_t dml_legacy_dif_strip(dml_job_t *dml_job_ptr) uint8_t *destination_ptr = dml_job_ptr->destination_first_ptr; // Check source data - dml_status_t status = dml_legacy_dif_check(dml_job_ptr); + int status = dml_legacy_dif_check(dml_job_ptr); if (status != DML_STATUS_OK) { return status; @@ -286,7 +310,7 @@ dml_status_t dml_legacy_dif_strip(dml_job_t *dml_job_ptr) // Process data for (uint32_t block = 0; block < block_count; block++) { - dmlc_copy_8u(source_ptr, destination_ptr, block_size); + dml_ref_mem_move(source_ptr, destination_ptr, block_size); source_ptr += source_step; destination_ptr += block_size; @@ -295,8 +319,10 @@ dml_status_t dml_legacy_dif_strip(dml_job_t *dml_job_ptr) return DML_STATUS_OK; } -dml_status_t dml_legacy_dif_update(dml_job_t * dml_job_ptr) +int dml_legacy_dif_update(void *dml_job_ptr_) { + dml_job_t *dml_job_ptr = (dml_job_t *)dml_job_ptr_; + // General constants const uint32_t dif_flags = dml_job_ptr->dif_config.flags; const uint32_t block_size = own_dif_block_sizes[dml_job_ptr->dif_config.block_size]; @@ -324,7 +350,7 @@ dml_status_t dml_legacy_dif_update(dml_job_t * dml_job_ptr) uint32_t reference_tag = dml_job_ptr->dif_config.destination_reference_tag_seed; // Check Source - const dml_status_t status = dml_legacy_dif_check(dml_job_ptr); + const int status = dml_legacy_dif_check(dml_job_ptr); if (status != DML_STATUS_OK) { return status; @@ -335,13 +361,13 @@ dml_status_t dml_legacy_dif_update(dml_job_t * dml_job_ptr) { own_dif_t *const destination_dif_ptr = (own_dif_t *)(destination_ptr + block_size); - dmlc_copy_8u(source_ptr, destination_ptr, step); + dml_ref_mem_move(source_ptr, destination_ptr, step); // Update DIF if (calculate_crc) { - uint16_t crc = crc_seed; - dmlc_calculate_crc_16u(destination_ptr, block_size, &crc, OWN_DIF_CRC_POLYNOMIAL); + uint16_t crc = crc_seed; + crc = crc_16u(destination_ptr, block_size, crc, OWN_DIF_CRC_POLYNOMIAL); destination_dif_ptr->guard_tag = reverse_bytes_16u((invert_crc_result) ? ~crc : crc); } diff --git a/sources/middle_layer/sw_path_legacy/dif.h b/sources/core/src/dif_impl/dml_dif.h similarity index 75% rename from sources/middle_layer/sw_path_legacy/dif.h rename to sources/core/src/dif_impl/dml_dif.h index 7fae4c1..cd8a2c9 100644 --- a/sources/middle_layer/sw_path_legacy/dif.h +++ b/sources/core/src/dif_impl/dml_dif.h @@ -17,21 +17,24 @@ #ifndef DML_ML_SW_PATH_LEGACY_DIF_H #define DML_ML_SW_PATH_LEGACY_DIF_H -#include +#include #ifdef __cplusplus extern "C" { #endif -dml_status_t dml_legacy_dif_check(dml_job_t* dml_job_ptr); +typedef void* job_t; -dml_status_t dml_legacy_dif_insert(dml_job_t* dml_job_ptr); +int dml_legacy_dif_check(job_t dml_job_ptr); -dml_status_t dml_legacy_dif_strip(dml_job_t* dml_job_ptr); +int dml_legacy_dif_insert(job_t dml_job_ptr); -dml_status_t dml_legacy_dif_update(dml_job_t* dml_job_ptr); +int dml_legacy_dif_strip(job_t dml_job_ptr); + +int dml_legacy_dif_update(job_t dml_job_ptr); #ifdef __cplusplus } #endif + #endif // DML_ML_SW_PATH_LEGACY_DIF_H diff --git a/sources/core/src/dif_insert.cpp b/sources/core/src/dif_insert.cpp new file mode 100644 index 0000000..5a48484 --- /dev/null +++ b/sources/core/src/dif_insert.cpp @@ -0,0 +1,67 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include +#include +#include + +#include "../../../../include/dml/dmldefs.h" +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void dif_insert(dif_insert_descriptor dsc, dif_insert_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto transfer_size = dsc.transfer_size(); + const auto options = dsc.flags(); + const auto dif_options = dsc.dif_flags(); + const auto dif_dst_options = dsc.destination_dif_flags(); + const auto dst_app_tag_mask = dsc.destination_app_tag_mask(); + + auto dst_ref_tag = dsc.destination_ref_tag(); + auto dst_app_tag = dsc.destination_app_tag(); + + dml_job_t job; + memset(&job, 0, sizeof(dml_job_t)); + job.source_first_ptr = src; + job.destination_first_ptr = dst; + job.source_length = transfer_size; + job.operation = DML_OP_DIF_INSERT; + job.dif_config.destination_reference_tag_seed = dst_ref_tag; + job.dif_config.destination_application_tag_seed = dst_app_tag; + job.dif_config.destination_application_tag_mask = dst_app_tag_mask; + job.dif_config.block_size = static_cast(dif_options & 0b11); + + // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h + job.dif_config.flags = (uint64_t(dif_options) << 16) | (uint64_t(dif_dst_options) << 8); + + job.flags = options; + + auto status = dml_legacy_dif_insert(&job); + + record.bytes_completed() = job.offset; + // TODO: Tags should be written + + _mm_mfence(); + record.status() = to_underlying((status == DML_STATUS_OK) ? dml::detail::execution_status::success + : dml::detail::execution_status::dif_control_error); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/dif_strip.cpp b/sources/core/src/dif_strip.cpp new file mode 100644 index 0000000..8461a76 --- /dev/null +++ b/sources/core/src/dif_strip.cpp @@ -0,0 +1,68 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include +#include +#include + +#include "../../../../include/dml/dmldefs.h" +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void dif_strip(dif_strip_descriptor dsc, dif_strip_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto transfer_size = dsc.transfer_size(); + const auto options = dsc.flags(); + const auto dif_options = dsc.dif_flags(); + const auto dif_src_options = dsc.source_dif_flags(); + const auto src_app_tag_mask = dsc.source_app_tag_mask(); + + auto src_ref_tag = dsc.source_ref_tag(); + auto src_app_tag = dsc.source_app_tag(); + + dml_job_t job; + memset(&job, 0, sizeof(dml_job_t)); + job.source_first_ptr = src; + job.destination_first_ptr = dst; + job.source_length = transfer_size; + job.operation = DML_OP_DIF_STRIP; + job.dif_config.source_reference_tag_seed = src_ref_tag; + job.dif_config.source_application_tag_seed = src_app_tag; + job.dif_config.source_application_tag_mask = src_app_tag_mask; + job.dif_config.block_size = static_cast(dif_options & 0b11); + + // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h + job.dif_config.flags = (uint64_t(dif_options) << 16) | dif_src_options; + + job.flags = options; + + auto status = dml_legacy_dif_strip(&job); + + record.dif_status() = job.result; + record.bytes_completed() = job.offset; + // TODO: Tags should be written + + _mm_mfence(); + record.status() = to_underlying((status == DML_STATUS_OK) ? dml::detail::execution_status::success + : dml::detail::execution_status::dif_control_error); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/dif_update.cpp b/sources/core/src/dif_update.cpp new file mode 100644 index 0000000..4faa051 --- /dev/null +++ b/sources/core/src/dif_update.cpp @@ -0,0 +1,75 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include +#include +#include + +#include "../../../../include/dml/dmldefs.h" +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void dif_update(dif_update_descriptor dsc, dif_update_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto transfer_size = dsc.transfer_size(); + const auto options = dsc.flags(); + const auto dif_options = dsc.dif_flags(); + const auto dif_src_options = dsc.source_dif_flags(); + const auto dif_dst_options = dsc.destination_dif_flags(); + const auto src_app_tag_mask = dsc.source_app_tag_mask(); + const auto dst_app_tag_mask = dsc.destination_app_tag_mask(); + + auto src_ref_tag = dsc.source_ref_tag(); + auto dst_ref_tag = dsc.destination_ref_tag(); + auto src_app_tag = dsc.source_app_tag(); + auto dst_app_tag = dsc.destination_app_tag(); + + dml_job_t job; + memset(&job, 0, sizeof(dml_job_t)); + job.source_first_ptr = src; + job.destination_first_ptr = dst; + job.source_length = transfer_size; + job.operation = DML_OP_DIF_UPDATE; + job.dif_config.source_reference_tag_seed = src_ref_tag; + job.dif_config.source_application_tag_seed = src_app_tag; + job.dif_config.source_application_tag_mask = src_app_tag_mask; + job.dif_config.destination_reference_tag_seed = dst_ref_tag; + job.dif_config.destination_application_tag_seed = dst_app_tag; + job.dif_config.destination_application_tag_mask = dst_app_tag_mask; + job.dif_config.block_size = static_cast(dif_options & 0b11); + + // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h + job.dif_config.flags = (uint64_t(dif_options) << 16) | (uint64_t(dif_dst_options) << 8) | dif_src_options; + + job.flags = static_cast(options); + + auto status = dml_legacy_dif_update(&job); + + record.dif_status() = job.result; + record.bytes_completed() = job.offset; + // TODO: Tags should be written + + _mm_mfence(); + record.status() = to_underlying((status == DML_STATUS_OK) ? dml::detail::execution_status::success + : dml::detail::execution_status::dif_control_error); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/drain.cpp b/sources/core/src/drain.cpp new file mode 100644 index 0000000..f1a13a1 --- /dev/null +++ b/sources/core/src/drain.cpp @@ -0,0 +1,28 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void drain(drain_descriptor dsc, drain_completion_record record) noexcept + { + static_cast(dsc); + record.status() = static_cast(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/dualcast.cpp b/sources/core/src/dualcast.cpp new file mode 100644 index 0000000..c9e7024 --- /dev/null +++ b/sources/core/src/dualcast.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void dualcast(dualcast_descriptor dsc, dualcast_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto dst1 = reinterpret_cast(dsc.destination_1_address()); + const auto dst2 = reinterpret_cast(dsc.destination_2_address()); + const auto transfer_size = dsc.transfer_size(); + + dispatch::dualcast(src, dst1, dst2, transfer_size); + + _mm_mfence(); + record.status() = to_underlying(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/fill.cpp b/sources/core/src/fill.cpp new file mode 100644 index 0000000..9d79786 --- /dev/null +++ b/sources/core/src/fill.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void fill(fill_descriptor dsc, fill_completion_record record) noexcept + { + const auto pattern = dsc.pattern(); + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto transfer_size = dsc.transfer_size(); + + dispatch::fill(pattern, dst, transfer_size); + + _mm_mfence(); + record.status() = to_underlying(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/hardware_device.cpp b/sources/core/src/hardware_device.cpp new file mode 100644 index 0000000..4ecba38 --- /dev/null +++ b/sources/core/src/hardware_device.cpp @@ -0,0 +1,95 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include + +#include "core/device.hpp" +#include "hw_dispatcher/hw_dispatcher.hpp" +#include "hw_dispatcher/numa.hpp" + +namespace dml::core +{ +#ifdef DML_HW + static inline auto enqueue(const dispatcher::hw_device &device, descriptor &dsc, completion_record &record) noexcept + { + auto view = any_descriptor(dsc); + view.flags() |= static_cast(dml::detail::flag::completion_record_address_valid) | + static_cast(dml::detail::flag::request_completion_record); + + // Use BlockOnFault on hardware, until page fault handling is implemented in software side + if (view.operation() != static_cast(operation::batch) && + view.operation() != static_cast(operation::drain) && view.operation() != static_cast(operation::nop)) + { + view.flags() |= static_cast(dml::detail::flag::block_on_fault); + } + + view.completion_record_address() = reinterpret_cast(&record); + record.bytes[0] = 0; + + auto status = device.enqueue_descriptor(reinterpret_cast(&dsc)); + + return status == DML_STATUS_OK ? dml::detail::submission_status::success : dml::detail::submission_status::failure; + } +#endif + + dml::detail::submission_status hardware_device::submit(descriptor &dsc, completion_record &completion_record) noexcept + { +#ifdef DML_HW + auto &dispatcher = dispatcher::hw_dispatcher::get_instance(); + + if (dispatcher.is_hw_support()) + { + static thread_local auto current_device_idx = 0u; + + auto device_count = std::distance(dispatcher.begin(), dispatcher.end()); + auto tried_devices = 0u; + + while (tried_devices < device_count) + { + auto ¤t_device = *(dispatcher.begin() + current_device_idx); + current_device_idx = (current_device_idx + 1) % device_count; + + if (util::get_numa_id() != current_device.numa_id()) + { + tried_devices++; + continue; + } + + auto status = enqueue(current_device, dsc, completion_record); + + if (status != detail::submission_status::success) + { + tried_devices++; + } + else + { + return status; + } + } + + return detail::submission_status::queue_busy; + } +#else + static_cast(dsc); + static_cast(completion_record); +#endif + + return dml::detail::submission_status::failure; + } +} // namespace dml::core diff --git a/sources/core/src/hw_dispatcher/CMakeLists.txt b/sources/core/src/hw_dispatcher/CMakeLists.txt new file mode 100644 index 0000000..b406202 --- /dev/null +++ b/sources/core/src/hw_dispatcher/CMakeLists.txt @@ -0,0 +1,51 @@ +# +# Copyright 2021 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you ("License"). Unless the License provides otherwise, +# you may not use, modify, copy, publish, distribute, disclose or transmit +# this software or the related documents without Intel's prior written +# permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +add_library(dml_hw_dispatcher OBJECT + hw_device.cpp + hw_device.hpp + hw_dispatcher.cpp + hw_dispatcher.hpp + hw_queue.cpp + hw_queue.hpp + numa.cpp + numa.hpp + + hw_configuration_driver.c + + legacy_headers/hardware_configuration_driver.h + legacy_headers/hardware_api.h + legacy_headers/hardware_completion_records_api.h + legacy_headers/hardware_limits.h + legacy_headers/hardware_descriptors_api.h + legacy_headers/own_dsa_accel_constants.h + legacy_headers/hardware_definitions.h + legacy_headers/libaccel_config.h + ) + +target_compile_features(dml_hw_dispatcher PRIVATE cxx_std_17 c_std_11) + +target_compile_options(dml_hw_dispatcher + PRIVATE $<$:${DML_CPP_PRIVATE_OPTIONS}> + ) + +target_include_directories(dml_hw_dispatcher PUBLIC ../../../../include) + +if (DML_HW) + target_compile_definitions(dml_hw_dispatcher + PRIVATE DML_HW + PUBLIC $<$:LOG_HW_INIT> + ) +endif () diff --git a/sources/core/src/hw_dispatcher/hw_configuration_driver.c b/sources/core/src/hw_dispatcher/hw_configuration_driver.c new file mode 100644 index 0000000..6bd3abd --- /dev/null +++ b/sources/core/src/hw_dispatcher/hw_configuration_driver.c @@ -0,0 +1,399 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "legacy_headers/hardware_configuration_driver.h" + +#if defined(linux) + +#include +#include + +const static char *accelerator_configuration_driver_name = "/usr/lib64/libaccel-config.so.1"; + +typedef int (*accfg_new_ptr)(struct accfg_ctx **ctx); + +typedef struct accfg_device *(*accfg_device_get_first_ptr)(struct accfg_ctx *ctx); + +typedef const char *(*accfg_device_get_devname_ptr)(struct accfg_device *device); + +typedef struct accfg_device *(*accfg_device_get_next_ptr)(struct accfg_device *device); + +typedef struct accfg_wq *(*accfg_wq_get_first_ptr)(struct accfg_device *device); + +typedef struct accfg_wq *(*accfg_wq_get_next_ptr)(struct accfg_wq *wq); + +typedef enum accfg_wq_state (*accfg_wq_get_state_ptr)(struct accfg_wq *wq); + +typedef unsigned int (*accfg_device_get_version_ptr)(struct accfg_device *device); + +typedef const char * (*accfg_wq_get_devname_ptr)(struct accfg_wq *wq); + +typedef enum accfg_device_state (*accfg_device_get_state_ptr)(struct accfg_device *device); + +typedef struct accfg_ctx *(*accfg_unref_ptr)(struct accfg_ctx *ctx); + +typedef enum accfg_wq_mode (*accfg_wq_get_mode_ptr)(struct accfg_wq *wq); + +typedef unsigned long (*accfg_device_get_gen_cap_ptr)(struct accfg_device *device); + +typedef int (*accfg_group_get_traffic_class_ptr)(struct accfg_group *group); + +typedef struct accfg_group *(*accfg_group_get_first_ptr)(struct accfg_device *device); + +typedef struct accfg_group *(*accfg_group_get_next_ptr)(struct accfg_group *group); + +typedef struct accfg_group *(*accfg_wq_get_group_ptr)(struct accfg_wq *wq); + +typedef int (*accfg_wq_get_group_id_ptr)(struct accfg_wq *wq); + +typedef int (*accfg_group_get_id_ptr)(struct accfg_group *group); + +typedef int (*accfg_wq_get_user_dev_path_ptr)(struct accfg_wq *wq, char *buf, size_t size); + +typedef int (*accfg_wq_get_priority_ptr)(struct accfg_wq *wq); + +/** + * @brief Table with functions required from accelerator configuration library + */ +static dsa_desc_t functions_table[] = { { NULL, "accfg_new" }, + { NULL, "accfg_device_get_first" }, + { NULL, "accfg_device_get_devname" }, + { NULL, "accfg_device_get_next" }, + { NULL, "accfg_wq_get_first" }, + { NULL, "accfg_wq_get_next" }, + { NULL, "accfg_wq_get_state" }, + { NULL, "accfg_wq_get_mode" }, + { NULL, "accfg_device_get_version" }, + { NULL, "accfg_wq_get_devname" }, + { NULL, "accfg_device_get_state" }, + { NULL, "accfg_unref" }, + { NULL, "accfg_device_get_gen_cap" }, + { NULL, "accfg_device_get_numa_node" }, + { NULL, "accfg_wq_get_priority" }, + { NULL, "accfg_group_get_first" }, + { NULL, "accfg_group_get_next" }, + { NULL, "accfg_group_get_traffic_class_a" }, + { NULL, "accfg_group_get_traffic_class_b" }, + { NULL, "accfg_wq_get_group" }, + { NULL, "accfg_wq_get_group_id" }, + { NULL, "accfg_group_get_id" }, + { NULL, "accfg_wq_get_user_dev_path" }, + // Terminate list/init + { NULL, NULL } }; + +static inline dsahw_status_t own_load_accelerator_configuration_driver(void **driver_instance_pptr); + +static inline bool own_load_configuration_functions(void *driver_instance_ptr); + +#endif + +dsahw_status_t DML_HW_API(initialize_accelerator_driver)(hw_driver_t *driver_ptr) +{ +#if defined(linux) + // Variables + driver_ptr->driver_instance_ptr = NULL; + + // Load DLL + dsahw_status_t status = own_load_accelerator_configuration_driver(&driver_ptr->driver_instance_ptr); + + // If DLL is loaded successfully + if (DML_STATUS_OK != status || !driver_ptr->driver_instance_ptr || !own_load_configuration_functions(driver_ptr->driver_instance_ptr)) + { + // Free DLL + if (driver_ptr->driver_instance_ptr) + { + dlclose(driver_ptr->driver_instance_ptr); + } + + driver_ptr->driver_instance_ptr = NULL; + } + + return status; +#else + return DML_STATUS_LIBACCEL_NOT_FOUND; +#endif +} + +void DML_HW_API(finalize_accelerator_driver)(hw_driver_t *driver_ptr) +{ +#if defined(linux) + if (driver_ptr->driver_instance_ptr) + { + dlclose(driver_ptr->driver_instance_ptr); + } + + driver_ptr->driver_instance_ptr = NULL; +#endif +} + +int32_t DML_HW_API(driver_new_context)(struct accfg_ctx **ctx) +{ +#if defined(linux) + return ((accfg_new_ptr)functions_table[0].function)(ctx); +#else + return DML_STATUS_LIBACCEL_NOT_FOUND; +#endif +} + +struct accfg_device *DML_HW_API(context_get_first_device)(struct accfg_ctx *ctx) +{ +#if defined(linux) + return ((accfg_device_get_first_ptr)functions_table[1].function)(ctx); +#else + return NULL; +#endif +} + +const char *DML_HW_API(device_get_name)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_device_get_devname_ptr)functions_table[2].function)(device); +#else + return NULL; +#endif +} + +struct accfg_device *DML_HW_API(device_get_next)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_device_get_next_ptr)functions_table[3].function)(device); +#else + return NULL; +#endif +} + +struct accfg_wq *DML_HW_API(get_first_work_queue)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_wq_get_first_ptr)functions_table[4].function)(device); +#else + return NULL; +#endif +} + +struct accfg_wq *DML_HW_API(work_queue_get_next)(struct accfg_wq *wq) +{ +#if defined(linux) + return ((accfg_wq_get_next_ptr)functions_table[5].function)(wq); +#else + return NULL; +#endif +} + +enum accfg_wq_state DML_HW_API(work_queue_get_state)(struct accfg_wq *wq) +{ +#if defined(linux) + return ((accfg_wq_get_state_ptr)functions_table[6].function)(wq); +#else + return -1; +#endif +} + +enum accfg_wq_mode DML_HW_API(work_queue_get_mode)(struct accfg_wq *wq) +{ +#if defined(linux) + return ((accfg_wq_get_mode_ptr)functions_table[7].function)(wq); +#else + return 2; +#endif +} + +uint32_t DML_HW_API(device_get_version)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_device_get_version_ptr) functions_table[8].function)(device); +#else + return -1; +#endif +} + +const char * DML_HW_API(work_queue_get_device_name)(struct accfg_wq *wq) +{ +#if defined(linux) + return ((accfg_wq_get_devname_ptr) functions_table[9].function)(wq); +#else + return NULL; +#endif +} + +enum accfg_device_state DML_HW_API(device_get_state)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_device_get_state_ptr)functions_table[10].function)(device); +#else + return -1; +#endif +} + +struct accfg_ctx *DML_HW_API(context_close)(struct accfg_ctx *ctx) +{ +#if defined(linux) + return ((accfg_unref_ptr)functions_table[11].function)(ctx); +#else + return NULL; +#endif +} + +uint64_t DML_HW_API(device_get_gen_cap_register)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_device_get_gen_cap_ptr)functions_table[12].function)(device); +#else + return 0; +#endif +} + +uint64_t DML_HW_API(device_get_numa_node)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_device_get_gen_cap_ptr)functions_table[13].function)(device); +#else + return -1; +#endif +} + +int32_t DML_HW_API(work_queue_get_priority)(struct accfg_wq *wq) +{ +#if defined(linux) + return ((accfg_wq_get_priority_ptr) functions_table[14].function)(wq); +#else + return -1; +#endif +} + +struct accfg_group *DML_HW_API(group_get_first)(struct accfg_device *device) +{ +#if defined(linux) + return ((accfg_group_get_first_ptr)functions_table[15].function)(device); +#else + return NULL; +#endif +} + +struct accfg_group *DML_HW_API(group_get_next)(struct accfg_group *group) +{ +#if defined(linux) + return ((accfg_group_get_next_ptr)functions_table[16].function)(group); +#else + return NULL; +#endif +} + +int DML_HW_API(group_get_traffic_class_a)(struct accfg_group *group) +{ +#if defined(linux) + return ((accfg_group_get_traffic_class_ptr)functions_table[17].function)(group); +#else + return 0; +#endif +} + +int DML_HW_API(group_get_traffic_class_b)(struct accfg_group *group) +{ +#if defined(linux) + return ((accfg_group_get_traffic_class_ptr)functions_table[18].function)(group); +#else + return 0; +#endif +} + +struct accfg_group *DML_HW_API(work_queue_get_group)(struct accfg_wq *wq) +{ +#if defined(linux) + return ((accfg_wq_get_group_ptr)functions_table[19].function)(wq); +#else + return NULL; +#endif +} + +int DML_HW_API(work_queue_get_group_id)(struct accfg_wq *wq) +{ +#if defined(linux) + return ((accfg_wq_get_group_id_ptr)functions_table[20].function)(wq); +#else + return -1; +#endif +} + +int DML_HW_API(group_get_id)(struct accfg_group *group) +{ +#if defined(linux) + return ((accfg_group_get_id_ptr)functions_table[21].function)(group); +#else + return -1; +#endif +} + +int DML_HW_API(work_queue_get_device_path)(struct accfg_wq *wq, char *buf, size_t size) +{ +#if defined(linux) + return ((accfg_wq_get_user_dev_path_ptr)functions_table[22].function)(wq, buf, size); +#else + return -1; +#endif +} + +#if defined(linux) + +/* ------ Internal functions implementation ------ */ + +bool own_load_configuration_functions(void *driver_instance_ptr) +{ + uint32_t i = 0u; + + // Clear error log + (void)dlerror(); + while (functions_table[i].function_name) + { + functions_table[i].function = (library_function)dlsym(driver_instance_ptr, functions_table[i].function_name); + + char *err_message = dlerror(); + + if (err_message || !functions_table[i].function) + { + return false; + } + + i++; + } + + return true; +} + +dsahw_status_t own_load_accelerator_configuration_driver(void **driver_instance_pptr) +{ + DIAG("loading driver: %s\n", accelerator_configuration_driver_name); + // Try to load the user interface library for IAX/DSA kernel driver + void *driver_instance_ptr = dlopen(accelerator_configuration_driver_name, RTLD_LAZY); + + if (!driver_instance_ptr) + { + // This is needed for error handle. We need to call dlerror + // for emptying error message. Otherwise we will receive error + // message during loading symbols from another library + dlerror(); + + return DML_STATUS_LIBACCEL_NOT_FOUND; + } + + *driver_instance_pptr = driver_instance_ptr; + + return DML_STATUS_OK; +} + +#endif diff --git a/sources/core/src/hw_dispatcher/hw_device.cpp b/sources/core/src/hw_dispatcher/hw_device.cpp new file mode 100644 index 0000000..f503e64 --- /dev/null +++ b/sources/core/src/hw_dispatcher/hw_device.cpp @@ -0,0 +1,256 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifdef DML_HW + +#include "hw_device.hpp" + +#include + +#include "legacy_headers/hardware_configuration_driver.h" +#include "legacy_headers/own_dsa_accel_constants.h" + +static inline bool own_search_device_name(const char *src_ptr, const uint32_t name, const uint32_t name_size) noexcept +{ + const uint8_t null_terminator = '\0'; + + for (size_t symbol_idx = 0u; null_terminator != src_ptr[symbol_idx + name_size]; symbol_idx++) + { + const auto *candidate_ptr = reinterpret_cast(src_ptr + symbol_idx); + + // Convert the first 3 bytes to lower case and make the 4th 0xff + if (name == (*candidate_ptr | CHAR_MSK)) + { + return true; + } + } + + return false; +} + +namespace dml::core::dispatcher +{ + + void hw_device::fill_hw_context(dsahw_context_t *const hw_context_ptr) const noexcept + { + // Restore device properties + hw_context_ptr->gen_cap.block_on_fault_support = hw_device::block_on_fault_support(); + hw_context_ptr->gen_cap.overlapping_copy_support = hw_device::overlapping_copy_support(); + hw_context_ptr->gen_cap.memory_cache_control_support = hw_device::memory_cache_control_support(); + hw_context_ptr->gen_cap.flush_cache_control_support = hw_device::flush_cache_control_support(); + hw_context_ptr->gen_cap.destination_readback_support = hw_device::destination_readback_support(); + hw_context_ptr->gen_cap.descriptor_readback_support = hw_device::descriptor_readback_support(); + hw_context_ptr->gen_cap.max_transfer_size = hw_device::max_transfer_size(); + hw_context_ptr->gen_cap.max_batch_size = hw_device::max_batch_size(); + hw_context_ptr->gen_cap.message_size = hw_device::message_size(); + hw_context_ptr->gen_cap.configuration_support = hw_device::configuration_support(); + } + + auto hw_device::enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t + { + const auto n_queues = std::distance(this->begin(), this->end()); + + // Initially set to "end" index + static thread_local auto last_wq_idx = std::atomic(n_queues); + + // Loop FROM the queue after the one used for last submit + for (auto idx = last_wq_idx.load() + 1; idx < n_queues; ++idx) + { + auto &queue = *(this->begin() + idx); + auto status = queue.enqueue_descriptor(desc_ptr); + + if (DML_STATUS_OK == status) + { + last_wq_idx = idx; + return DML_STATUS_OK; + } + } + + // If the loop before didn't submit descriptor, then loop UNTIL the queue that was used for last submit + for (auto idx = 0; idx <= last_wq_idx; ++idx) + { + auto &queue = *(this->begin() + idx); + auto status = queue.enqueue_descriptor(desc_ptr); + + if (DML_STATUS_OK == status) + { + last_wq_idx = idx; + return DML_STATUS_OK; + } + } + + return DML_STATUS_WORK_QUEUE_OVERFLOW_ERROR; + } + + auto hw_device::block_on_fault_support() const noexcept -> uint8_t + { + return GC_BLOCK_ON_FAULT(gen_cap_register_); + } + + auto hw_device::overlapping_copy_support() const noexcept -> uint8_t + { + return GC_OVERLAPPING(gen_cap_register_); + } + + auto hw_device::memory_cache_control_support() const noexcept -> uint8_t + { + return GC_CACHE_WRITE(gen_cap_register_); + } + + auto hw_device::flush_cache_control_support() const noexcept -> uint8_t + { + return GC_CACHE_FLUSH(gen_cap_register_); + } + + auto hw_device::destination_readback_support() const noexcept -> uint8_t + { + return GC_DST_READBACK(gen_cap_register_); + } + + auto hw_device::descriptor_readback_support() const noexcept -> uint8_t + { + return GC_DRAIN_READBACK(gen_cap_register_); + } + + auto hw_device::max_transfer_size() const noexcept -> uint32_t + { + return GC_MAX_TRANSFER_SIZE(gen_cap_register_); + } + + auto hw_device::max_batch_size() const noexcept -> uint32_t + { + return GC_MAX_BATCH_SIZE(gen_cap_register_); + } + + auto hw_device::message_size() const noexcept -> uint16_t + { + return GC_INTERRUPT_STORAGE(gen_cap_register_); + } + + auto hw_device::configuration_support() const noexcept -> uint8_t + { + return GC_CONF_SUPPORT(gen_cap_register_); + } + + auto hw_device::initialize_new_device(descriptor_t *device_descriptor_ptr) noexcept -> dsahw_status_t + { +#if defined(linux) + // Device initialization stage + auto *device_ptr = reinterpret_cast(device_descriptor_ptr); + const auto *name_ptr = dsa_device_get_name(device_ptr); + const bool is_dsa_device = own_search_device_name(name_ptr, DSA_DEVICE_ID, DEVICE_NAME_LENGTH); + + version_major_ = dsa_device_get_version(device_ptr)>>8u; + version_minor_ = dsa_device_get_version(device_ptr)&0xFF; + + DIAG("%5s: ", name_ptr); + if (!is_dsa_device || version_major_ != 1) + { + DIAGA("UNSUPPORTED\n"); + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; + } + + if (ACCFG_DEVICE_ENABLED != dsa_device_get_state(device_ptr)) { + DIAGA("DISABLED\n"); + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; + } + DIAGA("\n"); + + gen_cap_register_ = dsa_device_get_gen_cap_register(device_ptr); + numa_node_id_ = dsa_device_get_numa_node(device_ptr); + + DIAG("%5s: version: %d.%d\n", name_ptr, version_major_, version_minor_); + DIAG("%5s: numa: %lu\n", name_ptr, numa_node_id_); + DIAG("%5s: GENCAP: 0x%016lX\n", name_ptr, gen_cap_register_); + DIAG("%5s: GENCAP: block on fault support: %d\n", name_ptr, block_on_fault_support()); + DIAG("%5s: GENCAP: overlapping copy support: %d\n", name_ptr, overlapping_copy_support()); + DIAG("%5s: GENCAP: cache control support (memory): %d\n", name_ptr, memory_cache_control_support()); + DIAG("%5s: GENCAP: cache control support (cache flush): %d\n", name_ptr, flush_cache_control_support()); + DIAG("%5s: GENCAP: maximum supported transfer size: %u\n", name_ptr, max_transfer_size()); + DIAG("%5s: GENCAP: maximum supported batch size: %u\n", name_ptr, max_batch_size()); + + // Working queues initialization stage + auto *wq_ptr = dsa_get_first_work_queue(device_ptr); + auto wq_it = working_queues_.begin(); + + DIAG("%5s: getting device WQs\n", name_ptr); + while (nullptr != wq_ptr) + { + if (DML_STATUS_OK == wq_it->initialize_new_queue(wq_ptr)) + { + wq_it++; + + std::push_heap(working_queues_.begin(), + wq_it, + [](const hw_queue &a, const hw_queue &b) -> bool + { + return a.priority() < b.priority(); + }); + } + + wq_ptr = dsa_work_queue_get_next(wq_ptr); + } + + // Check number of working queues + queue_count_ = std::distance(working_queues_.begin(), wq_it); + + if (queue_count_ > 1) + { + auto begin = working_queues_.begin(); + auto end = begin + queue_count_; + + std::sort_heap(begin, + end, + [](const hw_queue &a, const hw_queue &b) -> bool + { + return a.priority() < b.priority(); + }); + } + + if (queue_count_ == 0) + { + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; + } + + return DML_STATUS_OK; +#else + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; +#endif + } + + auto hw_device::size() const noexcept -> size_t + { + return queue_count_; + } + + auto hw_device::numa_id() const noexcept -> uint64_t + { + return numa_node_id_; + } + + auto hw_device::begin() const noexcept -> queues_container_t::const_iterator + { + return working_queues_.cbegin(); + } + + auto hw_device::end() const noexcept -> queues_container_t::const_iterator + { + return working_queues_.cbegin() + queue_count_; + } + +} // namespace dml::core::dispatcher + +#endif diff --git a/sources/core/src/hw_dispatcher/hw_device.hpp b/sources/core/src/hw_dispatcher/hw_device.hpp new file mode 100644 index 0000000..81722d3 --- /dev/null +++ b/sources/core/src/hw_dispatcher/hw_device.hpp @@ -0,0 +1,90 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_MIDDLE_LAYER_DISPATCHER_HW_DEVICE_HPP_ +#define DML_MIDDLE_LAYER_DISPATCHER_HW_DEVICE_HPP_ + +#include + +#include "dml/dmldefs.h" +#include "hw_queue.hpp" + +#ifdef DML_HW +#include "legacy_headers/hardware_definitions.h" +#include "legacy_headers/own_dsa_accel_constants.h" + +namespace dml::core::dispatcher +{ + + class hw_device final + { + static constexpr uint32_t max_working_queues = MAX_WORK_QUEUE_COUNT; + + using queues_container_t = std::array; + + public: + using descriptor_t = void; + + hw_device() noexcept = default; + + void fill_hw_context(dsahw_context_t *hw_context_ptr) const noexcept; + + [[nodiscard]] auto enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t; + + [[nodiscard]] auto initialize_new_device(descriptor_t *device_descriptor_ptr) noexcept -> dsahw_status_t; + + [[nodiscard]] auto size() const noexcept -> size_t; + + [[nodiscard]] auto numa_id() const noexcept -> uint64_t; + + [[nodiscard]] auto begin() const noexcept -> queues_container_t::const_iterator; + + [[nodiscard]] auto end() const noexcept -> queues_container_t::const_iterator; + + protected: + auto block_on_fault_support() const noexcept -> uint8_t; + + auto overlapping_copy_support() const noexcept -> uint8_t; + + auto memory_cache_control_support() const noexcept -> uint8_t; + + auto flush_cache_control_support() const noexcept -> uint8_t; + + auto destination_readback_support() const noexcept -> uint8_t; + + auto descriptor_readback_support() const noexcept -> uint8_t; + + auto max_transfer_size() const noexcept -> uint32_t; + + auto max_batch_size() const noexcept -> uint32_t; + + auto message_size() const noexcept -> uint16_t; + + auto configuration_support() const noexcept -> uint8_t; + + private: + queues_container_t working_queues_ = {}; /**< Set of available HW working queues */ + uint32_t queue_count_ = 0u; /**< Number of working queues that are available */ + uint64_t gen_cap_register_ = 0u; /**< GENCAP register content */ + uint64_t numa_node_id_ = 0u; /**< NUMA node id of the device */ + uint32_t version_major_ = 0u; /**< Major version of discovered device */ + uint32_t version_minor_ = 0u; /**< Minor version of discovered device */ + }; + +} // namespace dml::core::dispatcher + +#endif +#endif //DML_MIDDLE_LAYER_DISPATCHER_HW_DEVICE_HPP_ diff --git a/sources/core/src/hw_dispatcher/hw_dispatcher.cpp b/sources/core/src/hw_dispatcher/hw_dispatcher.cpp new file mode 100644 index 0000000..5b2e521 --- /dev/null +++ b/sources/core/src/hw_dispatcher/hw_dispatcher.cpp @@ -0,0 +1,165 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "hw_dispatcher.hpp" + +#if defined(DML_HW) && defined(linux) + +#include "legacy_headers/libaccel_config.h" + +#endif + +// TODO should be removed at all +#define DML_HWSTS_RET(expr, err_code) \ + { \ + if (expr) \ + { \ + return (err_code); \ + } \ + } + +namespace dml::core::dispatcher +{ + static hw_dispatcher instance{}; + + hw_dispatcher::hw_dispatcher() noexcept + { +#ifdef DML_HW + hw_init_status_ = hw_dispatcher::initialize_hw(); + hw_support_ = hw_init_status_ == DML_STATUS_OK; +#else + hw_support_ = false; +#endif + } + +#ifdef DML_HW + + auto hw_dispatcher::initialize_hw() noexcept -> dsahw_status_t + { + accfg_ctx *ctx_ptr = nullptr; + + DIAG("DML version %s\n", "TODO"); + DIAG("Struct size: %lu B\n", sizeof(device_container_t)); + + dsahw_status_t status = dsa_initialize_accelerator_driver(&hw_driver_); + DML_HWSTS_RET(status != DML_STATUS_OK, status); + + DIAG("creating context\n"); + int32_t context_creation_status = dsa_driver_new_context(&ctx_ptr); + DML_HWSTS_RET(0u != context_creation_status, DML_STATUS_LIBACCEL_ERROR); + + // Retrieve first device in the system given the passed in context + DIAG("enumerating devices\n"); + auto *dev_tmp_ptr = dsa_context_get_first_device(ctx_ptr); + auto device_it = devices_.begin(); + + while (nullptr != dev_tmp_ptr) + { + if (DML_STATUS_OK == device_it->initialize_new_device(dev_tmp_ptr)) + { + device_it++; + } + + // Retrieve the "next" device in the system based on given device + dev_tmp_ptr = dsa_device_get_next(dev_tmp_ptr); + } + + device_count_ = std::distance(devices_.begin(), device_it); + + if (device_count_ <= 0) + { + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; + } + + hw_context_.set_driver_context_ptr(ctx_ptr); + + return DML_STATUS_OK; + } +#endif + + hw_dispatcher::~hw_dispatcher() noexcept + { +#ifdef DML_HW + // Variables + auto *context_ptr = hw_context_.get_driver_context_ptr(); + + if (context_ptr != nullptr) + { + dsa_context_close(context_ptr); + } + + dsa_finalize_accelerator_driver(&hw_driver_); + + // Zeroing values + hw_context_.set_driver_context_ptr(nullptr); +#endif + } + + auto hw_dispatcher::get_instance() noexcept -> hw_dispatcher & + { + return instance; + } + + auto hw_dispatcher::is_hw_support() const noexcept -> bool + { + return hw_support_; + } + +#ifdef DML_HW + + void hw_dispatcher::fill_hw_context(dsahw_context_t *const hw_context_ptr) noexcept + { +#if defined(linux) + // Restore context + hw_context_ptr->dsa_context_ptr = hw_context_.get_driver_context_ptr(); + + // Restore device properties + // We take the first one as all configurations across the platform should be the same for all devices + devices_[0].fill_hw_context(hw_context_ptr); +#endif + } + + auto hw_dispatcher::get_hw_init_status() const noexcept -> dsahw_status_t + { + return hw_init_status_; + } + +#ifdef DML_HW + + auto hw_dispatcher::begin() const noexcept -> device_container_t::const_iterator + { + return devices_.cbegin(); + } + + auto hw_dispatcher::end() const noexcept -> device_container_t::const_iterator + { + return devices_.cbegin() + device_count_; + } + + void hw_dispatcher::hw_context::set_driver_context_ptr(accfg_ctx *driver_context_ptr) noexcept + { + driver_context_ptr_ = driver_context_ptr; + } + + [[nodiscard]] auto hw_dispatcher::hw_context::get_driver_context_ptr() noexcept -> accfg_ctx * + { + return driver_context_ptr_; + } + +#endif + +#endif +} // namespace dml::core::dispatcher diff --git a/sources/core/src/hw_dispatcher/hw_dispatcher.hpp b/sources/core/src/hw_dispatcher/hw_dispatcher.hpp new file mode 100644 index 0000000..4dd611b --- /dev/null +++ b/sources/core/src/hw_dispatcher/hw_dispatcher.hpp @@ -0,0 +1,96 @@ +/* + * + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_MIDDLE_LAYER_DISPATCHER_HW_DISPATCHER_HPP_ +#define DML_MIDDLE_LAYER_DISPATCHER_HW_DISPATCHER_HPP_ + +#include +#include + +#include "dml/dmldefs.h" +#include "hw_device.hpp" + +#ifdef DML_HW +#include "legacy_headers/hardware_configuration_driver.h" +#include "legacy_headers/hardware_definitions.h" +#include "legacy_headers/own_dsa_accel_constants.h" +#endif + +namespace dml::core::dispatcher +{ + + class hw_dispatcher final + { +#ifdef DML_HW + + static constexpr uint32_t max_devices = MAX_DEVICE_COUNT; + + using device_container_t = std::array; + + class hw_context final + { + public: + void set_driver_context_ptr(accfg_ctx *driver_context_ptr) noexcept; + + [[nodiscard]] auto get_driver_context_ptr() noexcept -> accfg_ctx *; + + private: + accfg_ctx *driver_context_ptr_ = nullptr; /**< DSA driver context */ + }; + +#endif + + public: + static auto get_instance() noexcept -> hw_dispatcher &; + + [[nodiscard]] auto is_hw_support() const noexcept -> bool; + +#ifdef DML_HW + + [[nodiscard]] auto get_hw_init_status() const noexcept -> dsahw_status_t; + + void fill_hw_context(dsahw_context_t *hw_context_ptr) noexcept; + + [[nodiscard]] auto begin() const noexcept -> device_container_t::const_iterator; + + [[nodiscard]] auto end() const noexcept -> device_container_t::const_iterator; + +#endif + + ~hw_dispatcher() noexcept; + + hw_dispatcher() noexcept; + + protected: +#ifdef DML_HW + auto initialize_hw() noexcept -> dsahw_status_t; + + private: + hw_context hw_context_; + hw_driver_t hw_driver_{}; + device_container_t devices_{}; + size_t device_count_ = 0; +#endif + + bool hw_support_; +#ifdef DML_HW + dsahw_status_t hw_init_status_; +#endif + }; + +} // namespace dml::core::dispatcher +#endif //DML_MIDDLE_LAYER_DISPATCHER_HW_DISPATCHER_HPP_ diff --git a/sources/core/src/hw_dispatcher/hw_queue.cpp b/sources/core/src/hw_dispatcher/hw_queue.cpp new file mode 100644 index 0000000..caa960e --- /dev/null +++ b/sources/core/src/hw_dispatcher/hw_queue.cpp @@ -0,0 +1,201 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifdef DML_HW + +#include + +#if defined(linux) + +#include + +#endif + +#include "hw_queue.hpp" +#include "legacy_headers/hardware_configuration_driver.h" +#include "legacy_headers/own_dsa_accel_constants.h" + +#define DML_HWSTS_RET(expr, err_code) \ + { \ + if (expr) \ + { \ + return (err_code); \ + } \ + } + +namespace dml::core::dispatcher +{ + hw_queue::hw_queue(hw_queue &&other) noexcept + { + version_ = other.version_; + priority_ = other.priority_; + portal_mask_ = other.portal_mask_; + portal_ptr_ = other.portal_ptr_; + portal_offset_ = 0; + + other.portal_ptr_ = nullptr; + } + + auto hw_queue::operator=(hw_queue &&other) noexcept -> hw_queue & + { + version_ = other.version_; + priority_ = other.priority_; + portal_mask_ = other.portal_mask_; + portal_ptr_ = other.portal_ptr_; + portal_offset_ = 0; + + other.portal_ptr_ = nullptr; + + return *this; + } + + hw_queue::~hw_queue() + { +#if defined(linux) + // Freeing resources + if (portal_ptr_ != nullptr) + { + munmap(portal_ptr_, 0x1000u); + + portal_ptr_ = nullptr; + } +#endif + } + + void hw_queue::set_portal_ptr(void *value_ptr) noexcept + { + portal_offset_ = reinterpret_cast(value_ptr) & OWN_PAGE_MASK; + portal_mask_ = reinterpret_cast(value_ptr) & (~OWN_PAGE_MASK); + portal_ptr_ = value_ptr; + } + + auto hw_queue::get_portal_ptr() const noexcept -> void * + { + uint64_t offset = portal_offset_++; + offset = (offset << 6) & OWN_PAGE_MASK; + return reinterpret_cast(offset | portal_mask_); + } + + auto hw_queue::enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t + { +#if defined(linux) + uint8_t retry = 0u; + + void *current_place_ptr = get_portal_ptr(); + asm volatile("sfence\t\n" + ".byte 0xf2, 0x0f, 0x38, 0xf8, 0x02\t\n" + "setz %0\t\n" + : "=r"(retry) + : "a"(current_place_ptr), "d"(desc_ptr)); + + return static_cast(retry); +#else + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; +#endif + } + + auto hw_queue::initialize_new_queue(void *wq_descriptor_ptr) noexcept -> dsahw_status_t + { +#if defined(linux) + auto *work_queue_ptr = reinterpret_cast(wq_descriptor_ptr); + char path[64]; +#ifdef LOG_HW_INIT + auto work_queue_dev_name = dsa_work_queue_get_device_name(work_queue_ptr); +#endif + + if (ACCFG_WQ_ENABLED != dsa_work_queue_get_state(work_queue_ptr)) + { + DIAG(" %7s: DISABLED\n", work_queue_dev_name); + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; + } + + if (ACCFG_WQ_SHARED != dsa_work_queue_get_mode(work_queue_ptr)) + { + DIAG(" %7s: UNSUPPORTED\n", work_queue_dev_name); + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; + } + + DIAG(" %7s:\n", work_queue_dev_name); + auto status = dsa_work_queue_get_device_path(work_queue_ptr, path, 64 - 1); + + DML_HWSTS_RET((0 > status), DML_STATUS_LIBACCEL_ERROR); + + DIAG(" %7s: opening descriptor %s", work_queue_dev_name, path); + auto fd = open(path, O_RDWR); + if(0 >= fd) + { + DIAGA(", access denied\n"); + return DML_STATUS_LIBACCEL_ERROR; + } + + // Map portal for enqcmd + auto *region_ptr = mmap(nullptr, 0x1000u, PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0u); + close(fd); + if(MAP_FAILED == region_ptr) + { + DIAGA(", limited MSI-X mapping failed\n"); + return DML_STATUS_LIBACCEL_ERROR; + } + DIAGA("\n"); + + auto *group_ptr = dsa_work_queue_get_group(work_queue_ptr); + if (group_ptr == nullptr) { + return DML_STATUS_LIBACCEL_ERROR; + } + + priority_ = dsa_work_queue_get_priority(work_queue_ptr); + memory_type_ = dsa_group_get_traffic_class_b(group_ptr) ? supported_memory_type::durable + : supported_memory_type::non_durable; + +#if 0 + DIAG(" %7s: size: %d\n", work_queue_dev_name, accfg_wq_get_size(work_queue_ptr)); + DIAG(" %7s: threshold: %d\n", work_queue_dev_name, accfg_wq_get_threshold(work_queue_ptr)); + DIAG(" %7s: priority: %d\n", work_queue_dev_name, priority_); + DIAG(" %7s: group: %d\n", work_queue_dev_name, group_id); + DIAG(" %7s: memtype: %d\n", work_queue_dev_name, memory_type_); + + for(struct accfg_engine *engine = accfg_engine_get_first(device_ptr); + engine != NULL; engine = accfg_engine_get_next(engine)) + { + if(accfg_engine_get_group_id(engine) == group_id) + DIAG(" %s\n", accfg_engine_get_devname(engine)); + } +#else + DIAG(" %7s: priority: %d\n", work_queue_dev_name, priority_); + DIAG(" %7s: memtype: %d\n", work_queue_dev_name, static_cast(memory_type_)); +#endif + + hw_queue::set_portal_ptr(region_ptr); + + return DML_STATUS_OK; +#else + return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; +#endif + } + + auto hw_queue::priority() const noexcept -> int32_t + { + return priority_; + } + + auto hw_queue::memory_type() const noexcept -> hw_queue::supported_memory_type + { + return memory_type_; + } + +} // namespace dml::core::dispatcher + +#endif diff --git a/sources/core/src/hw_dispatcher/hw_queue.hpp b/sources/core/src/hw_dispatcher/hw_queue.hpp new file mode 100644 index 0000000..5a5dd02 --- /dev/null +++ b/sources/core/src/hw_dispatcher/hw_queue.hpp @@ -0,0 +1,78 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_MIDDLE_LAYER_DISPATCHER_HW_QUEUE_HPP_ +#define DML_MIDDLE_LAYER_DISPATCHER_HW_QUEUE_HPP_ + +#include + +#include "dml/dmldefs.h" + +#ifdef DML_HW + +#include "legacy_headers/hardware_definitions.h" + +namespace dml::core::dispatcher +{ + + class hw_queue + { + public: + enum class supported_memory_type + { + durable, + non_durable + }; + + using descriptor_t = void; + + hw_queue() noexcept = default; + + hw_queue(const hw_queue &) noexcept = delete; + + auto operator=(const hw_queue &other) noexcept -> hw_queue & = delete; + + hw_queue(hw_queue &&other) noexcept; + + auto operator=(hw_queue &&other) noexcept -> hw_queue &; + + auto initialize_new_queue(descriptor_t *wq_descriptor_ptr) noexcept -> dsahw_status_t; + + [[nodiscard]] auto get_portal_ptr() const noexcept -> void *; + + [[nodiscard]] auto enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t; + + [[nodiscard]] auto priority() const noexcept -> int32_t; + + [[nodiscard]] auto memory_type() const noexcept -> supported_memory_type; + + void set_portal_ptr(void *portal_ptr) noexcept; + + ~hw_queue() noexcept; + + private: + uint32_t version_ = 0u; + int32_t priority_ = 0u; + supported_memory_type memory_type_ = supported_memory_type::non_durable; + uint64_t portal_mask_ = 0u; /**< Mask for incrementing portals */ + mutable void *portal_ptr_ = nullptr; + mutable std::atomic portal_offset_ = 0u; /**< Portal for enqcmd (mod page size)*/ + }; + +} // namespace dml::core::dispatcher +#endif + +#endif //DML_MIDDLE_LAYER_DISPATCHER_HW_QUEUE_HPP_ diff --git a/sources/hw-path/include/hardware_api.h b/sources/core/src/hw_dispatcher/legacy_headers/hardware_api.h similarity index 88% rename from sources/hw-path/include/hardware_api.h rename to sources/core/src/hw_dispatcher/legacy_headers/hardware_api.h index bc3c67c..9977170 100644 --- a/sources/hw-path/include/hardware_api.h +++ b/sources/core/src/hw_dispatcher/legacy_headers/hardware_api.h @@ -22,11 +22,10 @@ * @{ * @brief Contains general hardware function declarations */ -#include "hardware_limits.h" +#include "hardware_completion_records_api.h" #include "hardware_definitions.h" #include "hardware_descriptors_api.h" -#include "hardware_completion_records_api.h" - +#include "hardware_limits.h" #ifndef DML_OWN_HW_API_H__ #define DML_OWN_HW_API_H__ @@ -43,7 +42,7 @@ extern "C" { * @return Follow statuses: * - @todo Add statuses */ -dml_status_t DML_HW_API(get_context)(dsahw_context_t ** hw_context_ptr); +dml_status_t DML_HW_API(get_context)(dsahw_context_t **hw_context_ptr); /** * @brief Calls an operation, which is implemented with DSA hardware @@ -55,9 +54,7 @@ dml_status_t DML_HW_API(get_context)(dsahw_context_t ** hw_context_ptr); * @return @ref dml_status_t in according to specified DSA operation in @ref dml_job_t * */ -dml_status_t DML_HW_API(submit)(const dsahw_context_t *hw_state_ptr, - const dsahw_descriptor_t *descriptor_ptr, - dml_operation_flags_t flags); +dml_status_t DML_HW_API(submit)(const dsahw_context_t *hw_state_ptr, const dsahw_descriptor_t *descriptor_ptr, dml_operation_flags_t flags); /** * @brief Closes connection with hardware @@ -66,12 +63,10 @@ dml_status_t DML_HW_API(submit)(const dsahw_context_t *hw_state_ptr, * * @return The one of the follow statuses: * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_HARDWARE_DISCONNECTION_ERROR. * */ dml_status_t DML_HW_API(finalize)(dsahw_context_t *hw_context_ptr); - /** * @brief Returns value of OverlappingCopySupport from GENCAP * @@ -82,11 +77,10 @@ dml_status_t DML_HW_API(finalize)(dsahw_context_t *hw_context_ptr); */ int DML_HW_API(get_overlapping_copy_support)(dsahw_context_t *hw_context_ptr); - #ifdef __cplusplus } #endif -#endif //DML_OWN_HW_API_H__ +#endif //DML_OWN_HW_API_H__ /** @} */ diff --git a/sources/hw-path/include/hardware_completion_records_api.h b/sources/core/src/hw_dispatcher/legacy_headers/hardware_completion_records_api.h similarity index 87% rename from sources/hw-path/include/hardware_completion_records_api.h rename to sources/core/src/hw_dispatcher/legacy_headers/hardware_completion_records_api.h index c2447a8..3a41113 100644 --- a/sources/hw-path/include/hardware_completion_records_api.h +++ b/sources/core/src/hw_dispatcher/legacy_headers/hardware_completion_records_api.h @@ -24,7 +24,6 @@ */ #include "hardware_definitions.h" - #ifndef DML_HW_COMPLETION_RECORDS_API_H__ #define DML_HW_COMPLETION_RECORDS_API_H__ @@ -32,7 +31,6 @@ extern "C" { #endif - /** * @brief Extracts operation results from @ref dsahw_completion_record_t * and fills the @ref dml_job_t structure in accordance with the @ref DML_OP_MEM_MOVE operation @@ -50,9 +48,8 @@ extern "C" { * */ dsahw_status_t DML_HW_API(get_mem_move_result)(const dsahw_completion_record_t *completion_record_ptr, - dml_meta_result_t *result_ptr, - uint32_t *elements_processed_ptr); - + dml_meta_result_t *result_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -67,9 +64,7 @@ dsahw_status_t DML_HW_API(get_mem_move_result)(const dsahw_completion_record_t * * - @ref DML_STATUS_PAGE_FAULT_ERROR; * */ -dsahw_status_t DML_HW_API(get_fill_result)(const dsahw_completion_record_t *completion_record_ptr, - uint32_t *elements_processed_ptr); - +dsahw_status_t DML_HW_API(get_fill_result)(const dsahw_completion_record_t *completion_record_ptr, uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -88,9 +83,8 @@ dsahw_status_t DML_HW_API(get_fill_result)(const dsahw_completion_record_t *comp * */ dsahw_status_t DML_HW_API(get_compare_result)(const dsahw_completion_record_t *completion_record_ptr, - dml_meta_result_t *result_ptr, - uint32_t *elements_processed_ptr); - + dml_meta_result_t *result_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -113,11 +107,10 @@ dsahw_status_t DML_HW_API(get_compare_result)(const dsahw_completion_record_t *c * */ dsahw_status_t DML_HW_API(get_delta_create_result)(const dsahw_completion_record_t *completion_record_ptr, - const uint8_t *destination_ptr, - uint32_t *delta_record_length_ptr, - dml_meta_result_t *result_ptr, - uint32_t *elements_processed_ptr); - + const uint8_t *destination_ptr, + uint32_t *delta_record_length_ptr, + dml_meta_result_t *result_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -135,9 +128,7 @@ dsahw_status_t DML_HW_API(get_delta_create_result)(const dsahw_completion_record * - @ref DML_STATUS_OVERLAPPING_BUFFER_ERROR. * */ -dsahw_status_t DML_HW_API(get_delta_apply_result)(const dsahw_completion_record_t *completion_record_ptr, - uint32_t *elements_processed_ptr); - +dsahw_status_t DML_HW_API(get_delta_apply_result)(const dsahw_completion_record_t *completion_record_ptr, uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -154,9 +145,7 @@ dsahw_status_t DML_HW_API(get_delta_apply_result)(const dsahw_completion_record_ * - @ref DML_STATUS_OVERLAPPING_BUFFER_ERROR * */ -dsahw_status_t DML_HW_API(get_dualcast_result)(const dsahw_completion_record_t *completion_record_ptr, - uint32_t *elements_processed_ptr); - +dsahw_status_t DML_HW_API(get_dualcast_result)(const dsahw_completion_record_t *completion_record_ptr, uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -174,9 +163,8 @@ dsahw_status_t DML_HW_API(get_dualcast_result)(const dsahw_completion_record_t * * */ dsahw_status_t DML_HW_API(get_crc_result)(const dsahw_completion_record_t *completion_record_ptr, - uint32_t *crc_result_ptr, - uint32_t *elements_processed_ptr); - + uint32_t *crc_result_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -195,9 +183,8 @@ dsahw_status_t DML_HW_API(get_crc_result)(const dsahw_completion_record_t *compl * */ dsahw_status_t DML_HW_API(get_crc_copy_result)(const dsahw_completion_record_t *completion_record_ptr, - uint32_t *crc_result_ptr, - uint32_t *elements_processed_ptr); - + uint32_t *crc_result_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -212,9 +199,7 @@ dsahw_status_t DML_HW_API(get_crc_copy_result)(const dsahw_completion_record_t * * - @ref DML_STATUS_PAGE_FAULT_ERROR; * */ -dsahw_status_t DML_HW_API(get_cache_flush_result)(const dsahw_completion_record_t *completion_record_ptr, - uint32_t *elements_processed_ptr); - +dsahw_status_t DML_HW_API(get_cache_flush_result)(const dsahw_completion_record_t *completion_record_ptr, uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -233,9 +218,9 @@ dsahw_status_t DML_HW_API(get_cache_flush_result)(const dsahw_completion_record_ * */ dsahw_status_t DML_HW_API(get_check_dif_result)(const dsahw_completion_record_t *completion_record_ptr, - dml_meta_result_t *result_ptr, - dml_dif_config_t *dif_config_ptr, - uint32_t *elements_processed_ptr); + dml_meta_result_t *result_ptr, + dml_dif_config_t *dif_config_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -254,9 +239,8 @@ dsahw_status_t DML_HW_API(get_check_dif_result)(const dsahw_completion_record_t * */ dsahw_status_t DML_HW_API(get_insert_dif_result)(const dsahw_completion_record_t *completion_record_ptr, - dml_dif_config_t *dif_config_ptr, - uint32_t *elements_processed_ptr); - + dml_dif_config_t *dif_config_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -275,9 +259,8 @@ dsahw_status_t DML_HW_API(get_insert_dif_result)(const dsahw_completion_record_t * */ dsahw_status_t DML_HW_API(get_strip_dif_result)(const dsahw_completion_record_t *completion_record_ptr, - dml_dif_config_t *dif_config_ptr, - uint32_t *elements_processed_ptr); - + dml_dif_config_t *dif_config_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -297,10 +280,9 @@ dsahw_status_t DML_HW_API(get_strip_dif_result)(const dsahw_completion_record_t * */ dsahw_status_t DML_HW_API(get_update_dif_result)(const dsahw_completion_record_t *completion_record_ptr, - dml_dif_config_t *dif_config_ptr, - dml_meta_result_t *result_ptr, - uint32_t *elements_processed_ptr); - + dml_dif_config_t *dif_config_ptr, + dml_meta_result_t *result_ptr, + uint32_t *elements_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -315,7 +297,6 @@ dsahw_status_t DML_HW_API(get_update_dif_result)(const dsahw_completion_record_t */ dsahw_status_t DML_HW_API(get_nop_result)(const dsahw_completion_record_t *completion_record_ptr); - /** * @brief Extracts operation results from @ref dsahw_completion_record_t * and fills the @ref dml_job_t structure in accordance with the @ref DML_OP_BATCH operation @@ -329,9 +310,7 @@ dsahw_status_t DML_HW_API(get_nop_result)(const dsahw_completion_record_t *compl * - @ref DML_STATUS_BATCH_ERROR; * */ -dsahw_status_t DML_HW_API(get_batch_result)(const dsahw_completion_record_t *completion_record_ptr, - uint32_t *descriptors_processed_ptr); - +dsahw_status_t DML_HW_API(get_batch_result)(const dsahw_completion_record_t *completion_record_ptr, uint32_t *descriptors_processed_ptr); /** * @brief Extracts operation results from @ref dsahw_completion_record_t @@ -346,11 +325,10 @@ dsahw_status_t DML_HW_API(get_batch_result)(const dsahw_completion_record_t *com */ dsahw_status_t DML_HW_API(get_drain_result)(const dsahw_completion_record_t *completion_record_ptr); - #ifdef __cplusplus } #endif -#endif //DML_HW_COMPLETION_RECORDS_API_H__ +#endif //DML_HW_COMPLETION_RECORDS_API_H__ /** @} */ diff --git a/sources/hw-path/include/hardware_configuration_driver.h b/sources/core/src/hw_dispatcher/legacy_headers/hardware_configuration_driver.h similarity index 60% rename from sources/hw-path/include/hardware_configuration_driver.h rename to sources/core/src/hw_dispatcher/legacy_headers/hardware_configuration_driver.h index 2861910..c7f05c9 100644 --- a/sources/hw-path/include/hardware_configuration_driver.h +++ b/sources/core/src/hw_dispatcher/legacy_headers/hardware_configuration_driver.h @@ -1,18 +1,18 @@ /* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ +* Copyright 2021 Intel Corporation. +* +* This software and the related documents are Intel copyrighted materials, +* and your use of them is governed by the express license under which they +* were provided to you ("License"). Unless the License provides otherwise, +* you may not use, modify, copy, publish, distribute, disclose or transmit +* this software or the related documents without Intel's prior written +* permission. +* +* This software and the related documents are provided as is, with no +* express or implied warranties, other than those that are expressly +* stated in the License. +* +*/ #ifndef DML_SOURCES_HW_PATH_INCLUDE_HW_CONFIGURATION_DRIVER_H_ #define DML_SOURCES_HW_PATH_INCLUDE_HW_CONFIGURATION_DRIVER_H_ @@ -30,33 +30,33 @@ extern "C" { #endif /** - * @brief Type of function that should be loaded from accelerator configuration driver - */ +* @brief Type of function that should be loaded from accelerator configuration driver +*/ typedef int (*library_function)(); /** - * @brief Structure that maps function implementation to its name - */ +* @brief Structure that maps function implementation to its name +*/ typedef struct { - library_function function; /**< Function address */ - const char *function_name; /**< Function name */ + library_function function; /**< Function address */ + const char *function_name; /**< Function name */ } dsa_desc_t; /** - * @brief Structure represents configuration driver used for access to accelerator instances and their properties - */ +* @brief Structure represents configuration driver used for access to accelerator instances and their properties +*/ typedef struct { - void *driver_instance_ptr; /**< Pointer to a loaded driver */ + void *driver_instance_ptr; /**< Pointer to a loaded driver */ } hw_driver_t; /** - * @brief Initializes driver functions - * - * @note Should be called only once - * - * @return status of initialization - */ +* @brief Initializes driver functions +* +* @note Should be called only once +* +* @return status of initialization +*/ dsahw_status_t DML_HW_API(initialize_accelerator_driver)(hw_driver_t *driver_ptr); void DML_HW_API(finalize_accelerator_driver)(hw_driver_t *driver_ptr); @@ -71,7 +71,7 @@ const char *DML_HW_API(device_get_name)(struct accfg_device *device); struct accfg_device *DML_HW_API(device_get_next)(struct accfg_device *device); -uint32_t DML_HW_API(device_get_major_version)(struct accfg_device *device); +uint32_t DML_HW_API(device_get_version)(struct accfg_device *device); enum accfg_device_state DML_HW_API(device_get_state)(struct accfg_device *device); @@ -83,7 +83,7 @@ struct accfg_wq *DML_HW_API(get_first_work_queue)(struct accfg_device *device); struct accfg_wq *DML_HW_API(work_queue_get_next)(struct accfg_wq *wq); -int32_t DML_HW_API(work_queue_get_minor_version)(struct accfg_wq *wq); +const char * DML_HW_API(work_queue_get_device_name)(struct accfg_wq *wq); int32_t DML_HW_API(work_queue_get_priority)(struct accfg_wq *wq); @@ -111,4 +111,4 @@ int DML_HW_API(work_queue_get_device_path)(struct accfg_wq *wq, char *buf, size_ } #endif -#endif //DML_SOURCES_HW_PATH_INCLUDE_HW_CONFIGURATION_DRIVER_H_ +#endif //DML_SOURCES_HW_PATH_INCLUDE_HW_CONFIGURATION_DRIVER_H_ \ No newline at end of file diff --git a/sources/hw-path/include/hardware_definitions.h b/sources/core/src/hw_dispatcher/legacy_headers/hardware_definitions.h similarity index 66% rename from sources/hw-path/include/hardware_definitions.h rename to sources/core/src/hw_dispatcher/legacy_headers/hardware_definitions.h index 12f95fe..71334b7 100644 --- a/sources/hw-path/include/hardware_definitions.h +++ b/sources/core/src/hw_dispatcher/legacy_headers/hardware_definitions.h @@ -30,17 +30,24 @@ #ifndef DSA_HW_DSA_DEFINES_H__ #define DSA_HW_DSA_DEFINES_H__ +#ifdef LOG_HW_INIT +#include +#define DIAGA(...) printf(__VA_ARGS__); fflush(stdout) /**< Diagnostic printer for appending to line */ +#define DIAG(...) printf("dml-diag: " __VA_ARGS__); fflush(stdout) /**< Diagnostic printer */ +#else +#define DIAGA(...) /**< Diagnostic printer for appending to line */ +#define DIAG(...) /**< Diagnostic printer */ +#endif #ifdef __cplusplus extern "C" { #endif - /* ------ Definitions ------ */ -#if defined( _WIN32 ) || defined ( _WIN64 ) -#define DML_HW_STDCALL __stdcall -#define DML_HW_CDECL __cdecl +#if defined(_WIN32) || defined(_WIN64) +#define DML_HW_STDCALL __stdcall +#define DML_HW_CDECL __cdecl #else #define DML_HW_STDCALL #define DML_HW_CDECL @@ -48,74 +55,67 @@ extern "C" { /* ------ Macros ------ */ -#if !defined( DML_HW_API ) +#if !defined(DML_HW_API) #define DML_HW_API(name) DML_HW_STDCALL dsa_##name /**< Declaration macros to manipulate function name */ #endif - /* ------ Statuses ------ */ typedef dml_status_t dsahw_status_t; /**< Redefinition of @ref dml_status_t for core functions */ - /* ------ State ------ */ #if defined(__GNUC__) - /** +/** * @brief Packs a structure byte by byte */ - #define DML_HW_BYTE_PACKED_STRUCTURE_BEGIN \ - typedef struct __attribute__ ((__packed__)) +#define DML_HW_BYTE_PACKED_STRUCTURE_BEGIN typedef struct __attribute__((__packed__)) - /** +/** * @brief Pops a previous structure pack property */ - #define DML_HW_BYTE_PACKED_STRUCTURE_END -#elif(_MSC_VER) - /** +#define DML_HW_BYTE_PACKED_STRUCTURE_END +#elif (_MSC_VER) +/** * @brief Packs a structure byte by byte */ - #define DML_HW_BYTE_PACKED_STRUCTURE_BEGIN \ - __pragma(pack(push, 1)) \ - typedef struct +#define DML_HW_BYTE_PACKED_STRUCTURE_BEGIN __pragma(pack(push, 1)) typedef struct - /** +/** * @brief Pops a previous structure pack property */ - #define DML_HW_BYTE_PACKED_STRUCTURE_END \ - __pragma(pack(pop)) +#define DML_HW_BYTE_PACKED_STRUCTURE_END __pragma(pack(pop)) #else - #error Compiler not supported +#error Compiler not supported #endif - /* ################# DSA DESCRIPTOR ################# */ -#define DSA_HW_DESCRIPTOR_SIZE (64u) /**< DSA hardware descriptor byte size */ +#define DSA_HW_DESCRIPTOR_SIZE (64u) /**< DSA hardware descriptor byte size */ /** * @brief Defines a common type of the DSA hardware descriptor */ DML_HW_BYTE_PACKED_STRUCTURE_BEGIN { - uint8_t bytes[DSA_HW_DESCRIPTOR_SIZE]; /**< Allocation memory for an abstract DSA descriptor*/ -} dsahw_descriptor_t; + uint8_t bytes[DSA_HW_DESCRIPTOR_SIZE]; /**< Allocation memory for an abstract DSA descriptor*/ +} +dsahw_descriptor_t; DML_HW_BYTE_PACKED_STRUCTURE_END - /* ################# DSA COMPLETION RECORD ################# */ -#define DSA_HW_COMPLETION_RECORD_SIZE (32u) /**< DSA hardware completion record byte size */ - +#define DSA_HW_COMPLETION_RECORD_SIZE (32u) /**< DSA hardware completion record byte size */ /** * @brief Defines an abstract type of the DSA hardware completion record */ DML_HW_BYTE_PACKED_STRUCTURE_BEGIN { - uint8_t status; /**< DSA completion status field */ - uint8_t bytes[DSA_HW_COMPLETION_RECORD_SIZE - 1u]; /**< Allocation memory for others fields*/ -} dsahw_completion_record_t; + uint8_t status; /**< DSA completion status field */ + uint8_t bytes[DSA_HW_COMPLETION_RECORD_SIZE - 1u]; /**< Allocation memory for others fields*/ +} +dsahw_completion_record_t; DML_HW_BYTE_PACKED_STRUCTURE_END /** @@ -135,12 +135,11 @@ typedef uint8_t portal_t; /**< Define portal type */ */ typedef struct { - portal_t *portals_ptr; /**< Pointer to memory, which is mapped as DSA Portals */ - uint32_t current_portal; /**< Current available portal to enqueue a descriptor */ - uint32_t portal_count; /**< Maximal count of portals in the portal table */ + portal_t *portals_ptr; /**< Pointer to memory, which is mapped as DSA Portals */ + uint32_t current_portal; /**< Current available portal to enqueue a descriptor */ + uint32_t portal_count; /**< Maximal count of portals in the portal table */ } own_hw_portal_information_t; - /** * @brief Contain information about TC-A and TC-B portals */ @@ -150,7 +149,6 @@ typedef struct own_hw_portal_information_t tc_b_portals; /**< WQs working with TC-B */ } own_hw_portal_table_t; - /** * @brief Contains information from General Capabilities Register (GENCAP) */ @@ -164,28 +162,25 @@ typedef struct uint8_t destination_readback_support; /**< The Destination Readback flag in descriptors is supported/unsupported */ uint8_t flush_cache_control_support; /**< Cache control for cache flush operations is supported/unsupported */ uint8_t overlapping_copy_support; /**< Memory overlapping supported/unsupported for the memory move operation */ - uint8_t interrupt_handle_request; /**< Used to determine the interrupt handle to use in descriptors */ uint8_t block_on_fault_support; /**< Block of fault supported/unsupported */ uint8_t configuration_support; /**< Group configuration and WQ configuration are read-write/read-only */ - uint8_t max_descriptors; /**< The maximum number of descriptors that can be in progress in each engine */ } own_hw_gen_cap_t; - /** * @brief Contains specific information about Hardware Path */ -typedef struct { - own_dml_structure_id_t guard; /**< Structure guard */ - own_hw_portal_table_t portal_table; /**< Contains information about open portals */ - own_hw_gen_cap_t gen_cap; /**< General Capabilities Register fields */ +typedef struct +{ + own_dml_structure_id_t guard; /**< Structure guard */ + own_hw_portal_table_t portal_table; /**< Contains information about open portals */ + own_hw_gen_cap_t gen_cap; /**< General Capabilities Register fields */ #if defined(linux) - struct accfg_ctx *dsa_context_ptr; /**< @todo */ + struct accfg_ctx *dsa_context_ptr; /**< @todo */ #endif } dsahw_context_t; - #ifdef __cplusplus } #endif -#endif // DSA_HW_DSA_DEFINES_H__ +#endif // DSA_HW_DSA_DEFINES_H__ diff --git a/sources/hw-path/include/hardware_descriptors_api.h b/sources/core/src/hw_dispatcher/legacy_headers/hardware_descriptors_api.h similarity index 78% rename from sources/hw-path/include/hardware_descriptors_api.h rename to sources/core/src/hw_dispatcher/legacy_headers/hardware_descriptors_api.h index a0ef2b5..77e5f3a 100644 --- a/sources/hw-path/include/hardware_descriptors_api.h +++ b/sources/core/src/hw_dispatcher/legacy_headers/hardware_descriptors_api.h @@ -24,7 +24,6 @@ */ #include "hardware_definitions.h" - #ifndef DSA_HARDWARE_DESCRIPTORS_API_H__ #define DSA_HARDWARE_DESCRIPTORS_API_H__ @@ -32,7 +31,6 @@ extern "C" { #endif - /** * @brief Fills a descriptor for the @ref DML_OP_MEM_MOVE operation * @@ -46,14 +44,13 @@ extern "C" { * @return The following statuses: * @todo add return statuses */ -dsahw_status_t DML_HW_API(init_mem_move_descriptor)(dsahw_descriptor_t *descriptor_ptr, - const uint8_t *source_ptr, - uint32_t source_length, - uint8_t *destination_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_mem_move_descriptor)(dsahw_descriptor_t *descriptor_ptr, + const uint8_t *source_ptr, + uint32_t source_length, + uint8_t *destination_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_FILL operation * @@ -68,14 +65,13 @@ dsahw_status_t DML_HW_API(init_mem_move_descriptor)(dsahw_descriptor_t *descript * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_fill_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint64_t *pattern_ptr, - uint8_t *destination_ptr, - uint32_t destination_length, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_fill_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint64_t *pattern_ptr, + uint8_t *destination_ptr, + uint32_t destination_length, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_COMPARE operation * @@ -91,15 +87,14 @@ dsahw_status_t DML_HW_API(init_fill_descriptor)(dsahw_descriptor_t *descriptor_p * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_compare_descriptor)(dsahw_descriptor_t *descriptor_ptr, - const uint8_t *source_first_ptr, - const uint8_t *source_second_ptr, - uint32_t source_length, - dml_meta_result_t expected_result, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_compare_descriptor)(dsahw_descriptor_t *descriptor_ptr, + const uint8_t *source_first_ptr, + const uint8_t *source_second_ptr, + uint32_t source_length, + dml_meta_result_t expected_result, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_COMPARE_PATTERN operation * @@ -115,15 +110,14 @@ dsahw_status_t DML_HW_API(init_compare_descriptor)(dsahw_descriptor_t *descripto * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_compare_pattern_descriptor)(dsahw_descriptor_t *descriptor_ptr, - const uint8_t *source_ptr, - uint32_t source_length, - const uint64_t *pattern_ptr, - dml_meta_result_t expected_result, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_compare_pattern_descriptor)(dsahw_descriptor_t *descriptor_ptr, + const uint8_t *source_ptr, + uint32_t source_length, + const uint64_t *pattern_ptr, + dml_meta_result_t expected_result, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DELTA_CREATE operation * @@ -141,17 +135,16 @@ dsahw_status_t DML_HW_API(init_compare_pattern_descriptor)(dsahw_descriptor_t *d * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_delta_create_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *source_first_ptr, - uint8_t *source_second_ptr, - uint32_t source_length, - uint8_t *delta_record_ptr, - uint32_t max_delta_record_length, - dml_meta_result_t expected_result, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_delta_create_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *source_first_ptr, + uint8_t *source_second_ptr, + uint32_t source_length, + uint8_t *delta_record_ptr, + uint32_t max_delta_record_length, + dml_meta_result_t expected_result, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DELTA_APPLY operation * @@ -167,15 +160,14 @@ dsahw_status_t DML_HW_API(init_delta_create_descriptor)(dsahw_descriptor_t *desc * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_delta_apply_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *delta_record_ptr, - uint32_t delta_record_length, - uint8_t *destination_ptr, - uint32_t destination_length, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_delta_apply_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *delta_record_ptr, + uint32_t delta_record_length, + uint8_t *destination_ptr, + uint32_t destination_length, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DUALCAST operation * @@ -191,15 +183,14 @@ dsahw_status_t DML_HW_API(init_delta_apply_descriptor)(dsahw_descriptor_t *descr * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_dualcast_descriptor)(dsahw_descriptor_t *descriptor_ptr, - const uint8_t *source_ptr, - uint32_t source_length, - uint8_t *destination_first_ptr, - uint8_t *destination_second_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_dualcast_descriptor)(dsahw_descriptor_t *descriptor_ptr, + const uint8_t *source_ptr, + uint32_t source_length, + uint8_t *destination_first_ptr, + uint8_t *destination_second_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for @ref DML_OP_CRC operation * @@ -214,14 +205,13 @@ dsahw_status_t DML_HW_API(init_dualcast_descriptor)(dsahw_descriptor_t *descript * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_crc_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *source_ptr, - uint32_t source_length, - uint32_t *crc_checksum_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_crc_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *source_ptr, + uint32_t source_length, + uint32_t *crc_checksum_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for @ref DML_OP_COPY_CRC operation * @@ -237,15 +227,14 @@ dsahw_status_t DML_HW_API(init_crc_descriptor)(dsahw_descriptor_t *descriptor_pt * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_crc_copy_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *source_ptr, - uint32_t source_length, - uint32_t *crc_checksum_ptr, - uint8_t *destination_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_crc_copy_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *source_ptr, + uint32_t source_length, + uint32_t *crc_checksum_ptr, + uint8_t *destination_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for @ref DML_OP_CACHE_FLUSH operation * @@ -259,13 +248,12 @@ dsahw_status_t DML_HW_API(init_crc_copy_descriptor)(dsahw_descriptor_t *descript * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_cache_flush_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *memory_region_ptr, - uint32_t memory_region_length, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_cache_flush_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *memory_region_ptr, + uint32_t memory_region_length, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DIF_CHECK operation * @@ -281,14 +269,13 @@ dsahw_status_t DML_HW_API(init_cache_flush_descriptor)(dsahw_descriptor_t *descr * - @ref DML_STATUS_JOB_LENGTH_ERROR; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_check_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *source_ptr, - uint32_t source_length, - const dml_dif_config_t *dif_config_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_check_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *source_ptr, + uint32_t source_length, + const dml_dif_config_t *dif_config_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DIF_INSERT operation * @@ -304,15 +291,14 @@ dsahw_status_t DML_HW_API(init_check_dif_descriptor)(dsahw_descriptor_t *descrip * - @ref DML_STATUS_OK; * - @ref DML_STATUS_JOB_LENGTH_ERROR; */ -dsahw_status_t DML_HW_API(init_insert_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *source_ptr, - uint32_t source_length, - const dml_dif_config_t *dif_config_ptr, - uint8_t *destination_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_insert_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *source_ptr, + uint32_t source_length, + const dml_dif_config_t *dif_config_ptr, + uint8_t *destination_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DIF_STRIP operation * @@ -328,15 +314,14 @@ dsahw_status_t DML_HW_API(init_insert_dif_descriptor)(dsahw_descriptor_t *descri * - @ref DML_STATUS_OK; * - @ref DML_STATUS_JOB_LENGTH_ERROR; */ -dsahw_status_t DML_HW_API(init_strip_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *source_ptr, - uint32_t source_length, - const dml_dif_config_t *dif_config_ptr, - uint8_t *destination_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_strip_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *source_ptr, + uint32_t source_length, + const dml_dif_config_t *dif_config_ptr, + uint8_t *destination_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DIF_UPDATE operation * @@ -352,15 +337,14 @@ dsahw_status_t DML_HW_API(init_strip_dif_descriptor)(dsahw_descriptor_t *descrip * - @ref DML_STATUS_OK; * - @ref DML_STATUS_JOB_LENGTH_ERROR; */ -dsahw_status_t DML_HW_API(init_update_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint8_t *source_ptr, - uint32_t source_length, - const dml_dif_config_t *dif_config_ptr, - uint8_t *destination_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_update_dif_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint8_t *source_ptr, + uint32_t source_length, + const dml_dif_config_t *dif_config_ptr, + uint8_t *destination_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_NOP operation * @@ -372,11 +356,10 @@ dsahw_status_t DML_HW_API(init_update_dif_descriptor)(dsahw_descriptor_t *descri * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_nop_descriptor)(dsahw_descriptor_t *descriptor_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_nop_descriptor)(dsahw_descriptor_t *descriptor_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_BATCH operation * @@ -390,13 +373,12 @@ dsahw_status_t DML_HW_API(init_nop_descriptor)(dsahw_descriptor_t *descriptor_pt * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_batch_descriptor)(dsahw_descriptor_t *descriptor_ptr, - const dsahw_descriptor_t *internal_descriptors_ptr, - uint32_t internal_descriptors_count, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_batch_descriptor)(dsahw_descriptor_t *descriptor_ptr, + const dsahw_descriptor_t *internal_descriptors_ptr, + uint32_t internal_descriptors_count, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); - /** * @brief Fills a descriptor for the @ref DML_OP_DRAIN operation * @@ -410,16 +392,16 @@ dsahw_status_t DML_HW_API(init_batch_descriptor)(dsahw_descriptor_t *descriptor_ * - @ref DML_STATUS_OK; * - @ref DML_STATUS_NULL_POINTER_ERROR. */ -dsahw_status_t DML_HW_API(init_drain_descriptor)(dsahw_descriptor_t *descriptor_ptr, - uint64_t *readback_address1_ptr, - uint64_t *readback_address2_ptr, - dml_operation_flags_t flags, +dsahw_status_t DML_HW_API(init_drain_descriptor)(dsahw_descriptor_t *descriptor_ptr, + uint64_t *readback_address1_ptr, + uint64_t *readback_address2_ptr, + dml_operation_flags_t flags, dsahw_completion_record_t *result_ptr); #ifdef __cplusplus } #endif -#endif //DSA_HARDWARE_DESCRIPTORS_API_H__ +#endif //DSA_HARDWARE_DESCRIPTORS_API_H__ /** @} */ diff --git a/sources/hw-path/include/hardware_limits.h b/sources/core/src/hw_dispatcher/legacy_headers/hardware_limits.h similarity index 97% rename from sources/hw-path/include/hardware_limits.h rename to sources/core/src/hw_dispatcher/legacy_headers/hardware_limits.h index 78121ad..87e75a9 100644 --- a/sources/hw-path/include/hardware_limits.h +++ b/sources/core/src/hw_dispatcher/legacy_headers/hardware_limits.h @@ -15,22 +15,19 @@ */ /** - * + * * @brief Contains public hardware limits * */ #include "hardware_definitions.h" - #ifndef DML_HW_DSA_LIMITS_H__ #define DML_HW_DSA_LIMITS_H__ - #ifdef __cplusplus extern "C" { #endif - /** * @brief Maximum number of the descriptors/completion records that can be launched in the single batch */ @@ -56,9 +53,8 @@ extern "C" { */ #define DSA_HW_MAX_RESULT_ALIGNMENT (32u) - #ifdef __cplusplus } #endif -#endif // DML_HW_DSA_LIMITS_H__ +#endif // DML_HW_DSA_LIMITS_H__ diff --git a/sources/hw-path/include/libaccel_config.h b/sources/core/src/hw_dispatcher/legacy_headers/libaccel_config.h similarity index 65% rename from sources/hw-path/include/libaccel_config.h rename to sources/core/src/hw_dispatcher/legacy_headers/libaccel_config.h index 747251e..3a1181b 100644 --- a/sources/hw-path/include/libaccel_config.h +++ b/sources/core/src/hw_dispatcher/legacy_headers/libaccel_config.h @@ -14,9 +14,8 @@ * */ -/* - * SPDX-License-Identifier: LGPL-2.0 - */ +// SPDX-License-Identifier: LGPL-2.1 +/* Copyright(c) 2019 Intel Corporation. All rights reserved. */ #ifndef _LIBACCFG_H_ #define _LIBACCFG_H_ @@ -45,73 +44,92 @@ extern "C" { #define UUID_ZERO "00000000-0000-0000-0000-000000000000" /* no need to save device state */ +enum accfg_device_type { + ACCFG_DEVICE_DSA = 0, + ACCFG_DEVICE_IAX = 1, + ACCFG_DEVICE_TYPE_UNKNOWN = -1, +}; + enum accfg_device_state { - ACCFG_DEVICE_DISABLED = 0, - ACCFG_DEVICE_ENABLED = 1, - ACCFG_DEVICE_UNKNOWN = -1, + ACCFG_DEVICE_DISABLED = 0, + ACCFG_DEVICE_ENABLED = 1, + ACCFG_DEVICE_UNKNOWN = -1, }; enum accfg_wq_mode { - ACCFG_WQ_SHARED = 0, - ACCFG_WQ_DEDICATED, - ACCFG_WQ_MODE_UNKNOWN, + ACCFG_WQ_SHARED = 0, + ACCFG_WQ_DEDICATED, + ACCFG_WQ_MODE_UNKNOWN, }; enum accfg_wq_state { - ACCFG_WQ_DISABLED = 0, - ACCFG_WQ_ENABLED = 1, - ACCFG_WQ_QUIESCING = 2, - ACCFG_WQ_UNKNOWN = -1, + ACCFG_WQ_DISABLED, + ACCFG_WQ_ENABLED, + ACCFG_WQ_QUIESCING, + ACCFG_WQ_LOCKED, + ACCFG_WQ_UNKNOWN = -1, }; enum accfg_wq_type { - ACCFG_WQT_NONE = 0, - ACCFG_WQT_KERNEL, - ACCFG_WQT_USER, - ACCFG_WQT_MDEV, + ACCFG_WQT_NONE = 0, + ACCFG_WQT_KERNEL, + ACCFG_WQT_USER, + ACCFG_WQT_MDEV, }; enum accfg_control_flag { - ACCFG_DEVICE_DISABLE = 0, - ACCFG_DEVICE_ENABLE, - ACCFG_WQ_ENABLE, - ACCFG_WQ_DISABLE, + ACCFG_DEVICE_DISABLE = 0, + ACCFG_DEVICE_ENABLE, + ACCFG_WQ_ENABLE, + ACCFG_WQ_DISABLE, +}; + +enum accfg_mdev_type { + ACCFG_MDEV_TYPE_1_DWQ, + ACCFG_MDEV_TYPE_1_SWQ, + ACCFG_MDEV_TYPE_UNKNOWN, }; /* no need to save device error */ struct accfg_error { - uint64_t val[4]; + uint64_t val[4]; +}; + +struct accfg_op_cap { + uint64_t bits[4]; }; /* parameters read from sysfs of accfg driver */ struct dev_parameters { - unsigned int token_limit; + unsigned int token_limit; }; extern char *accfg_basenames[]; +extern char *accfg_mdev_basenames[]; struct group_parameters { - unsigned int tokens_reserved; - unsigned int tokens_allowed; - unsigned int use_token_limit; - int traffic_class_a; - int traffic_class_b; + unsigned int tokens_reserved; + unsigned int tokens_allowed; + unsigned int use_token_limit; + int traffic_class_a; + int traffic_class_b; }; struct wq_parameters { - int group_id; - unsigned int wq_size; - unsigned int threshold; - unsigned int priority; - int block_on_fault; - const char *mode; - const char *type; - const char *name; - const char *uuid_str; + int group_id; + unsigned int wq_size; + unsigned int threshold; + unsigned int priority; + int block_on_fault; + unsigned int max_batch_size; + uint64_t max_transfer_size; + const char *mode; + const char *type; + const char *name; }; struct engine_parameters { - int group_id; + int group_id; }; struct accfg_ctx; @@ -133,11 +151,11 @@ int accfg_new(struct accfg_ctx **ctx); /* override default log routine */ void accfg_set_log_fn(struct accfg_ctx *ctx, -void (*log_fn) (struct accfg_ctx * ctx, - int priority, const char *file, - int line, const char *fn, - const char *format, - va_list args)); + void (*log_fn)(struct accfg_ctx *ctx, + int priority, const char *file, + int line, const char *fn, + const char *format, + va_list args)); /* libaccfg function for device */ struct accfg_device; @@ -149,17 +167,19 @@ int accfg_device_disable(struct accfg_device *device, bool force); struct accfg_device *accfg_device_get_first(struct accfg_ctx *ctx); struct accfg_device *accfg_device_get_next(struct accfg_device *device); #define accfg_device_foreach(ctx, device) \ - for (device = accfg_device_get_first(ctx); \ - device != NULL; \ - device = accfg_device_get_next(device)) + for (device = accfg_device_get_first(ctx); \ + device != NULL; \ + device = accfg_device_get_next(device)) struct accfg_ctx *accfg_device_get_ctx(struct accfg_device *); const char *accfg_device_get_devname(struct accfg_device *device); int accfg_device_type_validate(const char *dev_name); +enum accfg_device_type accfg_device_get_type(struct accfg_device *device); +char *accfg_device_get_type_str(struct accfg_device *device); int accfg_device_get_id(struct accfg_device *device); struct accfg_device *accfg_ctx_device_get_by_id(struct accfg_ctx *ctx, - int id); + int id); struct accfg_device *accfg_ctx_device_get_by_name(struct accfg_ctx *ctx, - const char *dev_name); + const char *dev_name); unsigned int accfg_device_get_max_groups(struct accfg_device *device); unsigned int accfg_device_get_max_work_queues(struct accfg_device *device); unsigned int accfg_device_get_max_engines(struct accfg_device *device); @@ -167,36 +187,55 @@ unsigned int accfg_device_get_max_work_queues_size(struct accfg_device *device); int accfg_device_get_numa_node(struct accfg_device *device); unsigned int accfg_device_get_ims_size(struct accfg_device *device); unsigned int accfg_device_get_max_batch_size(struct accfg_device *device); -unsigned long accfg_device_get_max_transfer_size(struct accfg_device *device); -unsigned long accfg_device_get_op_cap(struct accfg_device *device); -unsigned long accfg_device_get_gen_cap(struct accfg_device *device); +uint64_t accfg_device_get_max_transfer_size(struct accfg_device *device); +int accfg_device_get_op_cap(struct accfg_device *device, + struct accfg_op_cap *op_cap); +uint64_t accfg_device_get_gen_cap(struct accfg_device *device); unsigned int accfg_device_get_configurable(struct accfg_device *device); bool accfg_device_get_pasid_enabled(struct accfg_device *device); +bool accfg_device_get_mdev_enabled(struct accfg_device *device); int accfg_device_get_errors(struct accfg_device *device, struct accfg_error *error); enum accfg_device_state accfg_device_get_state(struct accfg_device *device); unsigned int accfg_device_get_max_tokens(struct accfg_device *device); unsigned int accfg_device_get_max_batch_size(struct accfg_device *device); unsigned int accfg_device_get_token_limit(struct accfg_device *device); unsigned int accfg_device_get_cdev_major(struct accfg_device *device); +unsigned int accfg_device_get_version(struct accfg_device *device); int accfg_device_get_clients(struct accfg_device *device); int accfg_device_set_token_limit(struct accfg_device *dev, int val); int accfg_device_is_active(struct accfg_device *device); +int accfg_device_get_cmd_status(struct accfg_device *device); +const char *accfg_device_get_cmd_status_str(struct accfg_device *device); + +struct accfg_device_mdev; +struct accfg_device_mdev *accfg_device_first_mdev(struct accfg_device *device); +struct accfg_device_mdev *accfg_device_next_mdev(struct accfg_device_mdev *mdev); +void accfg_mdev_get_uuid(struct accfg_device_mdev *mdev, uuid_t uuid); +enum accfg_mdev_type accfg_mdev_get_type(struct accfg_device_mdev *mdev); +int accfg_create_mdev(struct accfg_device *device, enum accfg_mdev_type type, + uuid_t uuid); +int accfg_remove_mdev(struct accfg_device *device, uuid_t uuid); + +#define accfg_device_mdev_foreach(device, mdev) \ + for (mdev = accfg_device_first_mdev(device); \ + mdev != NULL; \ + mdev = accfg_device_next_mdev(mdev)) /* libaccfg function for group */ struct accfg_group; struct accfg_group *accfg_group_get_first(struct accfg_device *device); struct accfg_group *accfg_group_get_next(struct accfg_group *group); #define accfg_group_foreach(device, group) \ - for (group = accfg_group_get_first(device); \ - group != NULL; \ - group = accfg_group_get_next(group)) + for (group = accfg_group_get_first(device); \ + group != NULL; \ + group = accfg_group_get_next(group)) int accfg_group_get_id(struct accfg_group *group); struct accfg_group *accfg_device_group_get_by_id(struct accfg_device *device, - int id); + int id); int accfg_group_get_device_id(struct accfg_group *group); const char *accfg_group_get_devname(struct accfg_group *group); -unsigned long accfg_group_get_size(struct accfg_group *group); -unsigned long accfg_group_get_available_size(struct accfg_group *group); +uint64_t accfg_group_get_size(struct accfg_group *group); +uint64_t accfg_group_get_available_size(struct accfg_group *group); struct accfg_device *accfg_group_get_device(struct accfg_group *group); struct accfg_ctx *accfg_group_get_ctx(struct accfg_group *group); int accfg_group_get_tokens_reserved(struct accfg_group *group); @@ -214,29 +253,22 @@ int accfg_group_set_traffic_class_b(struct accfg_group *group, int val); struct accfg_wq; struct accfg_wq *accfg_wq_get_first(struct accfg_device *device); struct accfg_wq *accfg_wq_get_next(struct accfg_wq *wq); -uuid_t *accfg_wq_first_uuid(struct accfg_wq *wq); -uuid_t *accfg_wq_next_uuid(struct accfg_wq *wq); #define accfg_wq_foreach(device, wq) \ - for (wq = accfg_wq_get_first(device); \ - wq != NULL; \ - wq = accfg_wq_get_next(wq)) - -#define accfg_wq_uuid_foreach(wq, uuid) \ - for (uuid = accfg_wq_first_uuid(wq); \ - uuid != NULL; \ - uuid = accfg_wq_next_uuid(wq)) + for (wq = accfg_wq_get_first(device); \ + wq != NULL; \ + wq = accfg_wq_get_next(wq)) struct accfg_ctx *accfg_wq_get_ctx(struct accfg_wq *wq); struct accfg_device *accfg_wq_get_device(struct accfg_wq *wq); struct accfg_group *accfg_wq_get_group(struct accfg_wq *wq); int accfg_wq_get_id(struct accfg_wq *wq); struct accfg_wq *accfg_device_wq_get_by_id(struct accfg_device *device, - int id); + int id); const char *accfg_wq_get_devname(struct accfg_wq *wq); enum accfg_wq_mode accfg_wq_get_mode(struct accfg_wq *wq); -unsigned long accfg_wq_get_size(struct accfg_wq *wq); +uint64_t accfg_wq_get_size(struct accfg_wq *wq); int accfg_wq_get_group_id(struct accfg_wq *wq); int accfg_wq_get_priority(struct accfg_wq *wq); unsigned int accfg_wq_get_priv(struct accfg_wq *wq); @@ -245,6 +277,8 @@ enum accfg_wq_state accfg_wq_get_state(struct accfg_wq *wq); int accfg_wq_get_cdev_minor(struct accfg_wq *wq); const char *accfg_wq_get_type_name(struct accfg_wq *wq); enum accfg_wq_type accfg_wq_get_type(struct accfg_wq *wq); +unsigned int accfg_wq_get_max_batch_size(struct accfg_wq *wq); +uint64_t accfg_wq_get_max_transfer_size(struct accfg_wq *wq); int accfg_wq_get_threshold(struct accfg_wq *wq); int accfg_wq_get_clients(struct accfg_wq *wq); int accfg_wq_is_enabled(struct accfg_wq *wq); @@ -253,35 +287,36 @@ int accfg_wq_set_priority(struct accfg_wq *wq, int val); int accfg_wq_set_group_id(struct accfg_wq *wq, int val); int accfg_wq_set_threshold(struct accfg_wq *wq, int val); int accfg_wq_set_block_on_fault(struct accfg_wq *wq, int val); -int accfg_wq_set_str_mode(struct accfg_wq *wq, const char* val); +int accfg_wq_set_max_batch_size(struct accfg_wq *wq, int val); +int accfg_wq_set_max_transfer_size(struct accfg_wq *wq, uint64_t val); +int accfg_wq_set_str_mode(struct accfg_wq *wq, const char *val); int accfg_wq_set_mode(struct accfg_wq *wq, enum accfg_wq_mode mode); -int accfg_wq_set_str_type(struct accfg_wq *wq, const char* val); +int accfg_wq_set_str_type(struct accfg_wq *wq, const char *val); int accfg_wq_set_str_name(struct accfg_wq *wq, const char *val); int accfg_wq_enable(struct accfg_wq *wq); int accfg_wq_disable(struct accfg_wq *wq, bool force); int accfg_wq_priority_boundary(struct accfg_wq *wq); int accfg_wq_size_boundary(struct accfg_device *device, int wq_num); -int accfg_wq_create_mdev(struct accfg_wq *wq, uuid_t uuid); -int accfg_wq_remove_mdev(struct accfg_wq *wq, uuid_t uuid); +int accfg_wq_get_user_dev_path(struct accfg_wq *wq, char *buf, size_t size); /* libaccfg function for engine */ struct accfg_engine; struct accfg_engine *accfg_engine_get_first(struct accfg_device *device); struct accfg_engine *accfg_engine_get_next(struct accfg_engine *engine); #define accfg_engine_foreach(device, engine) \ - for (engine = accfg_engine_get_first(device); \ - engine != NULL; \ - engine = accfg_engine_get_next(engine)) + for (engine = accfg_engine_get_first(device); \ + engine != NULL; \ + engine = accfg_engine_get_next(engine)) struct accfg_ctx *accfg_engine_get_ctx(struct accfg_engine *engine); struct accfg_device *accfg_engine_get_device(struct accfg_engine *engine); struct accfg_group *accfg_engine_get_group(struct accfg_engine *engine); int accfg_engine_get_group_id(struct accfg_engine *engine); int accfg_engine_get_id(struct accfg_engine *engine); struct accfg_engine *accfg_device_engine_get_by_id(struct accfg_device *device, - int id); + int id); const char *accfg_engine_get_devname(struct accfg_engine *engine); int accfg_engine_set_group_id(struct accfg_engine *engine, int val); #ifdef __cplusplus } /* extern "C" */ #endif -#endif +#endif \ No newline at end of file diff --git a/sources/core/src/hw_dispatcher/legacy_headers/own_dsa_accel_constants.h b/sources/core/src/hw_dispatcher/legacy_headers/own_dsa_accel_constants.h new file mode 100644 index 0000000..1c037f0 --- /dev/null +++ b/sources/core/src/hw_dispatcher/legacy_headers/own_dsa_accel_constants.h @@ -0,0 +1,59 @@ +/* + * Copyright 2020-2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +/** + * @brief Contains a constant, which are used to connect with hardware + * @date 3/23/2020 + * + */ + +#include + +#ifndef DML_DSA_ACCEL_CONFIG_H__ +#define DML_DSA_ACCEL_CONFIG_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DSA_DEVICE_ID ((uint32_t)(((uint32_t)0xFF << 24u) | ((uint32_t)('a') << 16u) | ((uint32_t)('s') << 8u) | (uint32_t)('d'))) + +#define CHAR_MSK 0xFF202020 +#define MAX_DEVICE_COUNT 8u +#define MAX_WORK_QUEUE_COUNT 8u +#define OWN_PAGE_MASK 0x0FFFllu /**< Defines page mask for portal incrementing */ + +// General Capabilities Register unwrappers +#define GC_BLOCK_ON_FAULT(GENCAP) (((GENCAP)) &0x01) /**< GENCAP bit 0 - block on fault support */ +#define GC_OVERLAPPING(GENCAP) (((GENCAP)>>1) &0x01) /**< GENCAP bit 1 - overlapping copy support */ +#define GC_CACHE_WRITE(GENCAP) (((GENCAP)>>2) &0x01) /**< GENCAP bit 2 - cache control support (memory) */ +#define GC_CACHE_FLUSH(GENCAP) (((GENCAP)>>3) &0x01) /**< GENCAP bit 3 - cache control support (cache flush) */ +#define GC_COM_CAP(GENCAP) (((GENCAP)>>4) &0x01) /**< GENCAP bit 4 - command capabilities support */ +#define GC_DST_READBACK(GENCAP) (((GENCAP)>>8) &0x01) /**< GENCAP bit 8 - destination readback support */ +#define GC_DRAIN_READBACK(GENCAP) (((GENCAP)>>9) &0x01) /**< GENCAP bit 9 - drain descriptor readback address support */ +#define GC_MAX_TRANSFER_SIZE(GENCAP) (1 << (((GENCAP)>>16)&0x1F)) /**< GENCAP 20-16 bits - maximum supported transfer size */ +#define GC_MAX_BATCH_SIZE(GENCAP) (1 << (((GENCAP)>>21)&0x0F)) /**< GENCAP 24-21 bits - maximum supported batch size */ +#define GC_INTERRUPT_STORAGE(GENCAP) ((((GENCAP)>>25)&0x3F)*256u) /**< GENCAP 30-25 bits - interrupt message storage size */ +#define GC_CONF_SUPPORT(GENCAP) (((GENCAP)>>31)&0x01) /**< GENCAP bit 31 - configuration support */ + +static const char DEVICE_NAME[] = "dsa"; +static const uint32_t DEVICE_NAME_LENGTH = sizeof(DEVICE_NAME) - 2u; //sizeof will return 4, position of terminating 0 is 3 + +#ifdef __cplusplus +} +#endif + +#endif //DML_DSA_ACCEL_CONFIG_H__ diff --git a/include/dml/cpp/middle_layer/descriptor.hpp b/sources/core/src/hw_dispatcher/numa.cpp similarity index 59% rename from include/dml/cpp/middle_layer/descriptor.hpp rename to sources/core/src/hw_dispatcher/numa.cpp index 28ef3fb..f686429 100644 --- a/include/dml/cpp/middle_layer/descriptor.hpp +++ b/sources/core/src/hw_dispatcher/numa.cpp @@ -14,26 +14,27 @@ * */ -/** - * @date 05/19/2021 - * @defgroup dmlml DML Middle Layer - * @brief Middle Layer for Intel(R) Data Mover Library (Intel® DML) - */ - -#ifndef DML_ML_DESCRIPTOR_HPP -#define DML_ML_DESCRIPTOR_HPP +#if defined(linux) +#include +#endif -#include "types.hpp" +#include "numa.hpp" -namespace dml::ml +namespace dml::core::util { - /** - * @todo - */ - struct alignas(64u) descriptor + uint32_t get_numa_id() noexcept { - byte_t bytes[64u]{}; /**< Underlying data array */ - }; -} // namespace dml::ml +#if defined(linux) + uint32_t tsc_aux = 0; + + __rdtscp(&tsc_aux); + + // Linux encodes NUMA node into [32:12] of TSC_AUX + return tsc_aux >> 12; +#else + // Not supported in Windows yet + return 0; +#endif + } -#endif //DML_ML_DESCRIPTOR_HPP +} // namespace dml::core::util diff --git a/sources/middle_layer/dispatcher/numa.hpp b/sources/core/src/hw_dispatcher/numa.hpp similarity index 85% rename from sources/middle_layer/dispatcher/numa.hpp rename to sources/core/src/hw_dispatcher/numa.hpp index 2072573..93bf73c 100644 --- a/sources/middle_layer/dispatcher/numa.hpp +++ b/sources/core/src/hw_dispatcher/numa.hpp @@ -19,10 +19,9 @@ #include -namespace dml::ml::util { - -int32_t get_numa_id() noexcept; - +namespace dml::core::util +{ + [[nodiscard]] uint32_t get_numa_id() noexcept; } -#endif //DML_MIDDLE_LAYER_DISPATCHER_NUMA_HPP_ +#endif //DML_MIDDLE_LAYER_DISPATCHER_NUMA_HPP_ diff --git a/sources/core/src/kernels.hpp b/sources/core/src/kernels.hpp new file mode 100644 index 0000000..fa776f5 --- /dev/null +++ b/sources/core/src/kernels.hpp @@ -0,0 +1,60 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_OWN_KERNELS_HPP +#define DML_CORE_OWN_KERNELS_HPP + +#include +#include + +namespace dml::core::kernels +{ + void nop(nop_descriptor dsc, nop_completion_record record) noexcept; + + void batch(batch_descriptor dsc, batch_completion_record record) noexcept; + + void drain(drain_descriptor dsc, drain_completion_record record) noexcept; + + void mem_move(mem_move_descriptor dsc, mem_move_completion_record record) noexcept; + + void fill(fill_descriptor dsc, fill_completion_record record) noexcept; + + void compare(compare_descriptor dsc, compare_completion_record record) noexcept; + + void compare_pattern(compare_pattern_descriptor dsc, compare_pattern_completion_record record) noexcept; + + void create_delta(create_delta_descriptor dsc, create_delta_completion_record record) noexcept; + + void apply_delta(apply_delta_descriptor dsc, apply_delta_completion_record record) noexcept; + + void dualcast(dualcast_descriptor dsc, dualcast_completion_record record) noexcept; + + void crc(crc_descriptor dsc, crc_completion_record record) noexcept; + + void copy_crc(copy_crc_descriptor dsc, crc_completion_record record) noexcept; + + void dif_check(dif_check_descriptor dsc, dif_check_completion_record record) noexcept; + + void dif_insert(dif_insert_descriptor dsc, dif_insert_completion_record record) noexcept; + + void dif_strip(dif_strip_descriptor dsc, dif_strip_completion_record record) noexcept; + + void dif_update(dif_update_descriptor dsc, dif_update_completion_record record) noexcept; + + void cache_flush(cache_flush_descriptor dsc, cache_flush_completion_record record) noexcept; +} // namespace dml::core::kernels + +#endif //DML_CORE_OWN_KERNELS_HPP diff --git a/sources/core/src/mem_move.cpp b/sources/core/src/mem_move.cpp new file mode 100644 index 0000000..913f10e --- /dev/null +++ b/sources/core/src/mem_move.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include + +#include "immintrin.h" +#include "kernels.hpp" +#include "optimization_dispatcher.hpp" + +namespace dml::core::kernels +{ + void mem_move(mem_move_descriptor dsc, mem_move_completion_record record) noexcept + { + const auto src = reinterpret_cast(dsc.source_address()); + const auto dst = reinterpret_cast(dsc.destination_address()); + const auto transfer_size = dsc.transfer_size(); + + dispatch::mem_move(src, dst, transfer_size); + + _mm_mfence(); + record.status() = to_underlying(dml::detail::execution_status::success); + } +} // namespace dml::core::kernels diff --git a/sources/core/src/nop.cpp b/sources/core/src/nop.cpp new file mode 100644 index 0000000..f166419 --- /dev/null +++ b/sources/core/src/nop.cpp @@ -0,0 +1,29 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "kernels.hpp" + +namespace dml::core::kernels +{ + void nop(nop_descriptor dsc, nop_completion_record record) noexcept + { + static_cast(dsc); + record.status() = static_cast(dml::detail::execution_status::success); + } + +} // namespace dml::core::kernels diff --git a/sources/core/src/software_device.cpp b/sources/core/src/software_device.cpp new file mode 100644 index 0000000..7614986 --- /dev/null +++ b/sources/core/src/software_device.cpp @@ -0,0 +1,91 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include + +#include "core/device.hpp" +#include "kernels.hpp" + +namespace dml::core +{ + dml::detail::submission_status software_device::submit(descriptor& dsc, completion_record& completion_record) noexcept + { + auto dsc_view = any_descriptor(dsc); + auto op = operation(dsc_view.operation()); + + switch (op) + { + case operation::nop: + kernels::nop(nop_descriptor(dsc), nop_completion_record(completion_record)); + break; + case operation::batch: + kernels::batch(batch_descriptor(dsc), batch_completion_record(completion_record)); + break; + case operation::drain: + kernels::drain(drain_descriptor(dsc), drain_completion_record(completion_record)); + break; + case operation::memory_move: + kernels::mem_move(mem_move_descriptor(dsc), mem_move_completion_record(completion_record)); + break; + case operation::fill: + kernels::fill(fill_descriptor(dsc), fill_completion_record(completion_record)); + break; + case operation::compare: + kernels::compare(compare_descriptor(dsc), compare_completion_record(completion_record)); + break; + case operation::compare_pattern: + kernels::compare_pattern(compare_pattern_descriptor(dsc), compare_pattern_completion_record(completion_record)); + break; + case operation::create_delta: + kernels::create_delta(create_delta_descriptor(dsc), create_delta_completion_record(completion_record)); + break; + case operation::apply_delta: + kernels::apply_delta(apply_delta_descriptor(dsc), apply_delta_completion_record(completion_record)); + break; + case operation::dualcast: + kernels::dualcast(dualcast_descriptor(dsc), dualcast_completion_record(completion_record)); + break; + case operation::crc: + kernels::crc(crc_descriptor(dsc), crc_completion_record(completion_record)); + break; + case operation::copy_crc: + kernels::copy_crc(copy_crc_descriptor(dsc), crc_completion_record(completion_record)); + break; + case operation::dif_check: + kernels::dif_check(dif_check_descriptor(dsc), dif_check_completion_record(completion_record)); + break; + case operation::dif_insert: + kernels::dif_insert(dif_insert_descriptor(dsc), dif_insert_completion_record(completion_record)); + break; + case operation::dif_strip: + kernels::dif_strip(dif_strip_descriptor(dsc), dif_strip_completion_record(completion_record)); + break; + case operation::dif_update: + kernels::dif_update(dif_update_descriptor(dsc), dif_update_completion_record(completion_record)); + break; + case operation::cache_flush: + kernels::cache_flush(cache_flush_descriptor(dsc), cache_flush_completion_record(completion_record)); + break; + default: + return dml::detail::submission_status::failure; + } + + return dml::detail::submission_status::success; + } +} // namespace dml::core diff --git a/sources/core/src/sw_dispatcher/CMakeLists.txt b/sources/core/src/sw_dispatcher/CMakeLists.txt new file mode 100644 index 0000000..5e0f8de --- /dev/null +++ b/sources/core/src/sw_dispatcher/CMakeLists.txt @@ -0,0 +1,48 @@ +# +# Copyright 2021 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you ("License"). Unless the License provides otherwise, +# you may not use, modify, copy, publish, distribute, disclose or transmit +# this software or the related documents without Intel's prior written +# permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +add_library(dml_sw_dispatcher OBJECT + dml_cpuid.h + dml_kernels.h + optimization_dispatcher.hpp + + optimization_dispatcher.cpp + dml_cpuid.c + ) + +target_include_directories(dml_sw_dispatcher + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} + ) +target_compile_features(dml_sw_dispatcher + PUBLIC cxx_std_17 + PUBLIC c_std_11 + ) +target_sources(dml_sw_dispatcher + PUBLIC $ + PUBLIC $ + PUBLIC $ + ) +target_compile_options(dml_sw_dispatcher + PRIVATE ${DML_QUALITY_OPTIONS} + PRIVATE $<$:${DML_CPP_PRIVATE_OPTIONS}> + ) + +if (DML_ARCH STREQUAL avx512) + target_compile_definitions(dml_sw_dispatcher PRIVATE DML_AVX512) +endif () + +add_subdirectory(ref) +add_subdirectory(avx512) +add_subdirectory(cache_flush) diff --git a/sources/core/src/sw_dispatcher/avx512/CMakeLists.txt b/sources/core/src/sw_dispatcher/avx512/CMakeLists.txt new file mode 100644 index 0000000..2739e92 --- /dev/null +++ b/sources/core/src/sw_dispatcher/avx512/CMakeLists.txt @@ -0,0 +1,34 @@ +# +# Copyright 2021 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you ("License"). Unless the License provides otherwise, +# you may not use, modify, copy, publish, distribute, disclose or transmit +# this software or the related documents without Intel's prior written +# permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +add_library(dml_kernels_avx512 OBJECT + mem_move.c + fill.c + compare.c + compare_pattern.c + crc.c + ) + +target_compile_features(dml_kernels_avx512 PRIVATE c_std_11) + +target_compile_options(dml_kernels_avx512 PRIVATE ${DML_QUALITY_OPTIONS}) + +if (CMAKE_C_COMPILER_ID MATCHES GNU) + target_compile_options(dml_kernels_avx512 PRIVATE -march=skylake-avx512) +endif () + +if (CMAKE_C_COMPILER_ID MATCHES MSVC) + target_compile_options(dml_kernels_avx512 PRIVATE /arch:AVX512) +endif () diff --git a/sources/core/src/sw_dispatcher/avx512/compare.c b/sources/core/src/sw_dispatcher/avx512/compare.c new file mode 100644 index 0000000..70fe8d0 --- /dev/null +++ b/sources/core/src/sw_dispatcher/avx512/compare.c @@ -0,0 +1,59 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "../dml_kernels.h" + +#if defined(_MSC_BUILD) +#include +#elif defined(__GNUC__) +#include +#else +#error "Unsupported compiler" +#endif + +uint32_t dml_avx512_compare(const uint8_t* src1, const uint8_t* src2, uint32_t transfer_size, uint8_t* result) +{ + const uint8_t equal = 0x0; + const uint8_t not_equal = 0x1; + + uint32_t i; + __mmask64 msk64 = (__mmask64)0; + for (i = 0u; (i + 64) <= transfer_size; i += 64) + { + msk64 = + _mm512_cmp_epi8_mask(_mm512_loadu_si512((void const*)(src1 + i)), _mm512_loadu_si512((void const*)(src2 + i)), _MM_CMPINT_NE); + if (msk64) + { + *result = not_equal; + return i + (uint32_t)_tzcnt_u64((uint64_t)msk64); + } + } + { + uint64_t tail = transfer_size & 63; + msk64 = ((uint64_t)1 << tail) - (uint64_t)1; + msk64 = _mm512_cmp_epi8_mask(_mm512_maskz_loadu_epi8(msk64, (void const*)(src1 + i)), + _mm512_maskz_loadu_epi8(msk64, (void const*)(src2 + i)), + _MM_CMPINT_NE); + if (msk64) + { + *result = not_equal; + return i + (uint32_t)_tzcnt_u64((uint64_t)msk64); + } + } + + *result = equal; + return 0; +} \ No newline at end of file diff --git a/sources/core/src/sw_dispatcher/avx512/compare_pattern.c b/sources/core/src/sw_dispatcher/avx512/compare_pattern.c new file mode 100644 index 0000000..14c9bc5 --- /dev/null +++ b/sources/core/src/sw_dispatcher/avx512/compare_pattern.c @@ -0,0 +1,83 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "../dml_kernels.h" + +#if defined(_MSC_BUILD) +#include +#elif defined(__GNUC__) +#include +#else +#error "Unsupported compiler" +#endif + +uint32_t dml_avx512_compare_pattern(uint64_t pattern, const uint8_t* src, uint32_t transfer_size, uint8_t* result) +{ + const uint8_t equal = 0x0; + const uint8_t not_equal = 0x1; + + const uint32_t pattern_chunk_count = transfer_size >> 3; + const uint64_t tail_bytes_count = transfer_size & 7; + const uint64_t* const pattern_region_ptr = (uint64_t*)src; + + __m512i x_pattern = _mm512_set1_epi64(pattern); + uint32_t i; + + for (i = 0u; (i + 8) <= pattern_chunk_count; i += 8) + { + __mmask8 msk8 = _mm512_cmp_epi64_mask(_mm512_loadu_si512((void const*)(pattern_region_ptr + i)), x_pattern, _MM_CMPINT_NE); + if (msk8) + { + *result = not_equal; + return (i + (uint32_t)_tzcnt_u32((uint32_t)msk8)) << 3u; + } + } + { + uint64_t tail = pattern_chunk_count & 7; + if (tail) + { + __mmask8 msk8 = (__mmask8)((1 << tail) - 1); + msk8 = _mm512_mask_cmp_epi64_mask(msk8, + _mm512_maskz_loadu_epi64(msk8, (void const*)(pattern_region_ptr + i)), + x_pattern, + _MM_CMPINT_NE); + if (msk8) + { + *result = not_equal; + return (i + (uint32_t)_tzcnt_u32((uint32_t)msk8)) << 3u; + } + } + } + if (tail_bytes_count) + { + src += transfer_size - tail_bytes_count; + uint64_t byte_pattern = pattern; + // Compare tail + for (i = 0; i < tail_bytes_count; i++) + { + if (src[i] != (uint8_t)byte_pattern) + { + *result = not_equal; + return (pattern_chunk_count << 3) + i; + } + const size_t byte_width = 8; + byte_pattern >>= byte_width; + } + } + + *result = equal; + return 0; +} diff --git a/sources/core/src/sw_dispatcher/avx512/crc.c b/sources/core/src/sw_dispatcher/avx512/crc.c new file mode 100644 index 0000000..f9ac9b3 --- /dev/null +++ b/sources/core/src/sw_dispatcher/avx512/crc.c @@ -0,0 +1,628 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "../dml_kernels.h" + +#if defined(_MSC_BUILD) +#include +#elif defined(__GNUC__) +#include +#else +#error "Unsupported compiler" +#endif + +#define DML_DISABLE_OPTIMIZATION_ + +/** +* @todo +*/ +static inline uint32_t getCRCSize(uint64_t poly) +{ + uint32_t crcSize; + crcSize = 63 - (uint32_t)_lzcnt_u64(poly); + return crcSize; +} + +/** +* @todo +*/ + +static void own_CRC_8u_opt_k0(const uint8_t* src_ptr, uint32_t init_crc, int len0, int crc_size, uint32_t* koeff_ptr, uint32_t* crc_ptr); + +/** +* @todo +*/ +static void own_CRC_8u_k0(const uint8_t* src_ptr, uint32_t len, uint64_t poly, const uint8_t optPoly[128], uint32_t init, uint32_t* crc_ptr) +{ + int crc_size = getCRCSize(poly); + own_CRC_8u_opt_k0(src_ptr, init, len, crc_size, (uint32_t*)optPoly, crc_ptr); +} + +/** +* @todo +*/ +static void poly1x64_32_div(uint64_t poly, uint64_t* quit_ptr, uint32_t* tail_ptr) +{ + int i, j; + uint64_t tail = 0; + uint64_t quot = 0; + uint8_t byte = 0x01; + for (i = 0; i < 9; i++) + { + uint8_t bit; + uint64_t hbit; + for (j = 0; j < 8; j++) + { + bit = (byte & 0x80) >> 7; + byte <<= 1; + hbit = tail & 0x80000000; + tail <<= 1; + tail |= bit; + quot <<= 1; + if (hbit) + { + tail = tail ^ poly; + quot |= 1; + } + tail = tail & 0xffffffff; + } + } + *quit_ptr = quot; + *tail_ptr = (uint32_t)tail; + return; +} + +/** +* @todo +*/ +static inline void own_gen_crc_opt_poly_8u(uint64_t poly, uint8_t optPoly[128]) +{ + uint64_t u; + uint32_t i, k, crc_size; + uint32_t t; + + crc_size = getCRCSize(poly); + uint32_t* opt = (uint32_t*)optPoly; + *(uint64_t*)opt = poly; /*copy poly*/ + uint64_t poly32 = poly << (32 - crc_size); + poly1x64_32_div(poly32, &u, &t); /*for 1^64 and U*/ + *(uint64_t*)(opt + 2) = u; + int bits[] = { 64, 96, 160, 224, 288, 352, 416, 480, 544, 608, 672, 736, 800, 864, 928, 992, 1056, 2016, 2080 }; + uint32_t tail = 0; + uint32_t poly_32 = (uint32_t)poly; + int j; + + k = bits[0] + 8; + tail = poly_32; + for (j = 40; (uint32_t)j < k; j++) + { + uint32_t mask; + mask = (tail & 0x80000000) ? poly_32 : 0; + tail += tail; + tail ^= mask; + } + opt[4 + 0] = (uint32_t)tail; + + for (i = 1; i < ((sizeof(bits) / sizeof(bits[0])) - 2); i++) + { + k = bits[i] + 8; + for (; (uint32_t)j < k; j++) + { + uint32_t mask; + mask = (tail & 0x80000000) ? poly_32 : 0; + tail += tail; + tail ^= mask; + } + opt[4 + i] = (uint32_t)tail; + } +} + +static inline void dmlc_own_calculate_crc_32u(const uint8_t* const memory_region_ptr, + uint32_t bytes_to_hash, + uint32_t* const crc_ptr, + uint32_t polynomial) +{ + uint64_t poly = (uint64_t)polynomial | ((uint64_t)1u << (uint64_t)32u); + uint8_t opt_poly_ptr[128]; + + own_gen_crc_opt_poly_8u(poly, opt_poly_ptr); + own_CRC_8u_k0(memory_region_ptr, bytes_to_hash, poly, opt_poly_ptr, *crc_ptr, crc_ptr); +} + +#if defined(_MSC_VER) +#pragma optimize("", off) +#pragma optimize("O3", on) +#endif + +/** +* @todo +*/ +#define _MM_XOR_PS(A, B) _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(A), _mm_castsi128_ps(B))) +/** +* @todo +*/ +#define arg1_low32 ecx +/** +* @todo +*/ +static void own_CRC_8u_opt_k0(const uint8_t* src_ptr, uint32_t init_crc, int len0, int crc_size, uint32_t* koeff, uint32_t* crc_ptr) +{ + uint64_t pshufb_shf_table[] = { 0x8786858483828100, 0x8f8e8d8c8b8a8988, 0x0706050403020100, 0x000e0d0c0b0a0908 }; + + int len = len0; + uint8_t ttt[128]; + uint8_t* r11 = ttt; + uint8_t* ptr; + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm12, xmm13; +#ifndef DML_DISABLE_OPTIMIZATION_ + __m128i xmm11; +#endif // DML_DISABLE_OPTIMIZATION_ + + int eax, ecx, r9; + __m128i ENDIA_SHUF_MASK = _mm_set_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); + __m128i mask2 = _mm_set_epi64x(0x00000000FFFFFFFF, 0xFFFFFFFFFFFFFFFF); + __m128i mask1 = _mm_set_epi64x(0x8080808080808080, 0x8080808080808080); + + uint64_t q = *(uint64_t*)(koeff + 0); + q <<= (32 - crc_size); + uint64_t u = *(uint64_t*)(koeff + 2); + uint64_t k_64 = ((uint64_t)koeff[4]) << 32; + uint64_t k_96 = ((uint64_t)koeff[5]) << 32; + uint64_t k_160 = ((uint64_t)koeff[6]) << 32; + uint64_t k_224 = ((uint64_t)koeff[7]) << 32; + uint64_t k_288 = ((uint64_t)koeff[8]) << 32; + uint64_t k_352 = ((uint64_t)koeff[9]) << 32; + uint64_t k_416 = ((uint64_t)koeff[10]) << 32; + uint64_t k_480 = ((uint64_t)koeff[11]) << 32; + uint64_t k_544 = ((uint64_t)koeff[12]) << 32; + uint64_t k_608 = ((uint64_t)koeff[13]) << 32; + uint64_t k_672 = ((uint64_t)koeff[14]) << 32; + uint64_t k_736 = ((uint64_t)koeff[15]) << 32; + uint64_t k_800 = ((uint64_t)koeff[16]) << 32; + uint64_t k_864 = ((uint64_t)koeff[17]) << 32; + uint64_t k_928 = ((uint64_t)koeff[18]) << 32; + uint64_t k_992 = ((uint64_t)koeff[19]) << 32; + uint64_t k_1056 = ((uint64_t)koeff[20]) << 32; + + ecx = init_crc; + //crc16_t10dif_01: + ecx = ecx << (32 - crc_size); +#ifndef DML_DISABLE_OPTIMIZATION_ + if (len < 256) + { + goto _less_than_256; + } +#endif // DML_DISABLE_OPTIMIZATION_ + //; load the initial crc value + xmm10 = _mm_cvtsi32_si128(arg1_low32); //movd xmm10, arg1_low32; initial crc + //; crc value does not need to be byte - reflected, but it needs to be moved to the high part of the register. + //; because data will be byte - reflected and will align with initial crc at correct place. + xmm10 = _mm_slli_si128(xmm10, 12); + //; receive the initial 128B data, xor the initial crc value + xmm0 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 0)); + xmm1 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 1)); + xmm2 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 2)); + xmm3 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 3)); + xmm4 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 4)); + xmm5 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 5)); + xmm6 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 6)); + xmm7 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 7)); + xmm0 = _mm_shuffle_epi8(xmm0, ENDIA_SHUF_MASK); + //; XOR the initial_crc value + xmm0 = _mm_xor_si128(xmm0, xmm10); + xmm1 = _mm_shuffle_epi8(xmm1, ENDIA_SHUF_MASK); + xmm2 = _mm_shuffle_epi8(xmm2, ENDIA_SHUF_MASK); + xmm3 = _mm_shuffle_epi8(xmm3, ENDIA_SHUF_MASK); + xmm4 = _mm_shuffle_epi8(xmm4, ENDIA_SHUF_MASK); + xmm5 = _mm_shuffle_epi8(xmm5, ENDIA_SHUF_MASK); + xmm6 = _mm_shuffle_epi8(xmm6, ENDIA_SHUF_MASK); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm10 = _mm_set_epi64x(k_1056 /*rk4*/, k_992 /*rk3*/); + //; imm value of pclmulqdq instruction will determine which constant to use + //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + //; we subtract 256 instead of 128 to save one instruction from the loop + len -= 256; +// +//; at this section of the code, there is 128 * x + y(0 <= y < 128) bytes of buffer.The _fold_128_B_loop +//; loop will fold 128B at a time until we have 128 + y Bytes of buffer +// +// +//; fold 128B at a time.This section of the code folds 8 xmm registers in parallel +_fold_128_B_loop: + // + //; update the buffer pointer + src_ptr += 128; + xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 0)); + xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 1)); + xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); + xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); + xmm8 = xmm0; + xmm13 = xmm1; + xmm0 = _mm_clmulepi64_si128(xmm0, xmm10, 0x0); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); + xmm1 = _mm_clmulepi64_si128(xmm1, xmm10, 0x0); + xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); + xmm0 = _mm_xor_si128(xmm0, xmm9); + xmm0 = _MM_XOR_PS(xmm0, xmm8); + xmm1 = _mm_xor_si128(xmm1, xmm12); + xmm1 = _MM_XOR_PS(xmm1, xmm13); + xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 2)); + xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 3)); + xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); + xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); + xmm8 = xmm2; + xmm13 = xmm3; + xmm2 = _mm_clmulepi64_si128(xmm2, xmm10, 0x0); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); + xmm3 = _mm_clmulepi64_si128(xmm3, xmm10, 0x0); + xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); + xmm2 = _mm_xor_si128(xmm2, xmm9); + xmm2 = _MM_XOR_PS(xmm2, xmm8); + xmm3 = _mm_xor_si128(xmm3, xmm12); + xmm3 = _MM_XOR_PS(xmm3, xmm13); + + xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 4)); + xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 5)); + xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); + xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); + xmm8 = xmm4; + xmm13 = xmm5; + xmm4 = _mm_clmulepi64_si128(xmm4, xmm10, 0x0); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); + xmm5 = _mm_clmulepi64_si128(xmm5, xmm10, 0x0); + xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); + xmm4 = _mm_xor_si128(xmm4, xmm9); + xmm4 = _MM_XOR_PS(xmm4, xmm8); + xmm5 = _mm_xor_si128(xmm5, xmm12); + xmm5 = _MM_XOR_PS(xmm5, xmm13); + + xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 6)); + xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 7)); + xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); + xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); + xmm8 = xmm6; + xmm13 = xmm7; + xmm6 = _mm_clmulepi64_si128(xmm6, xmm10, 0x0); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); + xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x0); + xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); + xmm6 = _mm_xor_si128(xmm6, xmm9); + xmm6 = _MM_XOR_PS(xmm6, xmm8); + xmm7 = _mm_xor_si128(xmm7, xmm12); + xmm7 = _MM_XOR_PS(xmm7, xmm13); + + len -= 128; + //; check if there is another 128B in the buffer to be able to fold + if (len >= 0) + goto _fold_128_B_loop; //jge _fold_128_B_loop + //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + src_ptr += 128; + //; at this point, the buffer pointer is pointing at the last y Bytes of the buffer + //; fold the 8 xmm registers to 1 xmm register with different constants + // + xmm10 = _mm_set_epi64x(k_928 /*rk10*/, k_864 /*rk9*/); + xmm8 = xmm0; + xmm0 = _mm_clmulepi64_si128(xmm0, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _MM_XOR_PS(xmm7, xmm0); + + xmm10 = _mm_set_epi64x(k_800 /*rk12*/, k_736 /*rk11*/); + xmm8 = xmm1; + xmm1 = _mm_clmulepi64_si128(xmm1, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _MM_XOR_PS(xmm7, xmm1); + xmm10 = _mm_set_epi64x(k_672 /*rk14*/, k_608 /*rk13*/); + xmm8 = xmm2; + xmm2 = _mm_clmulepi64_si128(xmm2, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _mm_xor_si128(xmm7, xmm2); + xmm10 = _mm_set_epi64x(k_544 /*rk16*/, k_480 /*rk15*/); + xmm8 = xmm3; + xmm3 = _mm_clmulepi64_si128(xmm3, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _MM_XOR_PS(xmm7, xmm3); + xmm10 = _mm_set_epi64x(k_416 /*rk18*/, k_352 /*rk17*/); + xmm8 = xmm4; + xmm4 = _mm_clmulepi64_si128(xmm4, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _mm_xor_si128(xmm7, xmm4); + xmm10 = _mm_set_epi64x(k_288 /*rk20*/, k_224 /*rk19*/); + xmm8 = xmm5; + xmm5 = _mm_clmulepi64_si128(xmm5, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _MM_XOR_PS(xmm7, xmm5); + xmm10 = _mm_set_epi64x(k_160 /*rk2*/, k_96 /*rk1*/); + // ; imm value of pclmulqdq instruction will determine which constant to use + xmm8 = xmm6; + xmm6 = _mm_clmulepi64_si128(xmm6, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _mm_xor_si128(xmm7, xmm6); + // + // ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop + // ; instead of a cmp instruction, we use the negative flag with the jl instruction + len += (128 - 16); + if (len < 0) + goto _final_reduction_for_128; // jl _final_reduction_for_128 +// ; now we have 16 + y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory +// ; we can fold 16 bytes at a time if y >= 16 +// ; continue folding 16B at a time +_16B_reduction_loop: + xmm8 = xmm7; + xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm0 = _mm_loadu_si128((const __m128i*)src_ptr); + xmm0 = _mm_shuffle_epi8(xmm0, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + src_ptr += 16; + len -= 16; + // ; instead of a cmp instruction, we utilize the flags with the jge instruction + // ; equivalent of : cmp arg3, 16 - 16 + // ; check if there is any more 16B in the buffer to be able to fold + + if (len >= 0) + goto _16B_reduction_loop; // jge _16B_reduction_loop +// ; now we have 16 + z bytes left to reduce, where 0 <= z < 16. +// ; first, we reduce the data in the xmm7 register +_final_reduction_for_128: + // ; check if any more data to fold.If not, compute the CRC of the final 128 bits + len += 16; + if (len == 0) + goto _128_done; // je _128_done + // ; here we are getting data that is less than 16 bytes. + // ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes. + // ; after that the registers need to be adjusted. +#ifndef DML_DISABLE_OPTIMIZATION_ +_get_last_two_xmms: +#endif // DML_DISABLE_OPTIMIZATION_ + xmm2 = xmm7; + xmm1 = _mm_loadu_si128((const __m128i*)(src_ptr - 16 + len)); + xmm1 = _mm_shuffle_epi8(xmm1, ENDIA_SHUF_MASK); + // ; get rid of the extra data that was loaded before + // ; load the shift constant + // lea rax, [pshufb_shf_table + 16] + // sub rax, arg3 + ptr = (uint8_t*)pshufb_shf_table + 16 - len; + xmm0 = _mm_loadu_si128((const __m128i*)ptr); + // + // ; shift xmm2 to the left by arg3 bytes + xmm2 = _mm_shuffle_epi8(xmm2, xmm0); + xmm0 = _mm_xor_si128(xmm0, mask1); + xmm7 = _mm_shuffle_epi8(xmm7, xmm0); + xmm1 = _mm_blendv_epi8(xmm1, xmm2, xmm0); + // ; fold 16 Bytes + xmm2 = xmm1; + xmm8 = xmm7; + xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x11); + xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); + xmm7 = _mm_xor_si128(xmm7, xmm8); + xmm7 = _mm_xor_si128(xmm7, xmm2); +_128_done: + // ; compute crc of a 128 - bit value + xmm10 = _mm_set_epi64x(k_64 /*rk6*/, k_96 /*rk5*/); + xmm0 = xmm7; + // ; 64b fold + xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x1); + xmm0 = _mm_slli_si128(xmm0, 8); + xmm7 = _mm_xor_si128(xmm7, xmm0); + // ; 32b fold + xmm0 = xmm7; + xmm0 = _mm_and_si128(xmm0, mask2); + xmm7 = _mm_srli_si128(xmm7, 12); + xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x10); + xmm7 = _mm_xor_si128(xmm7, xmm0); +// ; barrett reduction +_barrett: + xmm10 = _mm_set_epi64x(q /*rk8*/, u /*rk7*/); + xmm0 = xmm7; + xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x01); + xmm7 = _mm_slli_si128(xmm7, 4); + xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x11); + + xmm7 = _mm_slli_si128(xmm7, 4); + xmm7 = _mm_xor_si128(xmm7, xmm0); + eax = _mm_extract_epi32(xmm7, 1); +#ifndef DML_DISABLE_OPTIMIZATION_ +_cleanup: +#endif // DML_DISABLE_OPTIMIZATION_ + // ; scale the result back to 16 bits + eax = ((uint32_t)eax) >> (32 - crc_size); + *crc_ptr = eax; + return; // ret + + //align 16 +#ifndef DML_DISABLE_OPTIMIZATION_ +_less_than_256: + // + //; check if there is enough buffer to be able to fold 16B at a time + //cmp arg3, 32 + //jl _less_than_32 + if (len < 32) + { + goto _less_than_32; + } + xmm11 = ENDIA_SHUF_MASK; + //; if there is, load the constants + xmm10 = _mm_set_epi64x(k_160 /*rk2*/, k_96 /*rk1*/); + xmm0 = _mm_cvtsi32_si128(arg1_low32); + xmm0 = _mm_slli_si128(xmm0, 12); + xmm7 = _mm_loadu_si128((const __m128i*)src_ptr); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + // + //; update the buffer pointer + src_ptr += 16; + // + //; update the counter.subtract 32 instead of 16 to save one instruction from the loop + len -= 32; + goto _16B_reduction_loop; //jmp _16B_reduction_loop + +//align 16 +_less_than_32: + //; mov initial crc to the return value. this is necessary for zero - length buffers. + eax = arg1_low32; //mov eax, arg1_low32 + //test arg3, arg3 + if (len == 0) + goto _cleanup; //je _cleanup + // + xmm11 = ENDIA_SHUF_MASK; + + xmm0 = _mm_cvtsi32_si128(arg1_low32); + xmm0 = _mm_slli_si128(xmm0, 12); + //cmp arg3, 16 + if (len == 16) + goto _exact_16_left; //je _exact_16_left + if (len < 16) + goto _less_than_16_left; //jl _less_than_16_left + + xmm7 = _mm_loadu_si128((const __m128i*)src_ptr); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + src_ptr += 16; + len -= 16; + xmm10 = _mm_set_epi64x(k_160 /*rk2*/, k_96 /*rk1*/); + goto _get_last_two_xmms; //jmp _get_last_two_xmms +//align 16 +_less_than_16_left: +#endif // DML_DISABLE_OPTIMIZATION_ + //; use stack space to load data less than 16 bytes, zero - out the 16B in memory first. + // + xmm1 = _mm_setzero_si128(); + + _mm_storeu_si128((__m128i*)r11, xmm1); + //cmp arg3, 4 + if (len < 4) + goto _only_less_than_4; //jl _only_less_than_4 + //; backup the counter value + r9 = len; + //cmp arg3, 8 + if (len < 8) + goto _less_than_8_left; //jl _less_than_8_left + //; load 8 Bytes + *(int64_t*)r11 = *(int64_t*)src_ptr; + r11 += 8; + len -= 8; + src_ptr += 8; +_less_than_8_left: + //cmp arg3, 4 + if (len < 4) + goto _less_than_4_left; //jl _less_than_4_left + //; load 4 Bytes + *(int*)r11 = *(int*)src_ptr; + r11 += 4; + len -= 4; + src_ptr += 4; //add arg2, 4 +_less_than_4_left: + // + //cmp arg3, 2 + if (len < 2) + goto _less_than_2_left; //jl _less_than_2_left + // + //; load 2 Bytes + *(short*)r11 = *(short*)src_ptr; + r11 += 2; + len -= 2; + src_ptr += 2; +_less_than_2_left: + //cmp arg3, 1 + if (len < 1) + goto _zero_left; //jl _zero_left + //; load 1 Byte + *r11 = *src_ptr; +_zero_left: + xmm7 = _mm_loadu_si128((const __m128i*)ttt); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + + ptr = (uint8_t*)pshufb_shf_table + 16 - r9; + xmm0 = _mm_loadu_si128((const __m128i*)ptr); + xmm0 = _mm_xor_si128(xmm0, mask1); + // + xmm7 = _mm_shuffle_epi8(xmm7, xmm0); + goto _128_done; //jmp _128_done + //align 16 +#ifndef DML_DISABLE_OPTIMIZATION_ +_exact_16_left: +#endif // DML_DISABLE_OPTIMIZATION_ + xmm7 = _mm_loadu_si128((const __m128i*)src_ptr); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + + goto _128_done; //jmp _128_done +_only_less_than_4: + //cmp arg3, 3 + if (len < 3) + goto _only_less_than_3; //jl _only_less_than_3 + //; load 3 Bytes + r11[0] = src_ptr[0]; + + r11[1] = src_ptr[1]; + + r11[2] = src_ptr[2]; + xmm7 = _mm_loadu_si128((const __m128i*)r11); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + xmm7 = _mm_srli_si128(xmm7, 5); + goto _barrett; //jmp _barrett +_only_less_than_3: + //cmp arg3, 2 + if (len < 2) + goto _only_less_than_2; //jl _only_less_than_2 + //; load 2 Bytes + r11[0] = src_ptr[0]; + + r11[1] = src_ptr[1]; + xmm7 = _mm_loadu_si128((const __m128i*)r11); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + xmm7 = _mm_srli_si128(xmm7, 6); + goto _barrett; //jmp _barrett +_only_less_than_2: + // + //; load 1 Byte + eax = src_ptr[0]; + r11[0] = eax; + + xmm7 = _mm_loadu_si128((const __m128i*)r11); + xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); + xmm7 = _mm_xor_si128(xmm7, xmm0); + + xmm7 = _mm_srli_si128(xmm7, 7); + + goto _barrett; //jmp _barrett +} + +uint32_t dml_avx512_crc_u32(const uint8_t* src, uint32_t transfer_size, uint32_t crc_value, uint32_t polynomial) +{ + const size_t optimization_border = 256; + + if (transfer_size < optimization_border) + { + return dml_ref_crc_32u(src, transfer_size, crc_value, polynomial); + } + + dmlc_own_calculate_crc_32u(src, transfer_size, &crc_value, polynomial); + + return crc_value; +} diff --git a/sources/core/src/sw_dispatcher/avx512/fill.c b/sources/core/src/sw_dispatcher/avx512/fill.c new file mode 100644 index 0000000..4431a61 --- /dev/null +++ b/sources/core/src/sw_dispatcher/avx512/fill.c @@ -0,0 +1,111 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "../dml_kernels.h" + +#if defined(_MSC_BUILD) +#include +#elif defined(__GNUC__) +#include +#else +#error "Unsupported compiler" +#endif + +// Disable optimization for MSVC, because it doesn't support _load_maskXX intrinsic +#ifndef _MSC_BUILD +static inline void fill_big_avx512(uint64_t pattern, uint8_t *const src, uint32_t transfer_size) +{ + // Check pointer alignment + const size_t unaligned_bytes = (uintptr_t)src % 64u; + const size_t unaligned_part_size = (0u == unaligned_bytes) ? 0u : 64u - unaligned_bytes; + + __m512i zmm_pattern = _mm512_set1_epi64(pattern); + + // Fill unaligned part of destination + if (0u != unaligned_part_size) + { + unsigned long long mask = 0xFFFFFFFFFFFFFFFFu >> unaligned_bytes; + __mmask64 mmask = _load_mask64(&mask); + + _mm512_mask_storeu_epi8(src, mmask, zmm_pattern); + + pattern = (pattern << (unaligned_bytes * 8u)) | (pattern >> (64u - (unaligned_bytes * 8u))); + zmm_pattern = _mm512_set1_epi64(pattern); + } + + // Fill aligned part of destination + const size_t aligned_part_size = transfer_size - unaligned_part_size; + const size_t head_size = aligned_part_size / sizeof(__m512i); + const size_t tail_size = aligned_part_size % sizeof(__m512i); + + uint8_t *const aligned_src = src + unaligned_part_size; + __m512i *head_ptr = (__m512i *)aligned_src; + __m512i *tail_ptr = (__m512i *)head_ptr + head_size; + + // Fill head part + if (0u != head_size) + { + while (head_ptr != tail_ptr) + { + _mm512_store_si512(head_ptr, zmm_pattern); + head_ptr++; + } + } + + // Fill tail part + if (0u != tail_size) + { + unsigned long long mask = ~(0xFFFFFFFFFFFFFFFFu << tail_size); + __mmask64 mmask = _load_mask64(&mask); + _mm512_mask_storeu_epi8(tail_ptr, mmask, zmm_pattern); + } +} + +static inline void fill_small_avx512(uint64_t pattern, uint8_t *const src, uint32_t transfer_size) +{ + // Fill 0-63 bytes with 64bit pattern via two _mm256_mask_storeu_epi8 calls. + + const size_t tail_size = transfer_size % sizeof(__m512i); + + __m256i ymm1_pattern = _mm256_set1_epi64x(pattern); + + unsigned long long mask_value = ~(0xFFFFFFFFFFFFFFFFu << tail_size); + __mmask32 mask_first = _load_mask32((uint32_t *)&mask_value); + _mm256_mask_storeu_epi8(src, mask_first, ymm1_pattern); + + __mmask32 mask_second = _load_mask32((uint32_t *)&mask_value + 1u); + _mm256_mask_storeu_epi8(src + 32u, mask_second, ymm1_pattern); +} + +void dml_avx512_fill_u64(uint64_t pattern, uint8_t *dst, uint32_t transfer_size) +{ + const size_t small_bound = 64u; + + if (transfer_size < small_bound) + { + fill_small_avx512(pattern, dst, transfer_size); + } + else + { + fill_big_avx512(pattern, dst, transfer_size); + } +} +#else +void dml_avx512_fill_u64(uint64_t pattern, uint8_t *dst, uint32_t transfer_size) +{ + dml_ref_fill_u64(pattern, dst, transfer_size); +} +#endif diff --git a/sources/core/src/sw_dispatcher/avx512/mem_move.c b/sources/core/src/sw_dispatcher/avx512/mem_move.c new file mode 100644 index 0000000..1501cff --- /dev/null +++ b/sources/core/src/sw_dispatcher/avx512/mem_move.c @@ -0,0 +1,862 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "../dml_cpuid.h" +#include "../dml_kernels.h" + +#if defined(_MSC_BUILD) +#include +#elif defined(__GNUC__) +#include +#else +#error "Unsupported compiler" +#endif + +#if defined(_MSC_VER) +#define OWN_ALIGNED_64_ARRAY(array_declaration) __declspec(align(64u)) array_declaration +#elif defined(__GNUC__) +#define OWN_ALIGNED_64_ARRAY(array_declaration) array_declaration __attribute__((aligned(64u))) +#endif + +static inline void dmlc_own_px_copy_8u_unrolled(const uint8_t *src, uint8_t *dst, uint32_t length) +{ + uint32_t align_dst = 64u - ((uint64_t)dst & 0x3F); + uint32_t align_src = 64u - ((uint64_t)src & 0x3F); + + if (align_dst < 64u) + { + if (length < align_dst) + { + align_dst = length; + for (uint32_t i = 0u; i < align_dst; ++i) + { + dst[i] = src[i]; + } + return; + } + for (uint32_t i = 0u; i < align_dst; ++i) + { + dst[i] = src[i]; + } + length -= align_dst; + src += align_dst; + dst += align_dst; + } + + if (align_dst == align_src) + { + const uint64_t *src_64u_ptr = (uint64_t *)src; + uint64_t *dst_64u_ptr = (uint64_t *)dst; + + uint32_t length_64u = length / sizeof(uint64_t); + uint32_t tail_start = length_64u * sizeof(uint64_t); + + while (length_64u > 3u) + { + dst_64u_ptr[0] = src_64u_ptr[0]; + dst_64u_ptr[1] = src_64u_ptr[1]; + dst_64u_ptr[2] = src_64u_ptr[2]; + dst_64u_ptr[3] = src_64u_ptr[3]; + dst_64u_ptr += 4u; + src_64u_ptr += 4u; + length_64u -= 4u; + } + + for (uint32_t i = 0u; i < length_64u; ++i) + { + dst_64u_ptr[i] = src_64u_ptr[i]; + } + + for (uint32_t i = tail_start; i < length; ++i) + { + dst[i] = src[i]; + } + } + else + { + while (length > 7u) + { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; + + dst += 8u; + src += 8u; + length -= 8; + } + + for (uint32_t i = 0u; i < length; ++i) + { + dst[i] = src[i]; + } + } +} + +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_2u[32]) = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_4u[32]) = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_6u[32]) = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_8u[32]) = { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_10u[32]) = { 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_12u[32]) = { 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_14u[32]) = { 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_16u[32]) = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_18u[32]) = { 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_20u[32]) = { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_22u[32]) = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_24u[32]) = { 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_26u[32]) = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_28u[32]) = { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_30u[32]) = { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_32u[32]) = { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_34u[32]) = { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_36u[32]) = { 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_38u[32]) = { 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_40u[32]) = { 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_42u[32]) = { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_44u[32]) = { 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_46u[32]) = { 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_48u[32]) = { 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_50u[32]) = { 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_52u[32]) = { 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_54u[32]) = { 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_56u[32]) = { 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_58u[32]) = { 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_60u[32]) = { 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61 }; +OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_62u[32]) = { 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62 }; + +static uint16_t *permutex_idx_pptr[31] = { permutex_idx_2u, permutex_idx_4u, permutex_idx_6u, permutex_idx_8u, permutex_idx_10u, + permutex_idx_12u, permutex_idx_14u, permutex_idx_16u, permutex_idx_18u, permutex_idx_20u, + permutex_idx_22u, permutex_idx_24u, permutex_idx_26u, permutex_idx_28u, permutex_idx_30u, + permutex_idx_32u, permutex_idx_34u, permutex_idx_36u, permutex_idx_38u, permutex_idx_40u, + permutex_idx_42u, permutex_idx_44u, permutex_idx_46u, permutex_idx_48u, permutex_idx_50u, + permutex_idx_52u, permutex_idx_54u, permutex_idx_56u, permutex_idx_58u, permutex_idx_60u, + permutex_idx_62u }; + +static inline __m512i dmlc_own_mm512_bsrli_epi128(__m512i a, uint32_t shift) +{ + switch (shift) + { + case 1: + { + return _mm512_bsrli_epi128(a, 1); + } + case 2: + { + return _mm512_bsrli_epi128(a, 2); + } + case 3: + { + return _mm512_bsrli_epi128(a, 3); + } + case 4: + { + return _mm512_bsrli_epi128(a, 4); + } + case 5: + { + return _mm512_bsrli_epi128(a, 5); + } + case 6: + { + return _mm512_bsrli_epi128(a, 6); + } + case 7: + { + return _mm512_bsrli_epi128(a, 7); + } + case 8: + { + return _mm512_bsrli_epi128(a, 8); + } + case 9: + { + return _mm512_bsrli_epi128(a, 9); + } + case 10: + { + return _mm512_bsrli_epi128(a, 10); + } + case 11: + { + return _mm512_bsrli_epi128(a, 11); + } + case 12: + { + return _mm512_bsrli_epi128(a, 12); + } + case 13: + { + return _mm512_bsrli_epi128(a, 13); + } + case 14: + { + return _mm512_bsrli_epi128(a, 14); + } + case 15: + { + return _mm512_bsrli_epi128(a, 15); + } + default: + return _mm512_setzero_si512(); + } +} + +static inline __m512i dmlc_own_mm512_bslli_epi128(__m512i a, uint32_t shift) +{ + switch (shift) + { + case 1: + { + return _mm512_bslli_epi128(a, 1); + } + case 2: + { + return _mm512_bslli_epi128(a, 2); + } + case 3: + { + return _mm512_bslli_epi128(a, 3); + } + case 4: + { + return _mm512_bslli_epi128(a, 4); + } + case 5: + { + return _mm512_bslli_epi128(a, 5); + } + case 6: + { + return _mm512_bslli_epi128(a, 6); + } + case 7: + { + return _mm512_bslli_epi128(a, 7); + } + case 8: + { + return _mm512_bslli_epi128(a, 8); + } + case 9: + { + return _mm512_bslli_epi128(a, 9); + } + case 10: + { + return _mm512_bslli_epi128(a, 10); + } + case 11: + { + return _mm512_bslli_epi128(a, 11); + } + case 12: + { + return _mm512_bslli_epi128(a, 12); + } + case 13: + { + return _mm512_bslli_epi128(a, 13); + } + case 14: + { + return _mm512_bslli_epi128(a, 14); + } + case 15: + { + return _mm512_bslli_epi128(a, 15); + } + default: + return _mm512_setzero_si512(); + } +} + +static inline __m512i dmlc_own_mm512_alignr_epi8(__m512i a, __m512i b, uint32_t shift) +{ + switch (shift) + { + case 0: + { + return b; + } + case 4: + { + return _mm512_alignr_epi32(a, b, 1); + } + case 8: + { + return _mm512_alignr_epi32(a, b, 2); + } + case 12: + { + return _mm512_alignr_epi32(a, b, 3); + } + case 16: + { + return _mm512_alignr_epi32(a, b, 4); + } + case 20: + { + return _mm512_alignr_epi32(a, b, 5); + } + case 24: + { + return _mm512_alignr_epi32(a, b, 6); + } + case 28: + { + return _mm512_alignr_epi32(a, b, 7); + } + case 32: + { + return _mm512_alignr_epi32(a, b, 8); + } + case 36: + { + return _mm512_alignr_epi32(a, b, 9); + } + case 40: + { + return _mm512_alignr_epi32(a, b, 10); + } + case 44: + { + return _mm512_alignr_epi32(a, b, 11); + } + case 48: + { + return _mm512_alignr_epi32(a, b, 12); + } + case 52: + { + return _mm512_alignr_epi32(a, b, 13); + } + case 56: + { + return _mm512_alignr_epi32(a, b, 14); + } + case 60: + { + return _mm512_alignr_epi32(a, b, 15); + } + default: + return _mm512_setzero_si512(); + } +} + +static void copy_avx512(const uint8_t *src, uint8_t *dst, uint32_t transfer_size) +{ + const size_t kilobyte = 1024; + + if (transfer_size < kilobyte) + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + + if (transfer_size > (32 * kilobyte)) + { + size_t cache_size = dml_core_get_cache_size(); + if ((cache_size > 0) && (transfer_size > cache_size)) + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + } + + const uint32_t alignment = 64; + const uint32_t magic_number = 0x3f; + + uint32_t align_dst = alignment - ((uintptr_t)dst & magic_number); + uint32_t align_src = alignment - ((uintptr_t)src & magic_number); + + if (align_dst < alignment) + { + if (transfer_size < 4 * kilobyte) + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + + dmlc_own_px_copy_8u_unrolled(src, dst, align_dst); + + transfer_size -= align_dst; + dst += align_dst; + src += align_dst; + + uint32_t length_512u = transfer_size / sizeof(__m512i); + uint32_t tail = transfer_size % sizeof(__m512i); + + if (0u != ((align_src - align_dst) & 15u)) + { + uint32_t shift = (align_dst > align_src) ? (align_dst - align_src) : (64u + align_dst - align_src); + + if (0u == (shift & 3u)) + { + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2 = dmlc_own_mm512_alignr_epi8(zmm1, zmm0, shift); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4 = dmlc_own_mm512_alignr_epi8(zmm3, zmm1, shift); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6 = dmlc_own_mm512_alignr_epi8(zmm5, zmm3, shift); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7 = dmlc_own_mm512_alignr_epi8(zmm0, zmm5, shift); + _mm512_store_si512((__m512i *)dst, zmm2); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else if (0u == (shift & 1u)) + { + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + __m512i permutex_idxmm = _mm512_load_si512(permutex_idx_pptr[(shift - 2) / 2]); + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2 = _mm512_permutex2var_epi16(zmm0, permutex_idxmm, zmm1); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4 = _mm512_permutex2var_epi16(zmm1, permutex_idxmm, zmm3); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6 = _mm512_permutex2var_epi16(zmm3, permutex_idxmm, zmm5); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7 = _mm512_permutex2var_epi16(zmm5, permutex_idxmm, zmm0); + _mm512_store_si512((__m512i *)dst, zmm2); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else if (shift < 16u) + { + if (transfer_size < 16 * kilobyte) + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + __m512i permutex_idxmm_higher = _mm512_load_si512(permutex_idx_pptr[(shift - 1) / 2]); + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm0, shift); + __m512i zmm2_higher = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_higher, zmm1); + zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm2_higher, 1u); + zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x0001000100010001, zmm2_lower); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm1, shift); + __m512i zmm4_higher = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_higher, zmm3); + zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm4_higher, 1u); + zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x0001000100010001, zmm4_lower); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm3, shift); + __m512i zmm6_higher = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_higher, zmm5); + zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm6_higher, 1u); + zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x0001000100010001, zmm6_lower); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm5, shift); + __m512i zmm7_higher = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_higher, zmm0); + zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm7_higher, 1u); + zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x0001000100010001, zmm7_lower); + _mm512_store_si512((__m512i *)dst, zmm2_higher); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4_higher); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6_higher); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7_higher); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else if (shift > 48u) + { + if (transfer_size < 16 * kilobyte) + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + __m512i permutex_idxmm_lower = _mm512_load_si512(permutex_idx_pptr[(shift - 3) / 2]); + uint32_t shift_higher = 64u - shift; + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2_lower = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_lower, zmm1); + zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm2_lower, 1u); + __m512i zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm1, shift_higher); + zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x7FFF7FFF7FFF7FFF, zmm2_lower); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4_lower = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_lower, zmm3); + zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm4_lower, 1u); + __m512i zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm3, shift_higher); + zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x7FFF7FFF7FFF7FFF, zmm4_lower); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6_lower = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_lower, zmm5); + zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm6_lower, 1u); + __m512i zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm5, shift_higher); + zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x7FFF7FFF7FFF7FFF, zmm6_lower); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7_lower = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_lower, zmm0); + zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm7_lower, 1u); + __m512i zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm0, shift_higher); + zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x7FFF7FFF7FFF7FFF, zmm7_lower); + _mm512_store_si512((__m512i *)dst, zmm2_higher); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4_higher); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6_higher); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7_higher); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + } + else + { + while (length_512u > 3u) + { + __m512i zmm0 = _mm512_loadu_si512((const __m512i *)src); + __m512i zmm1 = _mm512_loadu_si512((const __m512i *)(src + 64u)); + __m512i zmm2 = _mm512_loadu_si512((const __m512i *)(src + 128u)); + __m512i zmm3 = _mm512_loadu_si512((const __m512i *)(src + 192u)); + _mm512_store_si512((__m512i *)dst, zmm0); + _mm512_store_si512((__m512i *)(dst + 64u), zmm1); + _mm512_store_si512((__m512i *)(dst + 128u), zmm2); + _mm512_store_si512((__m512i *)(dst + 192u), zmm3); + src += 256u; + dst += 256u; + length_512u -= 4; + } + } + while (length_512u > 0u) + { + __m512i zmm0 = _mm512_loadu_si512((const __m512i *)src); + _mm512_store_si512((__m512i *)dst, zmm0); + src += 64u; + dst += 64u; + --length_512u; + } + + dmlc_own_px_copy_8u_unrolled(src, dst, tail); + + return; + } + + uint32_t length_512u = transfer_size / sizeof(__m512i); + uint32_t tail = transfer_size % sizeof(__m512i); + + if (align_src < 64u) + { + if (transfer_size < 32 * kilobyte) + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + + uint32_t shift = 64 - align_src; + + if (0u == (shift & 3u)) + { + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2 = dmlc_own_mm512_alignr_epi8(zmm1, zmm0, shift); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4 = dmlc_own_mm512_alignr_epi8(zmm3, zmm1, shift); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6 = dmlc_own_mm512_alignr_epi8(zmm5, zmm3, shift); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7 = dmlc_own_mm512_alignr_epi8(zmm0, zmm5, shift); + _mm512_store_si512((__m512i *)dst, zmm2); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else if (0u == (shift & 1u)) + { + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + __m512i permutex_idxmm = _mm512_load_si512(permutex_idx_pptr[(shift - 2) / 2]); + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2 = _mm512_permutex2var_epi16(zmm0, permutex_idxmm, zmm1); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4 = _mm512_permutex2var_epi16(zmm1, permutex_idxmm, zmm3); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6 = _mm512_permutex2var_epi16(zmm3, permutex_idxmm, zmm5); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7 = _mm512_permutex2var_epi16(zmm5, permutex_idxmm, zmm0); + _mm512_store_si512((__m512i *)dst, zmm2); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else if (shift < 16u) + { + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + __m512i permutex_idxmm_higher = _mm512_load_si512(permutex_idx_pptr[(shift - 1) / 2]); + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm0, shift); + __m512i zmm2_higher = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_higher, zmm1); + zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm2_higher, 1u); + zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x0001000100010001, zmm2_lower); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm1, shift); + __m512i zmm4_higher = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_higher, zmm3); + zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm4_higher, 1u); + zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x0001000100010001, zmm4_lower); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm3, shift); + __m512i zmm6_higher = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_higher, zmm5); + zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm6_higher, 1u); + zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x0001000100010001, zmm6_lower); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm5, shift); + __m512i zmm7_higher = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_higher, zmm0); + zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm7_higher, 1u); + zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x0001000100010001, zmm7_lower); + _mm512_store_si512((__m512i *)dst, zmm2_higher); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4_higher); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6_higher); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7_higher); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else if (shift > 48u) + { + src -= shift; + __mmask64 skip_mask = ~((1llu << shift) - 1u); + __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src); + src += 64u; + + __m512i permutex_idxmm_lower = _mm512_load_si512(permutex_idx_pptr[(shift - 3) / 2]); + uint32_t shift_higher = 64u - shift; + + while (length_512u > 4u) + { + __m512i zmm1 = _mm512_load_si512((const __m512i *)src); + __m512i zmm2_lower = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_lower, zmm1); + zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm2_lower, 1u); + __m512i zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm1, shift_higher); + zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x7FFF7FFF7FFF7FFF, zmm2_lower); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm4_lower = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_lower, zmm3); + zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm4_lower, 1u); + __m512i zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm3, shift_higher); + zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x7FFF7FFF7FFF7FFF, zmm4_lower); + __m512i zmm5 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm6_lower = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_lower, zmm5); + zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm6_lower, 1u); + __m512i zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm5, shift_higher); + zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x7FFF7FFF7FFF7FFF, zmm6_lower); + zmm0 = _mm512_load_si512((const __m512i *)(src + 192u)); + __m512i zmm7_lower = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_lower, zmm0); + zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm7_lower, 1u); + __m512i zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm0, shift_higher); + zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x7FFF7FFF7FFF7FFF, zmm7_lower); + _mm512_store_si512((__m512i *)dst, zmm2_higher); + _mm512_store_si512((__m512i *)(dst + 64u), zmm4_higher); + _mm512_store_si512((__m512i *)(dst + 128u), zmm6_higher); + _mm512_store_si512((__m512i *)(dst + 192u), zmm7_higher); + src += 256u; + dst += 256u; + length_512u -= 4u; + } + + src -= 64u - shift; + } + else + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + } + else + { + if (((12 * kilobyte) < transfer_size) && (transfer_size < (32 * kilobyte))) + { + dmlc_own_px_copy_8u_unrolled(src, dst, transfer_size); + return; + } + while (length_512u > 3u) + { + __m512i zmm0 = _mm512_load_si512((const __m512i *)src); + __m512i zmm1 = _mm512_load_si512((const __m512i *)(src + 64u)); + __m512i zmm2 = _mm512_load_si512((const __m512i *)(src + 128u)); + __m512i zmm3 = _mm512_load_si512((const __m512i *)(src + 192u)); + _mm512_store_si512((__m512i *)dst, zmm0); + _mm512_store_si512((__m512i *)(dst + 64u), zmm1); + _mm512_store_si512((__m512i *)(dst + 128u), zmm2); + _mm512_store_si512((__m512i *)(dst + 192u), zmm3); + src += 256u; + dst += 256u; + length_512u -= 4; + } + } + + while (length_512u > 0u) + { + __m512i zmm0 = _mm512_loadu_si512((const __m512i *)src); + _mm512_store_si512((__m512i *)dst, zmm0); + src += 64u; + dst += 64u; + --length_512u; + } + + dmlc_own_px_copy_8u_unrolled(src, dst, tail); +} + +void dml_avx512_mem_move(const uint8_t *src, uint8_t *dst, uint32_t transfer_size) +{ + const uint8_t *const src_begin = src; + const uint8_t *const src_end = src + transfer_size; + const uint8_t *const dst_begin = dst; + const uint8_t *const dst_end = dst + transfer_size; + + /* + * Either: + * src: |-------| + * dst: |-------| + * + * OR: + * src: |-------| + * dst: |-------| + * + * Assume ranges are exclusive, then equality is taken into account + * + * Copy is safe + */ + if (src_end <= dst_begin || src_begin >= dst_end) + { + copy_avx512(src, dst, transfer_size); + } + /* + * Fallback to move + */ + else + { + dml_ref_mem_move(src, dst, transfer_size); + } +} diff --git a/sources/core/src/sw_dispatcher/cache_flush/CMakeLists.txt b/sources/core/src/sw_dispatcher/cache_flush/CMakeLists.txt new file mode 100644 index 0000000..526052e --- /dev/null +++ b/sources/core/src/sw_dispatcher/cache_flush/CMakeLists.txt @@ -0,0 +1,26 @@ +# +# Copyright 2021 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you ("License"). Unless the License provides otherwise, +# you may not use, modify, copy, publish, distribute, disclose or transmit +# this software or the related documents without Intel's prior written +# permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +add_library(dml_kernels_cache_flush OBJECT + cache_flush.c + ) + +target_compile_features(dml_kernels_cache_flush PRIVATE c_std_11) + +target_compile_options(dml_kernels_cache_flush PRIVATE ${DML_QUALITY_OPTIONS}) + +if (CMAKE_C_COMPILER_ID MATCHES GNU) + target_compile_options(dml_kernels_cache_flush PRIVATE -mclflushopt -mclwb) +endif () diff --git a/sources/core/src/sw_dispatcher/cache_flush/cache_flush.c b/sources/core/src/sw_dispatcher/cache_flush/cache_flush.c new file mode 100644 index 0000000..ea58990 --- /dev/null +++ b/sources/core/src/sw_dispatcher/cache_flush/cache_flush.c @@ -0,0 +1,75 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "../dml_kernels.h" + +#if defined(_MSC_BUILD) +#include +#elif defined(__GNUC__) +#include +#else +#error "Unsupported compiler" +#endif + +static const size_t cache_line_size = 64u; + +void dml_clflushopt(uint8_t *dst, uint32_t transfer_size) +{ + const size_t cache_line_count = transfer_size / cache_line_size; + + _mm_mfence(); + for (size_t cache_line_index = 0; cache_line_index < cache_line_count; ++cache_line_index) + { + uint8_t *cache_line = dst + (cache_line_size * cache_line_index); + + _mm_clflushopt(cache_line); + } + _mm_mfence(); +} + +void dml_clflush(uint8_t *dst, uint32_t transfer_size) +{ + const size_t cache_line_count = transfer_size / cache_line_size; + + _mm_mfence(); + for (size_t cache_line_index = 0; cache_line_index < cache_line_count; ++cache_line_index) + { + uint8_t *cache_line = dst + (cache_line_size * cache_line_index); + + _mm_clflush(cache_line); + } + _mm_mfence(); +} + +void dml_clwb(uint8_t *dst, uint32_t transfer_size) +{ + const size_t cache_line_count = transfer_size / cache_line_size; + + _mm_mfence(); + for (size_t cache_line_index = 0; cache_line_index < cache_line_count; ++cache_line_index) + { + uint8_t *cache_line = dst + (cache_line_size * cache_line_index); + + _mm_clwb(cache_line); + } + _mm_mfence(); +} + +void dml_clwb_unsupported(uint8_t *dst, uint32_t transfer_size) +{ + (void)dst; + (void)transfer_size; +} \ No newline at end of file diff --git a/sources/core/src/sw_dispatcher/dml_cpuid.c b/sources/core/src/sw_dispatcher/dml_cpuid.c new file mode 100644 index 0000000..9f2184b --- /dev/null +++ b/sources/core/src/sw_dispatcher/dml_cpuid.c @@ -0,0 +1,111 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "dml_cpuid.h" + +#ifdef _WIN32 +#include "intrin.h" +typedef int dml_register_t; +#else +#include +typedef unsigned dml_register_t; +#endif + +dml_core_registers dml_core_cpuidex(dml_register_t leaf, dml_register_t sub_leaf) +{ +#ifdef _WIN32 + const dml_register_t eax = 0; + const dml_register_t ebx = 1; + const dml_register_t ecx = 2; + const dml_register_t edx = 3; + + dml_register_t regs[4]; + __cpuidex(regs, leaf, sub_leaf); + + dml_core_registers registers = { regs[eax], regs[ebx], regs[ecx], regs[edx] }; + + return registers; +#else + dml_core_registers registers; + __cpuid_count(leaf, sub_leaf, registers.eax, registers.ebx, registers.ecx, registers.edx); + + return registers; +#endif +} + +dml_core_registers dml_core_cpuid(dml_register_t leaf) +{ + return dml_core_cpuidex(leaf, 0x0); +} + +size_t dml_core_get_cache_size() +{ + static size_t cache_size = 0u; + + if (cache_size > 0) + { + return cache_size; + } + + const size_t max_cache_types = 32; + + dml_register_t max_cache_size = 0; + + for (dml_register_t cache_type = 0; cache_type < max_cache_types; cache_type++) + { + const dml_register_t cache_parameters = 0x4; + + const dml_core_registers registers = dml_core_cpuidex(cache_parameters, cache_type); + + const dml_register_t none_type = 0x1f; + const dml_register_t instruction_cache = 0x2; + if ((registers.eax & none_type) == none_type) + { + break; + } + if ((registers.eax & 0x1f) != instruction_cache) + { + // Sets = ECX + const dml_register_t sets = registers.ecx; + + // Line_Size = EBX[11:0] + const dml_register_t line_size_mask = 0xfff; + const dml_register_t line_size = (registers.ebx & line_size_mask); + + // Partitions = EBX[21:12] + const dml_register_t partitions_mask = 0x3ff; + const dml_register_t partitions_offset = 12; + const dml_register_t partitions = ((registers.ebx >> partitions_offset) & partitions_mask) + 1; + + // Ways = EBX[31:22] + const dml_register_t ways_mask = 0x3ff; + const dml_register_t ways_offset = 22; + const dml_register_t ways = ((registers.ebx >> ways_offset) & ways_mask) + 1; + + // This cache size in bytes + const dml_register_t this_cache_size = (ways + 1) * (partitions + 1) * (line_size + 1) * (sets + 1); + + if (this_cache_size > (dml_register_t)max_cache_size) + { + max_cache_size = this_cache_size; + } + } + } + + cache_size = (size_t)max_cache_size; + + return cache_size; +} diff --git a/sources/core/src/sw_dispatcher/dml_cpuid.h b/sources/core/src/sw_dispatcher/dml_cpuid.h new file mode 100644 index 0000000..a5e4ff6 --- /dev/null +++ b/sources/core/src/sw_dispatcher/dml_cpuid.h @@ -0,0 +1,63 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_OWN_KERNELS_CPUID_H +#define DML_CORE_OWN_KERNELS_CPUID_H + +#ifdef _WIN32 +typedef int dml_register_t; +#else +typedef unsigned dml_register_t; +#endif + +#include + +#define DML_CPUID_EXTENSIONS 0x7 + +#define DML_AVX512F (1 << 16) +#define DML_AVX512DQ (1 << 17) +#define DML_AVX512CD (1 << 28) +#define DML_AVX512BW (1 << 30) +#define DML_AVX512VL (1u << 31) + +#define DML_AVX512_MASK (DML_AVX512F | DML_AVX512DQ | DML_AVX512CD | DML_AVX512BW | DML_AVX512VL) + +#define DML_CLFLUSHOPT (1 << 23) +#define DML_CLWB (1 << 24) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct +{ + dml_register_t eax; + dml_register_t ebx; + dml_register_t ecx; + dml_register_t edx; +} dml_core_registers; + +dml_core_registers dml_core_cpuidex(dml_register_t leaf, dml_register_t sub_leaf); + +dml_core_registers dml_core_cpuid(dml_register_t leaf); + +size_t dml_core_get_cache_size(); + +#ifdef __cplusplus +} +#endif + +#endif // DML_CORE_OWN_KERNELS_CPUID_H diff --git a/sources/core/src/sw_dispatcher/dml_kernels.h b/sources/core/src/sw_dispatcher/dml_kernels.h new file mode 100644 index 0000000..72e1b36 --- /dev/null +++ b/sources/core/src/sw_dispatcher/dml_kernels.h @@ -0,0 +1,72 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_OWN_KERNELS_DEFS_H +#define DML_CORE_OWN_KERNELS_DEFS_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void dml_ref_mem_move(const uint8_t *src, uint8_t *dst, uint32_t transfer_size); + +void dml_avx512_mem_move(const uint8_t *src, uint8_t *dst, uint32_t transfer_size); + +void dml_ref_fill_u64(uint64_t pattern, uint8_t *dst, uint32_t transfer_size); + +void dml_avx512_fill_u64(uint64_t pattern, uint8_t *dst, uint32_t transfer_size); + +uint32_t dml_ref_compare(const uint8_t *src1, const uint8_t *src2, uint32_t transfer_size, uint8_t *result); + +uint32_t dml_avx512_compare(const uint8_t *src1, const uint8_t *src2, uint32_t transfer_size, uint8_t *result); + +uint32_t dml_ref_compare_pattern(uint64_t pattern, const uint8_t *src, uint32_t transfer_size, uint8_t *result); + +uint32_t dml_avx512_compare_pattern(uint64_t pattern, const uint8_t *src, uint32_t transfer_size, uint8_t *result); + +uint32_t dml_ref_create_delta(const uint8_t *src1, + const uint8_t *src2, + uint32_t transfer_size, + uint8_t *delta_record, + uint32_t max_delta_record_size, + uint8_t *result); + +void dml_ref_apply_delta(const uint8_t *delta_record, uint8_t *dst, uint32_t delta_record_size); + +void dml_ref_dualcast(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, uint32_t transfer_size); + +uint32_t dml_ref_crc_32u(const uint8_t *src, uint32_t transfer_size, uint32_t crc_value, uint32_t polynomial); + +uint32_t dml_avx512_crc_u32(const uint8_t *src, uint32_t transfer_size, uint32_t crc_value, uint32_t polynomial); + +uint32_t dml_ref_crc_reflected_u32(const uint8_t *src, uint32_t transfer_size, uint32_t crc_value, uint32_t polynomial); + +void dml_clflushopt(uint8_t *dst, uint32_t transfer_size); + +void dml_clflush(uint8_t *dst, uint32_t transfer_size); + +void dml_clwb(uint8_t *dst, uint32_t transfer_size); + +void dml_clwb_unsupported(uint8_t *dst, uint32_t transfer_size); + +#ifdef __cplusplus +} +#endif + +#endif // DML_CORE_OWN_KERNELS_DEFS_H diff --git a/sources/core/src/sw_dispatcher/optimization_dispatcher.cpp b/sources/core/src/sw_dispatcher/optimization_dispatcher.cpp new file mode 100644 index 0000000..e3bdf32 --- /dev/null +++ b/sources/core/src/sw_dispatcher/optimization_dispatcher.cpp @@ -0,0 +1,148 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include "optimization_dispatcher.hpp" + +#include + +#include "dml_cpuid.h" +#include "dml_kernels.h" + +namespace dml::core::dispatch +{ + static auto gs_mem_move = dml_ref_mem_move; + static auto gs_fill_u64 = dml_ref_fill_u64; + static auto gs_compare = dml_ref_compare; + static auto gs_compare_pattern = dml_ref_compare_pattern; + static auto gs_create_delta = dml_ref_create_delta; + static auto gs_apply_delta = dml_ref_apply_delta; + static auto gs_dualcast = dml_ref_dualcast; + static auto gs_crc_u32 = dml_ref_crc_32u; + static auto gs_crc_reflected_u32 = dml_ref_crc_reflected_u32; + static auto gs_cache_flush = dml_clflush; + static auto gs_cache_write_back = dml_clwb_unsupported; + + class dispatcher + { + public: + dispatcher() noexcept + { +#ifdef DML_AVX512 + gs_mem_move = dml_avx512_mem_move; + gs_fill_u64 = dml_avx512_fill_u64; + gs_compare = dml_avx512_compare; + gs_compare_pattern = dml_avx512_compare_pattern; + gs_crc_u32 = dml_avx512_crc_u32; + gs_cache_flush = dml_clflushopt; + gs_cache_write_back = dml_clwb; +#endif + + // Disable software dispatcher to preserve previous behavior +#if 0 + auto registers = dml_core_cpuid(DML_CPUID_EXTENSIONS); + + if ((registers.ebx & DML_AVX512_MASK) == DML_AVX512_MASK) + { + gs_mem_move = dml_avx512_mem_move; + gs_fill_u64 = dml_avx512_fill_u64; + gs_compare = dml_avx512_compare; + gs_compare_pattern = dml_avx512_compare_pattern; + gs_crc_u32 = dml_avx512_crc_u32; + } + + if ((registers.ebx & DML_CLFLUSHOPT) == DML_CLFLUSHOPT) + { + gs_cache_flush = dml_clflushopt; + } + + if ((registers.ebx & DML_CLWB) == DML_CLWB) + { + gs_cache_write_back = dml_clwb; + } +#endif + } + }; + + [[maybe_unused]] static auto gs_dispatcher = dispatcher(); + + void mem_move(const uint8_t* src, uint8_t* dst, uint32_t transfer_size) noexcept + { + gs_mem_move(src, dst, transfer_size); + } + + void fill(uint64_t pattern, uint8_t* dst, uint32_t transfer_size) noexcept + { + gs_fill_u64(pattern, dst, transfer_size); + } + + std::tuple compare(const uint8_t* src1, const uint8_t* src2, uint32_t transfer_size) noexcept + { + uint8_t result = 0; + auto mismatch = gs_compare(src1, src2, transfer_size, &result); + + return { mismatch, result }; + } + + std::tuple compare_pattern(uint64_t pattern, const uint8_t* src, uint32_t transfer_size) noexcept + { + uint8_t result = 0; + auto mismatch = gs_compare_pattern(pattern, src, transfer_size, &result); + + return { mismatch, result }; + } + + std::tuple create_delta(const uint8_t* src1, + const uint8_t* src2, + uint32_t transfer_size, + uint8_t* delta_record, + uint32_t delta_max_size) noexcept + { + uint8_t result = 0; + auto delta_record_size = gs_create_delta(src1, src2, transfer_size, delta_record, delta_max_size, &result); + + return { delta_record_size, result }; + } + + void apply_delta(const uint8_t* delta_record, uint8_t* dst, uint32_t transfer_size) noexcept + { + gs_apply_delta(delta_record, dst, transfer_size); + } + + void dualcast(const uint8_t* src, uint8_t* dst1, uint8_t* dst2, uint32_t transfer_size) noexcept + { + gs_dualcast(src, dst1, dst2, transfer_size); + } + + uint32_t crc(const uint8_t* src, uint32_t transfer_size, uint32_t crc_seed, uint32_t polynomial) noexcept + { + return gs_crc_u32(src, transfer_size, crc_seed, polynomial); + } + + uint32_t crc_reflected(const uint8_t* src, uint32_t transfer_size, uint32_t crc_seed, uint32_t polynomial) noexcept + { + return gs_crc_reflected_u32(src, transfer_size, crc_seed, polynomial); + } + + void cache_flush(uint8_t* dst, uint32_t transfer_size) noexcept + { + gs_cache_flush(dst, transfer_size); + } + + void cache_write_back(uint8_t* dst, uint32_t transfer_size) noexcept + { + gs_cache_write_back(dst, transfer_size); + } +} // namespace dml::core::dispatch diff --git a/sources/core/src/sw_dispatcher/optimization_dispatcher.hpp b/sources/core/src/sw_dispatcher/optimization_dispatcher.hpp new file mode 100644 index 0000000..db023f7 --- /dev/null +++ b/sources/core/src/sw_dispatcher/optimization_dispatcher.hpp @@ -0,0 +1,52 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_CORE_OWN_KERNELS_OPTIMIZATION_DISPATCHER_HPP +#define DML_CORE_OWN_KERNELS_OPTIMIZATION_DISPATCHER_HPP + +#include +#include + +namespace dml::core::dispatch +{ + void mem_move(const uint8_t* src, uint8_t* dst, uint32_t transfer_size) noexcept; + + void fill(uint64_t pattern, uint8_t* dst, uint32_t transfer_size) noexcept; + + std::tuple compare(const uint8_t* src1, const uint8_t* src2, uint32_t transfer_size) noexcept; + + std::tuple compare_pattern(uint64_t pattern, const uint8_t* src, uint32_t transfer_size) noexcept; + + std::tuple create_delta(const uint8_t* src1, + const uint8_t* src2, + uint32_t transfer_size, + uint8_t* delta_record, + uint32_t delta_max_size) noexcept; + + void apply_delta(const uint8_t* delta_record, uint8_t* dst, uint32_t transfer_size) noexcept; + + void dualcast(const uint8_t* src, uint8_t* dst1, uint8_t* dst2, uint32_t transfer_size) noexcept; + + uint32_t crc(const uint8_t* src, uint32_t transfer_size, uint32_t crc_seed, uint32_t polynomial = 0x1EDC6F41u) noexcept; + + uint32_t crc_reflected(const uint8_t* src, uint32_t transfer_size, uint32_t crc_seed, uint32_t polynomial = 0x1EDC6F41u) noexcept; + + void cache_flush(uint8_t* dst, uint32_t transfer_size) noexcept; + + void cache_write_back(uint8_t* dst, uint32_t transfer_size) noexcept; +} // namespace dml::core::dispatch + +#endif //DML_CORE_OWN_KERNELS_OPTIMIZATION_DISPATCHER_HPP diff --git a/sources/core/src/sw_dispatcher/ref/CMakeLists.txt b/sources/core/src/sw_dispatcher/ref/CMakeLists.txt new file mode 100644 index 0000000..39fdc5b --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/CMakeLists.txt @@ -0,0 +1,37 @@ +# +# Copyright 2021 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, +# and your use of them is governed by the express license under which they +# were provided to you ("License"). Unless the License provides otherwise, +# you may not use, modify, copy, publish, distribute, disclose or transmit +# this software or the related documents without Intel's prior written +# permission. +# +# This software and the related documents are provided as is, with no +# express or implied warranties, other than those that are expressly +# stated in the License. +# + +add_library(dml_kernels_ref OBJECT + mem_move.c + fill.c + compare.c + compare_pattern.c + create_delta.c + apply_delta.c + dualcast.c + crc.c + ) + +target_compile_features(dml_kernels_ref PRIVATE c_std_11) + +target_compile_options(dml_kernels_ref PRIVATE ${DML_QUALITY_OPTIONS}) + +if (CMAKE_C_COMPILER_ID MATCHES GNU) + target_compile_options(dml_kernels_ref PRIVATE -mavx2) +endif () + +if (CMAKE_C_COMPILER_ID MATCHES MSVC) + target_compile_options(dml_kernels_ref PRIVATE /arch:AVX2) +endif () diff --git a/sources/core/src/sw_dispatcher/ref/apply_delta.c b/sources/core/src/sw_dispatcher/ref/apply_delta.c new file mode 100644 index 0000000..f48b809 --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/apply_delta.c @@ -0,0 +1,40 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "../dml_kernels.h" + +void dml_ref_apply_delta(const uint8_t *delta_record, uint8_t *dst, uint32_t delta_record_size) +{ + typedef uint64_t block_t; + typedef uint16_t offset_t; + + const size_t delta_note_size = sizeof(block_t) + sizeof(offset_t); + const size_t delta_notes_count = delta_record_size / delta_note_size; + + block_t *const dst_u64 = (block_t *)dst; + + for (size_t index = 0; index < delta_notes_count; ++index) + { + const uint8_t *const delta_note_position = delta_record + (delta_note_size * index); + + const offset_t offset = *(offset_t *)delta_note_position; + const block_t data = *(block_t *)(delta_note_position + sizeof(offset_t)); + + dst_u64[offset] = data; + } +} diff --git a/sources/core/src/sw_dispatcher/ref/compare.c b/sources/core/src/sw_dispatcher/ref/compare.c new file mode 100644 index 0000000..5964a88 --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/compare.c @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "../dml_kernels.h" + +uint32_t dml_ref_compare(const uint8_t* src1, const uint8_t* src2, uint32_t transfer_size, uint8_t* result) +{ + const uint8_t equal = 0x0; + const uint8_t not_equal = 0x1; + + for (size_t index = 0; index < transfer_size; ++index) + { + if (src1[index] != src2[index]) + { + *result = not_equal; + return (uint32_t)index; + } + } + + *result = equal; + return 0; +} diff --git a/sources/core/src/sw_dispatcher/ref/compare_pattern.c b/sources/core/src/sw_dispatcher/ref/compare_pattern.c new file mode 100644 index 0000000..34ba7b3 --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/compare_pattern.c @@ -0,0 +1,56 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "../dml_kernels.h" + +uint32_t dml_ref_compare_pattern(uint64_t pattern, const uint8_t *src, uint32_t transfer_size, uint8_t *result) +{ + const uint8_t equal = 0x0; + const uint8_t not_equal = 0x1; + + const size_t chunk_size = sizeof(pattern); + const size_t head_size = transfer_size / chunk_size; + const size_t tail_size = transfer_size % chunk_size; + + const uint64_t *const head = (const uint64_t *)src; + const uint8_t *const tail = src + chunk_size * head_size; + + for (size_t index = 0; index < head_size; ++index) + { + if (head[index] != pattern) + { + *result = not_equal; + return (uint32_t)(index * chunk_size); + } + } + + const uint8_t *const pattern_u8 = (uint8_t *)&pattern; + + for (size_t index = 0; index < tail_size; ++index) + { + // No overflow for pattern. See tail_size calculation. + if (tail[index] != pattern_u8[index]) + { + *result = not_equal; + return (uint32_t)(chunk_size * head_size + index); + } + } + + *result = equal; + return 0; +} diff --git a/sources/core/src/sw_dispatcher/ref/crc.c b/sources/core/src/sw_dispatcher/ref/crc.c new file mode 100644 index 0000000..dfc5e97 --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/crc.c @@ -0,0 +1,65 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "../dml_kernels.h" + +static inline uint8_t reverse(uint8_t byte) +{ + byte = ((byte & 0x55u) << 1u) | ((byte & 0xAAu) >> 1u); + byte = ((byte & 0x33u) << 2u) | ((byte & 0xCCu) >> 2u); + byte = ((byte & 0x0Fu) << 4u) | ((byte & 0xF0u) >> 4u); + + return byte; +} + +static inline uint32_t calculate_crc_32u(uint32_t crc_value, uint8_t data, uint32_t polynomial) +{ + const size_t byte_width = 8; + const size_t crc_bit_count = sizeof(crc_value) * byte_width; + const size_t crc_byte_shift = crc_bit_count - byte_width; + const uint32_t high_bit_mask = 1 << (crc_bit_count - 1); + + crc_value ^= (data << crc_byte_shift); + + for (size_t bit = 0u; bit < byte_width; ++bit) + { + crc_value = (crc_value & high_bit_mask) ? ((crc_value << 1) ^ polynomial) : (crc_value << 1); + } + + return crc_value; +} + +uint32_t dml_ref_crc_32u(const uint8_t *src, uint32_t transfer_size, uint32_t crc_value, uint32_t polynomial) +{ + for (size_t byte = 0; byte < transfer_size; ++byte) + { + crc_value = calculate_crc_32u(crc_value, src[byte], polynomial); + } + + return crc_value; +} + +uint32_t dml_ref_crc_reflected_u32(const uint8_t *src, uint32_t transfer_size, uint32_t crc_value, uint32_t polynomial) +{ + for (size_t byte = 0; byte < transfer_size; ++byte) + { + crc_value = calculate_crc_32u(crc_value, reverse(src[byte]), polynomial); + } + + return crc_value; +} diff --git a/sources/core/src/sw_dispatcher/ref/create_delta.c b/sources/core/src/sw_dispatcher/ref/create_delta.c new file mode 100644 index 0000000..75249cb --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/create_delta.c @@ -0,0 +1,68 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "../dml_kernels.h" + +uint32_t dml_ref_create_delta(const uint8_t *src1, + const uint8_t *src2, + uint32_t transfer_size, + uint8_t *delta_record, + uint32_t delta_record_max_size, + uint8_t *result) +{ + typedef uint64_t block_t; + typedef uint16_t offset_t; + + const size_t delta_note_size = sizeof(block_t) + sizeof(offset_t); + const size_t block_count = transfer_size / sizeof(block_t); + + uint32_t delta_record_size = 0; + + for (size_t index = 0; index < block_count; ++index) + { + const block_t block1 = *(((block_t *)src1) + index); + const block_t block2 = *(((block_t *)src2) + index); + + if (block1 != block2) + { + if ((delta_record_size + delta_note_size) > delta_record_max_size) + { + const uint8_t overflow = 0x2; + + *result = overflow; + return delta_record_size; + } + + uint8_t *const delta_position = delta_record + delta_record_size; + offset_t *const offset = (offset_t *)delta_position; + block_t *const data = (block_t *)(delta_position + sizeof(offset_t)); + + *offset = (offset_t)index; + *data = block2; + + delta_record_size += (uint32_t)delta_note_size; + } + } + + const uint8_t equal = 0x0; + const uint8_t not_equal = 0x1; + + *result = delta_record_size ? not_equal : equal; + + return delta_record_size; +} diff --git a/include/dml/cpp/middle_layer/core.hpp b/sources/core/src/sw_dispatcher/ref/dualcast.c similarity index 68% rename from include/dml/cpp/middle_layer/core.hpp rename to sources/core/src/sw_dispatcher/ref/dualcast.c index 864c3a2..94767ce 100644 --- a/include/dml/cpp/middle_layer/core.hpp +++ b/sources/core/src/sw_dispatcher/ref/dualcast.c @@ -14,15 +14,15 @@ * */ -#ifndef DML_ML_CORE_HPP -#define DML_ML_CORE_HPP +#include -#include "descriptor_views.hpp" -#include "result_views.hpp" +#include "../dml_kernels.h" -namespace dml::ml::core +void dml_ref_dualcast(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, uint32_t transfer_size) { - execution_status submit(descriptor& dsc) noexcept; -} // namespace dml::ml::core - -#endif //DML_ML_CORE_HPP + for (size_t index = 0; index < transfer_size; ++index) + { + dst1[index] = src[index]; + dst2[index] = src[index]; + } +} diff --git a/sources/core/src/sw_dispatcher/ref/fill.c b/sources/core/src/sw_dispatcher/ref/fill.c new file mode 100644 index 0000000..d00c90c --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/fill.c @@ -0,0 +1,29 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "../dml_kernels.h" + +void dml_ref_fill_u64(uint64_t pattern, uint8_t *dst, uint32_t transfer_size) +{ + const uint8_t *const pattern_bytes = (const uint8_t *)&pattern; + + for (size_t index = 0; index < transfer_size; ++index) + { + dst[index] = pattern_bytes[index % sizeof(pattern)]; + } +} diff --git a/sources/core/src/sw_dispatcher/ref/mem_move.c b/sources/core/src/sw_dispatcher/ref/mem_move.c new file mode 100644 index 0000000..80ce2f5 --- /dev/null +++ b/sources/core/src/sw_dispatcher/ref/mem_move.c @@ -0,0 +1,96 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include + +#include "../dml_kernels.h" + +static inline void own_copy_forward(const uint8_t *const src, uint8_t *const dst, uint32_t transfer_size) +{ + for (size_t index = 0; index < transfer_size; ++index) + { + dst[index] = src[index]; + } +} + +static inline void own_copy_backward(const uint8_t *const src, uint8_t *const dst, uint32_t transfer_size) +{ + const size_t last_index = transfer_size - 1; + const size_t first_index = 0; + + for (size_t index = last_index; index != first_index; --index) + { + dst[index] = src[index]; + } + + dst[first_index] = src[first_index]; +} + +void dml_ref_mem_move(const uint8_t *src, uint8_t *dst, uint32_t transfer_size) +{ + const uint8_t *const src_begin = src; + const uint8_t *const src_end = src + transfer_size; + const uint8_t *const dst_begin = dst; + const uint8_t *const dst_end = dst + transfer_size; + + /* + * Either: + * src: |-------| + * dst: |-------| + * + * OR: + * src: |-------| + * dst: |-------| + * + * Assume ranges are exclusive, then equality is taken into account + * + * Any copy is safe + */ + if (src_end <= dst_begin || src_begin >= dst_end) + { + own_copy_forward(src, dst, transfer_size); + } + /* + * src: |-------| + * dst: |-------| + * + * Only forward copy is applicable + */ + else if (src_begin < dst_end && src_end > dst_end) + { + own_copy_forward(src, dst, transfer_size); + } + /* + * src: |-------| + * dst: |-------| + * + * Only backward copy is applicable + */ + else if (src_begin < dst_begin && src_end > dst_begin) + { + own_copy_backward(src, dst, transfer_size); + } + /* + * src: |-------| + * dst: |-------| + * + * The same memory regions + */ + else + { + // Do nothing + } +} diff --git a/sources/middle_layer/utils.hpp b/sources/core/src/utils.hpp similarity index 81% rename from sources/middle_layer/utils.hpp rename to sources/core/src/utils.hpp index b689a59..2616258 100644 --- a/sources/middle_layer/utils.hpp +++ b/sources/core/src/utils.hpp @@ -17,14 +17,14 @@ #ifndef DML_ML_OWN_UTILS_HPP #define DML_ML_OWN_UTILS_HPP -#include +#include #include #define RETURN_STATUS_IF(expr, status) \ if ((expr)) \ return (status) -namespace dml::ml +namespace dml::core { template bool any_equal_zero(args_t... args) noexcept @@ -46,6 +46,12 @@ namespace dml::ml return ((lhs <= rhs) && ((lhs + lhs_size) > rhs)) || ((rhs <= lhs) && ((rhs + rhs_size) > lhs)); } + bool adjacent(address_t lhs, transfer_size_t lhs_size, address_t rhs) noexcept + { + // If end of lhs is the same as begin of rhs + return (lhs + lhs_size) == rhs; + } + template bool any_misaligned(args_t... args) noexcept { @@ -55,14 +61,6 @@ namespace dml::ml }; return (is_misaligned(args) || ...); } - -// template -// constexpr elem_t reverse_bytes(elem_t v) noexcept { -// constexpr auto byte_size = elem_t(8); -// constexpr auto mask = ~0 ^ (sizeof(v) * byte_size); -// v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); -// return (v >> 16) | (v << 16); -// } -} // namespace dml::ml +} // namespace dml::core #endif // DML_ML_OWN_UTILS_HPP diff --git a/sources/core/src/validation.cpp b/sources/core/src/validation.cpp new file mode 100644 index 0000000..f1c92ad --- /dev/null +++ b/sources/core/src/validation.cpp @@ -0,0 +1,333 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include + +#include "utils.hpp" + +namespace dml::core +{ + static constexpr uint32_t dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; + + static dml::detail::validation_status validate(nop_descriptor nop) noexcept; + + static dml::detail::validation_status validate(batch_descriptor batch) noexcept; + + static dml::detail::validation_status validate(drain_descriptor drain) noexcept; + + static dml::detail::validation_status validate(mem_move_descriptor mem_move) noexcept; + + static dml::detail::validation_status validate(fill_descriptor fill) noexcept; + + static dml::detail::validation_status validate(compare_descriptor compare) noexcept; + + static dml::detail::validation_status validate(compare_pattern_descriptor compare_pattern) noexcept; + + static dml::detail::validation_status validate(create_delta_descriptor create_delta) noexcept; + + static dml::detail::validation_status validate(apply_delta_descriptor apply_delta) noexcept; + + static dml::detail::validation_status validate(dualcast_descriptor dualcast) noexcept; + + static dml::detail::validation_status validate(crc_descriptor crc) noexcept; + + static dml::detail::validation_status validate(copy_crc_descriptor copy_crc) noexcept; + + static dml::detail::validation_status validate(dif_check_descriptor dif_check) noexcept; + + static dml::detail::validation_status validate(dif_insert_descriptor dif_insert) noexcept; + + static dml::detail::validation_status validate(dif_strip_descriptor dif_strip) noexcept; + + static dml::detail::validation_status validate(dif_update_descriptor dif_update) noexcept; + + static dml::detail::validation_status validate(cache_flush_descriptor cache_flush) noexcept; + + dml::detail::validation_status validate(descriptor &dsc) noexcept + { + auto view = any_descriptor(dsc); + + switch (static_cast(view.operation())) + { + case operation::nop: + return validate(nop_descriptor(dsc)); + case operation::batch: + return validate(batch_descriptor(dsc)); + case operation::drain: + return validate(drain_descriptor(dsc)); + case operation::memory_move: + return validate(mem_move_descriptor(dsc)); + case operation::fill: + return validate(fill_descriptor(dsc)); + case operation::compare: + return validate(compare_descriptor(dsc)); + case operation::compare_pattern: + return validate(compare_pattern_descriptor(dsc)); + case operation::create_delta: + return validate(create_delta_descriptor(dsc)); + case operation::apply_delta: + return validate(apply_delta_descriptor(dsc)); + case operation::dualcast: + return validate(dualcast_descriptor(dsc)); + case operation::crc: + return validate(crc_descriptor(dsc)); + case operation::copy_crc: + return validate(copy_crc_descriptor(dsc)); + case operation::dif_check: + return validate(dif_check_descriptor(dsc)); + case operation::dif_insert: + return validate(dif_insert_descriptor(dsc)); + case operation::dif_strip: + return validate(dif_strip_descriptor(dsc)); + case operation::dif_update: + return validate(dif_update_descriptor(dsc)); + case operation::cache_flush: + return validate(cache_flush_descriptor(dsc)); + default: + return dml::detail::validation_status::unsupported_operation; + } + } + + static dml::detail::validation_status validate(nop_descriptor nop) noexcept + { + static_cast(nop); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(drain_descriptor drain) noexcept + { + static_cast(drain); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(mem_move_descriptor mem_move) noexcept + { + RETURN_STATUS_IF(any_equal_zero(mem_move.source_address(), mem_move.destination_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(mem_move.transfer_size()), dml::detail::validation_status::null_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(fill_descriptor fill) noexcept + { + RETURN_STATUS_IF(any_equal_zero(fill.destination_address()), dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(fill.transfer_size()), dml::detail::validation_status::null_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(compare_descriptor compare) noexcept + { + RETURN_STATUS_IF(any_equal_zero(compare.source_1_address(), compare.source_2_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(compare.transfer_size()), dml::detail::validation_status::null_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(compare_pattern_descriptor compare_pattern) noexcept + { + RETURN_STATUS_IF(any_equal_zero(compare_pattern.source_address()), dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(compare_pattern.transfer_size()), dml::detail::validation_status::null_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(create_delta_descriptor create_delta) noexcept + { + constexpr auto max_size = 0x80000; + + RETURN_STATUS_IF( + any_equal_zero(create_delta.source_1_address(), create_delta.source_2_address(), create_delta.delta_record_address()), + dml::detail::validation_status::null_address); + + RETURN_STATUS_IF(any_equal_zero(create_delta.transfer_size(), create_delta.maximum_delta_record_size()), + dml::detail::validation_status::null_size); + + RETURN_STATUS_IF( + any_misaligned<8u>(create_delta.source_1_address(), create_delta.source_2_address(), create_delta.delta_record_address()), + dml::detail::validation_status::misalignment); + + RETURN_STATUS_IF(create_delta.transfer_size() % 8 != 0, dml::detail::validation_status::wrong_size); + + RETURN_STATUS_IF(create_delta.transfer_size() > max_size, dml::detail::validation_status::large_size); + + RETURN_STATUS_IF(create_delta.maximum_delta_record_size() % 10 != 0 || create_delta.maximum_delta_record_size() < 80, + dml::detail::validation_status::wrong_delta_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(apply_delta_descriptor apply_delta) noexcept + { + constexpr auto max_size = 0x80000; + + RETURN_STATUS_IF(any_equal_zero(apply_delta.destination_address(), apply_delta.delta_record_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(apply_delta.transfer_size(), apply_delta.delta_record_size()), + dml::detail::validation_status::null_size); + + RETURN_STATUS_IF(overlaps(apply_delta.delta_record_address(), + apply_delta.delta_record_size(), + apply_delta.destination_address(), + apply_delta.transfer_size()), + dml::detail::validation_status::overlapping); + + RETURN_STATUS_IF(any_misaligned<8u>(apply_delta.destination_address(), apply_delta.delta_record_address()), + dml::detail::validation_status::misalignment); + + RETURN_STATUS_IF(apply_delta.transfer_size() % 8 != 0, dml::detail::validation_status::wrong_size); + + RETURN_STATUS_IF(apply_delta.transfer_size() > max_size, dml::detail::validation_status::large_size); + + RETURN_STATUS_IF(apply_delta.delta_record_size() % 10 != 0, dml::detail::validation_status::wrong_delta_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(dualcast_descriptor dualcast) noexcept + { + RETURN_STATUS_IF(any_equal_zero(dualcast.source_address(), dualcast.destination_1_address(), dualcast.destination_2_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(dualcast.transfer_size()), dml::detail::validation_status::null_size); + + RETURN_STATUS_IF((dualcast.destination_1_address() & 0xFFFu) != (dualcast.destination_2_address() & 0xFFFu), + dml::detail::validation_status::wrong_dualcast_address); + + RETURN_STATUS_IF(overlaps(dualcast.source_address(), dualcast.destination_1_address(), dualcast.transfer_size()), + dml::detail::validation_status::overlapping); + + RETURN_STATUS_IF(overlaps(dualcast.source_address(), dualcast.destination_2_address(), dualcast.transfer_size()), + dml::detail::validation_status::overlapping); + + RETURN_STATUS_IF(overlaps(dualcast.destination_1_address(), dualcast.destination_2_address(), dualcast.transfer_size()), + dml::detail::validation_status::overlapping); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(crc_descriptor crc) noexcept + { + RETURN_STATUS_IF(any_equal_zero(crc.source_address()), dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(crc.transfer_size()), dml::detail::validation_status::null_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(copy_crc_descriptor copy_crc) noexcept + { + RETURN_STATUS_IF(any_equal_zero(copy_crc.source_address(), copy_crc.destination_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(copy_crc.transfer_size()), dml::detail::validation_status::null_size); + + RETURN_STATUS_IF(overlaps(copy_crc.source_address(), copy_crc.destination_address(), copy_crc.transfer_size()), + dml::detail::validation_status::overlapping); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(cache_flush_descriptor cache_flush) noexcept + { + RETURN_STATUS_IF(any_equal_zero(cache_flush.destination_address()), dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(cache_flush.transfer_size()), dml::detail::validation_status::null_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(dif_check_descriptor dif_check) noexcept + { + const auto block_size = dif_block_sizes[dif_check.dif_flags() & 0b11]; + + RETURN_STATUS_IF(any_equal_zero(dif_check.source_address()), dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(dif_check.transfer_size()), dml::detail::validation_status::null_size); + RETURN_STATUS_IF(dif_check.transfer_size() % (block_size + sizeof(uint64_t)) != 0, dml::detail::validation_status::wrong_dif_size); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(dif_insert_descriptor dif_insert) noexcept + { + const auto block_size = dif_block_sizes[dif_insert.dif_flags() & 0b11]; + + RETURN_STATUS_IF(any_equal_zero(dif_insert.source_address(), dif_insert.destination_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(dif_insert.transfer_size()), dml::detail::validation_status::null_size); + RETURN_STATUS_IF(dif_insert.transfer_size() % block_size != 0, dml::detail::validation_status::wrong_dif_size); + + const auto src_size = dif_insert.transfer_size(); + const auto dst_size = src_size + ((src_size / block_size) * 8); + RETURN_STATUS_IF(overlaps(dif_insert.source_address(), src_size, dif_insert.destination_address(), dst_size), + dml::detail::validation_status::overlapping); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(dif_strip_descriptor dif_strip) noexcept + { + const auto block_size = dif_block_sizes[dif_strip.dif_flags() & 0b11]; + + RETURN_STATUS_IF(any_equal_zero(dif_strip.source_address(), dif_strip.destination_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(dif_strip.transfer_size()), dml::detail::validation_status::null_size); + RETURN_STATUS_IF(dif_strip.transfer_size() % (block_size + sizeof(uint64_t)) != 0, dml::detail::validation_status::wrong_dif_size); + + const auto src_size = dif_strip.transfer_size(); + const auto dst_size = src_size - ((src_size / block_size) * 8); + RETURN_STATUS_IF(overlaps(dif_strip.source_address(), src_size, dif_strip.destination_address(), dst_size), + dml::detail::validation_status::overlapping); + + // Hardware bug workaround + if (dif_strip.destination_address() < dif_strip.source_address()) + { + if ((dif_strip.destination_address() + dst_size) <= dif_strip.source_address() && + dif_strip.source_address() <= (dif_strip.destination_address() + src_size)) + { + return dml::detail::validation_status::dif_strip_adjacent; + } + } + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(dif_update_descriptor dif_update) noexcept + { + const auto block_size = dif_block_sizes[dif_update.dif_flags() & 0b11]; + + RETURN_STATUS_IF(any_equal_zero(dif_update.source_address(), dif_update.destination_address()), + dml::detail::validation_status::null_address); + RETURN_STATUS_IF(any_equal_zero(dif_update.transfer_size()), dml::detail::validation_status::null_size); + RETURN_STATUS_IF(dif_update.transfer_size() % (block_size + sizeof(uint64_t)) != 0, dml::detail::validation_status::wrong_dif_size); + RETURN_STATUS_IF(overlaps(dif_update.source_address(), dif_update.destination_address(), dif_update.transfer_size()), + dml::detail::validation_status::overlapping); + + return dml::detail::validation_status::success; + } + + static dml::detail::validation_status validate(batch_descriptor batch) noexcept + { + RETURN_STATUS_IF(any_equal_zero(batch.descriptor_list_address()), dml::detail::validation_status::null_address); + RETURN_STATUS_IF(batch.descriptors_count() < 4, dml::detail::validation_status::wrong_batch_size); + + return dml::detail::validation_status::success; + } + +} // namespace dml::core diff --git a/sources/cores/CMakeLists.txt b/sources/cores/CMakeLists.txt deleted file mode 100644 index 8ac5538..0000000 --- a/sources/cores/CMakeLists.txt +++ /dev/null @@ -1,43 +0,0 @@ -# -# Copyright 2020-2021 Intel Corporation. -# -# This software and the related documents are Intel copyrighted materials, -# and your use of them is governed by the express license under which they -# were provided to you ("License"). Unless the License provides otherwise, -# you may not use, modify, copy, publish, distribute, disclose or transmit -# this software or the related documents without Intel's prior written -# permission. -# -# This software and the related documents are provided as is, with no -# express or implied warranties, other than those that are expressly -# stated in the License. -# - -add_library(dml_core OBJECT - src/dmlc_fill_8u.c - src/dmlc_delta_record_8u.c - src/dmlc_crc_16u_32u.c - src/dmlc_copy_8u.c - src/dmlc_compare_8u.c - src/dmlc_cache_8u.c - ) - -target_include_directories(dml_core - PUBLIC include - PRIVATE ../../include - PRIVATE src/include - ) - -target_compile_features(dml_core PRIVATE c_std_11) - -# TODO: target_compile_options(dml_core PRIVATE ${DML_QUALITY_OPTIONS}) - -target_compile_definitions(dml_core PRIVATE DML_CORES_BADARG_CHECK) - -# TODO: Remove -if ("${DML_ARCH}" STREQUAL "avx512") - target_compile_options(dml_core PRIVATE ${DML_AVX512_OPTIONS}) - target_compile_definitions(dml_core PRIVATE AVX512) -else() - target_compile_definitions(dml_core PRIVATE PX) -endif() diff --git a/sources/cores/include/core_compare.h b/sources/cores/include/core_compare.h deleted file mode 100644 index 778b16c..0000000 --- a/sources/cores/include/core_compare.h +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @file - * @date 2/10/2020 - * - * @defgroup core_public_compare Compare Features - * @ingroup core_public_features - * @{ - * - * @brief Features to compare memory region with another one or with a pattern. - * - * @details Compare group contains optimized cores, which perform the following tasks: - * - Comparing between vectors; - * - Comparing between vector and a pattern; - * - Comparing vector values with some key/range to create a bit mask; - * - Creating delta between two memory regions. - * - */ - -#include "core_definitions.h" - -#ifndef DML_KERNEL_COMPARE_H__ -#define DML_KERNEL_COMPARE_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -/* ------ Kernel Compare Defines ------ */ - -#define DML_COMPARE_STATUS_EQ DML_STATUS_OK /**< Redefinition of default status in context of Compare functions */ -#define DML_COMPARE_STATUS_NE DML_STATUS_FALSE_PREDICATE_OK /**< Redefinition of default status in context of Compare functions */ - -typedef uint64_t pattern_t; /**< Special type for 8-byte pattern */ -#define DML_SIZE_PATTERN_T 64 /**< pattern_t size in bits */ - - -/* ------ Kernel Compare Functions ------ */ - -/** - * @brief Compares specified memory regions. - * - * @param[in] first_vector_ptr pointer to the reference vector - * @param[in] second_vector_ptr pointer to the vector to compare - * @param[in] size number of bytes to compare - * @param[out] mismatch_offset_ptr first mismatch offset - * - * @note No memory alignment required. - * @note After function execution mismatch_offset_ptr contains the first miss_match offset - * if vectors are not equal. - * - * @return - * - @ref DML_COMPARE_STATUS_EQ; - * - @ref DML_COMPARE_STATUS_NE; - * - @ref DML_STATUS_NULL_POINTER_ERROR. - */ -DML_CORE_API(dmlc_status_t, compare_8u, (const uint8_t *first_vector_ptr, - const uint8_t *second_vector_ptr, - const uint32_t size, - uint32_t *const mismatch_offset_ptr)); - - -/** - * @brief Compares specified memory region with 8-byte pattern. - * - * @param[in] memory_region_ptr pointer to the base vector - * @param[in] pattern expected 8-byte memory pattern - * @param[in] size number of bytes to compare - * @param[out] mismatch_offset_ptr first mismatch offset - * - * @note After function execution mismatch_offset_ptr contains the first miss_match offset - * if vectors are not equal. - * @note Mismatch_offset_ptr may not be the exact byte location, - * but it is guaranteed to be no greater than the first difference. - * - * @return - * - @ref DML_COMPARE_STATUS_EQ; - * - @ref DML_COMPARE_STATUS_NE; - * - @ref DML_STATUS_NULL_POINTER_ERROR. - */ -DML_CORE_API(dmlc_status_t, compare_with_pattern_8u, (const uint8_t *memory_region_ptr, - const pattern_t pattern, - const uint32_t size, - uint32_t *const mismatch_offset_ptr)); - -/** - * @brief Creates delta record if vectors are not equal - * - * @param[in] reference_vector_ptr pointer to the base vector - * @param[in] second_vector_ptr pointer to the delta that is written into the delta record - * @param[in] compared_bytes number of bytes to compare - * @param[in] delta_record_max_size maximal delta record size - * @param[out] delta_record_ptr pointer to the delta record - * @param[out] record_size_ptr created delta record size - * - * @warning: Compared vectors addresses must be aligned to a multiple of 8. - * @warning: Number of bytes to compare must be multiple of 8. - * @warning: Number of bytes to compare must be less or equal to the maximum supported offset, - * which is 524,280 bytes (0x7FFF8). - * @warning: Number of available bytes in delta record must be multiple of 10. - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR; - * - @ref DML_STATUS_DELTA_ALIGN_ERROR in case if vector address is not aligned to be a - * multiple of 8; - * - @ref DML_STATUS_DELTA_INPUT_SIZE_ERROR in case if input vector size is not multiple of 8, or - * in case if input vector size is greater then max delta offset supported; - * - @ref DML_STATUS_DELTA_RECORD_SIZE_ERROR in case if max_delta_record_size is not sufficient - * for delta record creation, or max_delta_record_size is not a multiple of 10u - * - */ -DML_CORE_API(dmlc_status_t, create_delta_record_8u, (const uint8_t *reference_vector_ptr, - const uint8_t *second_vector_ptr, - const uint32_t compared_bytes, - const uint32_t delta_record_max_size, - uint8_t* delta_record_ptr, - uint32_t *const record_size_ptr)); - -/** - * @brief Applies delta record to the contents of memory at destination address - * - * @param[out] memory_region_ptr pointer to a memory region that is updated with a delta - * @param[in] delta_record_ptr pointer to a delta record - * @param[in] memory_region_size destination size - * @param[in] delta_record_size delta record size - * - * @warning Memory region byte size must be multiply of 8. - * @warning Delta record byte size must be multiply of 10. - * @warning Function does not support vectors' overlap. - * @warning Maximal supported offset is 524,280 bytes (0x7FFF8). - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR; - * - @ref DML_STATUS_DELTA_ALIGN_ERROR in case if memory_region_ptr address is not aligned a - * multiple of 8; - * - @ref DML_STATUS_DELTA_INPUT_SIZE_ERROR in case if memory region size is not multiple of 8; - * - @ref DML_STATUS_DELTA_RECORD_SIZE_ERROR in case if delta record size is not multiple of 10; - * - @ref DML_STATUS_OVERLAPPING_BUFFER_ERROR in case if vectors overlap - * - @ref DML_STATUS_MEMORY_OVERFLOW_ERROR in case if offset is greater than memory region size - */ -DML_CORE_API(dmlc_status_t, apply_delta_record_8u, (uint8_t * memory_region_ptr, - const uint8_t *delta_record_ptr, - const uint32_t memory_region_size, - const uint32_t delta_record_size)); - -#ifdef __cplusplus -} -#endif - -#endif //DML_KERNEL_COMPARE_H__ -/** @} */ diff --git a/sources/cores/include/core_cpu_features.h b/sources/cores/include/core_cpu_features.h deleted file mode 100644 index dc113ab..0000000 --- a/sources/cores/include/core_cpu_features.h +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @date 2/25/2020 - * - * @defgroup core_public_cpu CPU Features - * @ingroup core_public_features - * @{ - * @brief Wrappers of CPU features. - * - * @details CPU features group includes the following functions: - * - Functions that get CPU info; - * - Wrappers for cache manipulation instructions; - * - etc. - * - */ - -#include "core_definitions.h" - -#ifndef DML_KERNEL_CPU_FEATURES_H__ -#define DML_KERNEL_CPU_FEATURES_H__ - -#ifdef __cplusplus -extern "C" { -#endif - - - -#ifdef _WIN32 - -#include "intrin.h" - -/** - * @brief Return informantion about CPU based on arguments - * - * @param[out] info - 4-int buffer containing result of CPUID (registers EAX, EBX, ECX, EDX) - * @param[in] info_type - value of EAX register, setting type of resulting information - * @param[in] info_subtype - value of ECX register, setting subtype of resulting information - * - * @return - * Nothing - * - */ -DML_CORE_OWN_INLINE(void, cpuid, (int info[4], int info_type, int info_subtype)) -{ - __cpuidex(info, info_type, info_subtype); -} -#else - -// GCC Intrinsics -#include -#include - -/** - * @brief Return informantion about CPU based on arguments - * - * @param[out] info - 4-int buffer containing result of CPUID (registers EAX, EBX, ECX, EDX) - * @param[in] info_type - value of EAX register, setting type of resulting information - * @param[in] info_subtype - value of ECX register, setting subtype of resulting information - * - * @return - * Nothing - * - */ -DML_CORE_OWN_INLINE(void, cpuid, (int info[4], int info_type, int info_subtype)) -{ - __cpuid_count(info_type, info_subtype, info[0], info[1], info[2], info[3]); -} -#endif - -/** - * @brief Flushes the processor caches at the destination address with сache line invalidation from all cache hierarchy. - * - * @param[in] memory_region_ptr memory region address to update from cache - * @param[in] bytes_to_flush memory region size, in bytes, to flush - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR; - * - */ -DML_CORE_API(dmlc_status_t, move_cache_to_memory_8u, (const uint8_t *memory_region_ptr, - const uint32_t bytes_to_flush)); - -/** - * @brief Flushes the processor caches at the destination address without cache line invalidation. - * - * @param[in] memory_region_ptr - memory region address to update from cache - * @param[in] bytes_to_flush - memory region size, in bytes, to flush - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR; - * - */ -DML_CORE_API(dmlc_status_t, copy_cache_to_memory_8u, (const uint8_t *memory_region_ptr, - const uint32_t bytes_to_flush)); - -/** - * @brief Maximum cache size - */ -static int32_t max_cache_size = -1; - -/** - * @brief Returns max available cache size - * - * @param[out] size - pointer on resulting max cache size - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_UNKNOWN_CACHE_SIZE_ERROR; - * - */ -DML_CORE_OWN_INLINE(dml_status_t, get_max_cache_size, (int32_t * size)) -{ - if (max_cache_size > 0) - { - *size = max_cache_size; - return DML_STATUS_OK; - } - int32_t tmp_max_size = 0; - int32_t info[4] = {0, 0, 0, 0}; - - for (int32_t n = 0; n < 32; n++) { - dmlc_own_cpuid(info, 4, n); - if (!(info[0] & 0x1f)) - { - break; - } - if ((info[0] & 0x1f) != 2) - { - // Cache Size in Bytes = (Ways + 1) * (Partitions + 1) * (Line_Size + 1) * (Sets + 1) - int32_t tmp_cache_size = info[2] + 1; // Sets = ECX - tmp_cache_size *= (info[1] & 0xfff) + 1; // Line_Size = EBX[11:0] - tmp_cache_size *= ((info[1] >> 12) & 0x3ff) + 1; // Partitions = EBX[21:12] - tmp_cache_size *= ((info[1] >> 22) & 0x3ff) + 1; // Ways = EBX[31:22] - if (tmp_cache_size > tmp_max_size) { tmp_max_size = tmp_cache_size; } - } - } - if (tmp_max_size) - { - max_cache_size = tmp_max_size; - *size = tmp_max_size; - return DML_STATUS_OK; - } - else - { - *size = 0; - return DML_STATUS_UNKNOWN_CACHE_SIZE_ERROR; - } -} - -#ifdef __cplusplus -} -#endif - -#endif //DML_KERNEL_CPU_FEATURES_H__ - -/** @} */ diff --git a/sources/cores/include/core_definitions.h b/sources/cores/include/core_definitions.h deleted file mode 100644 index 2b279cd..0000000 --- a/sources/cores/include/core_definitions.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @file - * @date 2/10/2020 - * - * @defgroup core_src Kernel Layer - * @brief Intel(R) Data Mover Library (Intel® DML) Core functions - * - * @details DML Kernels Layer is a software path run on the user CPU, which must implement of the base DML features. - * The kernel is an atomic function without any dependencies on the upper layers of abstraction - * - * @defgroup core_public Public API - * @ingroup core_src - * - * @defgroup core_public_definitions Public Definitions - * @ingroup core_public - * @{ - * - * @brief Contains general definitions for public use in Intel® Data Mover Library (Intel® DML) Cores. - * - */ - -#include -#include "dml/dmldefs.h" - -#ifndef KERNEL_DEFINITIONS_H__ -#define KERNEL_DEFINITIONS_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -/* ------ Definitions ------ */ - -#if defined( _WIN32 ) || defined ( _WIN64 ) -#define DML_CORE_STDCALL __stdcall -#define DML_CORE_CDECL __cdecl -#else -#define DML_CORE_STDCALL -#define DML_CORE_CDECL -#endif - -/* ------ Macros ------ */ -/** - * @brief Defines an internal function declared in the file scope - */ -#define DML_CORE_OWN_INLINE(type, name, arg) type static inline dmlc_own_##name arg - -#if !defined( DML_CORE_API ) -#define DML_CORE_API(type, name, arg) type DML_CORE_STDCALL dmlc_##name arg /**< Declaration macros to manipulate function name */ -#endif - -/* ------ Statuses ------ */ - -typedef dml_status_t dmlc_status_t; /**< Redefinition of @ref dml_status_t for core functions */ - -#ifdef __cplusplus -} -#endif - -#endif //KERNEL_DEFINITIONS_H__ - -/** @} */ diff --git a/sources/cores/include/core_hash_functions.h b/sources/cores/include/core_hash_functions.h deleted file mode 100644 index cf038a7..0000000 --- a/sources/cores/include/core_hash_functions.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @date 3/5/2020 - * - * @defgroup core_public_hash Hash Features - * @ingroup core_public_features - * @{ - * - * @brief Features to calculate CRC - * - * @details Hash group contains optimized functions that calculate a hash value using - * different hash algorithms. - * - */ - - -#include "core_definitions.h" - -#ifndef DML_KERNEL_HASH_H__ -#define DML_KERNEL_HASH_H__ - -#ifdef __cplusplus -extern "C" { -#endif - - -/** - * @brief Shift value for extracting next byte for the CRC16 function - */ -#define OWN_CRC16_BYTE_SHIFT ( 8u ) - -/** - * @brief Shift value for extracting next byte the for CRC32 function - */ -#define OWN_CRC32_BYTE_SHIFT ( 24u ) - - -/** - * @brief Calculates CRC16 hash/checksum for a signified memory region - * - * @param[in] memory_region_ptr address of memory region to hash - * @param[in] bytes_to_hash memory region size, in bytes, to hash - * @param[in,out] crc_ptr CRC seed / result - * @param[in] polynomial polynomial to XORing - * - * @note No memory alignment is required; - * @note crc_ptr is the initial seed for CRC16 calculation and result storing - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR. - */ -DML_CORE_API(dmlc_status_t, calculate_crc_16u, (const uint8_t *const memory_region_ptr, - uint32_t bytes_to_hash, - uint16_t *const crc_ptr, - uint16_t polynomial)); - - -/** - * @brief Calculates CRC32 hash/checksum for a signified memory region - * - * @param[in] memory_region_ptr address of memory region to hash - * @param[in] bytes_to_hash memory region size, in bytes, to hash - * @param[in,out] crc_ptr CRC seed / result - * @param[in] polynomial polynomial to XORing - * - * @note No memory alignment is required; - * @note crc_ptr is the initial seed for CRC32 calculation and result storing - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR. - */ -DML_CORE_API(dmlc_status_t, calculate_crc_32u, (const uint8_t *const memory_region_ptr, - uint32_t bytes_to_hash, - uint32_t *const crc_ptr, - uint32_t polynomial)); - - -/** - * @brief Calculates CRC32 hash/checksum for a signified memory region with reversed bytes bits - * - * @param[in] memory_region_ptr address of memory region to hash - * @param[in] bytes_to_hash memory region size, in bytes, to hash - * @param[in,out] crc_ptr CRC seed / result - * @param[in] polynomial polynomial to XORing - * - * @note No memory alignment is required; - * @note crc_ptr is the initial seed for CRC32 calculation and result storing - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR. - */ -DML_CORE_API(dmlc_status_t, calculate_crc_reflected_32u, (const uint8_t *const memory_region_ptr, - uint32_t bytes_to_hash, - uint32_t *const crc_ptr, - uint32_t polynomial ) ); - - -#ifdef __cplusplus -} -#endif - -#endif // DML_KERNEL_HASH_H__ -/** @} */ diff --git a/sources/cores/include/core_memory.h b/sources/cores/include/core_memory.h deleted file mode 100644 index a2932a9..0000000 --- a/sources/cores/include/core_memory.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @date 2/20/2020 - * - * @defgroup core_public_memory Memory Features - * @ingroup core_public_features - * @{ - * - * @brief Features to move or copy memory region or to fill it with a pattern. - * - * @details Memory group contains optimized functions that perform the following tasks: - * - Copying the data from source to destination; - * - Movement the data from one memory region into another; - * - Filling vectors with some value, pattern. - * - */ - - - #include "core_definitions.h" - -#ifndef DML_KERNEL_MEMORY_H__ -#define DML_KERNEL_MEMORY_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * @brief Copies bytes from vector to another vector. - * - * @param[in] source_ptr pointer to source start - * @param[out] destination_ptr pointer to destination start - * @param[in] bytes_to_process number of bytes to process - * - * @note No memory alignment is required. - * - * @return - * - @ref DML_STATUS_OK; - */ -DML_CORE_API(dmlc_status_t, copy_8u, ( const uint8_t *const source_ptr, - uint8_t *const destination_ptr, - uint32_t bytes_to_process ) ); - - -/** - * @brief Moves bytes from vector to another vector. - * - * @param[in] source_ptr pointer to source start - * @param[out] destination_ptr pointer to destination start - * @param[in] bytes_to_process count of bytes to process - * - * @note No memory alignment is required. - * - * @return - * - @ref DML_STATUS_OK; - */ -DML_CORE_API(dmlc_status_t, move_8u, (const uint8_t *const source_ptr, - uint8_t *const destination_ptr, - uint32_t bytes_to_process)); - - -/** - * @brief Copies bytes from vector to two vectors. - * - * @param[in] source_ptr pointer to source start - * @param[out] first_destination_ptr pointer to first destination start - * @param[out] second_destination_ptr pointer to second destination start - * @param[in] bytes_to_process number of bytes to process - * - * @warning 0:11 bits in destination_first_ptr and destination_second_ptr must be equal. - * @warning Function does not support vectors' overlap. - * - * @return - * - @ref DML_STATUS_OK; - */ - DML_CORE_API(dmlc_status_t, dualcast_copy_8u, (const uint8_t *const source_ptr, - uint8_t *const first_destination_ptr, - uint8_t *const second_destination_ptr, - uint32_t bytes_to_process)); - - -/** - * @brief Fills the source vector with the value in the pattern field. - * - * @param[in] pattern 64-bit pattern to fill - * @param[out] memory_region_ptr memory region address - * @param[in] bytes_to_process count of bytes to process - * - * @note No memory alignment is required. - * - * @return - * - @ref DML_STATUS_OK; - * - @ref DML_STATUS_NULL_POINTER_ERROR. - */ -DML_CORE_API(dmlc_status_t, fill_with_pattern_8u, (uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process)); - - -#ifdef __cplusplus -} -#endif - -#endif // DML_KERNEL_MEMORY_H__ -/** @} */ diff --git a/sources/cores/src/avx512/dmlc_compare_8u_k0.cxx b/sources/cores/src/avx512/dmlc_compare_8u_k0.cxx deleted file mode 100644 index bc39a33..0000000 --- a/sources/cores/src/avx512/dmlc_compare_8u_k0.cxx +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - - /** - * @brief Contain default implementation of the follow functions: - * - @ref dmlc_own_compare_8u() - * - @ref dmlc_own_compare_with_pattern_8u() - * - * @date 07/06/2021 - * - */ - - -DML_CORE_OWN_INLINE(dmlc_status_t, compare_8u, (const uint8_t* first_vector_ptr, - const uint8_t* second_vector_ptr, - const uint32_t size, - uint32_t* const mismatch_offset_ptr)) -{ - uint32_t i; - __mmask64 msk64 = (__mmask64)0; - for (i = 0u; (i + 64) <= size; i += 64) { - msk64 = _mm512_cmp_epi8_mask( - _mm512_loadu_si512((void const*)(first_vector_ptr + i)), - _mm512_loadu_si512((void const*)(second_vector_ptr + i)), - _MM_CMPINT_NE); - if (msk64) { - *mismatch_offset_ptr = i + (uint32_t)_tzcnt_u64((uint64_t)msk64); - return DML_COMPARE_STATUS_NE; - } - } - { - uint64_t tail = size & 63; - msk64 = ((uint64_t)1 << tail) - (uint64_t)1; - msk64 = _mm512_cmp_epi8_mask( - _mm512_maskz_loadu_epi8(msk64, (void const*)(first_vector_ptr + i)), - _mm512_maskz_loadu_epi8(msk64, (void const*)(second_vector_ptr + i)), - _MM_CMPINT_NE); - if (msk64) { - *mismatch_offset_ptr = i + (uint32_t)_tzcnt_u64((uint64_t)msk64); - return DML_COMPARE_STATUS_NE; - } - } - return DML_COMPARE_STATUS_EQ; -} - -DML_CORE_OWN_INLINE(dmlc_status_t, compare_with_pattern_8u, (const uint8_t* memory_region_ptr, - const pattern_t pattern, - const uint32_t size, - uint32_t* const mismatch_offset_ptr)) -{ -#if (DML_SIZE_PATTERN_T == 64) - const uint32_t pattern_chunk_count = size >> 3; - const uint64_t tail_bytes_count = size & 7; - const uint64_t* const pattern_region_ptr = (uint64_t*)memory_region_ptr; - - __m512i x_pattern = _mm512_set1_epi64(pattern); - uint32_t i; - __mmask8 msk8 = (__mmask8)0; - - for (i = 0u; (i + 8) <= pattern_chunk_count; i += 8) { - msk8 = _mm512_cmp_epi64_mask(_mm512_loadu_si512((void const*)(pattern_region_ptr + i)), - x_pattern, - _MM_CMPINT_NE); - if (msk8) { - *mismatch_offset_ptr = (i + (uint32_t)_tzcnt_u32((uint32_t)msk8)) << 3u; - return DML_COMPARE_STATUS_NE; - } - } - { - uint64_t tail = pattern_chunk_count & 7; - if (tail) { - msk8 = (__mmask8)((1 << tail) - 1); - msk8 = _mm512_mask_cmp_epi64_mask(msk8, - _mm512_maskz_loadu_epi64(msk8, (void const*)(pattern_region_ptr + i)), - x_pattern, - _MM_CMPINT_NE); - if (msk8) { - *mismatch_offset_ptr = (i + (uint32_t)_tzcnt_u32((uint32_t)msk8)) << 3u; - return DML_COMPARE_STATUS_NE; - } - } - } - if (tail_bytes_count) { - memory_region_ptr += size - tail_bytes_count; - pattern_t byte_pattern = pattern; - // Compare tail - for (uint32_t i = 0; i < tail_bytes_count; i++) - { - if (memory_region_ptr[i] != (uint8_t)byte_pattern) - { - *mismatch_offset_ptr = (pattern_chunk_count << 3) + i; - - return DML_COMPARE_STATUS_NE; - } - byte_pattern >>= OWN_BYTE_BIT_LENGTH; - } - } - return DML_COMPARE_STATUS_EQ; -#else - //Constants - const uint32_t pattern_size = sizeof(pattern_t); - const uint32_t pattern_chunk_count = size / pattern_size; - const uint64_t tail_bytes_count = size % pattern_size; - const uint64_t* const pattern_region_ptr = (uint64_t*)memory_region_ptr; - - // Compare by pattern chunks - for (uint32_t i = 0u; i < pattern_chunk_count; i++) - { - if (pattern_region_ptr[i] != pattern) - { - *mismatch_offset_ptr = i * pattern_size; - - return DML_COMPARE_STATUS_NE; - } - } - - memory_region_ptr += size - tail_bytes_count; - pattern_t byte_pattern = pattern; - - // Compare tail - for (uint32_t i = 0; i < tail_bytes_count; i++) - { - if (memory_region_ptr[i] != (uint8_t)byte_pattern) - { - *mismatch_offset_ptr = pattern_chunk_count * pattern_size + i; - - return DML_COMPARE_STATUS_NE; - } - - byte_pattern >>= OWN_BYTE_BIT_LENGTH; - } - return DML_COMPARE_STATUS_EQ; -#endif -} - - diff --git a/sources/cores/src/avx512/dmlc_copy_8u_k0.cxx b/sources/cores/src/avx512/dmlc_copy_8u_k0.cxx deleted file mode 100644 index 2d56260..0000000 --- a/sources/cores/src/avx512/dmlc_copy_8u_k0.cxx +++ /dev/null @@ -1,779 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - - /** - * @brief Contain optimized AVX512 implementation of the follow functions: - * - @ref dmlc_copy_8u() - * - @ref dmlc_move_8u() - * - @ref dmlc_dualcast_copy_8u() - * - * @date 5/26/2021 - * - */ - -#include "core_cpu_features.h" - -#if defined(_MSC_VER) -#define OWN_ALIGNED_64_ARRAY(array_declaration) __declspec(align(64u)) array_declaration -#elif defined(__GNUC__) -#define OWN_ALIGNED_64_ARRAY(array_declaration) array_declaration __attribute__((aligned(64u))) -#endif - -DML_CORE_OWN_INLINE(void, px_copy_8u_unrolled, (const uint8_t *src_ptr, uint8_t *dst_ptr, uint32_t length)) { - uint32_t align_dst = 64u - ((uint64_t)dst_ptr & 0x3F); - uint32_t align_src = 64u - ((uint64_t)src_ptr & 0x3F); - - if (align_dst < 64u) { - if (length < align_dst) { - align_dst = length; - for (uint32_t i = 0u; i < align_dst; ++i) { - dst_ptr[i] = src_ptr[i]; - } - return; - } - for (uint32_t i = 0u; i < align_dst; ++i) { - dst_ptr[i] = src_ptr[i]; - } - length -= align_dst; - src_ptr += align_dst; - dst_ptr += align_dst; - } - - if (align_dst == align_src) { - const uint64_t *src_64u_ptr = (uint64_t *)src_ptr; - uint64_t *dst_64u_ptr = (uint64_t *)dst_ptr; - - uint32_t length_64u = length / sizeof(uint64_t); - uint32_t tail_start = length_64u * sizeof(uint64_t); - - while (length_64u > 3u) { - dst_64u_ptr[0] = src_64u_ptr[0]; - dst_64u_ptr[1] = src_64u_ptr[1]; - dst_64u_ptr[2] = src_64u_ptr[2]; - dst_64u_ptr[3] = src_64u_ptr[3]; - dst_64u_ptr += 4u; - src_64u_ptr += 4u; - length_64u -= 4u; - } - - for (uint32_t i = 0u; i < length_64u; ++i) { - dst_64u_ptr[i] = src_64u_ptr[i]; - } - - for (uint32_t i = tail_start; i < length; ++i) { - dst_ptr[i] = src_ptr[i]; - } - } - else { - while (length > 7u) { - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - dst_ptr[2] = src_ptr[2]; - dst_ptr[3] = src_ptr[3]; - dst_ptr[4] = src_ptr[4]; - dst_ptr[5] = src_ptr[5]; - dst_ptr[6] = src_ptr[6]; - dst_ptr[7] = src_ptr[7]; - - dst_ptr += 8u; - src_ptr += 8u; - length -= 8; - } - - for (uint32_t i = 0u; i < length; ++i) { - dst_ptr[i] = src_ptr[i]; - } - } -} - -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_2u[32]) = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_4u[32]) = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_6u[32]) = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_8u[32]) = {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_10u[32]) = {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_12u[32]) = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_14u[32]) = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_16u[32]) = {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_18u[32]) = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_20u[32]) = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_22u[32]) = {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_24u[32]) = {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_26u[32]) = {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_28u[32]) = {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_30u[32]) = {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_32u[32]) = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_34u[32]) = {17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_36u[32]) = {18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_38u[32]) = {19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_40u[32]) = {20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_42u[32]) = {21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_44u[32]) = {22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_46u[32]) = {23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_48u[32]) = {24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_50u[32]) = {25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_52u[32]) = {26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_54u[32]) = {27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_56u[32]) = {28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_58u[32]) = {29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_60u[32]) = {30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61}; -OWN_ALIGNED_64_ARRAY(static uint16_t permutex_idx_62u[32]) = {31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62}; - -static uint16_t *permutex_idx_pptr[31] = { - permutex_idx_2u, permutex_idx_4u, permutex_idx_6u, permutex_idx_8u, permutex_idx_10u, permutex_idx_12u, permutex_idx_14u, permutex_idx_16u, - permutex_idx_18u, permutex_idx_20u, permutex_idx_22u, permutex_idx_24u, permutex_idx_26u, permutex_idx_28u, permutex_idx_30u, permutex_idx_32u, - permutex_idx_34u, permutex_idx_36u, permutex_idx_38u, permutex_idx_40u, permutex_idx_42u, permutex_idx_44u, permutex_idx_46u, permutex_idx_48u, - permutex_idx_50u, permutex_idx_52u, permutex_idx_54u, permutex_idx_56u, permutex_idx_58u, permutex_idx_60u, permutex_idx_62u}; - -DML_CORE_OWN_INLINE(__m512i, mm512_bsrli_epi128, (__m512i a, uint32_t shift)) -{ - switch (shift) - { - case 1: { - return _mm512_bsrli_epi128(a, 1); - } - case 2: { - return _mm512_bsrli_epi128(a, 2); - } - case 3: { - return _mm512_bsrli_epi128(a, 3); - } - case 4: { - return _mm512_bsrli_epi128(a, 4); - } - case 5: { - return _mm512_bsrli_epi128(a, 5); - } - case 6: { - return _mm512_bsrli_epi128(a, 6); - } - case 7: { - return _mm512_bsrli_epi128(a, 7); - } - case 8: { - return _mm512_bsrli_epi128(a, 8); - } - case 9: { - return _mm512_bsrli_epi128(a, 9); - } - case 10: { - return _mm512_bsrli_epi128(a, 10); - } - case 11: { - return _mm512_bsrli_epi128(a, 11); - } - case 12: { - return _mm512_bsrli_epi128(a, 12); - } - case 13: { - return _mm512_bsrli_epi128(a, 13); - } - case 14: { - return _mm512_bsrli_epi128(a, 14); - } - case 15: { - return _mm512_bsrli_epi128(a, 15); - } - default: - return _mm512_setzero_si512(); - } -} - -DML_CORE_OWN_INLINE(__m512i, mm512_bslli_epi128, (__m512i a, uint32_t shift)) -{ - switch (shift) - { - case 1: { - return _mm512_bslli_epi128(a, 1); - } - case 2: { - return _mm512_bslli_epi128(a, 2); - } - case 3: { - return _mm512_bslli_epi128(a, 3); - } - case 4: { - return _mm512_bslli_epi128(a, 4); - } - case 5: { - return _mm512_bslli_epi128(a, 5); - } - case 6: { - return _mm512_bslli_epi128(a, 6); - } - case 7: { - return _mm512_bslli_epi128(a, 7); - } - case 8: { - return _mm512_bslli_epi128(a, 8); - } - case 9: { - return _mm512_bslli_epi128(a, 9); - } - case 10: { - return _mm512_bslli_epi128(a, 10); - } - case 11: { - return _mm512_bslli_epi128(a, 11); - } - case 12: { - return _mm512_bslli_epi128(a, 12); - } - case 13: { - return _mm512_bslli_epi128(a, 13); - } - case 14: { - return _mm512_bslli_epi128(a, 14); - } - case 15: { - return _mm512_bslli_epi128(a, 15); - } - default: - return _mm512_setzero_si512(); - } -} - -DML_CORE_OWN_INLINE(__m512i, mm512_alignr_epi8, (__m512i a, __m512i b, uint32_t shift)) -{ - switch (shift) - { - case 0: { - return b; - } - case 4: { - return _mm512_alignr_epi32(a, b, 1); - } - case 8: { - return _mm512_alignr_epi32(a, b, 2); - } - case 12: { - return _mm512_alignr_epi32(a, b, 3); - } - case 16: { - return _mm512_alignr_epi32(a, b, 4); - } - case 20: { - return _mm512_alignr_epi32(a, b, 5); - } - case 24: { - return _mm512_alignr_epi32(a, b, 6); - } - case 28: { - return _mm512_alignr_epi32(a, b, 7); - } - case 32: { - return _mm512_alignr_epi32(a, b, 8); - } - case 36: { - return _mm512_alignr_epi32(a, b, 9); - } - case 40: { - return _mm512_alignr_epi32(a, b, 10); - } - case 44: { - return _mm512_alignr_epi32(a, b, 11); - } - case 48: { - return _mm512_alignr_epi32(a, b, 12); - } - case 52: { - return _mm512_alignr_epi32(a, b, 13); - } - case 56: { - return _mm512_alignr_epi32(a, b, 14); - } - case 60: { - return _mm512_alignr_epi32(a, b, 15); - } - default: - return _mm512_setzero_si512(); - } -} - -DML_CORE_OWN_INLINE(void, copy_8u, (const uint8_t *src_ptr, - uint8_t *dst_ptr, - uint32_t length)) -{ - if (length < 1024u) { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - - if (length > 32000) { - int32_t size = 0u; - dmlc_own_get_max_cache_size(&size); - if ((size > 0) && (length > (uint32_t)size)) { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - } - - uint32_t align_dst = 64u - ((uint64_t)dst_ptr & 0x3F); - uint32_t align_src = 64u - ((uint64_t)src_ptr & 0x3F); - if (align_dst < 64u) - { - if (length < 4000u) { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, align_dst); - length -= align_dst; - dst_ptr += align_dst; - src_ptr += align_dst; - uint32_t length512u = length / sizeof(__m512i); - uint32_t tail = length % sizeof(__m512i); - - if (0u != ((align_src - align_dst) & 15u)) - { - uint32_t shift = (align_dst > align_src)? (align_dst - align_src) : (64u + align_dst - align_src); - - if (0u == (shift & 3u)) { - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2 = dmlc_own_mm512_alignr_epi8(zmm1, zmm0, shift); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4 = dmlc_own_mm512_alignr_epi8(zmm3, zmm1, shift); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6 = dmlc_own_mm512_alignr_epi8(zmm5, zmm3, shift); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7 = dmlc_own_mm512_alignr_epi8(zmm0, zmm5, shift); - _mm512_store_si512((__m512i *)dst_ptr, zmm2); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else if (0u == (shift & 1u)) { - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - __m512i permutex_idxmm = _mm512_load_si512(permutex_idx_pptr[(shift - 2) / 2]); - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2 = _mm512_permutex2var_epi16(zmm0, permutex_idxmm, zmm1); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4 = _mm512_permutex2var_epi16(zmm1, permutex_idxmm, zmm3); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6 = _mm512_permutex2var_epi16(zmm3, permutex_idxmm, zmm5); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7 = _mm512_permutex2var_epi16(zmm5, permutex_idxmm, zmm0); - _mm512_store_si512((__m512i *)dst_ptr, zmm2); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else if (shift < 16u) { - if (length < 16000u) { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - __m512i permutex_idxmm_higher = _mm512_load_si512(permutex_idx_pptr[(shift - 1) / 2]); - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm0, shift); - __m512i zmm2_higher = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_higher, zmm1); - zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm2_higher, 1u); - zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x0001000100010001, zmm2_lower); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm1, shift); - __m512i zmm4_higher = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_higher, zmm3); - zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm4_higher, 1u); - zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x0001000100010001, zmm4_lower); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm3, shift); - __m512i zmm6_higher = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_higher, zmm5); - zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm6_higher, 1u); - zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x0001000100010001, zmm6_lower); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm5, shift); - __m512i zmm7_higher = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_higher, zmm0); - zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm7_higher, 1u); - zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x0001000100010001, zmm7_lower); - _mm512_store_si512((__m512i *)dst_ptr, zmm2_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7_higher); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else if (shift > 48u) { - if (length < 16000u) { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - __m512i permutex_idxmm_lower = _mm512_load_si512(permutex_idx_pptr[(shift - 3) / 2]); - uint32_t shift_higher = 64u - shift; - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2_lower = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_lower, zmm1); - zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm2_lower, 1u); - __m512i zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm1, shift_higher); - zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x7FFF7FFF7FFF7FFF, zmm2_lower); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4_lower = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_lower, zmm3); - zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm4_lower, 1u); - __m512i zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm3, shift_higher); - zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x7FFF7FFF7FFF7FFF, zmm4_lower); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6_lower = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_lower, zmm5); - zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm6_lower, 1u); - __m512i zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm5, shift_higher); - zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x7FFF7FFF7FFF7FFF, zmm6_lower); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7_lower = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_lower, zmm0); - zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm7_lower, 1u); - __m512i zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm0, shift_higher); - zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x7FFF7FFF7FFF7FFF, zmm7_lower); - _mm512_store_si512((__m512i *)dst_ptr, zmm2_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7_higher); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - } - else - { - while (length512u > 3u) { - __m512i zmm0 = _mm512_loadu_si512((const __m512i *)src_ptr); - __m512i zmm1 = _mm512_loadu_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm2 = _mm512_loadu_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm3 = _mm512_loadu_si512((const __m512i *)(src_ptr + 192u)); - _mm512_store_si512((__m512i *)dst_ptr, zmm0); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm1); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm2); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm3); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4; - } - } - while (length512u > 0u) { - __m512i zmm0 = _mm512_loadu_si512((const __m512i *)src_ptr); - _mm512_store_si512((__m512i *)dst_ptr, zmm0); - src_ptr += 64u; - dst_ptr += 64u; - --length512u; - } - - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, tail); - - return; - } - - uint32_t length512u = length / sizeof(__m512i); - uint32_t tail = length % sizeof(__m512i); - - if (align_src < 64u) - { - if (length < 32000u) { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - uint32_t shift = 64 - align_src; - - if (0u == (shift & 3u)) { - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2 = dmlc_own_mm512_alignr_epi8(zmm1, zmm0, shift); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4 = dmlc_own_mm512_alignr_epi8(zmm3, zmm1, shift); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6 = dmlc_own_mm512_alignr_epi8(zmm5, zmm3, shift); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7 = dmlc_own_mm512_alignr_epi8(zmm0, zmm5, shift); - _mm512_store_si512((__m512i *)dst_ptr, zmm2); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else if (0u == (shift & 1u)) { - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - __m512i permutex_idxmm = _mm512_load_si512(permutex_idx_pptr[(shift - 2) / 2]); - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2 = _mm512_permutex2var_epi16(zmm0, permutex_idxmm, zmm1); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4 = _mm512_permutex2var_epi16(zmm1, permutex_idxmm, zmm3); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6 = _mm512_permutex2var_epi16(zmm3, permutex_idxmm, zmm5); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7 = _mm512_permutex2var_epi16(zmm5, permutex_idxmm, zmm0); - _mm512_store_si512((__m512i *)dst_ptr, zmm2); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else if (shift < 16u) { - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - __m512i permutex_idxmm_higher = _mm512_load_si512(permutex_idx_pptr[(shift - 1) / 2]); - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm0, shift); - __m512i zmm2_higher = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_higher, zmm1); - zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm2_higher, 1u); - zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x0001000100010001, zmm2_lower); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm1, shift); - __m512i zmm4_higher = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_higher, zmm3); - zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm4_higher, 1u); - zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x0001000100010001, zmm4_lower); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm3, shift); - __m512i zmm6_higher = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_higher, zmm5); - zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm6_higher, 1u); - zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x0001000100010001, zmm6_lower); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm5, shift); - __m512i zmm7_higher = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_higher, zmm0); - zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm7_higher, 1u); - zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x0001000100010001, zmm7_lower); - _mm512_store_si512((__m512i *)dst_ptr, zmm2_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7_higher); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else if (shift > 48u) { - src_ptr -= shift; - __mmask64 skip_mask = ~((1llu << shift) - 1u); - __m512i zmm0 = _mm512_maskz_loadu_epi8(skip_mask, (const __m512i *)src_ptr); - src_ptr += 64u; - - __m512i permutex_idxmm_lower = _mm512_load_si512(permutex_idx_pptr[(shift - 3) / 2]); - uint32_t shift_higher = 64u - shift; - - while (length512u > 4u) { - __m512i zmm1 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm2_lower = _mm512_permutex2var_epi16(zmm0, permutex_idxmm_lower, zmm1); - zmm2_lower = dmlc_own_mm512_bsrli_epi128(zmm2_lower, 1u); - __m512i zmm2_higher = dmlc_own_mm512_bslli_epi128(zmm1, shift_higher); - zmm2_higher = _mm512_mask_mov_epi8(zmm2_higher, 0x7FFF7FFF7FFF7FFF, zmm2_lower); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm4_lower = _mm512_permutex2var_epi16(zmm1, permutex_idxmm_lower, zmm3); - zmm4_lower = dmlc_own_mm512_bsrli_epi128(zmm4_lower, 1u); - __m512i zmm4_higher = dmlc_own_mm512_bslli_epi128(zmm3, shift_higher); - zmm4_higher = _mm512_mask_mov_epi8(zmm4_higher, 0x7FFF7FFF7FFF7FFF, zmm4_lower); - __m512i zmm5 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm6_lower = _mm512_permutex2var_epi16(zmm3, permutex_idxmm_lower, zmm5); - zmm6_lower = dmlc_own_mm512_bsrli_epi128(zmm6_lower, 1u); - __m512i zmm6_higher = dmlc_own_mm512_bslli_epi128(zmm5, shift_higher); - zmm6_higher = _mm512_mask_mov_epi8(zmm6_higher, 0x7FFF7FFF7FFF7FFF, zmm6_lower); - zmm0 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - __m512i zmm7_lower = _mm512_permutex2var_epi16(zmm5, permutex_idxmm_lower, zmm0); - zmm7_lower = dmlc_own_mm512_bsrli_epi128(zmm7_lower, 1u); - __m512i zmm7_higher = dmlc_own_mm512_bslli_epi128(zmm0, shift_higher); - zmm7_higher = _mm512_mask_mov_epi8(zmm7_higher, 0x7FFF7FFF7FFF7FFF, zmm7_lower); - _mm512_store_si512((__m512i *)dst_ptr, zmm2_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm4_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm6_higher); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm7_higher); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4u; - } - - src_ptr -= 64u - shift; - } - else { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - } - else - { - if ((12000 < length) && (length < 32000)) { - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, length); - return; - } - while (length512u > 3u) { - __m512i zmm0 = _mm512_load_si512((const __m512i *)src_ptr); - __m512i zmm1 = _mm512_load_si512((const __m512i *)(src_ptr + 64u)); - __m512i zmm2 = _mm512_load_si512((const __m512i *)(src_ptr + 128u)); - __m512i zmm3 = _mm512_load_si512((const __m512i *)(src_ptr + 192u)); - _mm512_store_si512((__m512i *)dst_ptr, zmm0); - _mm512_store_si512((__m512i *)(dst_ptr + 64u), zmm1); - _mm512_store_si512((__m512i *)(dst_ptr + 128u), zmm2); - _mm512_store_si512((__m512i *)(dst_ptr + 192u), zmm3); - src_ptr += 256u; - dst_ptr += 256u; - length512u -= 4; - } - } - - while (length512u > 0u) { - __m512i zmm0 = _mm512_loadu_si512((const __m512i *)src_ptr); - _mm512_store_si512((__m512i *)dst_ptr, zmm0); - src_ptr += 64u; - dst_ptr += 64u; - --length512u; - } - - dmlc_own_px_copy_8u_unrolled(src_ptr, dst_ptr, tail); -} - -DML_CORE_OWN_INLINE(void, px_copy_8u_not_unrolled, (const uint8_t *src_ptr, uint8_t *dst_ptr, uint32_t length)) { - const uint64_t *src_64u_ptr = (uint64_t *)src_ptr; - uint64_t *dst_64u_ptr = (uint64_t *)dst_ptr; - - uint32_t length_64u = length / sizeof(uint64_t); - uint32_t tail_start = length_64u * sizeof(uint64_t); - - for (uint32_t i = 0u; i < length_64u; ++i) { - dst_64u_ptr[i] = src_64u_ptr[i]; - } - - for (uint32_t i = tail_start; i < length; ++i) { - dst_ptr[i] = src_ptr[i]; - } -} - -DML_CORE_OWN_INLINE(void, move_8u, (const uint8_t *const source_ptr, - uint8_t *const destination_ptr, - uint32_t bytes_to_process)) -{ - // Current position in source vector - const uint8_t *source_current_ptr = (const uint8_t *)(source_ptr + bytes_to_process); - - // Current position in destination vector - uint8_t *destination_current_ptr = (uint8_t *)(destination_ptr + bytes_to_process); - - while (0u < bytes_to_process) - { - // Shift position in destination vector - destination_current_ptr--; - - // Shift position in source vector - source_current_ptr--; - - // Copy 1 byte - (*destination_current_ptr) = (*source_current_ptr); - - // Decrease bytes counter - bytes_to_process -= sizeof(uint8_t); - } -} - - -DML_CORE_OWN_INLINE(void, dualcast_copy_8u, (const uint8_t *const source_ptr, - uint8_t *const first_destination_ptr, - uint8_t *const second_destination_ptr, - uint32_t bytes_to_process)) -{ - // Current position in source vector 64u - const uint8_t *source_current_ptr = (const uint8_t *)source_ptr; - - // Current position in first destination vector 64u - uint8_t *first_destination_current_ptr = (uint8_t *)first_destination_ptr; - - // Current position in second destination vector 64u - uint8_t *second_destination_current_ptr = (uint8_t *)second_destination_ptr; - - while (0 < bytes_to_process) - { - // Copy 1 byte to first destination vector - (*first_destination_current_ptr) = (*source_current_ptr); - - // Copy 1 byte to second destination vector - (*second_destination_current_ptr) = (*source_current_ptr); - - // Shift position in first destination vector - first_destination_current_ptr++; - - // Shift position in second destination vector - second_destination_current_ptr++; - - // Shift position in source vector - source_current_ptr++; - - // Decrease bytes counter - bytes_to_process -= sizeof(uint8_t); - } -} diff --git a/sources/cores/src/avx512/dmlc_crc_16u_32u_k0.cxx b/sources/cores/src/avx512/dmlc_crc_16u_32u_k0.cxx deleted file mode 100644 index 17cc6d8..0000000 --- a/sources/cores/src/avx512/dmlc_crc_16u_32u_k0.cxx +++ /dev/null @@ -1,618 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_calculate_crc_32u() - * - * @date 7/20/2021 - * - */ - -#define DML_DISABLE_OPTIMIZATION_ - -/** -* @todo -*/ -static inline uint32_t getCRCSize(uint64_t poly) -{ - uint32_t crcSize; - crcSize = 63 - (uint32_t)_lzcnt_u64(poly); - return crcSize; -} - -/** -* @todo -*/ - -static void own_CRC_8u_opt_k0(const uint8_t* src_ptr, uint32_t init_crc, int len0, int crc_size, uint32_t* koeff_ptr, uint32_t* crc_ptr); - -/** -* @todo -*/ -static void own_CRC_8u_k0(const uint8_t* src_ptr, uint32_t len, uint64_t poly, const uint8_t optPoly[128], uint32_t init, uint32_t* crc_ptr) -{ - int crc_size = getCRCSize(poly); - own_CRC_8u_opt_k0(src_ptr, init, len, crc_size, (uint32_t*)optPoly, crc_ptr); -} - -/** -* @todo -*/ -static void poly1x64_32_div(uint64_t poly, uint64_t* quit_ptr, uint32_t* tail_ptr) -{ - int i, j; - uint64_t tail = 0; - uint64_t quot = 0; - uint8_t byte = 0x01; - for (i = 0; i < 9; i++) { - uint8_t bit; - uint64_t hbit; - for (j = 0; j < 8; j++) { - bit = (byte & 0x80) >> 7; - byte <<= 1; - hbit = tail & 0x80000000; - tail <<= 1; - tail |= bit; - quot <<= 1; - if (hbit) { - tail = tail ^ poly; - quot |= 1; - } - tail = tail & 0xffffffff; - } - } - *quit_ptr = quot; - *tail_ptr = (uint32_t)tail; - return; -} - -/** -* @todo -*/ -static inline void own_gen_crc_opt_poly_8u(uint64_t poly, uint8_t optPoly[128]) -{ - uint64_t u; - uint32_t i, k, crc_size; - uint32_t t; - - crc_size = getCRCSize(poly); - uint32_t* opt = (uint32_t*)optPoly; - *(uint64_t*)opt = poly; /*copy poly*/ - uint64_t poly32 = poly << (32 - crc_size); - poly1x64_32_div(poly32, &u, &t); /*for 1^64 and U*/ - *(uint64_t*)(opt + 2) = u; - int bits[] = {64, 96, 160, 224, 288, 352, 416, 480, 544, 608, 672, 736, 800, 864, 928, 992, 1056, 2016, 2080}; - uint32_t tail = 0; - uint32_t poly_32 = (uint32_t)poly; - int j; - - k = bits[0] + 8; - tail = poly_32; - for (j = 40; (uint32_t)j < k; j++) { - uint32_t mask; - mask = (tail & 0x80000000) ? poly_32 : 0; - tail += tail; - tail ^= mask; - } - opt[4 + 0] = (uint32_t)tail; - - for (i = 1; i < ((sizeof(bits) / sizeof(bits[0])) - 2); i++) { - k = bits[i] + 8; - for (; (uint32_t)j < k; j++) { - uint32_t mask; - mask = (tail & 0x80000000) ? poly_32 : 0; - tail += tail; - tail ^= mask; - } - opt[4 + i] = (uint32_t)tail; - } -} - -DML_CORE_OWN_INLINE(dmlc_status_t, calculate_crc_32u, (const uint8_t* const memory_region_ptr, - uint32_t bytes_to_hash, - uint32_t* const crc_ptr, - uint32_t polynomial)) -{ - uint64_t poly = (uint64_t)polynomial | ((uint64_t)1u << (uint64_t)32u); - uint8_t opt_poly_ptr[128]; - - own_gen_crc_opt_poly_8u(poly, opt_poly_ptr); - own_CRC_8u_k0(memory_region_ptr, bytes_to_hash, poly, opt_poly_ptr, *crc_ptr, crc_ptr); - return DML_STATUS_OK; -} - -#if defined(_MSC_VER) -#pragma optimize("", off) -#pragma optimize("O3", on) -#endif - -/** -* @todo -*/ -#define _MM_XOR_PS(A,B) _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(A),_mm_castsi128_ps(B))) -/** -* @todo -*/ -#define arg1_low32 ecx -/** -* @todo -*/ -static void own_CRC_8u_opt_k0(const uint8_t* src_ptr, uint32_t init_crc, int len0, int crc_size, uint32_t* koeff, uint32_t* crc_ptr) -{ - uint64_t pshufb_shf_table[] = { - 0x8786858483828100, 0x8f8e8d8c8b8a8988, - 0x0706050403020100, 0x000e0d0c0b0a0908}; - - int len = len0; - uint8_t ttt[128]; - uint8_t* r11 = ttt; - uint8_t* ptr; - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm12, xmm13; -#ifndef DML_DISABLE_OPTIMIZATION_ - __m128i xmm11; -#endif // DML_DISABLE_OPTIMIZATION_ - - int eax, ecx, r9; - __m128i ENDIA_SHUF_MASK = _mm_set_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); - __m128i mask2 = _mm_set_epi64x(0x00000000FFFFFFFF, 0xFFFFFFFFFFFFFFFF); - __m128i mask1 = _mm_set_epi64x(0x8080808080808080, 0x8080808080808080); - - uint64_t q = *(uint64_t*)(koeff + 0); - q <<= (32 - crc_size); - uint64_t u = *(uint64_t*)(koeff + 2); - uint64_t k_64 = ((uint64_t)koeff[4]) << 32; - uint64_t k_96 = ((uint64_t)koeff[5]) << 32; - uint64_t k_160 = ((uint64_t)koeff[6]) << 32; - uint64_t k_224 = ((uint64_t)koeff[7]) << 32; - uint64_t k_288 = ((uint64_t)koeff[8]) << 32; - uint64_t k_352 = ((uint64_t)koeff[9]) << 32; - uint64_t k_416 = ((uint64_t)koeff[10]) << 32; - uint64_t k_480 = ((uint64_t)koeff[11]) << 32; - uint64_t k_544 = ((uint64_t)koeff[12]) << 32; - uint64_t k_608 = ((uint64_t)koeff[13]) << 32; - uint64_t k_672 = ((uint64_t)koeff[14]) << 32; - uint64_t k_736 = ((uint64_t)koeff[15]) << 32; - uint64_t k_800 = ((uint64_t)koeff[16]) << 32; - uint64_t k_864 = ((uint64_t)koeff[17]) << 32; - uint64_t k_928 = ((uint64_t)koeff[18]) << 32; - uint64_t k_992 = ((uint64_t)koeff[19]) << 32; - uint64_t k_1056 = ((uint64_t)koeff[20]) << 32; - - - ecx = init_crc; - //crc16_t10dif_01: - ecx = ecx << (32 - crc_size); -#ifndef DML_DISABLE_OPTIMIZATION_ - if (len < 256) { - goto _less_than_256; - } -#endif // DML_DISABLE_OPTIMIZATION_ - //; load the initial crc value - xmm10 = _mm_cvtsi32_si128(arg1_low32); //movd xmm10, arg1_low32; initial crc - //; crc value does not need to be byte - reflected, but it needs to be moved to the high part of the register. - //; because data will be byte - reflected and will align with initial crc at correct place. - xmm10 = _mm_slli_si128(xmm10, 12); - //; receive the initial 128B data, xor the initial crc value - xmm0 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 0)); - xmm1 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 1)); - xmm2 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 2)); - xmm3 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 3)); - xmm4 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 4)); - xmm5 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 5)); - xmm6 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 6)); - xmm7 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 7)); - xmm0 = _mm_shuffle_epi8(xmm0, ENDIA_SHUF_MASK); - //; XOR the initial_crc value - xmm0 = _mm_xor_si128(xmm0, xmm10); - xmm1 = _mm_shuffle_epi8(xmm1, ENDIA_SHUF_MASK); - xmm2 = _mm_shuffle_epi8(xmm2, ENDIA_SHUF_MASK); - xmm3 = _mm_shuffle_epi8(xmm3, ENDIA_SHUF_MASK); - xmm4 = _mm_shuffle_epi8(xmm4, ENDIA_SHUF_MASK); - xmm5 = _mm_shuffle_epi8(xmm5, ENDIA_SHUF_MASK); - xmm6 = _mm_shuffle_epi8(xmm6, ENDIA_SHUF_MASK); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm10 = _mm_set_epi64x(k_1056/*rk4*/, k_992/*rk3*/); - //; imm value of pclmulqdq instruction will determine which constant to use - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - //; we subtract 256 instead of 128 to save one instruction from the loop - len -= 256; - // - //; at this section of the code, there is 128 * x + y(0 <= y < 128) bytes of buffer.The _fold_128_B_loop - //; loop will fold 128B at a time until we have 128 + y Bytes of buffer - // - // - //; fold 128B at a time.This section of the code folds 8 xmm registers in parallel - _fold_128_B_loop: - // - //; update the buffer pointer - src_ptr += 128; - xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 0)); - xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 1)); - xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); - xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); - xmm8 = xmm0; - xmm13 = xmm1; - xmm0 = _mm_clmulepi64_si128(xmm0, xmm10, 0x0); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); - xmm1 = _mm_clmulepi64_si128(xmm1, xmm10, 0x0); - xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); - xmm0 = _mm_xor_si128(xmm0, xmm9); - xmm0 = _MM_XOR_PS(xmm0, xmm8); - xmm1 = _mm_xor_si128(xmm1, xmm12); - xmm1 = _MM_XOR_PS(xmm1, xmm13); - xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 2)); - xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 3)); - xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); - xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); - xmm8 = xmm2; - xmm13 = xmm3; - xmm2 = _mm_clmulepi64_si128(xmm2, xmm10, 0x0); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); - xmm3 = _mm_clmulepi64_si128(xmm3, xmm10, 0x0); - xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); - xmm2 = _mm_xor_si128(xmm2, xmm9); - xmm2 = _MM_XOR_PS(xmm2, xmm8); - xmm3 = _mm_xor_si128(xmm3, xmm12); - xmm3 = _MM_XOR_PS(xmm3, xmm13); - - xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 4)); - xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 5)); - xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); - xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); - xmm8 = xmm4; - xmm13 = xmm5; - xmm4 = _mm_clmulepi64_si128(xmm4, xmm10, 0x0); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); - xmm5 = _mm_clmulepi64_si128(xmm5, xmm10, 0x0); - xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); - xmm4 = _mm_xor_si128(xmm4, xmm9); - xmm4 = _MM_XOR_PS(xmm4, xmm8); - xmm5 = _mm_xor_si128(xmm5, xmm12); - xmm5 = _MM_XOR_PS(xmm5, xmm13); - - xmm9 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 6)); - xmm12 = _mm_loadu_si128((const __m128i*)(src_ptr + 16 * 7)); - xmm9 = _mm_shuffle_epi8(xmm9, ENDIA_SHUF_MASK); - xmm12 = _mm_shuffle_epi8(xmm12, ENDIA_SHUF_MASK); - xmm8 = xmm6; - xmm13 = xmm7; - xmm6 = _mm_clmulepi64_si128(xmm6, xmm10, 0x0); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x11); - xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x0); - xmm13 = _mm_clmulepi64_si128(xmm13, xmm10, 0x11); - xmm6 = _mm_xor_si128(xmm6, xmm9); - xmm6 = _MM_XOR_PS(xmm6, xmm8); - xmm7 = _mm_xor_si128(xmm7, xmm12); - xmm7 = _MM_XOR_PS(xmm7, xmm13); - - len -= 128; - //; check if there is another 128B in the buffer to be able to fold - if (len >= 0) goto _fold_128_B_loop; //jge _fold_128_B_loop - //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - src_ptr += 128; - //; at this point, the buffer pointer is pointing at the last y Bytes of the buffer - //; fold the 8 xmm registers to 1 xmm register with different constants - // - xmm10 = _mm_set_epi64x(k_928/*rk10*/, k_864/*rk9*/); - xmm8 = xmm0; - xmm0 = _mm_clmulepi64_si128(xmm0, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _MM_XOR_PS(xmm7, xmm0); - - xmm10 = _mm_set_epi64x(k_800/*rk12*/, k_736/*rk11*/); - xmm8 = xmm1; - xmm1 = _mm_clmulepi64_si128(xmm1, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _MM_XOR_PS(xmm7, xmm1); - xmm10 = _mm_set_epi64x(k_672/*rk14*/, k_608/*rk13*/); - xmm8 = xmm2; - xmm2 = _mm_clmulepi64_si128(xmm2, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _mm_xor_si128(xmm7, xmm2); - xmm10 = _mm_set_epi64x(k_544/*rk16*/, k_480/*rk15*/); - xmm8 = xmm3; - xmm3 = _mm_clmulepi64_si128(xmm3, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _MM_XOR_PS(xmm7, xmm3); - xmm10 = _mm_set_epi64x(k_416/*rk18*/, k_352/*rk17*/); - xmm8 = xmm4; - xmm4 = _mm_clmulepi64_si128(xmm4, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _mm_xor_si128(xmm7, xmm4); - xmm10 = _mm_set_epi64x(k_288/*rk20*/, k_224/*rk19*/); - xmm8 = xmm5; - xmm5 = _mm_clmulepi64_si128(xmm5, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _MM_XOR_PS(xmm7, xmm5); - xmm10 = _mm_set_epi64x(k_160/*rk2*/, k_96/*rk1*/); - // ; imm value of pclmulqdq instruction will determine which constant to use - xmm8 = xmm6; - xmm6 = _mm_clmulepi64_si128(xmm6, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _mm_xor_si128(xmm7, xmm6); - // - // ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop - // ; instead of a cmp instruction, we use the negative flag with the jl instruction - len += (128 - 16); - if (len < 0) goto _final_reduction_for_128; // jl _final_reduction_for_128 - // ; now we have 16 + y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory - // ; we can fold 16 bytes at a time if y >= 16 - // ; continue folding 16B at a time - _16B_reduction_loop: - xmm8 = xmm7; - xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm0 = _mm_loadu_si128((const __m128i*)src_ptr); - xmm0 = _mm_shuffle_epi8(xmm0, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - src_ptr += 16; - len -= 16; - // ; instead of a cmp instruction, we utilize the flags with the jge instruction - // ; equivalent of : cmp arg3, 16 - 16 - // ; check if there is any more 16B in the buffer to be able to fold - - if (len >= 0) goto _16B_reduction_loop; // jge _16B_reduction_loop - // ; now we have 16 + z bytes left to reduce, where 0 <= z < 16. - // ; first, we reduce the data in the xmm7 register - _final_reduction_for_128: - // ; check if any more data to fold.If not, compute the CRC of the final 128 bits - len += 16; - if (len == 0) goto _128_done; // je _128_done - // ; here we are getting data that is less than 16 bytes. - // ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes. - // ; after that the registers need to be adjusted. -#ifndef DML_DISABLE_OPTIMIZATION_ - _get_last_two_xmms: -#endif // DML_DISABLE_OPTIMIZATION_ - xmm2 = xmm7; - xmm1 = _mm_loadu_si128((const __m128i*)(src_ptr - 16 + len)); - xmm1 = _mm_shuffle_epi8(xmm1, ENDIA_SHUF_MASK); - // ; get rid of the extra data that was loaded before - // ; load the shift constant - // lea rax, [pshufb_shf_table + 16] - // sub rax, arg3 - ptr = (uint8_t*)pshufb_shf_table + 16 - len; - xmm0 = _mm_loadu_si128((const __m128i*)ptr); - // - // ; shift xmm2 to the left by arg3 bytes - xmm2 = _mm_shuffle_epi8(xmm2, xmm0); - xmm0 = _mm_xor_si128(xmm0, mask1); - xmm7 = _mm_shuffle_epi8(xmm7, xmm0); - xmm1 = _mm_blendv_epi8(xmm1, xmm2, xmm0); - // ; fold 16 Bytes - xmm2 = xmm1; - xmm8 = xmm7; - xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x11); - xmm8 = _mm_clmulepi64_si128(xmm8, xmm10, 0x0); - xmm7 = _mm_xor_si128(xmm7, xmm8); - xmm7 = _mm_xor_si128(xmm7, xmm2); - _128_done: - // ; compute crc of a 128 - bit value - xmm10 = _mm_set_epi64x(k_64/*rk6*/, k_96/*rk5*/); - xmm0 = xmm7; - // ; 64b fold - xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x1); - xmm0 = _mm_slli_si128(xmm0, 8); - xmm7 = _mm_xor_si128(xmm7, xmm0); - // ; 32b fold - xmm0 = xmm7; - xmm0 = _mm_and_si128(xmm0, mask2); - xmm7 = _mm_srli_si128(xmm7, 12); - xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x10); - xmm7 = _mm_xor_si128(xmm7, xmm0); - // ; barrett reduction - _barrett: - xmm10 = _mm_set_epi64x(q/*rk8*/, u/*rk7*/); - xmm0 = xmm7; - xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x01); - xmm7 = _mm_slli_si128(xmm7, 4); - xmm7 = _mm_clmulepi64_si128(xmm7, xmm10, 0x11); - - xmm7 = _mm_slli_si128(xmm7, 4); - xmm7 = _mm_xor_si128(xmm7, xmm0); - eax = _mm_extract_epi32(xmm7, 1); -#ifndef DML_DISABLE_OPTIMIZATION_ - _cleanup: -#endif // DML_DISABLE_OPTIMIZATION_ - // ; scale the result back to 16 bits - eax = ((uint32_t)eax) >> (32 - crc_size); - *crc_ptr = eax; - return; // ret - - //align 16 -#ifndef DML_DISABLE_OPTIMIZATION_ - _less_than_256: - // - //; check if there is enough buffer to be able to fold 16B at a time - //cmp arg3, 32 - //jl _less_than_32 - if (len < 32) { - goto _less_than_32; - } - xmm11 = ENDIA_SHUF_MASK; - //; if there is, load the constants - xmm10 = _mm_set_epi64x(k_160/*rk2*/, k_96/*rk1*/); - xmm0 = _mm_cvtsi32_si128(arg1_low32); - xmm0 = _mm_slli_si128(xmm0, 12); - xmm7 = _mm_loadu_si128((const __m128i*)src_ptr); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - // - //; update the buffer pointer - src_ptr += 16; - // - //; update the counter.subtract 32 instead of 16 to save one instruction from the loop - len -= 32; - goto _16B_reduction_loop; //jmp _16B_reduction_loop - - //align 16 - _less_than_32: - //; mov initial crc to the return value. this is necessary for zero - length buffers. - eax = arg1_low32; //mov eax, arg1_low32 - //test arg3, arg3 - if (len == 0) goto _cleanup; //je _cleanup - // - xmm11 = ENDIA_SHUF_MASK; - - xmm0 = _mm_cvtsi32_si128(arg1_low32); - xmm0 = _mm_slli_si128(xmm0, 12); - //cmp arg3, 16 - if (len == 16) goto _exact_16_left; //je _exact_16_left - if (len < 16) goto _less_than_16_left; //jl _less_than_16_left - - xmm7 = _mm_loadu_si128((const __m128i*)src_ptr); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - src_ptr += 16; - len -= 16; - xmm10 = _mm_set_epi64x(k_160/*rk2*/, k_96/*rk1*/); - goto _get_last_two_xmms; //jmp _get_last_two_xmms - //align 16 - _less_than_16_left: -#endif // DML_DISABLE_OPTIMIZATION_ - //; use stack space to load data less than 16 bytes, zero - out the 16B in memory first. - // - xmm1 = _mm_setzero_si128(); - - _mm_storeu_si128((__m128i*)r11, xmm1); - //cmp arg3, 4 - if (len < 4) goto _only_less_than_4; //jl _only_less_than_4 - //; backup the counter value - r9 = len; - //cmp arg3, 8 - if (len < 8) goto _less_than_8_left; //jl _less_than_8_left - //; load 8 Bytes - *(int64_t*)r11 = *(int64_t*)src_ptr; - r11 += 8; - len -= 8; - src_ptr += 8; - _less_than_8_left: - //cmp arg3, 4 - if (len < 4) goto _less_than_4_left; //jl _less_than_4_left - //; load 4 Bytes - *(int*)r11 = *(int*)src_ptr; - r11 += 4; - len -= 4; - src_ptr += 4; //add arg2, 4 - _less_than_4_left: - // - //cmp arg3, 2 - if (len < 2) goto _less_than_2_left; //jl _less_than_2_left - // - //; load 2 Bytes - *(short*)r11 = *(short*)src_ptr; - r11 += 2; - len -= 2; - src_ptr += 2; - _less_than_2_left: - //cmp arg3, 1 - if (len < 1) goto _zero_left; //jl _zero_left - //; load 1 Byte - *r11 = *src_ptr; - _zero_left: - xmm7 = _mm_loadu_si128((const __m128i*)ttt); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - - ptr = (uint8_t*)pshufb_shf_table + 16 - r9; - xmm0 = _mm_loadu_si128((const __m128i*)ptr); - xmm0 = _mm_xor_si128(xmm0, mask1); - // - xmm7 = _mm_shuffle_epi8(xmm7, xmm0); - goto _128_done; //jmp _128_done - //align 16 -#ifndef DML_DISABLE_OPTIMIZATION_ - _exact_16_left: -#endif // DML_DISABLE_OPTIMIZATION_ - xmm7 = _mm_loadu_si128((const __m128i*)src_ptr); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - - goto _128_done; //jmp _128_done - _only_less_than_4: - //cmp arg3, 3 - if (len < 3) goto _only_less_than_3; //jl _only_less_than_3 - //; load 3 Bytes - r11[0] = src_ptr[0]; - - r11[1] = src_ptr[1]; - - r11[2] = src_ptr[2]; - xmm7 = _mm_loadu_si128((const __m128i*)r11); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - xmm7 = _mm_srli_si128(xmm7, 5); - goto _barrett; //jmp _barrett - _only_less_than_3: - //cmp arg3, 2 - if (len < 2) goto _only_less_than_2; //jl _only_less_than_2 - //; load 2 Bytes - r11[0] = src_ptr[0]; - - r11[1] = src_ptr[1]; - xmm7 = _mm_loadu_si128((const __m128i*)r11); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - xmm7 = _mm_srli_si128(xmm7, 6); - goto _barrett; //jmp _barrett - _only_less_than_2: - // - //; load 1 Byte - eax = src_ptr[0]; - r11[0] = eax; - - xmm7 = _mm_loadu_si128((const __m128i*)r11); - xmm7 = _mm_shuffle_epi8(xmm7, ENDIA_SHUF_MASK); - xmm7 = _mm_xor_si128(xmm7, xmm0); - - xmm7 = _mm_srli_si128(xmm7, 7); - - goto _barrett; //jmp _barrett -} - -/** -* @todo -*/ -dmlc_status_t dmlc_own_calculate_crc_32u_noopt(const uint8_t* const memory_region_ptr, - uint32_t bytes_to_hash, - uint32_t* const crc_ptr, - uint32_t polynomial) -{ - // Current crc value - uint32_t current_crc = (*crc_ptr); - - // Through all bytes - for (uint32_t i = 0u; i < bytes_to_hash; ++i) - { - // Calculate crc for current byte - current_crc = dmlc_own_crc_byte_32u(current_crc, memory_region_ptr[i], polynomial); - } - - // Store result - (*crc_ptr) = current_crc; - - return DML_STATUS_OK; -} \ No newline at end of file diff --git a/sources/cores/src/avx512/dmlc_fill_8u.cxx b/sources/cores/src/avx512/dmlc_fill_8u.cxx deleted file mode 100644 index 24de573..0000000 --- a/sources/cores/src/avx512/dmlc_fill_8u.cxx +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain optimized AVX512 implementation of the follow functions: - * - @ref dmlc_fill_with_pattern_8u() - * - * @date 10/29/2020 - * - */ - -#include "core_memory.h" -#include "own_dmlc_definitions.h" - -DML_CORE_OWN_INLINE(void, opt_fill_with_pattern_8u_big, ( uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process ) ); - -DML_CORE_OWN_INLINE(void, opt_fill_with_pattern_8u_small, ( uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process ) ); - -DML_CORE_OWN_INLINE(dmlc_status_t, opt_fill_with_pattern_8u, ( uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process ) ) -{ - DML_CORE_CHECK_NULL_POINTER(memory_region_ptr); - - if (bytes_to_process < 64u) - { - dmlc_own_opt_fill_with_pattern_8u_small(pattern, memory_region_ptr, bytes_to_process); - } - else - { - dmlc_own_opt_fill_with_pattern_8u_big(pattern, memory_region_ptr, bytes_to_process); - } - - return DML_STATUS_OK; -} - -DML_CORE_OWN_INLINE(void, opt_fill_with_pattern_8u_big, ( uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process ) ) -{ - // Check pointer alignment - const size_t unaligned_bytes = (uintptr_t)memory_region_ptr % 64u; - const size_t unaligned_part_size = (0u == unaligned_bytes) ? 0u : 64u - unaligned_bytes; - - __m512i zmm_pattern = _mm512_set1_epi64(pattern); - - // Fill unaligned part of destination - if (0u != unaligned_part_size) - { - unsigned long long mask = 0xFFFFFFFFFFFFFFFFu >> unaligned_bytes; - __mmask64 mmask = _load_mask64(&mask); - - _mm512_mask_storeu_epi8(memory_region_ptr, mmask, zmm_pattern); - - pattern = (pattern << (unaligned_bytes * 8u)) | (pattern >> (64u - (unaligned_bytes * 8u))); - zmm_pattern = _mm512_set1_epi64(pattern); - } - - // Fill aligned part of destination - const size_t aligned_part_size = bytes_to_process - unaligned_part_size; - const size_t head_size = aligned_part_size / sizeof(__m512i); - const size_t tail_size = aligned_part_size % sizeof(__m512i); - - uint8_t *const aligned_memory_region_ptr = memory_region_ptr + unaligned_part_size; - __m512i *head_ptr = (__m512i *)aligned_memory_region_ptr; - __m512i *tail_ptr = (__m512i *)head_ptr + head_size; - - // Fill head part - if (0u != head_size) - { - while (head_ptr != tail_ptr) - { - _mm512_store_si512(head_ptr, zmm_pattern); - head_ptr++; - } - } - - // Fill tail part - if (0u != tail_size) - { - unsigned long long mask = ~(0xFFFFFFFFFFFFFFFFu << tail_size); - __mmask64 mmask = _load_mask64(&mask); - _mm512_mask_storeu_epi8(tail_ptr, mmask, zmm_pattern); - } - -} - -DML_CORE_OWN_INLINE(void, opt_fill_with_pattern_8u_small, ( uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process ) ) -{ - // Fill 1-63 bytes with 64bit pattern via two _mm256_mask_storeu_epi8 calls. - - const size_t tail_size = bytes_to_process % sizeof(__m512i); - - __m256i ymm1_pattern = _mm256_set1_epi64x(pattern); - - unsigned long long mask_value = ~(0xFFFFFFFFFFFFFFFFu << tail_size); - __mmask32 mask_first = _load_mask32((uint32_t *)&mask_value); - _mm256_mask_storeu_epi8(memory_region_ptr, mask_first, ymm1_pattern); - - __mmask32 mask_second = _load_mask32((uint32_t *)&mask_value + 1u); - _mm256_mask_storeu_epi8(memory_region_ptr + 32u, mask_second, ymm1_pattern); -} diff --git a/sources/cores/src/default/dmlc_compare_8u_px.cxx b/sources/cores/src/default/dmlc_compare_8u_px.cxx deleted file mode 100644 index 9c93445..0000000 --- a/sources/cores/src/default/dmlc_compare_8u_px.cxx +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - - /** - * @brief Contain default implementation of the follow functions: - * - @ref dmlc_own_compare_8u() - * - @ref dmlc_own_compare_with_pattern_8u() - * - * @date 07/06/2021 - * - */ - -DML_CORE_OWN_INLINE(dmlc_status_t, compare_8u, (const uint8_t* first_vector_ptr, - const uint8_t* second_vector_ptr, - const uint32_t size, - uint32_t* const mismatch_offset_ptr)) -{ - for (uint32_t i = 0u; i < size; i++) - { - if (first_vector_ptr[i] != second_vector_ptr[i]) - { - *mismatch_offset_ptr = i; - - return DML_COMPARE_STATUS_NE; - } - } - - return DML_COMPARE_STATUS_EQ; -} - -DML_CORE_OWN_INLINE(dmlc_status_t, compare_with_pattern_8u, (const uint8_t* memory_region_ptr, - const pattern_t pattern, - const uint32_t size, - uint32_t* const mismatch_offset_ptr)) -{ - //Constants - const uint32_t pattern_size = sizeof(pattern_t); - const uint32_t pattern_chunk_count = size / pattern_size; - const uint64_t tail_bytes_count = size % pattern_size; - const uint64_t* const pattern_region_ptr = (uint64_t*)memory_region_ptr; - - // Compare by pattern chunks - for (uint32_t i = 0u; i < pattern_chunk_count; i++) - { - if (pattern_region_ptr[i] != pattern) - { - *mismatch_offset_ptr = i * pattern_size; - - return DML_COMPARE_STATUS_NE; - } - } - - memory_region_ptr += size - tail_bytes_count; - pattern_t byte_pattern = pattern; - - // Compare tail - for (uint32_t i = 0; i < tail_bytes_count; i++) - { - if (memory_region_ptr[i] != (uint8_t)byte_pattern) - { - *mismatch_offset_ptr = pattern_chunk_count * pattern_size + i; - - return DML_COMPARE_STATUS_NE; - } - - byte_pattern >>= OWN_BYTE_BIT_LENGTH; - } - - return DML_COMPARE_STATUS_EQ; -} diff --git a/sources/cores/src/default/dmlc_copy_8u_px.cxx b/sources/cores/src/default/dmlc_copy_8u_px.cxx deleted file mode 100644 index 3b84a9e..0000000 --- a/sources/cores/src/default/dmlc_copy_8u_px.cxx +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - - /** - * @brief Contain default implementation of the follow functions: - * - @ref dmlc_own_copy_8u() - * - @ref dmlc_own_move_8u() - * - @ref dmlc_own_dualcast_copy_8u() - * - * @date 5/26/2021 - * - */ - -DML_CORE_OWN_INLINE(void, copy_8u, (const uint8_t *const source_ptr, - uint8_t *const destination_ptr, - uint32_t bytes_to_process)) -{ - // Current position in source vector - const uint8_t *source_current_ptr = (const uint8_t *)source_ptr; - - // Current position in destination vector - uint8_t *destination_current_ptr = (uint8_t *)destination_ptr; - - while (0u < bytes_to_process) - { - // Copy 1 byte - *destination_current_ptr = *source_current_ptr; - - // Shift position in destination vector - destination_current_ptr++; - - // Shift position in source vector - source_current_ptr++; - - // Decrease bytes counter - bytes_to_process -= sizeof(uint8_t); - } -} - - -DML_CORE_OWN_INLINE(void, move_8u, (const uint8_t *const source_ptr, - uint8_t *const destination_ptr, - uint32_t bytes_to_process)) -{ - // Current position in source vector - const uint8_t *source_current_ptr = (const uint8_t *)(source_ptr + bytes_to_process); - - // Current position in destination vector - uint8_t *destination_current_ptr = (uint8_t *)(destination_ptr + bytes_to_process); - - while (0u < bytes_to_process) - { - // Shift position in destination vector - destination_current_ptr--; - - // Shift position in source vector - source_current_ptr--; - - // Copy 1 byte - (*destination_current_ptr) = (*source_current_ptr); - - // Decrease bytes counter - bytes_to_process -= sizeof(uint8_t); - } -} - - -DML_CORE_OWN_INLINE(void, dualcast_copy_8u, (const uint8_t *const source_ptr, - uint8_t *const first_destination_ptr, - uint8_t *const second_destination_ptr, - uint32_t bytes_to_process)) -{ - // Current position in source vector 64u - const uint8_t *source_current_ptr = (const uint8_t *)source_ptr; - - // Current position in first destination vector 64u - uint8_t *first_destination_current_ptr = (uint8_t *)first_destination_ptr; - - // Current position in second destination vector 64u - uint8_t *second_destination_current_ptr = (uint8_t *)second_destination_ptr; - - while (0 < bytes_to_process) - { - // Copy 1 byte to first destination vector - (*first_destination_current_ptr) = (*source_current_ptr); - - // Copy 1 byte to second destination vector - (*second_destination_current_ptr) = (*source_current_ptr); - - // Shift position in first destination vector - first_destination_current_ptr++; - - // Shift position in second destination vector - second_destination_current_ptr++; - - // Shift position in source vector - source_current_ptr++; - - // Decrease bytes counter - bytes_to_process -= sizeof(uint8_t); - } -} diff --git a/sources/cores/src/default/dmlc_crc_16u_32u_px.cxx b/sources/cores/src/default/dmlc_crc_16u_32u_px.cxx deleted file mode 100644 index 16db5e7..0000000 --- a/sources/cores/src/default/dmlc_crc_16u_32u_px.cxx +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - - /** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_calculate_crc_32u() - * - * @date 7/20/2021 - * - */ - -DML_CORE_OWN_INLINE(dmlc_status_t, calculate_crc_32u, (const uint8_t* const memory_region_ptr, - uint32_t bytes_to_hash, - uint32_t* const crc_ptr, - uint32_t polynomial)) -{ - // Current crc value - uint32_t current_crc = (*crc_ptr); - - // Through all bytes - for (uint32_t i = 0u; i < bytes_to_hash; ++i) - { - // Calculate crc for current byte - current_crc = dmlc_own_crc_byte_32u(current_crc, memory_region_ptr[i], polynomial); - } - - // Store result - (*crc_ptr) = current_crc; - return DML_STATUS_OK; -} diff --git a/sources/cores/src/default/dmlc_fill_8u.cxx b/sources/cores/src/default/dmlc_fill_8u.cxx deleted file mode 100644 index 1d4c9cd..0000000 --- a/sources/cores/src/default/dmlc_fill_8u.cxx +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain default implementation of the follow functions: - * - @ref dmlc_fill_with_pattern_8u() - * - * @date 10/29/2020 - * - */ - -#include "core_memory.h" -#include "own_dmlc_definitions.h" - -DML_CORE_OWN_INLINE(dmlc_status_t, opt_fill_with_pattern_8u, ( uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process ) ) -{ - DML_CORE_CHECK_NULL_POINTER(memory_region_ptr) - - // Current position in memory region 64u - uint64_t *memory_region_current_64u_ptr = (uint64_t *)memory_region_ptr; - - // Current pattern 64u to fill with - const uint64_t *pattern_current_64u_ptr = (const uint64_t *)(&pattern); - - // Current pattern 8u to fill with - const uint8_t *pattern_current_8u_ptr = (const uint8_t *)(&pattern); - - // Current position in memory region 8u - uint8_t *memory_region_current_8u_ptr; - - // Fill body - while(sizeof(uint64_t) <= bytes_to_process) - { - // Put 8 bytes to current memory region - (*memory_region_current_64u_ptr) = - (*pattern_current_64u_ptr); - - // Shift position in memory region - memory_region_current_64u_ptr++; - - // Decrease bytes counter - bytes_to_process -= sizeof(uint64_t); - } - - // Get position in memory region - memory_region_current_8u_ptr = (uint8_t *)memory_region_current_64u_ptr; - - // Fill tail - while(0 < bytes_to_process) - { - // Put 1 byte to current memory region - (*memory_region_current_8u_ptr) = - (*pattern_current_8u_ptr); - - // Shift position in memory region - memory_region_current_8u_ptr++; - - // Shift position in current pattern - pattern_current_8u_ptr++; - - // Decrease bytes counter - bytes_to_process -= sizeof(uint8_t); - } - - // Success - return DML_STATUS_OK; -} diff --git a/sources/cores/src/dmlc_cache_8u.c b/sources/cores/src/dmlc_cache_8u.c deleted file mode 100644 index ac388e6..0000000 --- a/sources/cores/src/dmlc_cache_8u.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of functions for cache manipulation - * @date 2/25/2020 - * - * @details Function list: - * - @ref dmlc_move_cache_to_memory - * - @ref dmlc_copy_cache_to_memory - * - */ - - -#include "core_cpu_features.h" -#include "own_dmlc_definitions.h" - -#define OWN_CACHE_LINE_BYTE_SIZE 64u - -DML_CORE_API(dmlc_status_t, move_cache_to_memory_8u, (const uint8_t *memory_region_ptr, - const uint32_t bytes_to_flush)) -{ - DML_CORE_CHECK_NULL_POINTER(memory_region_ptr) - - // Constants - const uint64_t start_address = (uint64_t) memory_region_ptr; - const uint32_t cache_align = OWN_CACHE_LINE_BYTE_SIZE - (start_address % OWN_CACHE_LINE_BYTE_SIZE); - const uint32_t cache_line_count = (bytes_to_flush + OWN_CACHE_LINE_BYTE_SIZE) / OWN_CACHE_LINE_BYTE_SIZE - 1u; - - // Variables - void *memory_address_ptr = (void*) memory_region_ptr; - - #if !defined (PX) - _mm_clflushopt(memory_address_ptr); - #else - _mm_clflush(memory_address_ptr); - #endif - - // Align pointer - memory_region_ptr += cache_align; - - for (uint32_t i = 0u; i < cache_line_count; i++) - { - #if !defined (PX) - _mm_clflushopt(memory_address_ptr); - #else - _mm_clflush(memory_address_ptr); - #endif - memory_region_ptr += OWN_CACHE_LINE_BYTE_SIZE; - } - - return DML_STATUS_OK; -} - - -DML_CORE_API(dmlc_status_t, copy_cache_to_memory_8u, (const uint8_t *memory_region_ptr, - const uint32_t bytes_to_flush)) -{ - DML_CORE_CHECK_NULL_POINTER(memory_region_ptr) - - #if !defined (PX) - // Constants - const uint64_t start_address = (uint64_t) memory_region_ptr; - const uint32_t cache_align = OWN_CACHE_LINE_BYTE_SIZE - (start_address % OWN_CACHE_LINE_BYTE_SIZE); - const uint32_t cache_line_count = (bytes_to_flush + OWN_CACHE_LINE_BYTE_SIZE) / OWN_CACHE_LINE_BYTE_SIZE - 1u; - - // Variable - void *memory_address_ptr = (void*) memory_region_ptr; - - _mm_clwb(memory_address_ptr); - - // Align pointer - memory_region_ptr += cache_align; - - for (uint32_t i = 0u; i < cache_line_count; i++) - { - _mm_clwb(memory_address_ptr); - memory_region_ptr += OWN_CACHE_LINE_BYTE_SIZE; - } - #endif - - return DML_STATUS_OK; -} diff --git a/sources/cores/src/dmlc_compare_8u.c b/sources/cores/src/dmlc_compare_8u.c deleted file mode 100644 index 7c9c23e..0000000 --- a/sources/cores/src/dmlc_compare_8u.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_compare_8u() - * - @ref dmlc_compare_pattern_8u() - * - * @date 2/10/2020 - * - */ - -#include "core_compare.h" -#include "own_dmlc_definitions.h" -#if defined(AVX512) -#include "avx512/dmlc_compare_8u_k0.cxx" -#else -#include "default/dmlc_compare_8u_px.cxx" -#endif - - -DML_CORE_API(dmlc_status_t, compare_8u, (const uint8_t* first_vector_ptr, - const uint8_t* second_vector_ptr, - const uint32_t size, - uint32_t* const mismatch_offset_ptr)) -{ - return dmlc_own_compare_8u(first_vector_ptr, second_vector_ptr, size, mismatch_offset_ptr); -} - -DML_CORE_API(dmlc_status_t, compare_with_pattern_8u, (const uint8_t *memory_region_ptr, - const pattern_t pattern, - const uint32_t size, - uint32_t *const mismatch_offset_ptr)) -{ - return dmlc_own_compare_with_pattern_8u(memory_region_ptr, pattern, size, mismatch_offset_ptr); -} diff --git a/sources/cores/src/dmlc_copy_8u.c b/sources/cores/src/dmlc_copy_8u.c deleted file mode 100644 index 220d603..0000000 --- a/sources/cores/src/dmlc_copy_8u.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_copy_forward_8u() - * - @ref dmlc_copy_backward_8u() - * - @ref dmlc_dualcast_copy_8u() - * - * @date 2/20/2020 - * - */ - - -#include "core_memory.h" -#include "own_dmlc_definitions.h" -#if defined(AVX512) -#include "avx512/dmlc_copy_8u_k0.cxx" -#else -#include "default/dmlc_copy_8u_px.cxx" -#endif - -/** Checks 0:11 bits for equality **/ -#define OWN_BAD_ARGUMENT_DUALCAST_DST_ALIGNMENT(dst_ptr1, dst_ptr2) \ - DML_CORE_BAD_ARGUMENT_RETURN( ((((uint64_t) (dst_ptr1)) & 0xFFFu) != \ - (((uint64_t) (dst_ptr2)) & 0xFFFu)), \ - DML_STATUS_DUALCAST_ALIGN_ERROR ) - - -DML_CORE_API(dmlc_status_t, copy_8u, ( const uint8_t *const source_ptr, - uint8_t *const destination_ptr, - uint32_t bytes_to_process ) ) -{ - // Main action - dmlc_own_copy_8u(source_ptr, destination_ptr, bytes_to_process); - - // Success - return DML_STATUS_OK; -} - - -DML_CORE_API(dmlc_status_t, move_8u, ( const uint8_t *const source_ptr, - uint8_t *const destination_ptr, - uint32_t bytes_to_process ) ) -{ - const uint8_t * const src_begin = source_ptr; - const uint8_t * const src_end = source_ptr + bytes_to_process; - const uint8_t * const dst_begin = destination_ptr; - const uint8_t * const dst_end = destination_ptr + bytes_to_process; - - // If memory regions do not overlap: - if (src_end <= dst_begin || src_begin >= dst_end) - { - return dmlc_copy_8u(source_ptr, destination_ptr, bytes_to_process); - } - - dmlc_own_move_8u(source_ptr, destination_ptr, bytes_to_process); - - // Success - return DML_STATUS_OK; -} - - - DML_CORE_API(dmlc_status_t, dualcast_copy_8u, ( const uint8_t *const source_ptr, - uint8_t *const first_destination_ptr, - uint8_t *const second_destination_ptr, - uint32_t bytes_to_process ) ) -{ - // Main action - dmlc_own_dualcast_copy_8u(source_ptr, first_destination_ptr, second_destination_ptr, bytes_to_process); - - // Success - return DML_STATUS_OK; -} diff --git a/sources/cores/src/dmlc_crc_16u_32u.c b/sources/cores/src/dmlc_crc_16u_32u.c deleted file mode 100644 index edf7d2d..0000000 --- a/sources/cores/src/dmlc_crc_16u_32u.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_calculate_crc_16u() - * - @ref dmlc_calculate_crc_32u() - * - @ref dmlc_calculate_crc_reflected_32u() - * - * @date 2/5/2020 - * - */ - - -#include "core_hash_functions.h" -#include "own_dmlc_definitions.h" -#include "own_dmlc_crc_16u_32u.cxx" -#include "own_dmlc_byte_op.cxx" - -#if defined(AVX512) -#include "avx512/dmlc_crc_16u_32u_k0.cxx" -#else -#include "default/dmlc_crc_16u_32u_px.cxx" -#endif - - - -DML_CORE_API(dmlc_status_t, calculate_crc_16u, (const uint8_t *const memory_region_ptr, - uint32_t bytes_to_hash, - uint16_t *const crc_ptr, - uint16_t polynomial)) -{ - // Check input arguments - DML_CORE_CHECK_NULL_POINTER(memory_region_ptr) - DML_CORE_CHECK_NULL_POINTER(crc_ptr) - - // Current crc value - uint16_t current_crc = (*crc_ptr); - - // Through all bytes - for(uint32_t i = 0u; i < bytes_to_hash; ++i) - { - // Calculate crc for current byte - current_crc = dmlc_own_crc_byte_16u(current_crc, memory_region_ptr[i], polynomial); - } - - // Store result - (*crc_ptr) = current_crc; - - return DML_STATUS_OK; -} - -#if defined(_MSC_VER) -#define BORDER_OPT 256 -#else -#define BORDER_OPT 256 -#endif - - -DML_CORE_API(dmlc_status_t, calculate_crc_32u, (const uint8_t *const memory_region_ptr, - uint32_t bytes_to_hash, - uint32_t *const crc_ptr, - uint32_t polynomial)) -{ -#if defined(AVX512) - if (bytes_to_hash < BORDER_OPT) - return dmlc_own_calculate_crc_32u_noopt(memory_region_ptr, bytes_to_hash, crc_ptr, polynomial); -#endif - return dmlc_own_calculate_crc_32u(memory_region_ptr, bytes_to_hash, crc_ptr, polynomial); -} - - -DML_CORE_API(dmlc_status_t, calculate_crc_reflected_32u, (const uint8_t *const memory_region_ptr, - uint32_t bytes_to_hash, - uint32_t *const crc_ptr, - uint32_t polynomial)) -{ - // Check input arguments - DML_CORE_CHECK_NULL_POINTER(memory_region_ptr) - DML_CORE_CHECK_NULL_POINTER(crc_ptr) - - // Current crc value - uint32_t current_crc = (*crc_ptr); - - // Temporary storage for reversed value - uint8_t reversed_value; - - // Through all bytes - for(uint32_t i = 0u; i < bytes_to_hash; ++i) - { - // Get current byte - reversed_value = memory_region_ptr[i]; - - // Reverse bits - reversed_value = dmlc_own_reverse_8u(reversed_value); - - // Calculate crc for current reversed byte - current_crc = dmlc_own_crc_byte_32u(current_crc, reversed_value, polynomial); - } - - // Store result - (*crc_ptr) = current_crc; - - return DML_STATUS_OK; -} diff --git a/sources/cores/src/dmlc_delta_record_8u.c b/sources/cores/src/dmlc_delta_record_8u.c deleted file mode 100644 index 3bd46cc..0000000 --- a/sources/cores/src/dmlc_delta_record_8u.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @details Contain implementation for Delta Record feature: - * - @ref dmlc_create_delta_record_8u - * - * @date 2/17/2020 - * - */ - -#include "core_compare.h" -#include "own_dmlc_definitions.h" - - -/** - * @defgroup own_delta_record Delta record own API - * @ingroup core_own - * @{ - * @brief Delta record own definitions and functions - * - */ - -/* ------ DELTA RECORD INTERNAL DEFINITIONS ------ */ - -typedef pattern_t region_t; /**< Redefinition for the @ref pattern_t type */ -typedef uint16_t offset_t; /**< Redefinition to make more abstract offset type */ - -/* Delta note sizes */ -#define DELTA_NOTE_OFFSET_FIELD_SIZE sizeof(offset_t) /**< Size of offset field in delta note */ -#define DELTA_NOTE_REGION_FIELD_SIZE sizeof(region_t) /**< Size of delta field in delta note */ -#define DELTA_NOTE_SIZE (DELTA_NOTE_OFFSET_FIELD_SIZE + DELTA_NOTE_REGION_FIELD_SIZE) /**< Delta note size*/ - -/* Checkers */ - -/**< Input pointer must be aligned to 8 bytes */ -#define OWN_DELTA_CHECK_PTR_ALIGNMENT(ptr) \ - DML_CORE_BAD_ARGUMENT_RETURN((((uint64_t)ptr) % 8u), DML_STATUS_DELTA_ALIGN_ERROR) - -#define MAX_AVAILABLE_INPUT_SIZE 0x7FFF8u /**< Input vector size limit*/ - -#if defined(__GNUC__) - typedef struct __attribute__ ((__packed__)) -#elif(_MSC_VER) - #pragma pack(2) - typedef struct -#else - #error Compiler not supported -#endif -{ - offset_t offset; /**< Offset of mismatched region, which delta written in delta field*/ - region_t reference_data; /**< Contain delta between standard vector and compared*/ -} own_delta_note_t; /**< Presents a single element of Delta Record stream */ - -/** @} */ - -/* ------ DELTA RECORD PUBLIC FUNCTIONS IMPLEMENTATION ------ */ - -DML_CORE_API(dmlc_status_t, create_delta_record_8u, (const uint8_t *reference_vector_ptr, - const uint8_t *second_vector_ptr, - const uint32_t compared_bytes, - const uint32_t delta_record_max_size, - uint8_t* delta_record_ptr, - uint32_t *const record_size_ptr)) -{ - DML_CORE_CHECK_NULL_POINTER(reference_vector_ptr) - DML_CORE_CHECK_NULL_POINTER(second_vector_ptr) - DML_CORE_CHECK_NULL_POINTER(delta_record_ptr) - DML_CORE_CHECK_NULL_POINTER(record_size_ptr) - OWN_DELTA_CHECK_PTR_ALIGNMENT(reference_vector_ptr) - OWN_DELTA_CHECK_PTR_ALIGNMENT(second_vector_ptr) - - (*record_size_ptr) = 0u; - - DML_CORE_CHECK_INPUT_SIZE(compared_bytes % DELTA_NOTE_REGION_FIELD_SIZE, DML_STATUS_DELTA_ALIGN_ERROR) - DML_CORE_CHECK_INPUT_SIZE(compared_bytes > MAX_AVAILABLE_INPUT_SIZE, DML_STATUS_DELTA_OFFSET_ERROR) - DML_CORE_CHECK_OUTPUT_SIZE(delta_record_max_size % DELTA_NOTE_SIZE, DML_STATUS_DELTA_INPUT_SIZE_ERROR) - DML_CORE_CHECK_OUTPUT_SIZE(0u == delta_record_max_size, DML_STATUS_DELTA_INPUT_SIZE_ERROR) - - // Delta Record - const uint32_t delta_note_count = delta_record_max_size / DELTA_NOTE_SIZE; - const offset_t regions_count = (offset_t)(compared_bytes / DELTA_NOTE_REGION_FIELD_SIZE); - own_delta_note_t* current_delta_notes_ptr = (own_delta_note_t*) delta_record_ptr; - own_delta_note_t* end_delta_notes_ptr = current_delta_notes_ptr + delta_note_count; - - // Create delta - for (offset_t i = 0u; i < regions_count; i++) - { - const uint64_t base_region = *(uint64_t *) reference_vector_ptr; - const uint64_t vector_region = *(uint64_t *) second_vector_ptr; - - reference_vector_ptr += DELTA_NOTE_REGION_FIELD_SIZE; - second_vector_ptr += DELTA_NOTE_REGION_FIELD_SIZE; - - // Write delta note into delta record in case: - if (base_region != vector_region) - { - if(current_delta_notes_ptr < end_delta_notes_ptr) - { - current_delta_notes_ptr->reference_data = base_region; - current_delta_notes_ptr->offset = i; - current_delta_notes_ptr++; - (*record_size_ptr) += DELTA_NOTE_SIZE; - } - else - { - return DML_STATUS_DELTA_RECORD_SIZE_ERROR; - } - } - } - - return DML_STATUS_OK; -} - - -DML_CORE_API(dmlc_status_t, apply_delta_record_8u, (uint8_t * memory_region_ptr, - const uint8_t *delta_record_ptr, - const uint32_t memory_region_size, - const uint32_t delta_record_size)) -{ - DML_CORE_CHECK_NULL_POINTER(memory_region_ptr) - DML_CORE_CHECK_NULL_POINTER(delta_record_ptr) - OWN_DELTA_CHECK_PTR_ALIGNMENT(memory_region_ptr) - DML_CORE_CHECK_INPUT_SIZE(memory_region_size > MAX_AVAILABLE_INPUT_SIZE, DML_STATUS_DELTA_INPUT_SIZE_ERROR) - DML_CORE_CHECK_INPUT_SIZE(memory_region_size % DELTA_NOTE_REGION_FIELD_SIZE, DML_STATUS_DELTA_ALIGN_ERROR) - DML_CORE_CHECK_INPUT_SIZE(delta_record_size % DELTA_NOTE_SIZE, DML_STATUS_DELTA_RECORD_SIZE_ERROR) - DML_CORE_CHECK_OVERLAPPING_FORWARD(delta_record_ptr, memory_region_ptr, memory_region_size) - DML_CORE_CHECK_OVERLAPPING_FORWARD(memory_region_ptr, delta_record_ptr, delta_record_size) - - // Constants - const uint32_t delta_notes_count = delta_record_size / DELTA_NOTE_SIZE; - - // Variables - own_delta_note_t *delta_note_ptr = (own_delta_note_t *) delta_record_ptr; - region_t *regions_ptr = (region_t *) memory_region_ptr; - - for (uint32_t i = 0u; i < delta_notes_count; i++) - { - const offset_t region_offset = delta_note_ptr[i].offset; - const region_t reference_data = delta_note_ptr[i].reference_data; - - if (region_offset < memory_region_size) - { - regions_ptr[region_offset] = reference_data; - } - else - { - return DML_STATUS_MEMORY_OVERFLOW_ERROR; - } - } - - return DML_STATUS_OK; -} diff --git a/sources/cores/src/dmlc_fill_8u.c b/sources/cores/src/dmlc_fill_8u.c deleted file mode 100644 index e213645..0000000 --- a/sources/cores/src/dmlc_fill_8u.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_fill_with_pattern_8u() - * - * @date 2/21/2020 - * - */ - - -#include "core_memory.h" -#include "own_dmlc_definitions.h" - -#if defined(AVX512) - // TODO: I cannot load mask on MSVC17, so I disabled optimizations - #if (_MSC_VER >= 1928) || defined(__GNUC__) - #include "avx512/dmlc_fill_8u.cxx" - #else - #include "default/dmlc_fill_8u.cxx" - #endif -#else - #include "default/dmlc_fill_8u.cxx" -#endif - -DML_CORE_API(dmlc_status_t, fill_with_pattern_8u, ( uint64_t pattern, - uint8_t *const memory_region_ptr, - uint32_t bytes_to_process ) ) -{ - return dmlc_own_opt_fill_with_pattern_8u(pattern, memory_region_ptr, bytes_to_process); -} diff --git a/sources/cores/src/include/own_dmlc_checkers.h b/sources/cores/src/include/own_dmlc_checkers.h deleted file mode 100644 index 063a691..0000000 --- a/sources/cores/src/include/own_dmlc_checkers.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** -* @brief -* @date 2/10/2020 -* -* @addtogroup core_own -* @{ -*/ - -#include "core_definitions.h" - -#ifndef DML_OWN_BAD_ARGUMENT_MACROS_HPP_ -#define DML_OWN_BAD_ARGUMENT_MACROS_HPP_ - -#ifdef DML_CORES_BADARG_CHECK - #define DML_CORE_BAD_ARGUMENT_RETURN(expression, error_code) \ - if(expression) \ - { \ - return (error_code); \ - } - -#else - #define DML_CORE_BAD_ARGUMENT_RETURN(expression, error_code ) /**< */ -#endif - - // Bad Argument wrappers - #define DML_CORE_CHECK_NULL_POINTER(pointer) \ - DML_CORE_BAD_ARGUMENT_RETURN( NULL == (pointer), DML_STATUS_NULL_POINTER_ERROR) - - #define DML_CORE_CHECK_OVERLAPPING_FORWARD(dst_ptr, src_ptr, length) \ - DML_CORE_BAD_ARGUMENT_RETURN((( (uint64_t) (src_ptr) ) <= ( (uint64_t) (dst_ptr) )) && \ - ( ( (uint64_t)((src_ptr) + (length)) ) > ( (uint64_t) (dst_ptr)) ), DML_STATUS_OVERLAPPING_BUFFER_ERROR) - - #define DML_CORE_CHECK_OVERLAPPING_BACKWARD(dst_ptr, src_ptr, length) \ - DML_CORE_CHECK_OVERLAPPING_FORWARD(src_ptr, dst_ptr, length) - - #define DML_CORE_CHECK_OVERLAPPING(pointer1, pointer2, length) \ - DML_CORE_CHECK_OVERLAPPING_FORWARD(pointer1, pointer2, length) \ - DML_CORE_CHECK_OVERLAPPING_BACKWARD(pointer1, pointer2, length) - - - #define DML_CORE_CHECK_INPUT_SIZE(condition, status) \ - DML_CORE_BAD_ARGUMENT_RETURN((condition), (status)) - - #define DML_CORE_CHECK_OUTPUT_SIZE(condition, status) \ - DML_CORE_BAD_ARGUMENT_RETURN((condition), (status)) - -#endif //DML_OWN_BAD_ARGUMENT_MACROS_HPP_ - -/** @} */ diff --git a/sources/cores/src/include/own_dmlc_definitions.h b/sources/cores/src/include/own_dmlc_definitions.h deleted file mode 100644 index e27cdc8..0000000 --- a/sources/cores/src/include/own_dmlc_definitions.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @file - * @date 2/10/2020 - * - * @defgroup core_own Own API - * @ingroup core_src - * - * @defgroup core_own_definition Internal Definitions - * @ingroup core_own - * @{ - * - * @brief Contains general definitions for internal use in Intel® Data Mover Library (Intel® DML) Cores. - * - */ - -#if defined(_MSC_BUILD) -#include -#elif defined(__GNUC__) -#include -#else -#error "Unsupported compiler" -#endif - -#include "core_definitions.h" -#include "own_dmlc_checkers.h" - -#ifndef OWN_KERNEL_DEFINITIONS_H__ -#define OWN_KERNEL_DEFINITIONS_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -/* Define NULL pointer value */ -#ifndef NULL - #ifdef __cplusplus - #define NULL 0 - #else - #define NULL ((void *)0) - #endif -#endif - -/* ------ Defines ------ */ -#define OWN_BYTE_BIT_LENGTH 8u /**< Byte bit-length*/ -#define OWN_HIGH_BIT_MASK_16U ( 0x8000u ) /**< Mask for checking high bit in uint16 value */ -#define OWN_HIGH_BIT_MASK_32U ( 0x80000000u ) /**< Mask for checking high bit in uint16 value */ - -/* ------ Enumerations ------ */ - -/** - * @brief Kernel boolean type - */ -typedef enum -{ - OWN_BOOL_FALSE = 0u, /**< Bool True */ - OWN_BOOL_TRUE = 1u /**< Bool False */ -} kernel_bool_t; - -#ifdef __cplusplus -} -#endif - -#endif //OWN_KERNEL_DEFINITIONS_H__ - -/** @} */ diff --git a/sources/cores/src/own_dmlc_byte_op.cxx b/sources/cores/src/own_dmlc_byte_op.cxx deleted file mode 100644 index bee00e5..0000000 --- a/sources/cores/src/own_dmlc_byte_op.cxx +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_own_reverse_8u() - * - * @date 3/5/2020 - * - */ - - -DML_CORE_OWN_INLINE(uint8_t, reverse_8u, ( uint8_t byte ) ) -{ - // Value to return - uint8_t reversed_value = byte; - - reversed_value = ((reversed_value & 0x55u) << 1u) | ((reversed_value & 0xAAu) >> 1u); - reversed_value = ((reversed_value & 0x33u) << 2u) | ((reversed_value & 0xCCu) >> 2u); - reversed_value = ((reversed_value & 0x0Fu) << 4u) | ((reversed_value & 0xF0u) >> 4u); - - return reversed_value; -} diff --git a/sources/cores/src/own_dmlc_crc_16u_32u.cxx b/sources/cores/src/own_dmlc_crc_16u_32u.cxx deleted file mode 100644 index e096ab1..0000000 --- a/sources/cores/src/own_dmlc_crc_16u_32u.cxx +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contain implementation of the follow functions: - * - @ref dmlc_own_crc_byte_16u() - * - @ref dmlc_own_crc_byte_32u() - * - * @date 2/24/2020 - * - */ - -DML_CORE_OWN_INLINE(uint16_t, crc_byte_16u, ( uint16_t init_crc, - uint8_t next_byte, - uint16_t polynomial ) ) -{ - // Current crc value - uint16_t current_crc = init_crc ^ (next_byte << OWN_CRC16_BYTE_SHIFT); - - // Modulo-2 division bit by bit - for(uint8_t bit = 0u; bit < OWN_BYTE_BIT_LENGTH; ++bit) - { - // Check high bit - current_crc = (current_crc & OWN_HIGH_BIT_MASK_16U) ? - // If high bit is set - shift it to the left and XOR with polynomial - ((current_crc << 1u) ^ polynomial) : - // Else just shift it to the left - (current_crc << 1u); - } - - return current_crc; -} - - -DML_CORE_OWN_INLINE(uint32_t, crc_byte_32u, ( uint32_t init_crc, - uint8_t next_byte, - uint32_t polynomial ) ) -{ - // Current crc value - uint32_t current_crc = init_crc ^ (next_byte << OWN_CRC32_BYTE_SHIFT); - - // Modulo-2 division bit by bit - for(uint8_t bit = 0u; bit < OWN_BYTE_BIT_LENGTH; ++bit) - { - // Check high bit - current_crc = (current_crc & OWN_HIGH_BIT_MASK_32U) ? - // If high bit is set - shift it to the left and XOR with polynomial - ((current_crc << 1u) ^ polynomial) : - // Else just shift it to the left - (current_crc << 1u); - } - - return current_crc; -} diff --git a/sources/cpp_api/CMakeLists.txt b/sources/cpp_api/CMakeLists.txt index 7f266d3..b47de96 100644 --- a/sources/cpp_api/CMakeLists.txt +++ b/sources/cpp_api/CMakeLists.txt @@ -14,32 +14,36 @@ # add_library(dmlhl STATIC - $ - $ # TODO: Remove - ) + $ + ) target_include_directories(dmlhl - PUBLIC $ - PUBLIC $ - ) + PUBLIC $ + PUBLIC $ + ) +target_sources(dmlhl + PRIVATE $ + ) +target_compile_features(dmlhl + PUBLIC cxx_std_17 + ) +target_compile_options(dmlhl + PRIVATE ${DML_QUALITY_OPTIONS} + ) -target_compile_features(dmlhl PUBLIC cxx_std_17) - -# TODO: Remove -if (DML_HW) +if(DML_HW) target_link_libraries(dmlhl PRIVATE ${CMAKE_DL_LIBS}) - target_compile_definitions(dmlhl PUBLIC DML_HW) endif() set_target_properties(dmlhl PROPERTIES - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF - VERSION ${PROJECT_VERSION} - SOVERSION ${PROJECT_SOVERSION}) + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + VERSION ${PROJECT_VERSION} + SOVERSION ${PROJECT_SOVERSION}) install(TARGETS dmlhl - EXPORT ${PROJECT_NAME}Targets - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + EXPORT ${PROJECT_NAME}Targets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/sources/hw-path/include/own_dsa_accel_constants.h b/sources/hw-path/include/own_dsa_accel_constants.h deleted file mode 100644 index a7a138d..0000000 --- a/sources/hw-path/include/own_dsa_accel_constants.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2020-2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -/** - * @brief Contains a constant, which are used to connect with hardware - * @date 3/23/2020 - * - */ - -#include - -#ifndef DML_DSA_ACCEL_CONFIG_H__ -#define DML_DSA_ACCEL_CONFIG_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -#define DSA_DEVICE_ID ((uint32_t) (((uint32_t)0xFF << 24u) \ - | ((uint32_t)('a') << 16u) \ - | ((uint32_t)('s') << 8u) \ - | (uint32_t)('d'))) - -#define CHAR_MSK 0xFF202020 -#define MAX_DEVICE_COUNT 100u -#define MAX_WORK_QUEUE_COUNT 100u -#define OWN_PAGE_MASK 0x0FFFllu /**< Defines page mask for portal incrementing */ - -// General Capabilities Register unwrappers -#define GC_BLOCK_ON_FAULT_SUP(gen_cap) ((gen_cap >> 0u) & 0x0000000000000001ull) -#define GC_OVERLAPPING_COPY_SUPPORT(gen_cap) ((gen_cap >> 1u) & 0x0000000000000001ull) -#define GC_M_CACHE_CONTROL_SUPPORT(gen_cap) ((gen_cap >> 2u) & 0x0000000000000001ull) -#define GC_F_CACHE_CONTROL_SUPPORT(gen_cap) ((gen_cap >> 3u) & 0x0000000000000001ull) -#define GC_INTERRUPT_HANDLE_REQUEST(gen_cap) ((gen_cap >> 7u) & 0x0000000000000001ull) -#define GC_DESTINATION_READBACK_SUPPORT(gen_cap) ((gen_cap >> 8u) & 0x0000000000000001ull) -#define GC_DESCRIPTOR_READBACK_SUPPORT(gen_cap) ((gen_cap >> 9u) & 0x0000000000000001ull) -#define GC_MAX_TRANSFER_SIZE(gen_cap) (1u << ((gen_cap >> 16u) & 0x000000000000001Full)) -#define GC_MAX_BATCH_SIZE(gen_cap) (1u << ((gen_cap >> 21u) & 0x000000000000000Full)) -#define GC_MESSAGE_SIZE(gen_cap) (256u * ((gen_cap >> 25u) & 0x000000000000003Full)) -#define GC_CONFIGURATION_SUPPORT(gen_cap) ((gen_cap >> 31u) & 0x0000000000000001ull) -#define GC_MAX_DESCRIPTORS(gen_cap) ((gen_cap >> 32u) & 0x00000000000000FFull) - -static const char *DLL_NAME = "/usr/lib64/libaccel-config.so"; -static const char DEVICE_NAME[] = "dsa"; -static const uint32_t DEVICE_NAME_LENGTH = - sizeof(DEVICE_NAME) - 2u; //sizeof will return 4, position of terminating 0 is 3 - -#ifdef __cplusplus -} -#endif - -#endif //DML_DSA_ACCEL_CONFIG_H__ diff --git a/sources/middle_layer/CMakeLists.txt b/sources/middle_layer/CMakeLists.txt index c66a710..cc39eba 100644 --- a/sources/middle_layer/CMakeLists.txt +++ b/sources/middle_layer/CMakeLists.txt @@ -16,34 +16,32 @@ project(dml_middle_layer CXX) add_library(dml_middle_layer OBJECT - # Sources - awaiter.cpp - validation.cpp - core.cpp + src/operation.cpp + src/result.cpp + src/execution_path.cpp + src/validation.cpp - sw_path_legacy/dif.c - ) + ../../include/dml/detail/ml/options.hpp + ../../include/dml/detail/ml/operation.hpp + ../../include/dml/detail/ml/result.hpp + ../../include/dml/detail/ml/execution_path.hpp + ../../include/dml/detail/ml/validation.hpp + ) +target_link_libraries(dml_middle_layer + PRIVATE dml_core + ) target_include_directories(dml_middle_layer - PRIVATE ../../include - PRIVATE ../cores/include - PRIVATE dispatcher) - -target_compile_features(dml_middle_layer PUBLIC cxx_std_17) - -target_compile_definitions(dml_middle_layer PRIVATE $<$: DML_EFFICIENT_WAIT>) - -if (DML_HW) - target_sources(dml_middle_layer PRIVATE - dispatcher/hw_device.cpp - dispatcher/hw_dispatcher.cpp - dispatcher/hw_queue.cpp - dispatcher/numa.cpp - device.cpp - hw_configuration_driver.c + PUBLIC ../../include + ) +target_sources(dml_middle_layer + PUBLIC $ + PUBLIC $ + ) +target_compile_features(dml_middle_layer + PUBLIC cxx_std_17 + ) +target_compile_options(dml_middle_layer + PRIVATE ${DML_QUALITY_OPTIONS} + PRIVATE ${DML_CPP_PRIVATE_OPTIONS} ) - target_include_directories(dml_middle_layer PRIVATE ../hw-path/include) - target_compile_definitions(dml_middle_layer - PUBLIC DML_HW - PRIVATE $<$: LIB_ACCEL_VERSION_3_2>) -endif () diff --git a/sources/middle_layer/awaiter.cpp b/sources/middle_layer/awaiter.cpp deleted file mode 100644 index 1ac06ca..0000000 --- a/sources/middle_layer/awaiter.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#include - -#if defined(linux) -#include -#else -#include -#include -#endif - -namespace dml::ml { - -#ifdef DML_EFFICIENT_WAIT - static inline uint64_t current_time() { - return __rdtsc(); - } - - static inline void monitor_address(volatile void *address) { - asm volatile(".byte 0xf3, 0x48, 0x0f, 0xae, 0xf0" : : "a"(address)); - } - - static inline int wait_until(unsigned long timeout, unsigned int state) { - uint8_t r = 0u; - auto timeout_low = static_cast(timeout); - auto timeout_high = static_cast(timeout >> 32); - - asm volatile(".byte 0xf2, 0x48, 0x0f, 0xae, 0xf1\t\n" - "setc %0\t\n" - : "=r"(r) - : "c"(state), "a"(timeout_low), "d"(timeout_high)); - - return r; - } -#endif - - awaiter::awaiter(volatile void *address, - uint8_t initial_value, - uint32_t period) noexcept - : address_ptr_(reinterpret_cast(address)), - period_(period), - initial_value_(initial_value) { - // Empty constructor - } - - awaiter::~awaiter() noexcept { -#ifdef DML_EFFICIENT_WAIT - while (initial_value_ == *address_ptr_) { - monitor_address(address_ptr_); - - auto start = current_time(); - wait_until(start + period_, idle_state_); - } -#else - while (initial_value_ == *address_ptr_) { - _mm_pause(); - } -#endif - } -} diff --git a/sources/middle_layer/core.cpp b/sources/middle_layer/core.cpp deleted file mode 100644 index af3b801..0000000 --- a/sources/middle_layer/core.cpp +++ /dev/null @@ -1,780 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#include "dml/cpp/middle_layer/core.hpp" - -#include - -#include "core_api.h" -//#include "dif.hpp" -#include "dml/cpp/middle_layer/descriptor_views.hpp" -#include "dml/cpp/middle_layer/options.hpp" -#include "dml/cpp/middle_layer/result_views.hpp" - -// TODO: Only for DIFs -#include - -#include - -#include "sw_path_legacy/dif.h" - -namespace dml::ml::core -{ - static inline void write_status(execution_status from_status, status_t &to_status) noexcept - { - _mm_sfence(); - to_status = static_cast(from_status); - } - - static inline execution_status evaluate(views::nop_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::batch_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::drain_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::mem_move_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::fill_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::compare_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::compare_pattern_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::create_delta_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::apply_delta_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::dualcast_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::crc_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::copy_crc_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::dif_check_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::dif_insert_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::dif_strip_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::dif_update_descriptor dsc) noexcept; - - static inline execution_status evaluate(views::cache_flush_descriptor dsc) noexcept; - - execution_status submit(descriptor &dsc) noexcept - { - auto view = views::any_descriptor(dsc); - - switch (static_cast(view.operation())) - { - case operation::nop: - return core::evaluate(views::nop_descriptor(dsc)); - break; - case operation::batch: - return core::evaluate(views::batch_descriptor(dsc)); - break; - case operation::drain: - return core::evaluate(views::drain_descriptor(dsc)); - break; - case operation::memory_move: - return core::evaluate(views::mem_move_descriptor(dsc)); - break; - case operation::fill: - return core::evaluate(views::fill_descriptor(dsc)); - break; - case operation::compare: - return core::evaluate(views::compare_descriptor(dsc)); - break; - case operation::compare_pattern: - return core::evaluate(views::compare_pattern_descriptor(dsc)); - break; - case operation::create_delta: - return core::evaluate(views::create_delta_descriptor(dsc)); - break; - case operation::apply_delta: - return core::evaluate(views::apply_delta_descriptor(dsc)); - break; - case operation::dualcast: - return core::evaluate(views::dualcast_descriptor(dsc)); - break; - case operation::crc: - return core::evaluate(views::crc_descriptor(dsc)); - break; - case operation::copy_crc: - return core::evaluate(views::copy_crc_descriptor(dsc)); - break; - case operation::dif_check: - return core::evaluate(views::dif_check_descriptor(dsc)); - break; - case operation::dif_insert: - return core::evaluate(views::dif_insert_descriptor(dsc)); - break; - case operation::dif_strip: - return core::evaluate(views::dif_strip_descriptor(dsc)); - break; - case operation::dif_update: - return core::evaluate(views::dif_update_descriptor(dsc)); - break; - case operation::cache_flush: - return core::evaluate(views::cache_flush_descriptor(dsc)); - break; - default: - return execution_status::unexpected; - } - } - - static inline execution_status evaluate(views::nop_descriptor dsc) noexcept - { - auto final_status = execution_status::success; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::nop_result(*record); - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::drain_descriptor dsc) noexcept - { - auto final_status = execution_status::success; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::drain_result(*record); - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::mem_move_descriptor dsc) noexcept - { - const auto src = reinterpret_cast(dsc.source_address()); - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto transfer_size = dsc.transfer_size(); - - const auto status = dmlc_move_8u(src, dst, transfer_size); - - auto final_status = (status == DML_STATUS_OK) ? execution_status::success : execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::mem_move_result(*record); - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::fill_descriptor dsc) noexcept - { - const auto pattern = dsc.pattern(); - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto transfer_size = dsc.transfer_size(); - - const auto status = dmlc_fill_with_pattern_8u(pattern, dst, transfer_size); - - auto final_status = (status == DML_STATUS_OK) ? execution_status::success : execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::fill_result(*record); - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::compare_descriptor dsc) noexcept - { - const auto src1 = reinterpret_cast(dsc.source_1_address()); - const auto src2 = reinterpret_cast(dsc.source_2_address()); - const auto transfer_size = dsc.transfer_size(); - const auto expected_result = dsc.expected_result(); - const auto options = compare_options(dsc.flags()); - - auto mismatch = transfer_size_t(0); - - const auto status = dmlc_compare_8u(src1, src2, transfer_size, &mismatch); - - result_t actual_result; - if (status == DML_COMPARE_STATUS_EQ) - { - actual_result = 0; - } - else if (status == DML_COMPARE_STATUS_NE) - { - actual_result = 1; - } - else - { - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::compare_result(*record); - write_status(execution_status::unexpected, result.status()); - } - return execution_status::unexpected; - } - - auto final_status = options.contains(compare_option::check_result) - ? (expected_result == actual_result) ? execution_status::success : execution_status::false_predicate_success - : execution_status::success; - - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::compare_result(*record); - - result.result() = actual_result; - result.bytes_completed() = mismatch; - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::compare_pattern_descriptor dsc) noexcept - { - const auto pattern = dsc.pattern(); - const auto src = reinterpret_cast(dsc.source_address()); - const auto transfer_size = dsc.transfer_size(); - const auto expected_result = dsc.expected_result(); - const auto options = compare_options(dsc.flags()); - - auto mismatch = transfer_size_t(0); - - const auto status = dmlc_compare_with_pattern_8u(src, pattern, transfer_size, &mismatch); - - result_t actual_result; - if (status == DML_COMPARE_STATUS_EQ) - { - actual_result = 0; - } - else if (status == DML_COMPARE_STATUS_NE) - { - actual_result = 1; - } - else - { - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::compare_result(*record); - write_status(execution_status::unexpected, result.status()); - } - return execution_status::unexpected; - } - - auto final_status = options.contains(compare_option::check_result) - ? (expected_result == actual_result) ? execution_status::success : execution_status::false_predicate_success - : execution_status::success; - - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::compare_result(*record); - - result.result() = actual_result; - result.bytes_completed() = mismatch; - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::create_delta_descriptor dsc) noexcept - { - const auto src1 = reinterpret_cast(dsc.source_1_address()); - const auto src2 = reinterpret_cast(dsc.source_2_address()); - const auto delta_record = reinterpret_cast(dsc.delta_record_address()); - const auto delta_max_size = dsc.maximum_delta_record_size(); - const auto transfer_size = dsc.transfer_size(); - const auto expected_result = dsc.expected_result_mask(); - const auto options = create_delta_options(dsc.flags()); - - auto delta_record_size = transfer_size_t(0); - - // Flip src1 and src2 due to core differs from hardware spec - const auto status = dmlc_create_delta_record_8u(src2, src1, transfer_size, delta_max_size, delta_record, &delta_record_size); - - result_t actual_result; - if (status == DML_STATUS_DELTA_RECORD_SIZE_ERROR) - { - actual_result = static_cast(delta_expected_result_option::expect_overflow); - } - else if (status == DML_STATUS_OK) - { - actual_result = delta_record_size ? static_cast(delta_expected_result_option::expect_not_equal) - : static_cast(delta_expected_result_option::expect_equal); - } - else - { - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::create_delta_result(*record); - - write_status(execution_status::unexpected, result.status()); - } - return execution_status::unexpected; - } - - auto final_status = options.contains(compare_option::check_result) - ? (expected_result == actual_result) ? execution_status::success : execution_status::false_predicate_success - : execution_status::success; - - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::create_delta_result(*record); - - result.delta_record_size() = delta_record_size; - result.result() = actual_result >> 1; // Hack, because of result and result mask uses different values - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::apply_delta_descriptor dsc) noexcept - { - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto delta_record = reinterpret_cast(dsc.delta_record_address()); - const auto delta_size = dsc.delta_record_size(); - const auto transfer_size = dsc.transfer_size(); - - const auto status = dmlc_apply_delta_record_8u(dst, delta_record, transfer_size, delta_size); - - auto final_status = status == DML_STATUS_OK ? execution_status::success : execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::apply_delta_result(*record); - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::dualcast_descriptor dsc) noexcept - { - const auto src = reinterpret_cast(dsc.source_address()); - const auto dst1 = reinterpret_cast(dsc.destination_1_address()); - const auto dst2 = reinterpret_cast(dsc.destination_2_address()); - const auto transfer_size = dsc.transfer_size(); - - const auto status = dmlc_dualcast_copy_8u(src, dst1, dst2, transfer_size); - - auto final_status = status == DML_STATUS_OK ? execution_status::success : execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::dualcast_result(*record); - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::crc_descriptor dsc) noexcept - { - const auto src = reinterpret_cast(dsc.source_address()); - const auto transfer_size = dsc.transfer_size(); - const auto crc_seed = dsc.crc_seed(); - const auto options = crc_additional_options(dsc.operation_specific_flags()); - - constexpr auto polynomial = 0x1EDC6F41u; - auto reverse = [](uint32_t value) - { - value = (value & 0x55555555u) << 1u | (value & 0xAAAAAAAAu) >> 1u; - value = (value & 0x33333333u) << 2u | (value & 0xCCCCCCCCu) >> 2u; - value = (value & 0x0F0F0F0Fu) << 4u | (value & 0xF0F0F0F0u) >> 4u; - value = (value & 0x00FF00FFu) << 8u | (value & 0xFF00FF00u) >> 8u; - value = (value & 0x0000FFFFu) << 16u | (value & 0xFFFF0000u) >> 16u; - - return value; - }; - - const auto bypass_reflection = options.contains(crc_additional_option::bypass_reflection); - const auto bypass_data_reflection = options.contains(crc_additional_option::bypass_data_reflection); - - auto crc_value = crc_seed; - - // Bypass inversion and use reverse bit order for CRC completion_record - if (!bypass_reflection) - { - crc_value = ~(crc_value); - crc_value = reverse(crc_value); - } - - // Bypass Data Reflection in case if DML_FLAG_DATA_REFLECTION set - auto status = (!bypass_data_reflection) ? dmlc_calculate_crc_reflected_32u(src, transfer_size, &crc_value, polynomial) - : dmlc_calculate_crc_32u(src, transfer_size, &crc_value, polynomial); - - // Bypass inversion and use reverse bit order for CRC completion_record - if (!bypass_reflection) - { - crc_value = reverse(crc_value); - crc_value = ~(crc_value); - } - - auto final_status = status == DML_STATUS_OK ? execution_status::success : execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::crc_result(*record); - - result.crc_value() = crc_value; - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::copy_crc_descriptor dsc) noexcept - { - const auto src = reinterpret_cast(dsc.source_address()); - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto transfer_size = dsc.transfer_size(); - const auto crc_seed = dsc.crc_seed(); - const auto options = crc_additional_options(dsc.operation_specific_flags()); - - { - const auto status = dmlc_move_8u(src, dst, transfer_size); - - if (status != DML_STATUS_OK) - { - auto final_status = execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::any_result(*record); - - write_status(final_status, result.status()); - } - return final_status; - } - } - - constexpr auto polynomial = 0x1EDC6F41u; - auto reverse = [](uint32_t value) - { - value = (value & 0x55555555u) << 1u | (value & 0xAAAAAAAAu) >> 1u; - value = (value & 0x33333333u) << 2u | (value & 0xCCCCCCCCu) >> 2u; - value = (value & 0x0F0F0F0Fu) << 4u | (value & 0xF0F0F0F0u) >> 4u; - value = (value & 0x00FF00FFu) << 8u | (value & 0xFF00FF00u) >> 8u; - value = (value & 0x0000FFFFu) << 16u | (value & 0xFFFF0000u) >> 16u; - - return value; - }; - - const auto bypass_reflection = options.contains(crc_additional_option::bypass_reflection); - const auto bypass_data_reflection = options.contains(crc_additional_option::bypass_data_reflection); - - auto crc_value = crc_seed; - - // Bypass inversion and use reverse bit order for CRC completion_record - if (!bypass_reflection) - { - crc_value = ~(crc_value); - crc_value = reverse(crc_value); - } - - // Bypass Data Reflection in case if DML_FLAG_DATA_REFLECTION set - const auto status = (!bypass_data_reflection) ? dmlc_calculate_crc_reflected_32u(src, transfer_size, &crc_value, polynomial) - : dmlc_calculate_crc_32u(src, transfer_size, &crc_value, polynomial); - - // Bypass inversion and use reverse bit order for CRC completion_record - if (!bypass_reflection) - { - crc_value = reverse(crc_value); - crc_value = ~(crc_value); - } - - auto final_status = status == DML_STATUS_OK ? execution_status::success : execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::crc_result(*record); - - result.crc_value() = crc_value; - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::dif_check_descriptor dsc) noexcept - { - constexpr uint32_t dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; - - const auto src = reinterpret_cast(dsc.source_address()); - const auto transfer_size = dsc.transfer_size(); - const auto options = dif_check_options(dsc.flags()); - const auto dif_options = dif_additional_options(dsc.dif_flags()); - const auto dif_src_options = dif_additional_src_options(dsc.source_dif_flags()); - const auto src_app_tag_mask = dsc.source_app_tag_mask(); - - const auto src_ref_tag = dsc.source_ref_tag(); - const auto src_app_tag = dsc.source_app_tag(); - - dml_job_t job; - memset(&job, 0, sizeof(dml_job_t)); - job.source_first_ptr = src; - job.source_length = transfer_size; - job.operation = DML_OP_DIF_CHECK; - job.dif_config.source_reference_tag_seed = src_ref_tag; - job.dif_config.source_application_tag_seed = src_app_tag; - job.dif_config.source_application_tag_mask = src_app_tag_mask; - job.dif_config.block_size = static_cast(static_cast(dif_options) & 0b11); - - // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h - job.dif_config.flags = - (static_cast(static_cast(dif_options)) << 16) | (static_cast(dif_src_options)); - - job.flags = static_cast(options); - - auto status = dml_legacy_dif_check(&job); - - // Unsupported operation - auto final_status = (status == DML_STATUS_OK) ? execution_status::success : execution_status::dif_control_error; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::dif_check_result(*record); - - result.dif_status() = job.result; - result.bytes_completed() = job.offset; - // TODO: Should also write values for tags - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::dif_insert_descriptor dsc) noexcept - { - const auto src = reinterpret_cast(dsc.source_address()); - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto transfer_size = dsc.transfer_size(); - const auto options = dif_insert_options(dsc.flags()); - const auto dif_options = dif_additional_options(dsc.dif_flags()); - const auto dif_dst_options = dif_additional_src_options(dsc.destination_dif_flags()); - const auto dst_app_tag_mask = dsc.destination_app_tag_mask(); - - auto dst_ref_tag = dsc.destination_ref_tag(); - auto dst_app_tag = dsc.destination_app_tag(); - - dml_job_t job; - memset(&job, 0, sizeof(dml_job_t)); - job.source_first_ptr = src; - job.destination_first_ptr = dst; - job.source_length = transfer_size; - job.operation = DML_OP_DIF_INSERT; - job.dif_config.destination_reference_tag_seed = dst_ref_tag; - job.dif_config.destination_application_tag_seed = dst_app_tag; - job.dif_config.destination_application_tag_mask = dst_app_tag_mask; - job.dif_config.block_size = static_cast(static_cast(dif_options) & 0b11); - - // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h - job.dif_config.flags = - (static_cast(static_cast(dif_options)) << 16) | (static_cast(dif_dst_options) << 8); - - job.flags = static_cast(options); - - auto status = dml_legacy_dif_insert(&job); - - // Unsupported operation - auto final_status = (status == DML_STATUS_OK) ? execution_status::success : execution_status::dif_control_error; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::dif_insert_result(*record); - - result.bytes_completed() = job.offset; - // TODO: Should also write values for tags - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::dif_strip_descriptor dsc) noexcept - { - const auto src = reinterpret_cast(dsc.source_address()); - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto transfer_size = dsc.transfer_size(); - const auto options = dif_strip_options(dsc.flags()); - const auto dif_options = dif_additional_options(dsc.dif_flags()); - const auto dif_src_options = dif_additional_src_options(dsc.source_dif_flags()); - const auto src_app_tag_mask = dsc.source_app_tag_mask(); - - auto src_ref_tag = dsc.source_ref_tag(); - auto src_app_tag = dsc.source_app_tag(); - - dml_job_t job; - memset(&job, 0, sizeof(dml_job_t)); - job.source_first_ptr = src; - job.destination_first_ptr = dst; - job.source_length = transfer_size; - job.operation = DML_OP_DIF_STRIP; - job.dif_config.source_reference_tag_seed = src_ref_tag; - job.dif_config.source_application_tag_seed = src_app_tag; - job.dif_config.source_application_tag_mask = src_app_tag_mask; - job.dif_config.block_size = static_cast(static_cast(dif_options) & 0b11); - - // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h - job.dif_config.flags = - (static_cast(static_cast(dif_options)) << 16) | (static_cast(dif_src_options)); - - job.flags = static_cast(options); - - auto status = dml_legacy_dif_strip(&job); - - // Unsupported operation - auto final_status = (status == DML_STATUS_OK) ? execution_status::success : execution_status::dif_control_error; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::dif_strip_result(*record); - - result.dif_status() = job.result; - result.bytes_completed() = job.offset; - // TODO: Should also write values for tags - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::dif_update_descriptor dsc) noexcept - { - const auto src = reinterpret_cast(dsc.source_address()); - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto transfer_size = dsc.transfer_size(); - const auto options = dif_update_options(dsc.flags()); - const auto dif_options = dif_additional_options(dsc.dif_flags()); - const auto dif_src_options = dif_additional_src_options(dsc.source_dif_flags()); - const auto dif_dst_options = dif_additional_src_options(dsc.destination_dif_flags()); - const auto src_app_tag_mask = dsc.source_app_tag_mask(); - const auto dst_app_tag_mask = dsc.destination_app_tag_mask(); - - auto src_ref_tag = dsc.source_ref_tag(); - auto dst_ref_tag = dsc.destination_ref_tag(); - auto src_app_tag = dsc.source_app_tag(); - auto dst_app_tag = dsc.destination_app_tag(); - - dml_job_t job; - memset(&job, 0, sizeof(dml_job_t)); - job.source_first_ptr = src; - job.destination_first_ptr = dst; - job.source_length = transfer_size; - job.operation = DML_OP_DIF_UPDATE; - job.dif_config.source_reference_tag_seed = src_ref_tag; - job.dif_config.source_application_tag_seed = src_app_tag; - job.dif_config.source_application_tag_mask = src_app_tag_mask; - job.dif_config.destination_reference_tag_seed = dst_ref_tag; - job.dif_config.destination_application_tag_seed = dst_app_tag; - job.dif_config.destination_application_tag_mask = dst_app_tag_mask; - job.dif_config.block_size = static_cast(static_cast(dif_options) & 0b11); - - // Job API composes DIF flags into one 64-bit value via shifting, check dmldefs.h - job.dif_config.flags = (static_cast(static_cast(dif_options)) << 16) | - (static_cast(dif_dst_options) << 8) | (static_cast(dif_src_options)); - - job.flags = static_cast(options); - - auto status = dml_legacy_dif_update(&job); - - // Unsupported operation - auto final_status = (status == DML_STATUS_OK) ? execution_status::success : execution_status::dif_control_error; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::dif_update_result(*record); - - result.dif_status() = job.result; - result.bytes_completed() = job.offset; - // TODO: Should also write values for tags - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::cache_flush_descriptor dsc) noexcept - { - const auto dst = reinterpret_cast(dsc.destination_address()); - const auto transfer_size = dsc.transfer_size(); - const auto options = cache_flush_options(dsc.operation_specific_flags()); - - auto status = options.contains(cache_flush_option::cache_control) ? dmlc_copy_cache_to_memory_8u(dst, transfer_size) - : dmlc_move_cache_to_memory_8u(dst, transfer_size); - - auto final_status = status == DML_STATUS_OK ? execution_status::success : execution_status::unexpected; - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::cache_flush_result(*record); - - write_status(final_status, result.status()); - } - - return final_status; - } - - static inline execution_status evaluate(views::batch_descriptor dsc) noexcept - { - const auto operations = reinterpret_cast(dsc.descriptor_list_address()); - const auto descriptors_count = dsc.descriptors_count(); - - auto final_status = execution_status::success; - auto index = transfer_size_t(0); - for (index = 0; index < descriptors_count; ++index) - { - auto &op_dsc = operations[index]; - - auto status = submit(op_dsc); - - if (status != execution_status::success) - { - final_status = status; - break; - } - } - - if (dsc.completion_record_address()) - { - auto record = reinterpret_cast(dsc.completion_record_address()); - auto result = views::batch_result(*record); - - result.descriptors_completed() = index; - write_status(final_status, result.status()); - } - - return final_status; - } - -} // namespace dml::ml::core diff --git a/sources/middle_layer/device.cpp b/sources/middle_layer/device.cpp deleted file mode 100644 index 073e26a..0000000 --- a/sources/middle_layer/device.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#include -#include -#include - -namespace dml::ml -{ -#ifdef DML_HW - submission_status hardware::submit(descriptor &dsc, completion_record &record) noexcept - { - static auto &dispatcher_instance = dispatcher::hw_dispatcher::get_instance(); - static thread_local int32_t numa_id = util::get_numa_id(); - - if (dispatcher_instance.is_hw_support()) - { - const auto n_devices = std::distance(dispatcher_instance.begin(), dispatcher_instance.end()); - - // Initially set to "end" index - static auto last_device_idx = std::atomic(n_devices); - - // Loop FROM the device after the one used for last submit - for (auto device_idx = last_device_idx.load() + 1; device_idx < n_devices; ++device_idx) - { - auto &device = *(dispatcher_instance.begin() + device_idx); - if (device.numa_id() != numa_id) - { - continue; - } - - auto view = views::any_descriptor(dsc); - view.flags() |= - static_cast(flag::completion_record_address_valid) | static_cast(flag::request_completion_record); - - // Use BlockOnFault on hardware, until page fault handling is implemented in software side - if (view.operation() != static_cast(operation::batch) && - view.operation() != static_cast(operation::drain) && - view.operation() != static_cast(operation::nop)) - { - view.flags() |= static_cast(flag::block_on_fault); - } - - view.completion_record_address() = reinterpret_cast(&record); - record.bytes[0] = 0; - - auto status = device.enqueue_descriptor(reinterpret_cast(&dsc)); - - if (status == DML_STATUS_OK) - { - last_device_idx = device_idx; - return submission_status::success; - } - } - - // If the loop before didn't submit descriptor, then loop UNTIL the device that was used for last submit - for (auto device_idx = 0; device_idx <= last_device_idx; ++device_idx) - { - auto &device = *(dispatcher_instance.begin() + device_idx); - if (device.numa_id() != numa_id) - { - continue; - } - - auto view = views::any_descriptor(dsc); - view.flags() |= - static_cast(flag::completion_record_address_valid) | static_cast(flag::request_completion_record); - - // Use BlockOnFault on hardware, until page fault handling is implemented in software side - if (view.operation() != static_cast(operation::batch) && - view.operation() != static_cast(operation::drain) && - view.operation() != static_cast(operation::nop)) - { - view.flags() |= static_cast(flag::block_on_fault); - } - - view.completion_record_address() = reinterpret_cast(&record); - record.bytes[0] = 0; - - auto status = device.enqueue_descriptor(reinterpret_cast(&dsc)); - - if (status == DML_STATUS_OK) - { - last_device_idx = device_idx; - return submission_status::success; - } - } - } - - return submission_status::failure; - } -#endif -} // namespace dml::ml diff --git a/sources/middle_layer/dispatcher/hw_device.cpp b/sources/middle_layer/dispatcher/hw_device.cpp deleted file mode 100644 index 06acace..0000000 --- a/sources/middle_layer/dispatcher/hw_device.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifdef DML_HW - -#include - -#include "hw_device.hpp" -#include "hardware_configuration_driver.h" -#include "own_dsa_accel_constants.h" - -static inline bool own_search_device_name(const char *src_ptr, - const uint32_t name, - const uint32_t name_size) noexcept { - const uint8_t null_terminator = '\0'; - - for (size_t symbol_idx = 0u; null_terminator != src_ptr[symbol_idx + name_size]; symbol_idx++) { - const auto *candidate_ptr = reinterpret_cast(src_ptr + symbol_idx); - - // Convert the first 3 bytes to lower case and make the 4th 0xff - if (name == (*candidate_ptr | CHAR_MSK)) { - return true; - } - } - - return false; -} - -namespace dml::ml::dispatcher { - -void hw_device::fill_hw_context(dsahw_context_t *const hw_context_ptr) const noexcept { - // Restore device properties - hw_context_ptr->gen_cap.block_on_fault_support = hw_device::block_on_fault_support(); - hw_context_ptr->gen_cap.overlapping_copy_support = hw_device::overlapping_copy_support(); - hw_context_ptr->gen_cap.memory_cache_control_support = hw_device::memory_cache_control_support(); - hw_context_ptr->gen_cap.flush_cache_control_support = hw_device::flush_cache_control_support(); - hw_context_ptr->gen_cap.interrupt_handle_request = hw_device::interrupt_handle_request(); - hw_context_ptr->gen_cap.destination_readback_support = hw_device::destination_readback_support(); - hw_context_ptr->gen_cap.descriptor_readback_support = hw_device::descriptor_readback_support(); - hw_context_ptr->gen_cap.max_transfer_size = hw_device::max_transfer_size(); - hw_context_ptr->gen_cap.max_batch_size = hw_device::max_batch_size(); - hw_context_ptr->gen_cap.message_size = hw_device::message_size(); - hw_context_ptr->gen_cap.configuration_support = hw_device::configuration_support(); - hw_context_ptr->gen_cap.max_descriptors = hw_device::max_descriptors(); -} - -auto hw_device::enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t { - const auto n_queues = std::distance(this->begin(), this->end()); - - // Initially set to "end" index - static auto last_wq_idx = std::atomic(n_queues); - - // Loop FROM the queue after the one used for last submit - for (auto idx = last_wq_idx.load() + 1; idx < n_queues; ++idx) - { - auto &queue = *(this->begin() + idx); - auto status = queue.enqueue_descriptor(desc_ptr); - - if (DML_STATUS_OK == status) { - last_wq_idx = idx; - return DML_STATUS_OK; - } - } - - // If the loop before didn't submit descriptor, then loop UNTIL the queue that was used for last submit - for (auto idx = 0; idx <= last_wq_idx; ++idx) - { - auto &queue = *(this->begin() + idx); - auto status = queue.enqueue_descriptor(desc_ptr); - - if (DML_STATUS_OK == status) { - last_wq_idx = idx; - return DML_STATUS_OK; - } - } - - return DML_STATUS_INSTANCE_NOT_FOUND; -} - -auto hw_device::block_on_fault_support() const noexcept -> uint8_t { - return GC_BLOCK_ON_FAULT_SUP(gen_cap_register_); -} - -auto hw_device::overlapping_copy_support() const noexcept -> uint8_t { - return GC_OVERLAPPING_COPY_SUPPORT(gen_cap_register_); -} - -auto hw_device::memory_cache_control_support() const noexcept -> uint8_t { - return GC_M_CACHE_CONTROL_SUPPORT(gen_cap_register_); -} - -auto hw_device::flush_cache_control_support() const noexcept -> uint8_t { - return GC_F_CACHE_CONTROL_SUPPORT(gen_cap_register_); -} - -auto hw_device::interrupt_handle_request() const noexcept -> uint8_t { - return GC_INTERRUPT_HANDLE_REQUEST(gen_cap_register_); -} - -auto hw_device::destination_readback_support() const noexcept -> uint8_t { - return GC_DESTINATION_READBACK_SUPPORT(gen_cap_register_); -} - -auto hw_device::descriptor_readback_support() const noexcept -> uint8_t { - return GC_DESCRIPTOR_READBACK_SUPPORT(gen_cap_register_); -} - -auto hw_device::max_transfer_size() const noexcept -> uint32_t { - return GC_MAX_TRANSFER_SIZE(gen_cap_register_); -} - -auto hw_device::max_batch_size() const noexcept -> uint32_t { - return GC_MAX_BATCH_SIZE(gen_cap_register_); -} - -auto hw_device::message_size() const noexcept -> uint16_t { - return GC_MESSAGE_SIZE(gen_cap_register_); -} - -auto hw_device::configuration_support() const noexcept -> uint8_t { - return GC_CONFIGURATION_SUPPORT(gen_cap_register_); -} - -auto hw_device::max_descriptors() const noexcept -> uint8_t { - return GC_MAX_DESCRIPTORS(gen_cap_register_); -} - -auto hw_device::initialize_new_device(descriptor_t *device_descriptor_ptr) noexcept -> dsahw_status_t { -#if defined(linux) - // Device initialization stage - auto *device_ptr = reinterpret_cast(device_descriptor_ptr); - const auto *name_ptr = dsa_device_get_name(device_ptr); - const bool is_dsa_device = own_search_device_name(name_ptr, DSA_DEVICE_ID, DEVICE_NAME_LENGTH); - - if (!is_dsa_device || ACCFG_DEVICE_DISABLED == dsa_device_get_state(device_ptr)) { - return DML_STATUS_INSTANCE_NOT_FOUND; - } - - gen_cap_register_ = dsa_device_get_gen_cap_register(device_ptr); - version_ = dsa_device_get_major_version(device_ptr); - numa_node_id_ = dsa_device_get_numa_node(device_ptr); - - dsa_group_get_first(device_ptr); - - // Working queues initialization stage - auto *wq_ptr = dsa_get_first_work_queue(device_ptr); - auto wq_it = working_queues_.begin(); - - while (nullptr != wq_ptr) { - if (DML_STATUS_OK == wq_it->initialize_new_queue(wq_ptr, version_)) { - wq_it++; - - std::push_heap(working_queues_.begin(), wq_it, - [](const hw_queue &a, const hw_queue &b) -> bool { - return a.priority() < b.priority(); - }); - } - - wq_ptr = dsa_work_queue_get_next(wq_ptr); - } - - // Check number of working queues - queue_count_ = std::distance(working_queues_.begin(), wq_it); - - if (queue_count_ > 1) { - auto begin = working_queues_.begin(); - auto end = begin + queue_count_; - - std::sort_heap(begin, end, [](const hw_queue &a, const hw_queue &b) -> bool { - return a.priority() < b.priority(); - }); - } - - if (queue_count_ == 0) { - return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; - } - - return DML_STATUS_OK; -#else - return DML_STATUS_INSTANCE_NOT_FOUND; -#endif -} - -auto hw_device::size() const noexcept -> size_t { - return queue_count_; -} - -auto hw_device::numa_id() const noexcept -> uint64_t { - return numa_node_id_; -} - -auto hw_device::begin() const noexcept -> queues_container_t::const_iterator { - return working_queues_.cbegin(); -} - -auto hw_device::end() const noexcept -> queues_container_t::const_iterator { - return working_queues_.cbegin() + queue_count_; -} - -} - -#endif diff --git a/sources/middle_layer/dispatcher/hw_device.hpp b/sources/middle_layer/dispatcher/hw_device.hpp deleted file mode 100644 index b364825..0000000 --- a/sources/middle_layer/dispatcher/hw_device.hpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_MIDDLE_LAYER_DISPATCHER_HW_DEVICE_HPP_ -#define DML_MIDDLE_LAYER_DISPATCHER_HW_DEVICE_HPP_ - -#include - -#include "dml/dmldefs.h" -#include "hw_queue.hpp" - -#ifdef DML_HW -#include "hardware_definitions.h" -#include "own_dsa_accel_constants.h" - -namespace dml::ml::dispatcher { - -class hw_device final { - - static constexpr uint32_t max_working_queues = MAX_WORK_QUEUE_COUNT; - - using queues_container_t = std::array; - -public: - using descriptor_t = void; - - hw_device() noexcept = default; - - void fill_hw_context(dsahw_context_t *hw_context_ptr) const noexcept; - - [[nodiscard]] auto enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t; - - [[nodiscard]] auto initialize_new_device(descriptor_t *device_descriptor_ptr) noexcept -> dsahw_status_t; - - [[nodiscard]] auto size() const noexcept -> size_t; - - [[nodiscard]] auto numa_id() const noexcept -> uint64_t; - - [[nodiscard]] auto begin() const noexcept -> queues_container_t::const_iterator; - - [[nodiscard]] auto end() const noexcept -> queues_container_t::const_iterator; - -protected: - auto block_on_fault_support() const noexcept -> uint8_t; - - auto overlapping_copy_support() const noexcept -> uint8_t; - - auto memory_cache_control_support() const noexcept -> uint8_t; - - auto flush_cache_control_support() const noexcept -> uint8_t; - - auto interrupt_handle_request() const noexcept -> uint8_t; - - auto destination_readback_support() const noexcept -> uint8_t; - - auto descriptor_readback_support() const noexcept -> uint8_t; - - auto max_transfer_size() const noexcept -> uint32_t; - - auto max_batch_size() const noexcept -> uint32_t; - - auto message_size() const noexcept -> uint16_t; - - auto configuration_support() const noexcept -> uint8_t; - - auto max_descriptors() const noexcept -> uint8_t; - -private: - queues_container_t working_queues_ = {}; /**< Set of available HW working queues */ - uint32_t queue_count_ = 0u; /**< Number of working queues that are available */ - uint64_t gen_cap_register_ = 0u; /**< GENCAP register content */ - uint64_t numa_node_id_ = 0u; /**< NUMA node id of the device */ - uint32_t version_ = 0u; /**< Version of discovered device */ -}; - -} - -#endif -#endif //DML_MIDDLE_LAYER_DISPATCHER_HW_DEVICE_HPP_ diff --git a/sources/middle_layer/dispatcher/hw_dispatcher.cpp b/sources/middle_layer/dispatcher/hw_dispatcher.cpp deleted file mode 100644 index b4d7a12..0000000 --- a/sources/middle_layer/dispatcher/hw_dispatcher.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#include "hw_dispatcher.hpp" - -#ifdef LOG_HW_INIT - -#include - -#endif - -#if defined(DML_HW) && defined(linux) - -#include "libaccel_config.h" - -#endif - -// TODO should be removed at all -#define DML_HWSTS_RET(expr, err_code) { if( expr ) { return( err_code ); }} - -namespace dml::ml::dispatcher { - -hw_dispatcher::hw_dispatcher() noexcept { -#ifdef DML_HW - hw_init_status_ = hw_dispatcher::initialize_hw(); - hw_support_ = hw_init_status_ == DML_STATUS_OK; -#else - hw_support_ = false; -#endif -} - -#ifdef DML_HW - -auto hw_dispatcher::initialize_hw() noexcept -> dsahw_status_t { - - accfg_ctx *ctx_ptr = nullptr; - - dsahw_status_t status = dsa_initialize_accelerator_driver(&hw_driver_); - DML_HWSTS_RET(status != DML_STATUS_OK, status); - - int32_t context_creation_status = dsa_driver_new_context(&ctx_ptr); - DML_HWSTS_RET(0u != context_creation_status, DML_STATUS_HARDWARE_CONNECTION_ERROR); - - // Retrieve first device in the system given the passed in context - auto *dev_tmp_ptr = dsa_context_get_first_device(ctx_ptr); - auto device_it = devices_.begin(); - - while (nullptr != dev_tmp_ptr) { - if (DML_STATUS_OK == device_it->initialize_new_device(dev_tmp_ptr)) { - device_it++; - } - - // Retrieve the "next" device in the system based on given device - dev_tmp_ptr = dsa_device_get_next(dev_tmp_ptr); - } - - device_count_ = std::distance(devices_.begin(), device_it); - - if (device_count_ <= 0) { - return DML_STATUS_HARDWARE_CONNECTION_ERROR; - } - -#ifdef LOG_HW_INIT - std::cout << "--------------------------------\n"; - std::cout << "Number of discovered devices: " << device_count_ << "\n"; - std::cout << "--------------------------------\n"; - - for (size_t i = 0; i < device_count_; i++) { - std::cout << "Device #" << i << " : " << devices_[i].size() << " work queues\n"; - } - - std::cout << "--------------------------------\n" << std::endl; -#endif - - hw_context_.set_driver_context_ptr(ctx_ptr); - - return DML_STATUS_OK; -} -#endif - -hw_dispatcher::~hw_dispatcher() noexcept { -#ifdef DML_HW - // Variables - auto *context_ptr = hw_context_.get_driver_context_ptr(); - - if (context_ptr != nullptr) { - dsa_context_close(context_ptr); - } - - dsa_finalize_accelerator_driver(&hw_driver_); - - // Zeroing values - hw_context_.set_driver_context_ptr(nullptr); -#endif -} - -auto hw_dispatcher::get_instance() noexcept -> hw_dispatcher & { - static hw_dispatcher instance{}; - - return instance; -} - -auto hw_dispatcher::is_hw_support() const noexcept -> bool { - return hw_support_; -} - -#ifdef DML_HW - -void hw_dispatcher::fill_hw_context(dsahw_context_t *const hw_context_ptr) noexcept { - -#if defined(linux) - // Restore context - hw_context_ptr->dsa_context_ptr = hw_context_.get_driver_context_ptr(); - - // Restore device properties - // We take the first one as all configurations across the platform should be the same for all devices - devices_[0].fill_hw_context(hw_context_ptr); -#endif -} - -auto hw_dispatcher::get_hw_init_status() const noexcept -> dsahw_status_t { - return hw_init_status_; -} - -#ifdef DML_HW - -auto hw_dispatcher::begin() const noexcept -> device_container_t::const_iterator { - return devices_.cbegin(); -} - -auto hw_dispatcher::end() const noexcept -> device_container_t::const_iterator { - return devices_.cbegin() + device_count_; -} - -void hw_dispatcher::hw_context::set_driver_context_ptr(accfg_ctx *driver_context_ptr) noexcept { - driver_context_ptr_ = driver_context_ptr; -} - -[[nodiscard]] auto hw_dispatcher::hw_context::get_driver_context_ptr() noexcept -> accfg_ctx * { - return driver_context_ptr_; -} - -#endif - -#endif -} diff --git a/sources/middle_layer/dispatcher/hw_dispatcher.hpp b/sources/middle_layer/dispatcher/hw_dispatcher.hpp deleted file mode 100644 index b41b886..0000000 --- a/sources/middle_layer/dispatcher/hw_dispatcher.hpp +++ /dev/null @@ -1,95 +0,0 @@ -/* - * - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_MIDDLE_LAYER_DISPATCHER_HW_DISPATCHER_HPP_ -#define DML_MIDDLE_LAYER_DISPATCHER_HW_DISPATCHER_HPP_ - -#include -#include - -#include "hw_device.hpp" -#include "dml/dmldefs.h" - -#ifdef DML_HW -#include "hardware_definitions.h" -#include "hardware_configuration_driver.h" -#include "own_dsa_accel_constants.h" -#endif - -namespace dml::ml::dispatcher { - -class hw_dispatcher final { - -#ifdef DML_HW - - static constexpr uint32_t max_devices = MAX_DEVICE_COUNT; - - using device_container_t = std::array; - - class hw_context final { - public: - void set_driver_context_ptr(accfg_ctx *driver_context_ptr) noexcept; - - [[nodiscard]] auto get_driver_context_ptr() noexcept -> accfg_ctx *; - - private: - accfg_ctx *driver_context_ptr_ = nullptr; /**< DSA driver context */ - }; - -#endif - -public: - - static auto get_instance() noexcept -> hw_dispatcher &; - - [[nodiscard]] auto is_hw_support() const noexcept -> bool; - -#ifdef DML_HW - - [[nodiscard]] auto get_hw_init_status() const noexcept -> dsahw_status_t; - - void fill_hw_context(dsahw_context_t *hw_context_ptr) noexcept; - - [[nodiscard]] auto begin() const noexcept -> device_container_t::const_iterator; - - [[nodiscard]] auto end() const noexcept -> device_container_t::const_iterator; - -#endif - - virtual ~hw_dispatcher() noexcept; - -protected: - hw_dispatcher() noexcept; - -#ifdef DML_HW - auto initialize_hw() noexcept -> dsahw_status_t; - -private: - hw_context hw_context_; - hw_driver_t hw_driver_{}; - device_container_t devices_{}; - size_t device_count_ = 0; -#endif - - bool hw_support_; -#ifdef DML_HW - dsahw_status_t hw_init_status_; -#endif -}; - -} -#endif //DML_MIDDLE_LAYER_DISPATCHER_HW_DISPATCHER_HPP_ diff --git a/sources/middle_layer/dispatcher/hw_queue.cpp b/sources/middle_layer/dispatcher/hw_queue.cpp deleted file mode 100644 index 1a69cc7..0000000 --- a/sources/middle_layer/dispatcher/hw_queue.cpp +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifdef DML_HW - -#include - -#if defined( linux ) - -#include - -#endif - -#include "hw_queue.hpp" -#include "hardware_configuration_driver.h" -#include "own_dsa_accel_constants.h" - -#define DML_HWSTS_RET(expr, err_code) { if( expr ) { return( err_code ); }} -#define DEC_BASE 10u /**< @todo */ -#define DEC_CHAR_BASE ('0') /**< @todo */ -#define DEC_MAX_INT_BUF 16u /**< @todo */ - -namespace dml::ml::dispatcher { - -static inline uint32_t own_int_to_str(char *path_ptr, uint32_t i, uint32_t max_path, uint32_t number) noexcept { - uint8_t ch[DEC_MAX_INT_BUF]; - uint32_t j = 0u; - - // At least once - in case of 0 - do { - // Convert digit to char - ch[j++] = DEC_CHAR_BASE + number % DEC_BASE; - // Next digit - number /= DEC_BASE; - } while (0u < number); - // Return max+1 if "path" buffer overflow - DML_HWSTS_RET((max_path < i + j), max_path + 1u); - do { - // Copy to "path" in correct order - path_ptr[i++] = ch[--j]; - } while (0u < j); - // Success - return the next "free" char index in the "path" - return i; -} - -static inline dsahw_status_t own_specify_path(char *path_ptr, - uint32_t max_path, - uint32_t major, - uint32_t minor) noexcept { - uint32_t i = 0u; - - // Strlen analogue - while (('\0' != path_ptr[i]) && (i < max_path)) { - i++; - } - // i has index of terminating 0 - // Check for buffer overflow - DML_HWSTS_RET((max_path < i), DML_STATUS_VERSION_DETECTION_ERROR); - // Need the next format: "/dev/char/major:minor" - i = own_int_to_str(path_ptr, i, max_path, major); - // Check for buffer overflow - DML_HWSTS_RET((max_path < i + 1u), DML_STATUS_VERSION_DETECTION_ERROR); - path_ptr[i++] = ':'; - i = own_int_to_str(path_ptr, i, max_path, minor); - // Check for buffer overflow - DML_HWSTS_RET((max_path < i), DML_STATUS_VERSION_DETECTION_ERROR); - path_ptr[i] = '\0'; - return DML_STATUS_OK; -} - -hw_queue::hw_queue(hw_queue &&other) noexcept { - version_ = other.version_; - priority_ = other.priority_; - portal_mask_ = other.portal_mask_; - portal_ptr_ = other.portal_ptr_; - portal_offset_ = 0; - - other.portal_ptr_ = nullptr; -} - -auto hw_queue::operator=(hw_queue &&other) noexcept -> hw_queue & { - version_ = other.version_; - priority_ = other.priority_; - portal_mask_ = other.portal_mask_; - portal_ptr_ = other.portal_ptr_; - portal_offset_ = 0; - - other.portal_ptr_ = nullptr; - - return *this; -} - -hw_queue::~hw_queue() { -#if defined( linux ) - // Freeing resources - if (portal_ptr_ != nullptr) { - munmap(portal_ptr_, 0x1000u); - - portal_ptr_ = nullptr; - } -#endif -} - -void hw_queue::set_portal_ptr(void *value_ptr) noexcept { - portal_offset_ = reinterpret_cast(value_ptr) & OWN_PAGE_MASK; - portal_mask_ = reinterpret_cast(value_ptr) & (~OWN_PAGE_MASK); - portal_ptr_ = value_ptr; -} - -auto hw_queue::get_portal_ptr() const noexcept -> void * { - uint64_t offset = portal_offset_++; - offset = (offset << 6) & OWN_PAGE_MASK; - return reinterpret_cast(offset | portal_mask_); -} - -auto hw_queue::enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t { -#if defined( linux ) - uint8_t retry = 0u; - - void *current_place_ptr = get_portal_ptr(); - asm volatile("sfence\t\n" - ".byte 0xf2, 0x0f, 0x38, 0xf8, 0x02\t\n" - "setz %0\t\n" - : "=r"(retry) : "a" (current_place_ptr), "d" (desc_ptr)); - - return static_cast(retry); -#else - return DML_STATUS_INSTANCE_NOT_FOUND; -#endif -} - -auto hw_queue::initialize_new_queue(void *wq_descriptor_ptr, uint32_t major_version) noexcept -> dsahw_status_t { -#if defined( linux ) - auto *work_queue_ptr = reinterpret_cast(wq_descriptor_ptr); - char path[64] = "/dev/char/"; - - if (ACCFG_WQ_ENABLED != dsa_work_queue_get_state(work_queue_ptr) || - ACCFG_WQ_SHARED != dsa_work_queue_get_mode(work_queue_ptr)) { - return DML_STATUS_WORK_QUEUES_NOT_AVAILABLE; - } - - auto *group_ptr = dsa_work_queue_get_group(work_queue_ptr); - if (group_ptr == nullptr) { - return DML_STATUS_INTERNAL_ERROR; - } - - version_ = dsa_work_queue_get_minor_version(work_queue_ptr); - priority_ = dsa_work_queue_get_priority(work_queue_ptr); - memory_type_ = dsa_group_get_traffic_class_b(group_ptr) ? supported_memory_type::durable - : supported_memory_type::non_durable; - - // Need the next format: "/dev/char/major:minor" -#if defined(LIB_ACCEL_VERSION_3_2) - auto status = dsa_work_queue_get_device_path(work_queue_ptr, path, 64 - 1); -#else - auto status = own_specify_path(path, sizeof(path) - 1u, major_version, version_); -#endif - DML_HWSTS_RET((0 > status), DML_STATUS_INCORRECT_WORK_QUEUE_ID); - - auto fd = open(path, O_RDWR); - DML_HWSTS_RET((0 > fd), DML_STATUS_WORK_QUEUE_CONNECTION_ERROR); - - // Map portal for enqcmd - auto *region_ptr = mmap(nullptr, 0x1000u, PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0u); - close(fd); - - DML_HWSTS_RET((MAP_FAILED == region_ptr), DML_STATUS_PORTAL_CREATION_ERROR); - - hw_queue::set_portal_ptr(region_ptr); - - return DML_STATUS_OK; -#else - return DML_STATUS_WORK_QUEUE_CONNECTION_ERROR; -#endif -} - -auto hw_queue::priority() const noexcept -> int32_t { - return priority_; -} - -auto hw_queue::memory_type() const noexcept -> hw_queue::supported_memory_type { - return memory_type_; -} - -} - -#endif diff --git a/sources/middle_layer/dispatcher/hw_queue.hpp b/sources/middle_layer/dispatcher/hw_queue.hpp deleted file mode 100644 index 368b932..0000000 --- a/sources/middle_layer/dispatcher/hw_queue.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#ifndef DML_MIDDLE_LAYER_DISPATCHER_HW_QUEUE_HPP_ -#define DML_MIDDLE_LAYER_DISPATCHER_HW_QUEUE_HPP_ - -#include - -#include "dml/dmldefs.h" - -#ifdef DML_HW - -#include "hardware_definitions.h" - -namespace dml::ml::dispatcher { - -class hw_queue { -public: - enum class supported_memory_type { - durable, - non_durable - }; - - using descriptor_t = void; - - hw_queue() noexcept = default; - - hw_queue(const hw_queue &) noexcept = delete; - - auto operator=(const hw_queue &other) noexcept -> hw_queue & = delete; - - hw_queue(hw_queue &&other) noexcept; - - auto operator=(hw_queue &&other) noexcept -> hw_queue &; - - auto initialize_new_queue(descriptor_t *wq_descriptor_ptr, uint32_t major_version) noexcept -> dsahw_status_t; - - [[nodiscard]] auto get_portal_ptr() const noexcept -> void *; - - [[nodiscard]] auto enqueue_descriptor(const dsahw_descriptor_t *desc_ptr) const noexcept -> dsahw_status_t; - - [[nodiscard]] auto priority() const noexcept -> int32_t; - - [[nodiscard]] auto memory_type() const noexcept -> supported_memory_type; - - void set_portal_ptr(void *portal_ptr) noexcept; - - virtual ~hw_queue() noexcept; - -private: - uint32_t version_ = 0u; - int32_t priority_ = 0u; - supported_memory_type memory_type_ = supported_memory_type::non_durable; - uint64_t portal_mask_ = 0u; /**< Mask for incrementing portals */ - mutable void *portal_ptr_ = nullptr; - mutable std::atomic portal_offset_ = 0u; /**< Portal for enqcmd (mod page size)*/ -}; - -} -#endif - -#endif //DML_MIDDLE_LAYER_DISPATCHER_HW_QUEUE_HPP_ diff --git a/sources/middle_layer/dispatcher/numa.cpp b/sources/middle_layer/dispatcher/numa.cpp deleted file mode 100644 index dbe13de..0000000 --- a/sources/middle_layer/dispatcher/numa.cpp +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#include -#include -#include - -#if defined(linux) - #include -#endif - -#include "numa.hpp" - -namespace dml::ml::util { - -static constexpr auto get_nodes_configuration() -> std::array, 10> { - return { - { - {R"(/sys/bus/node/devices/node0/cpulist)", 0}, - {R"(/sys/bus/node/devices/node1/cpulist)", 1}, - {R"(/sys/bus/node/devices/node2/cpulist)", 2}, - {R"(/sys/bus/node/devices/node3/cpulist)", 3}, - {R"(/sys/bus/node/devices/node4/cpulist)", 4}, - {R"(/sys/bus/node/devices/node5/cpulist)", 5}, - {R"(/sys/bus/node/devices/node6/cpulist)", 6}, - {R"(/sys/bus/node/devices/node7/cpulist)", 7}, - {R"(/sys/bus/node/devices/node8/cpulist)", 8}, - {R"(/sys/bus/node/devices/node9/cpulist)", 9} - } - }; -} - -static inline void get_region(std::ifstream &stream, - uint32_t &begin, - uint32_t &end, - bool read_end_splitter = false) { - char splitter = '0'; - - stream >> begin; - stream >> splitter; - stream >> end; - - if (read_end_splitter) { - stream >> splitter; - } -} - -static inline void update_regions(std::unordered_map ®ions, - uint32_t begin, - uint32_t end, - int32_t value) { - for (uint32_t i = begin; i <= end; i++) { - regions[i] = value; - } -} - -class numa_configuration { -public: - numa_configuration(const numa_configuration &other) = delete; - - auto operator=(const numa_configuration &other) -> numa_configuration & = delete; - - numa_configuration(numa_configuration &&other) = delete; - - auto operator=(numa_configuration &&other) -> numa_configuration & = delete; - - static auto get_instance() noexcept -> numa_configuration & { - static numa_configuration inst{}; - - return inst; - } - - auto operator[](uint32_t cpu_id) noexcept -> int32_t { - if (mapping_.find(cpu_id) != mapping_.end()) { - return mapping_[cpu_id]; - } else { - return 0; - } - } - -private: - numa_configuration() noexcept { - constexpr auto node_lists = get_nodes_configuration(); - - for (auto path : node_lists) { - std::ifstream file(path.first); - - if (!file.is_open()) { - continue; - } - - uint32_t begin = 0; - uint32_t end = 0; - - get_region(file, begin, end, true); - update_regions(mapping_, begin, end, path.second); - - get_region(file, begin, end); - update_regions(mapping_, begin, end, path.second); - } - } - - std::unordered_map mapping_; -}; - -auto get_cpu_id() -> uint32_t { - uint32_t cpu_id = -1; - -#if defined(linux) - __rdtscp(&cpu_id); -#endif - - return cpu_id; -} - -int32_t get_numa_id() noexcept { -#if defined(linux) - static auto &numa_config = numa_configuration::get_instance(); - - static thread_local auto cpu_id = get_cpu_id(); - static thread_local auto numa_id = numa_config[cpu_id]; - - return numa_id; -#else - // Not supported in Windows yet - return -1; -#endif -} - -} diff --git a/sources/middle_layer/hw_configuration_driver.c b/sources/middle_layer/hw_configuration_driver.c deleted file mode 100644 index 5bacb32..0000000 --- a/sources/middle_layer/hw_configuration_driver.c +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - - -#include "hardware_configuration_driver.h" - -#include - -#if defined( linux ) - -#include -#include - -const static char *accelerator_configuration_driver_name = "/usr/lib64/libaccel-config.so"; - -typedef int (*accfg_new_ptr)(struct accfg_ctx **ctx); - -typedef struct accfg_device *(*accfg_device_get_first_ptr)(struct accfg_ctx *ctx); - -typedef const char *(*accfg_device_get_devname_ptr)(struct accfg_device *device); - -typedef struct accfg_device *(*accfg_device_get_next_ptr)(struct accfg_device *device); - -typedef struct accfg_wq *(*accfg_wq_get_first_ptr)(struct accfg_device *device); - -typedef struct accfg_wq *(*accfg_wq_get_next_ptr)(struct accfg_wq *wq); - -typedef enum accfg_wq_state (*accfg_wq_get_state_ptr)(struct accfg_wq *wq); - -typedef unsigned int (*accfg_device_get_cdev_major_ptr)(struct accfg_device *device); - -typedef int (*accfg_wq_get_cdev_minor_ptr)(struct accfg_wq *wq); - -typedef enum accfg_device_state (*accfg_device_get_state_ptr)(struct accfg_device *device); - -typedef struct accfg_ctx *(*accfg_unref_ptr)(struct accfg_ctx *ctx); - -typedef enum accfg_wq_mode (*accfg_wq_get_mode_ptr)(struct accfg_wq *wq); - -typedef unsigned long (*accfg_device_get_gen_cap_ptr)(struct accfg_device *device); - -typedef int (*accfg_group_get_traffic_class_ptr)(struct accfg_group *group); - -typedef struct accfg_group *(*accfg_group_get_first_ptr)(struct accfg_device *device); - -typedef struct accfg_group *(*accfg_group_get_next_ptr)(struct accfg_group *group); - -typedef struct accfg_group *(*accfg_wq_get_group_ptr)(struct accfg_wq *wq); - -typedef int (*accfg_wq_get_group_id_ptr)(struct accfg_wq *wq); - -typedef int (*accfg_group_get_id_ptr)(struct accfg_group *group); - -typedef int (*accfg_wq_get_user_dev_path_ptr)(struct accfg_wq *wq, char *buf, size_t size); - -/** - * @brief Table with functions required from accelerator configuration library - */ -static dsa_desc_t functions_table[] = { - {NULL, "accfg_new"}, - {NULL, "accfg_device_get_first"}, - {NULL, "accfg_device_get_devname"}, - {NULL, "accfg_device_get_next"}, - {NULL, "accfg_wq_get_first"}, - {NULL, "accfg_wq_get_next"}, - {NULL, "accfg_wq_get_state"}, - {NULL, "accfg_wq_get_mode"}, - {NULL, "accfg_device_get_cdev_major"}, - {NULL, "accfg_wq_get_cdev_minor"}, - {NULL, "accfg_device_get_state"}, - {NULL, "accfg_unref"}, - {NULL, "accfg_device_get_gen_cap"}, - {NULL, "accfg_device_get_numa_node"}, - {NULL, "accfg_wq_get_priority"}, - {NULL, "accfg_group_get_first"}, - {NULL, "accfg_group_get_next"}, - {NULL, "accfg_group_get_traffic_class_a"}, - {NULL, "accfg_group_get_traffic_class_b"}, - {NULL, "accfg_wq_get_group"}, - {NULL, "accfg_wq_get_group_id"}, - {NULL, "accfg_group_get_id"}, -#if defined(LIB_ACCEL_VERSION_3_2) - {NULL, "accfg_wq_get_user_dev_path"}, -#endif - // Terminate list/init - {NULL, NULL} -}; - -static inline dsahw_status_t own_load_accelerator_configuration_driver(void **driver_instance_pptr); - -static inline bool own_load_configuration_functions(void *driver_instance_ptr); - -#endif - -dsahw_status_t DML_HW_API(initialize_accelerator_driver)(hw_driver_t *driver_ptr) { -#if defined( linux ) - // Variables - driver_ptr->driver_instance_ptr = NULL; - - // Load DLL - dsahw_status_t status = own_load_accelerator_configuration_driver(&driver_ptr->driver_instance_ptr); - - // If DLL is loaded successfully - if (DML_STATUS_OK != status || - !driver_ptr->driver_instance_ptr || - !own_load_configuration_functions(driver_ptr->driver_instance_ptr)) { - - // Free DLL - if (driver_ptr->driver_instance_ptr) { - dlclose(driver_ptr->driver_instance_ptr); - } - - driver_ptr->driver_instance_ptr = NULL; - } - - return status; -#else - return DML_STATUS_DRIVER_NOT_FOUND; -#endif -} - -void DML_HW_API(finalize_accelerator_driver)(hw_driver_t *driver_ptr) { -#if defined( linux ) - if (driver_ptr->driver_instance_ptr) { - dlclose(driver_ptr->driver_instance_ptr); - } - - driver_ptr->driver_instance_ptr = NULL; -#endif -} - -int32_t DML_HW_API(driver_new_context)(struct accfg_ctx **ctx) { -#if defined( linux ) - return ((accfg_new_ptr) functions_table[0].function)(ctx); -#else - return DML_STATUS_DRIVER_NOT_FOUND; -#endif -} - -struct accfg_device *DML_HW_API(context_get_first_device)(struct accfg_ctx *ctx) { -#if defined( linux ) - return ((accfg_device_get_first_ptr) functions_table[1].function)(ctx); -#else - return NULL; -#endif -} - -const char *DML_HW_API(device_get_name)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_device_get_devname_ptr) functions_table[2].function)(device); -#else - return NULL; -#endif -} - -struct accfg_device *DML_HW_API(device_get_next)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_device_get_next_ptr) functions_table[3].function)(device); -#else - return NULL; -#endif -} - -struct accfg_wq *DML_HW_API(get_first_work_queue)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_wq_get_first_ptr) functions_table[4].function)(device); -#else - return NULL; -#endif -} - -struct accfg_wq *DML_HW_API(work_queue_get_next)(struct accfg_wq *wq) { -#if defined( linux ) - return ((accfg_wq_get_next_ptr) functions_table[5].function)(wq); -#else - return NULL; -#endif -} - -enum accfg_wq_state DML_HW_API(work_queue_get_state)(struct accfg_wq *wq) { -#if defined( linux ) - return ((accfg_wq_get_state_ptr) functions_table[6].function)(wq); -#else - return -1; -#endif -} - -enum accfg_wq_mode DML_HW_API(work_queue_get_mode)(struct accfg_wq *wq) { -#if defined( linux ) - return ((accfg_wq_get_mode_ptr) functions_table[7].function)(wq); -#else - return 2; -#endif -} - -uint32_t DML_HW_API(device_get_major_version)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_device_get_cdev_major_ptr) functions_table[8].function)(device); -#else - return -1; -#endif -} - -int32_t DML_HW_API(work_queue_get_minor_version)(struct accfg_wq *wq) { -#if defined( linux ) - return ((accfg_wq_get_cdev_minor_ptr) functions_table[9].function)(wq); -#else - return -1; -#endif -} - -enum accfg_device_state DML_HW_API(device_get_state)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_device_get_state_ptr) functions_table[10].function)(device); -#else - return -1; -#endif -} - -struct accfg_ctx *DML_HW_API(context_close)(struct accfg_ctx *ctx) { -#if defined( linux ) - return ((accfg_unref_ptr) functions_table[11].function)(ctx); -#else - return NULL; -#endif -} - -uint64_t DML_HW_API(device_get_gen_cap_register)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_device_get_gen_cap_ptr) functions_table[12].function)(device); -#else - return 0; -#endif -} - -uint64_t DML_HW_API(device_get_numa_node)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_device_get_gen_cap_ptr) functions_table[13].function)(device); -#else - return -1; -#endif -} - -int32_t DML_HW_API(work_queue_get_priority)(struct accfg_wq *wq) { -#if defined( linux ) - return ((accfg_wq_get_cdev_minor_ptr) functions_table[14].function)(wq); -#else - return -1; -#endif -} - -struct accfg_group *DML_HW_API(group_get_first)(struct accfg_device *device) { -#if defined( linux ) - return ((accfg_group_get_first_ptr) functions_table[15].function)(device); -#else - return NULL; -#endif -} - -struct accfg_group *DML_HW_API(group_get_next)(struct accfg_group *group) { -#if defined( linux ) - return ((accfg_group_get_next_ptr) functions_table[16].function)(group); -#else - return NULL; -#endif -} - -int DML_HW_API(group_get_traffic_class_a)(struct accfg_group *group) { -#if defined( linux ) - return ((accfg_group_get_traffic_class_ptr) functions_table[17].function)(group); -#else - return 0; -#endif -} - -int DML_HW_API(group_get_traffic_class_b)(struct accfg_group *group) { -#if defined( linux ) - return ((accfg_group_get_traffic_class_ptr) functions_table[18].function)(group); -#else - return 0; -#endif -} - -struct accfg_group *DML_HW_API(work_queue_get_group)(struct accfg_wq *wq) { -#if defined( linux ) - return ((accfg_wq_get_group_ptr) functions_table[19].function)(wq); -#else - return NULL; -#endif -} - -int DML_HW_API(work_queue_get_group_id)(struct accfg_wq *wq) { -#if defined( linux ) - return ((accfg_wq_get_group_id_ptr) functions_table[20].function)(wq); -#else - return -1; -#endif -} - -int DML_HW_API(group_get_id)(struct accfg_group *group) { -#if defined( linux ) - return ((accfg_group_get_id_ptr) functions_table[21].function)(group); -#else - return -1; -#endif -} - -int DML_HW_API(work_queue_get_device_path)(struct accfg_wq *wq, char *buf, size_t size) { -#if defined( linux ) && defined(LIB_ACCEL_VERSION_3_2) - return ((accfg_wq_get_user_dev_path_ptr) functions_table[22].function)(wq, buf, size); -#else - return -1; -#endif -} - -#if defined( linux ) - -/* ------ Internal functions implementation ------ */ - -bool own_load_configuration_functions(void *driver_instance_ptr) { - uint32_t i = 0u; - - // Clear error log - (void)dlerror(); - while (functions_table[i].function_name) { - functions_table[i].function = (library_function) dlsym(driver_instance_ptr, functions_table[i].function_name); - - char *err_message = dlerror(); - - if (err_message || !functions_table[i].function) { - return false; - } - - i++; - } - - return true; -} - -dsahw_status_t own_load_accelerator_configuration_driver(void **driver_instance_pptr) { - - // Try to load the user interface library for IAX/DSA kernel driver - void *driver_instance_ptr = dlopen(accelerator_configuration_driver_name, RTLD_LAZY); - - if (!driver_instance_ptr) { - // This is needed for error handle. We need to call dlerror - // for emptying error message. Otherwise we will receive error - // message during loading symbols from another library - dlerror(); - - return DML_STATUS_DRIVER_NOT_FOUND; - } - - *driver_instance_pptr = driver_instance_ptr; - - return DML_STATUS_OK; -} - -#endif diff --git a/sources/middle_layer/src/execution_path.cpp b/sources/middle_layer/src/execution_path.cpp new file mode 100644 index 0000000..84b9049 --- /dev/null +++ b/sources/middle_layer/src/execution_path.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include + +#include "ml_utils.hpp" + +namespace dml::detail::ml::execution_path +{ + submission_status software::submit(operation& op, result& res) noexcept + { + return core::software_device().submit(as_core(op), as_core(res)); + } + + submission_status hardware::submit(operation& op, result& res) noexcept + { + return core::hardware_device().submit(as_core(op), as_core(res)); + } +} // namespace dml::detail::ml::execution_path diff --git a/sources/middle_layer/src/ml_utils.hpp b/sources/middle_layer/src/ml_utils.hpp new file mode 100644 index 0000000..efe3e97 --- /dev/null +++ b/sources/middle_layer/src/ml_utils.hpp @@ -0,0 +1,52 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#ifndef DML_ML_OWN_UTILS_HPP +#define DML_ML_OWN_UTILS_HPP + +#include +#include +#include +#include + +namespace dml::detail::ml +{ + // Wrapper for reinterpret_cast + [[nodiscard]] static inline auto& as_core(result& res) noexcept + { + return *std::launder(reinterpret_cast(&res)); + } + + // Wrapper for reinterpret_cast + [[nodiscard]] static inline auto& as_core(operation& res) noexcept + { + return *std::launder(reinterpret_cast(&res)); + } + + // Wrapper for reinterpret_cast + [[nodiscard]] static inline auto& as_ml(core::completion_record& res) noexcept + { + return *std::launder(reinterpret_cast(&res)); + } + + // Wrapper for reinterpret_cast + [[nodiscard]] static inline auto& as_ml(core::descriptor& res) noexcept + { + return *std::launder(reinterpret_cast(&res)); + } +} // namespace dml::detail::ml + +#endif // DML_ML_OWN_UTILS_HPP diff --git a/sources/middle_layer/src/operation.cpp b/sources/middle_layer/src/operation.cpp new file mode 100644 index 0000000..8ad94da --- /dev/null +++ b/sources/middle_layer/src/operation.cpp @@ -0,0 +1,361 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include +#include +#include + +#include "ml_utils.hpp" + +namespace dml::detail::ml +{ + operation make_nop_operation(const nop_options options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::nop_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::nop); + view.flags() = static_cast(options); + + return as_ml(descriptor); + } + + operation make_drain_operation(address_t readback_address_1, + address_t readback_address_2, + const drain_options options, + const drain_specific_options specific_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::drain_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::drain); + view.readback_address_1() = readback_address_1; + view.readback_address_2() = readback_address_2; + view.flags() = static_cast(options); + view.operation_specific_flags() = static_cast(specific_options); + + return as_ml(descriptor); + } + + operation make_mem_move_operation(const byte_t *const src, + byte_t *const dst, + const transfer_size_t size, + const mem_move_options options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::mem_move_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::memory_move); + view.source_address() = reinterpret_cast(src); + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = size; + view.flags() = static_cast(options); + + return as_ml(descriptor); + } + + operation make_fill_operation(const uint64_t pattern, + byte_t *const dst, + const transfer_size_t size, + const fill_options options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::fill_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::fill); + view.pattern() = pattern; + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = size; + view.flags() = static_cast(options); + + return as_ml(descriptor); + } + + operation make_dualcast_operation(const byte_t *const src, + byte_t *const dst1, + byte_t *const dst2, + const transfer_size_t size, + const dualcast_options options, + const dualcast_specific_options specific_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::dualcast_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::dualcast); + view.source_address() = reinterpret_cast(src); + view.destination_1_address() = reinterpret_cast(dst1); + view.destination_2_address() = reinterpret_cast(dst2); + view.transfer_size() = size; + view.flags() = static_cast(options); + view.operation_specific_flags() = static_cast(specific_options); + + return as_ml(descriptor); + } + + operation make_compare_operation(const byte_t *const src1, + const byte_t *const src2, + const transfer_size_t size, + const compare_options options, + const compare_result expected_result) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::compare_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::compare); + view.source_1_address() = reinterpret_cast(src1); + view.source_2_address() = reinterpret_cast(src2); + view.transfer_size() = size; + view.flags() = static_cast(options); + view.expected_result() = static_cast(expected_result); + + return as_ml(descriptor); + } + + operation make_compare_pattern_operation(const uint64_t pattern, + const byte_t *src, + const transfer_size_t size, + const compare_pattern_options options, + const compare_result expected_result) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::compare_pattern_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::compare_pattern); + view.pattern() = pattern; + view.source_address() = reinterpret_cast(src); + view.transfer_size() = size; + view.flags() = static_cast(options); + view.expected_result() = static_cast(expected_result); + + return as_ml(descriptor); + } + + operation make_crc_operation(const byte_t *const src, + const transfer_size_t size, + const crc_value_t crc_seed, + const crc_options options, + const crc_specific_options specific_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::crc_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::crc); + view.source_address() = reinterpret_cast(src); + view.transfer_size() = size; + view.flags() = static_cast(options); + view.operation_specific_flags() = static_cast(specific_options); + view.crc_seed() = crc_seed; + + return as_ml(descriptor); + } + + operation make_copy_crc_operation(const byte_t *const src, + byte_t *const dst, + const transfer_size_t size, + const crc_value_t crc_seed, + const copy_crc_options options, + const copy_crc_specific_options specific_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::copy_crc_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::copy_crc); + view.source_address() = reinterpret_cast(src); + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = size; + view.flags() = static_cast(options); + view.operation_specific_flags() = static_cast(specific_options); + view.crc_seed() = crc_seed; + + return as_ml(descriptor); + } + + operation make_create_delta_operation(const byte_t *const src1, + const byte_t *const src2, + const transfer_size_t size, + byte_t *const delta_record, + const transfer_size_t delta_max_size, + const create_delta_options options, + const create_delta_result expected_result) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::create_delta_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::create_delta); + view.source_1_address() = reinterpret_cast(src1); + view.source_2_address() = reinterpret_cast(src2); + view.delta_record_address() = reinterpret_cast(delta_record); + view.transfer_size() = size; + view.maximum_delta_record_size() = delta_max_size; + view.flags() = static_cast(options); + view.expected_result_mask() = static_cast(expected_result); + + return as_ml(descriptor); + } + + operation make_apply_delta_operation(const byte_t *const delta_record, + const transfer_size_t delta_size, + byte_t *const dst, + const transfer_size_t size, + const apply_delta_options options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::apply_delta_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::apply_delta); + view.delta_record_address() = reinterpret_cast(delta_record); + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = size; + view.delta_record_size() = delta_size; + view.flags() = static_cast(options); + + return as_ml(descriptor); + } + + operation make_cache_flush_operation(byte_t *const dst, const transfer_size_t size, const cache_flush_options options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::cache_flush_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::cache_flush); + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = size; + view.flags() = static_cast(options); + + return as_ml(descriptor); + } + + operation make_dif_check_operation(const byte_t *src, + transfer_size_t transfer_size, + dif_parameters src_parameters, + dif_check_options options, + dif_specific_options specific_options, + dif_source_options source_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::dif_check_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::dif_check); + view.source_address() = reinterpret_cast(src); + view.transfer_size() = transfer_size; + view.flags() = static_cast(options); + view.dif_flags() = static_cast(specific_options); + view.source_dif_flags() = static_cast(source_options); + view.source_ref_tag() = src_parameters.ref_tag_seed; + view.source_app_tag() = src_parameters.app_tag_seed; + view.source_app_tag_mask() = src_parameters.app_tag_mask; + + return as_ml(descriptor); + } + + operation make_dif_insert_operation(const byte_t *src, + byte_t *dst, + transfer_size_t transfer_size, + dif_parameters dst_parameters, + dif_insert_options options, + dif_specific_options specific_options, + dif_destination_options destination_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::dif_insert_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::dif_insert); + view.source_address() = reinterpret_cast(src); + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = transfer_size; + view.flags() = static_cast(options); + view.dif_flags() = static_cast(specific_options); + view.destination_dif_flags() = static_cast(destination_options); + view.destination_ref_tag() = dst_parameters.ref_tag_seed; + view.destination_app_tag() = dst_parameters.app_tag_seed; + view.destination_app_tag_mask() = dst_parameters.app_tag_mask; + + return as_ml(descriptor); + } + + operation make_dif_strip_operation(const byte_t *src, + byte_t *dst, + transfer_size_t transfer_size, + dif_parameters src_parameters, + dif_strip_options options, + dif_specific_options specific_options, + dif_source_options source_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::dif_strip_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::dif_strip); + view.source_address() = reinterpret_cast(src); + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = transfer_size; + view.flags() = static_cast(options); + view.dif_flags() = static_cast(specific_options); + view.source_dif_flags() = static_cast(source_options); + view.source_ref_tag() = src_parameters.ref_tag_seed; + view.source_app_tag() = src_parameters.app_tag_seed; + view.source_app_tag_mask() = src_parameters.app_tag_mask; + + return as_ml(descriptor); + } + + operation make_dif_update_operation(const byte_t *src, + byte_t *dst, + transfer_size_t transfer_size, + dif_parameters src_parameters, + dif_parameters dst_parameters, + dif_update_options options, + dif_specific_options specific_options, + dif_source_options source_options, + dif_destination_options destination_options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::dif_update_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::dif_update); + view.source_address() = reinterpret_cast(src); + view.destination_address() = reinterpret_cast(dst); + view.transfer_size() = transfer_size; + view.flags() = static_cast(options); + view.dif_flags() = static_cast(specific_options); + + view.source_dif_flags() = static_cast(source_options); + view.source_ref_tag() = src_parameters.ref_tag_seed; + view.source_app_tag() = src_parameters.app_tag_seed; + view.source_app_tag_mask() = src_parameters.app_tag_mask; + + view.destination_dif_flags() = static_cast(destination_options); + view.destination_ref_tag() = dst_parameters.ref_tag_seed; + view.destination_app_tag() = dst_parameters.app_tag_seed; + view.destination_app_tag_mask() = dst_parameters.app_tag_mask; + + return as_ml(descriptor); + } + + operation make_batch_operation(const operation *const src, const transfer_size_t length, const batch_options options) noexcept + { + auto descriptor = core::descriptor{}; + auto view = core::batch_descriptor(descriptor); + + view.operation() = to_underlying(core::operation::batch); + view.descriptor_list_address() = reinterpret_cast(src); + view.descriptors_count() = length; + view.flags() = static_cast(options); + + return as_ml(descriptor); + } +} // namespace dml::detail::ml diff --git a/sources/middle_layer/src/result.cpp b/sources/middle_layer/src/result.cpp new file mode 100644 index 0000000..faa2ca4 --- /dev/null +++ b/sources/middle_layer/src/result.cpp @@ -0,0 +1,122 @@ +/* + * Copyright 2021 Intel Corporation. + * + * This software and the related documents are Intel copyrighted materials, + * and your use of them is governed by the express license under which they + * were provided to you ("License"). Unless the License provides otherwise, + * you may not use, modify, copy, publish, distribute, disclose or transmit + * this software or the related documents without Intel's prior written + * permission. + * + * This software and the related documents are provided as is, with no + * express or implied warranties, other than those that are expressly + * stated in the License. + * + */ + +#include +#include +#include + +#include "ml_utils.hpp" + +#if defined(linux) +#include +#else +#include +#include +#endif + +namespace dml::detail::ml +{ + /** + * @brief Class that allows to defer scope exit to the moment when a certain address is changed + */ + class awaiter final + { + public: + /** + * @brief Constructor of the class + * + * @param address pointer to memory that should be asynchronously changed + * @param initial_value value to compare with + * @param period number of clocks between checks + */ + explicit awaiter(volatile void *address, uint8_t initial_value, uint32_t period = 200) noexcept: + address_ptr_(reinterpret_cast(address)), + period_(period), + initial_value_(initial_value) + { + // Empty constructor + } + + /** + * @brief Destructor that performs actual wait + */ + ~awaiter() noexcept + { +#ifdef DML_EFFICIENT_WAIT + while (initial_value_ == *address_ptr_) + { + monitor_address(address_ptr_); + + auto start = current_time(); + wait_until(start + period_, idle_state_); + } +#else + while (initial_value_ == *address_ptr_) + { + _mm_pause(); + } +#endif + } + + private: + volatile uint8_t *address_ptr_; /**(&res), 0); + } + + void bind(operation &op, result &res) noexcept + { + auto view = core::any_descriptor(as_core(op)); + + view.flags() |= static_cast(flag::completion_record_address_valid) | static_cast(flag::request_completion_record); + + view.completion_record_address() = reinterpret_cast(&res); + + res.bytes[0] = 0; + } + + detail::execution_status get_status(result &res) noexcept + { + return static_cast(core::any_completion_record(as_core(res)).status()); + } + + detail::result_t get_result(result &res) noexcept + { + return core::any_completion_record(as_core(res)).result(); + } + + detail::transfer_size_t get_bytes_completed(result &res) noexcept + { + return core::any_completion_record(as_core(res)).bytes_completed(); + } + + detail::transfer_size_t get_delta_record_size(result &res) noexcept + { + return core::create_delta_completion_record(as_core(res)).delta_record_size(); + } + + detail::transfer_size_t get_crc_value(result &res) noexcept + { + return core::crc_completion_record(as_core(res)).crc_value(); + } + +} // namespace dml::detail::ml diff --git a/include/dml/cpp/middle_layer/validation.hpp b/sources/middle_layer/src/validation.cpp similarity index 70% rename from include/dml/cpp/middle_layer/validation.hpp rename to sources/middle_layer/src/validation.cpp index aa676b9..1528b08 100644 --- a/include/dml/cpp/middle_layer/validation.hpp +++ b/sources/middle_layer/src/validation.cpp @@ -14,14 +14,15 @@ * */ -#ifndef DML_ML_VALIDATION_HPP -#define DML_ML_VALIDATION_HPP +#include +#include -#include +#include "ml_utils.hpp" -namespace dml::ml +namespace dml::detail::ml { - [[nodiscard]] validation_status validate(descriptor &dsc) noexcept; -} // namespace dml::ml - -#endif //DML_ML_VALIDATION_HPP + validation_status validate(operation& op) noexcept + { + return core::validate(as_core(op)); + } +} // namespace dml::detail::ml diff --git a/sources/middle_layer/validation.cpp b/sources/middle_layer/validation.cpp deleted file mode 100644 index 6d29f3c..0000000 --- a/sources/middle_layer/validation.cpp +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright 2021 Intel Corporation. - * - * This software and the related documents are Intel copyrighted materials, - * and your use of them is governed by the express license under which they - * were provided to you ("License"). Unless the License provides otherwise, - * you may not use, modify, copy, publish, distribute, disclose or transmit - * this software or the related documents without Intel's prior written - * permission. - * - * This software and the related documents are provided as is, with no - * express or implied warranties, other than those that are expressly - * stated in the License. - * - */ - -#include -#include -#include - -#include "utils.hpp" - -namespace dml::ml -{ - static constexpr uint32_t dif_block_sizes[4] = { 512u, 520u, 4096u, 4104u }; - - [[nodiscard]] static validation_status validate(views::nop_descriptor nop) noexcept; - - [[nodiscard]] static validation_status validate(views::batch_descriptor batch) noexcept; - - [[nodiscard]] static validation_status validate(views::drain_descriptor drain) noexcept; - - [[nodiscard]] static validation_status validate(views::mem_move_descriptor mem_move) noexcept; - - [[nodiscard]] static validation_status validate(views::fill_descriptor fill) noexcept; - - [[nodiscard]] static validation_status validate(views::compare_descriptor compare) noexcept; - - [[nodiscard]] static validation_status validate(views::compare_pattern_descriptor compare_pattern) noexcept; - - [[nodiscard]] static validation_status validate(views::create_delta_descriptor create_delta) noexcept; - - [[nodiscard]] static validation_status validate(views::apply_delta_descriptor apply_delta) noexcept; - - [[nodiscard]] static validation_status validate(views::dualcast_descriptor dualcast) noexcept; - - [[nodiscard]] static validation_status validate(views::crc_descriptor crc) noexcept; - - [[nodiscard]] static validation_status validate(views::copy_crc_descriptor copy_crc) noexcept; - - [[nodiscard]] static validation_status validate(views::dif_check_descriptor dif_check) noexcept; - - [[nodiscard]] static validation_status validate(views::dif_insert_descriptor dif_insert) noexcept; - - [[nodiscard]] static validation_status validate(views::dif_strip_descriptor dif_strip) noexcept; - - [[nodiscard]] static validation_status validate(views::dif_update_descriptor dif_update) noexcept; - - [[nodiscard]] static validation_status validate(views::cache_flush_descriptor cache_flush) noexcept; - - validation_status validate(descriptor &dsc) noexcept - { - auto view = views::any_descriptor(dsc); - - switch (static_cast(view.operation())) - { - case operation::nop: - return validate(views::nop_descriptor(dsc)); - case operation::batch: - return validate(views::batch_descriptor(dsc)); - case operation::drain: - return validate(views::drain_descriptor(dsc)); - case operation::memory_move: - return validate(views::mem_move_descriptor(dsc)); - case operation::fill: - return validate(views::fill_descriptor(dsc)); - case operation::compare: - return validate(views::compare_descriptor(dsc)); - case operation::compare_pattern: - return validate(views::compare_pattern_descriptor(dsc)); - case operation::create_delta: - return validate(views::create_delta_descriptor(dsc)); - case operation::apply_delta: - return validate(views::apply_delta_descriptor(dsc)); - case operation::dualcast: - return validate(views::dualcast_descriptor(dsc)); - case operation::crc: - return validate(views::crc_descriptor(dsc)); - case operation::copy_crc: - return validate(views::copy_crc_descriptor(dsc)); - case operation::dif_check: - return validate(views::dif_check_descriptor(dsc)); - case operation::dif_insert: - return validate(views::dif_insert_descriptor(dsc)); - case operation::dif_strip: - return validate(views::dif_strip_descriptor(dsc)); - case operation::dif_update: - return validate(views::dif_update_descriptor(dsc)); - case operation::cache_flush: - return validate(views::cache_flush_descriptor(dsc)); - default: - return validation_status::unsupported_operation; - } - } - - [[nodiscard]] static validation_status validate(views::nop_descriptor nop) noexcept - { - static_cast(nop); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::drain_descriptor drain) noexcept - { - static_cast(drain); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::mem_move_descriptor mem_move) noexcept - { - RETURN_STATUS_IF(any_equal_zero(mem_move.source_address(), mem_move.destination_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(mem_move.transfer_size()), validation_status::size_is_null); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::fill_descriptor fill) noexcept - { - RETURN_STATUS_IF(any_equal_zero(fill.destination_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(fill.transfer_size()), validation_status::size_is_null); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::compare_descriptor compare) noexcept - { - RETURN_STATUS_IF(any_equal_zero(compare.source_1_address(), compare.source_2_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(compare.transfer_size()), validation_status::size_is_null); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::compare_pattern_descriptor compare_pattern) noexcept - { - RETURN_STATUS_IF(any_equal_zero(compare_pattern.source_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(compare_pattern.transfer_size()), validation_status::size_is_null); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::create_delta_descriptor create_delta) noexcept - { - constexpr auto max_size = 0x80000; - - RETURN_STATUS_IF( - any_equal_zero(create_delta.source_1_address(), create_delta.source_2_address(), create_delta.delta_record_address()), - validation_status::address_is_null); - - RETURN_STATUS_IF(any_equal_zero(create_delta.transfer_size(), create_delta.maximum_delta_record_size()), - validation_status::size_is_null); - - RETURN_STATUS_IF( - any_misaligned<8u>(create_delta.source_1_address(), create_delta.source_2_address(), create_delta.delta_record_address()), - validation_status::address_is_misaligned); - - RETURN_STATUS_IF(create_delta.transfer_size() % 8 != 0, validation_status::delta_input_size_is_wrong); - - RETURN_STATUS_IF(create_delta.transfer_size() > max_size, validation_status::delta_input_size_overflow); - - RETURN_STATUS_IF(create_delta.maximum_delta_record_size() % 10 != 0 || create_delta.maximum_delta_record_size() < 80, - validation_status::delta_record_size_is_wrong); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::apply_delta_descriptor apply_delta) noexcept - { - constexpr auto max_size = 0x80000; - - RETURN_STATUS_IF(any_equal_zero(apply_delta.destination_address(), apply_delta.delta_record_address()), - validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(apply_delta.transfer_size(), apply_delta.delta_record_size()), validation_status::size_is_null); - - RETURN_STATUS_IF(overlaps(apply_delta.delta_record_address(), - apply_delta.delta_record_size(), - apply_delta.destination_address(), - apply_delta.transfer_size()), - validation_status::buffers_overlap); - - RETURN_STATUS_IF(any_misaligned<8u>(apply_delta.destination_address(), apply_delta.delta_record_address()), - validation_status::address_is_misaligned); - - RETURN_STATUS_IF(apply_delta.transfer_size() % 8 != 0, validation_status::delta_input_size_is_wrong); - - RETURN_STATUS_IF(apply_delta.transfer_size() > max_size, validation_status::delta_input_size_overflow); - - RETURN_STATUS_IF(apply_delta.delta_record_size() % 10 != 0, validation_status::delta_record_size_is_wrong); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::dualcast_descriptor dualcast) noexcept - { - RETURN_STATUS_IF(any_equal_zero(dualcast.source_address(), dualcast.destination_1_address(), dualcast.destination_2_address()), - validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(dualcast.transfer_size()), validation_status::size_is_null); - - RETURN_STATUS_IF((dualcast.destination_1_address() & 0xFFFu) != (dualcast.destination_2_address() & 0xFFFu), - validation_status::dualcast_address_is_wrong); - - RETURN_STATUS_IF(overlaps(dualcast.source_address(), dualcast.destination_1_address(), dualcast.transfer_size()), - validation_status::buffers_overlap); - - RETURN_STATUS_IF(overlaps(dualcast.source_address(), dualcast.destination_2_address(), dualcast.transfer_size()), - validation_status::buffers_overlap); - - RETURN_STATUS_IF(overlaps(dualcast.destination_1_address(), dualcast.destination_2_address(), dualcast.transfer_size()), - validation_status::buffers_overlap); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::crc_descriptor crc) noexcept - { - RETURN_STATUS_IF(any_equal_zero(crc.source_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(crc.transfer_size()), validation_status::size_is_null); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::copy_crc_descriptor copy_crc) noexcept - { - RETURN_STATUS_IF(any_equal_zero(copy_crc.source_address(), copy_crc.destination_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(copy_crc.transfer_size()), validation_status::size_is_null); - - RETURN_STATUS_IF(overlaps(copy_crc.source_address(), copy_crc.destination_address(), copy_crc.transfer_size()), - validation_status::buffers_overlap); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::cache_flush_descriptor cache_flush) noexcept - { - RETURN_STATUS_IF(any_equal_zero(cache_flush.destination_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(cache_flush.transfer_size()), validation_status::size_is_null); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::dif_check_descriptor dif_check) noexcept - { - const auto block_size = dif_block_sizes[dif_check.dif_flags() & 0b11]; - - RETURN_STATUS_IF(any_equal_zero(dif_check.source_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(dif_check.transfer_size()), validation_status::size_is_null); - RETURN_STATUS_IF(dif_check.transfer_size() % (block_size + sizeof(uint64_t)) != 0, validation_status::dif_size_is_wrong); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::dif_insert_descriptor dif_insert) noexcept - { - const auto block_size = dif_block_sizes[dif_insert.dif_flags() & 0b11]; - - RETURN_STATUS_IF(any_equal_zero(dif_insert.source_address(), dif_insert.destination_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(dif_insert.transfer_size()), validation_status::size_is_null); - RETURN_STATUS_IF(dif_insert.transfer_size() % block_size != 0, validation_status::dif_size_is_wrong); - - const auto src_size = dif_insert.transfer_size(); - const auto dst_size = (src_size / block_size) * (block_size + static_cast(sizeof(uint64_t))); - RETURN_STATUS_IF(overlaps(dif_insert.source_address(), src_size, dif_insert.destination_address(), dst_size), - validation_status::buffers_overlap); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::dif_strip_descriptor dif_strip) noexcept - { - const auto block_size = dif_block_sizes[dif_strip.dif_flags() & 0b11]; - - RETURN_STATUS_IF(any_equal_zero(dif_strip.source_address(), dif_strip.destination_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(dif_strip.transfer_size()), validation_status::size_is_null); - RETURN_STATUS_IF(dif_strip.transfer_size() % (block_size + sizeof(uint64_t)) != 0, validation_status::dif_size_is_wrong); - - const auto src_size = dif_strip.transfer_size(); - const auto dst_size = (src_size / (block_size + static_cast(sizeof(uint64_t)))) * block_size; - RETURN_STATUS_IF(overlaps(dif_strip.source_address(), src_size, dif_strip.destination_address(), dst_size), - validation_status::buffers_overlap); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::dif_update_descriptor dif_update) noexcept - { - const auto block_size = dif_block_sizes[dif_update.dif_flags() & 0b11]; - - RETURN_STATUS_IF(any_equal_zero(dif_update.source_address(), dif_update.destination_address()), validation_status::address_is_null); - RETURN_STATUS_IF(any_equal_zero(dif_update.transfer_size()), validation_status::size_is_null); - RETURN_STATUS_IF(dif_update.transfer_size() % (block_size + sizeof(uint64_t)) != 0, validation_status::dif_size_is_wrong); - RETURN_STATUS_IF(overlaps(dif_update.source_address(), dif_update.destination_address(), dif_update.transfer_size()), - validation_status::buffers_overlap); - - return validation_status::success; - } - - [[nodiscard]] static validation_status validate(views::batch_descriptor batch) noexcept - { - RETURN_STATUS_IF(any_equal_zero(batch.descriptor_list_address()), validation_status::address_is_null); - RETURN_STATUS_IF(batch.descriptors_count() < 4, validation_status::batch_size_is_wrong); - - return validation_status::success; - } - -} // namespace dml::ml