diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 26f438b07..c830c0a21 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -101,3 +101,4 @@ endif() target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS}) +target_include_directories(${TARGET_NAME} PUBLIC ./include/) diff --git a/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp new file mode 100644 index 000000000..5bfe85e5a --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp @@ -0,0 +1,61 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace openvino_extensions { +// Pack any container with string to ov::Tensor with element type u8 +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .size() +// so basically any STL container with std::string is compatible +// Tensor destination will be reshaped according the input data +template +void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) { + auto batch_size = strings.size(); + + // First run over all elements: calculate total memory required to hold all strings + size_t symbols_size = std::accumulate( + strings.begin(), strings.end(), size_t(0), + [](size_t accum, typename BatchOfStrings::const_reference str) + { return accum + str.size(); }); + + size_t total_size = 4 * (1 + 1 + batch_size) + symbols_size; + destination.set_shape({total_size}); + + int32_t* pindices = reinterpret_cast(destination.data()); + pindices[0] = batch_size; + pindices[1] = 0; + pindices += 2; + char* psymbols = reinterpret_cast(pindices + batch_size); + size_t current_symbols_pos = 0; + + for (const auto& str: strings) { + psymbols = std::copy(str.begin(), str.end(), psymbols); + current_symbols_pos += str.size(); + *pindices = current_symbols_pos; + ++pindices; + } +} + +std::vector unpack_strings(const ov::Tensor& source) { + int32_t length = source.get_byte_size(); + // check the format of the input bitstream representing the string tensor + OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + const int32_t* pindices = reinterpret_cast(source.data()); + int32_t batch_size = pindices[0]; + OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + const int32_t* begin_ids = pindices + 1; + const int32_t* end_ids = pindices + 2; + const char* symbols = reinterpret_cast(pindices + 2 + batch_size); + + std::vector result; + result.reserve(batch_size); + for (int32_t idx = 0; idx < batch_size; ++idx) { + result.emplace_back(symbols + begin_ids[idx], symbols + end_ids[idx]); + } + return result; +} +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index 9ee3e15ba..3aaf6989e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -222,58 +222,3 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); #endif } - - -// Pack any container with string to ov::Tensor with element type u8 -// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() -// so basically any STL container with std::string is compatible -// Tensor destination will be reshaped according the input data -template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { - auto batch_size = strings.size(); - - // First run over all elements: calculate total memory required to hold all strings - auto symbols_size = std::accumulate( - strings.begin(), strings.end(), size_t(0), - [](size_t accum, typename BatchOfStrings::const_reference s) - { return accum + s.length(); }); - - auto total_size = 4*(1 + 1 + batch_size) + symbols_size; - destination.set_shape({total_size}); - - auto data = destination.data(); - auto pbatch_size = reinterpret_cast(data); - auto pindices = pbatch_size + 1; - auto psymbols = reinterpret_cast(pindices + 1 + batch_size); - size_t current_symbols_pos = 0; - - *pbatch_size = batch_size; - *pindices = 0; - - for(auto s: strings) { - psymbols = std::copy(s.begin(), s.end(), psymbols); - current_symbols_pos += s.length(); - *++pindices = current_symbols_pos; - } -} - - -std::vector unpack_strings (const ov::Tensor& source) { - auto strings = source.data(); - auto length = source.get_byte_size(); - // check the format of the input bitstream representing the string tensor - OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - auto batch_size = *reinterpret_cast(strings + 0); - OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, - "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - auto begin_ids = reinterpret_cast(strings + 4); - auto end_ids = begin_ids + 1; - auto symbols = strings + 4 + 4 + 4 * batch_size; - - std::vector result; - result.reserve(batch_size); - for(size_t i = 0; i < batch_size; ++i) { - result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); - } - return result; -} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp index a0d72b5fc..8ffbc9e04 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -68,8 +68,3 @@ bool evaluate_normalization_helper ( std::function normalizer); std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); - -template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination); - -std::vector unpack_strings(const ov::Tensor& source);