Skip to content

Commit

Permalink
Merge pull request #2 from Wovchena/export-pack_strings-and-unpack_st…
Browse files Browse the repository at this point in the history
…rings

Export pack_strings() and unpack_strings()
  • Loading branch information
apaniukov authored Nov 15, 2023
2 parents 82639e6 + 1ec4c5f commit fb37580
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,4 @@ endif()
target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)

target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS})
target_include_directories(${TARGET_NAME} PUBLIC ./include/)
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <openvino/runtime/tensor.hpp>

namespace openvino_extensions {
// Pack any container with string to ov::Tensor with element type u8
// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .size()
// so basically any STL container with std::string is compatible
// Tensor destination will be reshaped according the input data
template <typename BatchOfStrings>
void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) {
auto batch_size = strings.size();

// First run over all elements: calculate total memory required to hold all strings
size_t symbols_size = std::accumulate(
strings.begin(), strings.end(), size_t(0),
[](size_t accum, typename BatchOfStrings::const_reference str)
{ return accum + str.size(); });

size_t total_size = 4 * (1 + 1 + batch_size) + symbols_size;
destination.set_shape({total_size});

int32_t* pindices = reinterpret_cast<int32_t*>(destination.data<uint8_t>());
pindices[0] = batch_size;
pindices[1] = 0;
pindices += 2;
char* psymbols = reinterpret_cast<char*>(pindices + batch_size);
size_t current_symbols_pos = 0;

for (const auto& str: strings) {
psymbols = std::copy(str.begin(), str.end(), psymbols);
current_symbols_pos += str.size();
*pindices = current_symbols_pos;
++pindices;
}
}

std::vector<std::string> unpack_strings(const ov::Tensor& source) {
int32_t length = source.get_byte_size();
// check the format of the input bitstream representing the string tensor
OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor");
const int32_t* pindices = reinterpret_cast<const int32_t*>(source.data<const uint8_t>());
int32_t batch_size = pindices[0];
OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size,
"Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices");
const int32_t* begin_ids = pindices + 1;
const int32_t* end_ids = pindices + 2;
const char* symbols = reinterpret_cast<const char*>(pindices + 2 + batch_size);

std::vector<std::string> result;
result.reserve(batch_size);
for (int32_t idx = 0; idx < batch_size; ++idx) {
result.emplace_back(symbols + begin_ids[idx], symbols + end_ids[idx]);
}
return result;
}
}
55 changes: 0 additions & 55 deletions modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,58 +222,3 @@ std::shared_ptr<Node> string_attribute_to_constant (const ov::frontend::NodeCont
return std::make_shared<Constant>(element::u8, Shape{value.length()}, (const void*)value.data());
#endif
}


// Pack any container with string to ov::Tensor with element type u8
// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length()
// so basically any STL container with std::string is compatible
// Tensor destination will be reshaped according the input data
template <typename BatchOfStrings>
void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) {
auto batch_size = strings.size();

// First run over all elements: calculate total memory required to hold all strings
auto symbols_size = std::accumulate(
strings.begin(), strings.end(), size_t(0),
[](size_t accum, typename BatchOfStrings::const_reference s)
{ return accum + s.length(); });

auto total_size = 4*(1 + 1 + batch_size) + symbols_size;
destination.set_shape({total_size});

auto data = destination.data<uint8_t>();
auto pbatch_size = reinterpret_cast<int32_t*>(data);
auto pindices = pbatch_size + 1;
auto psymbols = reinterpret_cast<char*>(pindices + 1 + batch_size);
size_t current_symbols_pos = 0;

*pbatch_size = batch_size;
*pindices = 0;

for(auto s: strings) {
psymbols = std::copy(s.begin(), s.end(), psymbols);
current_symbols_pos += s.length();
*++pindices = current_symbols_pos;
}
}


std::vector<std::string> unpack_strings (const ov::Tensor& source) {
auto strings = source.data<const uint8_t>();
auto length = source.get_byte_size();
// check the format of the input bitstream representing the string tensor
OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor");
auto batch_size = *reinterpret_cast<const int32_t*>(strings + 0);
OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size,
"Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices");
auto begin_ids = reinterpret_cast<const int32_t*>(strings + 4);
auto end_ids = begin_ids + 1;
auto symbols = strings + 4 + 4 + 4 * batch_size;

std::vector<std::string> result;
result.reserve(batch_size);
for(size_t i = 0; i < batch_size; ++i) {
result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i]));
}
return result;
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,3 @@ bool evaluate_normalization_helper (
std::function<std::string(const std::string&)> normalizer);

std::shared_ptr<ov::Node> string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name);

template <typename BatchOfStrings>
void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination);

std::vector<std::string> unpack_strings(const ov::Tensor& source);

0 comments on commit fb37580

Please sign in to comment.