Skip to content

Commit

Permalink
openvino_extensions
Browse files Browse the repository at this point in the history
  • Loading branch information
Wovchena committed Nov 10, 2023
1 parent debcb5d commit b739ffd
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,4 @@ endif()
target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)

target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS})
target_include_directories(${TARGET_NAME} PUBLIC ./include/)
Original file line number Diff line number Diff line change
Expand Up @@ -6,55 +6,56 @@

#include <openvino/runtime/tensor.hpp>

namespace openvino_extensions {
// Pack any container with string to ov::Tensor with element type u8
// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length()
// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .size()
// so basically any STL container with std::string is compatible
// Tensor destination will be reshaped according the input data
template <typename BatchOfStrings>
void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) {
auto batch_size = strings.size();

// First run over all elements: calculate total memory required to hold all strings
auto symbols_size = std::accumulate(
size_t symbols_size = std::accumulate(
strings.begin(), strings.end(), size_t(0),
[](size_t accum, typename BatchOfStrings::const_reference s)
{ return accum + s.length(); });
[](size_t accum, typename BatchOfStrings::const_reference str)
{ return accum + str.size(); });

auto total_size = 4*(1 + 1 + batch_size) + symbols_size;
size_t total_size = 4 * (1 + 1 + batch_size) + symbols_size;
destination.set_shape({total_size});

auto data = destination.data<uint8_t>();
auto pbatch_size = reinterpret_cast<int32_t*>(data);
auto pindices = pbatch_size + 1;
auto psymbols = reinterpret_cast<char*>(pindices + 1 + batch_size);
int32_t* pindices = reinterpret_cast<int32_t*>(destination.data<uint8_t>());
pindices[0] = batch_size;
pindices[1] = 0;
pindices += 2;
char* psymbols = reinterpret_cast<char*>(pindices + batch_size);
size_t current_symbols_pos = 0;

*pbatch_size = batch_size;
*pindices = 0;

for(auto s: strings) {
psymbols = std::copy(s.begin(), s.end(), psymbols);
current_symbols_pos += s.length();
*++pindices = current_symbols_pos;
for (const auto& str: strings) {
psymbols = std::copy(str.begin(), str.end(), psymbols);
current_symbols_pos += str.size();
*pindices = current_symbols_pos;
++pindices;
}
}

std::vector<std::string> unpack_strings(const ov::Tensor& source) {
auto strings = source.data<const uint8_t>();
auto length = source.get_byte_size();
int32_t length = source.get_byte_size();
// check the format of the input bitstream representing the string tensor
OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor");
auto batch_size = *reinterpret_cast<const int32_t*>(strings + 0);
const int32_t* pindices = reinterpret_cast<const int32_t*>(source.data<const uint8_t>());
int32_t batch_size = pindices[0];
OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size,
"Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices");
auto begin_ids = reinterpret_cast<const int32_t*>(strings + 4);
auto end_ids = begin_ids + 1;
auto symbols = strings + 4 + 4 + 4 * batch_size;
const int32_t* begin_ids = pindices + 1;
const int32_t* end_ids = pindices + 2;
const char* symbols = reinterpret_cast<const char*>(pindices + 2 + batch_size);

std::vector<std::string> result;
result.reserve(batch_size);
for(size_t i = 0; i < batch_size; ++i) {
result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i]));
for (int32_t idx = 0; idx < batch_size; ++idx) {
result.emplace_back(symbols + begin_ids[idx], symbols + end_ids[idx]);
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ target_include_directories(${TARGET_NAME} PRIVATE
# fast_tokenizer
${FAST_TOKENIZER_INCS})

target_include_directories(${TARGET_NAME} PUBLIC ./include/)

if(CMAKE_CL_64)
target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS)
endif()
Expand Down

0 comments on commit b739ffd

Please sign in to comment.