RunAsync output gibberish when multiple RunAsync infer in parallel. #18541

chaihahaha · 2023-11-21T17:36:45Z

Describe the issue

Sometimes these errors happens:
[E:onnxruntime:, sequential_executor.cc:514 onnxruntime::ExecuteKernel] Non-zero status code returned while running BeamSearch node

and the output tensor ids are negative, which is not possible to tokens.

To reproduce

Use any mT5 onnx model and the following c++ code:

#include "onnxruntime_cxx_api.h"
#include <iostream>
#include <vector>
#include <string>
#include <filesystem>
#include <thread>
#include <atomic>
#include <functional>
#include <future>

#pragma comment(lib, "onnxruntime.lib")

static std::atomic_bool atomic_wait{false};

std::unique_ptr<Ort::Session> session = nullptr;
std::unique_ptr<Ort::Env> env = nullptr;

void print_tensor(Ort::Value& tensor)
{
    Ort::TensorTypeAndShapeInfo ts = tensor.GetTensorTypeAndShapeInfo();

    const int32_t* el = tensor.GetTensorData<int32_t>();
    for (int i = 0; i < ts.GetElementCount(); i++)
    {
        std::cout << el[i] << " ";
    }
    std::cout << std::endl;
    return;
}

void AsyncCallback(void* user_data, OrtValue** outputs, size_t num_outputs, OrtStatusPtr status_ptr) {
    OrtValue** input_tensors = reinterpret_cast<OrtValue**>(user_data);
    Ort::Value output_value(outputs[0]);
    print_tensor(output_value);
    atomic_wait.store(true);
    std::cout << "callback" << std::endl;
}

int wmain(int argc, wchar_t* argv[])
{
    if (argc != 8)
    {
        std::cout << "Incorrect number of commandline args,"
            << "should be `OrtMT5.exe [model_path] [max_length] [min_length] "
            << "[num_beams] [num_return_sequences] [length_penalty] [repetition_penalty]`" << std::endl;
        return -1;
    }

    int32_t max_length_int;
    int32_t min_length_int;
    int32_t num_beams_int;
    int32_t num_return_sequences_int;
    float length_penalty_float;
    float repetition_penalty_float;
    try
    {
        max_length_int = std::stoi(argv[2]);
        min_length_int = std::stoi(argv[3]);
        num_beams_int = std::stoi(argv[4]);
        num_return_sequences_int = std::stoi(argv[5]);
        length_penalty_float = std::stof(argv[6]);
        repetition_penalty_float = std::stof(argv[7]);
    }
    catch (const std::exception& e)
    {
        std::cout << "Caught " << e.what() << std::endl;
        std::cout << "Incorrect commandline args,"
            << "should be `OrtMT5.exe [model_path] [max_length] [min_length] "
            << "[num_beams] [num_return_sequences] [length_penalty] [repetition_penalty]`" << std::endl;
        return -1;
    }
    
    Ort::SessionOptions session_options;
    int max_threads = std::thread::hardware_concurrency();
    session_options.SetGraphOptimizationLevel(
        GraphOptimizationLevel::ORT_ENABLE_ALL);
    auto env_local = std::make_unique<Ort::Env>(OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "OrtMT5");
    env = std::move(env_local);

    const wchar_t* model_path = argv[1];

    if (!std::filesystem::exists(model_path))
    {
        std::cout << "Incorrect model path" << std::endl;
        return -1;
    }
    auto session_local = std::make_unique<Ort::Session>(*env, model_path, session_options);
    session = std::move(session_local);

    Ort::AllocatorWithDefaultOptions allocator;

    const size_t num_input_nodes = session->GetInputCount();
    std::vector<Ort::AllocatedStringPtr> input_names_ptr;
    std::vector<const char*> input_node_names;
    input_node_names.reserve(num_input_nodes);
    std::vector<int64_t> input_node_dims;

    // iterate over all input nodes
    for (size_t i = 0; i < num_input_nodes; i++) {
        auto input_name = session->GetInputNameAllocated(i, allocator);
        input_node_names.push_back(input_name.get());
        input_names_ptr.push_back(std::move(input_name));
    }

    int32_t input_length = -1;
    int32_t input_value = -1;
    std::vector<std::vector<Ort::Value>> input_tensors_pool;
    std::vector<std::vector<Ort::Value>> output_tensors_pool;

    int scnt = 0;

    while (std::cin >> input_length)
    {
        if (input_length <= 0)
        {
            std::cout << "invalid input, not a positive integer" << std::endl;
            return -1;
        }
        std::vector<int32_t> py_input_ids{};
        for (int i = 0; i < input_length; i++)
        {
            if (std::cin >> input_value)
            {
                py_input_ids.push_back(input_value);
            }
            else
            {
                std::cout << "invalid input, not a integer" << std::endl;
                return -1;
            }
        }

        Ort::ConstMemoryInfo memory_info = allocator.GetInfo();

        std::vector<int32_t> max_length_values{ max_length_int };
        std::vector<int64_t> max_length_dims{ 1 };
        Ort::Value max_length = Ort::Value::CreateTensor<int32_t>(memory_info, max_length_values.data(), max_length_values.size(),
            max_length_dims.data(), max_length_dims.size());

        std::vector<int32_t> min_length_values{ min_length_int };
        std::vector<int64_t> min_length_dims{ 1 };
        Ort::Value min_length = Ort::Value::CreateTensor<int32_t>(memory_info, min_length_values.data(), min_length_values.size(),
            min_length_dims.data(), min_length_dims.size());

        std::vector<int32_t> num_beams_values{ num_beams_int };
        std::vector<int64_t> num_beams_dims{ 1 };
        Ort::Value num_beams = Ort::Value::CreateTensor<int32_t>(memory_info, num_beams_values.data(), num_beams_values.size(),
            num_beams_dims.data(), num_beams_dims.size());

        std::vector<int32_t> num_return_sequences_values{ num_return_sequences_int };
        std::vector<int64_t> num_return_sequences_dims{ 1 };
        Ort::Value num_return_sequences = Ort::Value::CreateTensor<int32_t>(memory_info, num_return_sequences_values.data(), num_return_sequences_values.size(),
            num_return_sequences_dims.data(), num_return_sequences_dims.size());

        std::vector<float> length_penalty_values{ (float)length_penalty_float };
        std::vector<int64_t> length_penalty_dims{ 1 };
        Ort::Value length_penalty = Ort::Value::CreateTensor<float>(memory_info, length_penalty_values.data(), length_penalty_values.size(),
            length_penalty_dims.data(), length_penalty_dims.size());

        std::vector<float> repetition_penalty_values{ (float)repetition_penalty_float };
        std::vector<int64_t> repetition_penalty_dims = { 1 };
        Ort::Value repetition_penalty = Ort::Value::CreateTensor<float>(memory_info, repetition_penalty_values.data(), repetition_penalty_values.size(),
            repetition_penalty_dims.data(), repetition_penalty_dims.size());

        std::vector<int32_t> input_ids_values = py_input_ids;
        std::vector<int64_t> input_ids_dims{1, (int64_t)input_ids_values.size()};
        Ort::Value input_ids = Ort::Value::CreateTensor<int32_t>(memory_info, input_ids_values.data(), input_ids_values.size(),
            input_ids_dims.data(), input_ids_dims.size());

        std::vector<Ort::Value> input_tensors;
        input_tensors.push_back(std::move(input_ids));
        input_tensors.push_back(std::move(max_length));
        input_tensors.push_back(std::move(min_length));
        input_tensors.push_back(std::move(num_beams));
        input_tensors.push_back(std::move(num_return_sequences));
        input_tensors.push_back(std::move(length_penalty));
        input_tensors.push_back(std::move(repetition_penalty));
        input_tensors_pool.push_back(std::move(input_tensors));

        Ort::AllocatedStringPtr output_name_ptr = session->GetOutputNameAllocated(0, allocator);
        const char* output_name = output_name_ptr.get();

        std::vector<const char*> output_names{output_name};
        output_names.resize(1);
        std::vector<int64_t> output_ids_dims{1, 1, max_length_int};
        std::vector<Ort::Value> output_tensors;

        output_tensors.push_back(
                Ort::Value::CreateTensor<int32_t>(
                    allocator,
                    output_ids_dims.data(),
                    output_ids_dims.size()
                )
        );
        output_tensors_pool.push_back(std::move(output_tensors));


        std::cout << "calling runasync" <<std::endl;
        session->RunAsync(
            Ort::RunOptions{nullptr},
            input_node_names.data(),
            input_tensors_pool[scnt].data(),
            input_tensors_pool[scnt].size(),
            output_names.data(),
            output_tensors_pool[scnt].data(),
            output_names.size(),
            AsyncCallback,
            input_tensors_pool[scnt].data()
            );
        //std::chrono::duration<double, std::milli> dur{10};
        //while (!atomic_wait.load())
        //    std::this_thread::sleep_for(dur);
        //atomic_wait.store(false);
        scnt += 1;
    }
    std::cout << "called runasync" <<std::endl;
    std::chrono::duration<double, std::milli> dur{10000};
    std::this_thread::sleep_for(dur);
    return 0;
}

Test with the following command:

echo 51 1042 264 462 338 12001 25134 31150 70694 10978 67916 81634 574 86977 56157 66637 309 1 1042 264 462 338 12001 25134 31150 70694 10978 67916 81634 574 86977 56157 66637 309 1 1042 263 462 338 12001 25134 31250 70694 10978 67316 81634 574 86977 56157 66337 309 1 17 1042 263 462 238 12001 25134 30250 70694 10978 63316 81634 574 82977 56157 66337 309 1 5 1042 263 462 309 1 10 1042 263 462 238 12001 25134 30250 70694 10978 1 | main "model.onnx" 128 1 8 1 1.1 1.1

Urgency

No response

Platform

Windows

OS Version

Win10

ONNX Runtime Installation

Released Package

ONNX Runtime Version or Commit ID

1.16.2

ONNX Runtime API

C++

Architecture

X64

Execution Provider

Default CPU

Execution Provider Library Version

No response

The text was updated successfully, but these errors were encountered:

chaihahaha · 2023-11-21T17:59:59Z

It seems adding a std::this_thread::sleep_for to sleep for 50ms after calling RunAsync would mitigate this problem. Still mysterious that sometimes input_tensors seems to be corrupted when read from callback.

chaihahaha · 2023-11-24T12:11:08Z

This is a lifetime problem of Ort::Value objects, keeping a global object list to avoid releasing memory and avoid multiple threads visiting one Ort::Value would solve this.

github-actions bot added the platform:windows issues related to the Windows platform label Nov 21, 2023

chaihahaha changed the title ~~RunAsync output gibberish when multiple RunAsync infering in parallel.~~ RunAsync output gibberish when multiple RunAsync infer in parallel. Nov 21, 2023

pranavsharma added the core runtime issues related to core runtime label Nov 21, 2023

chaihahaha closed this as completed Nov 24, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

RunAsync output gibberish when multiple RunAsync infer in parallel. #18541

RunAsync output gibberish when multiple RunAsync infer in parallel. #18541

chaihahaha commented Nov 21, 2023 •

edited

Loading

chaihahaha commented Nov 21, 2023 •

edited

Loading

chaihahaha commented Nov 24, 2023 •

edited

Loading

RunAsync output gibberish when multiple RunAsync infer in parallel. #18541

RunAsync output gibberish when multiple RunAsync infer in parallel. #18541

Comments

chaihahaha commented Nov 21, 2023 • edited Loading

Describe the issue

To reproduce

Urgency

Platform

OS Version

ONNX Runtime Installation

ONNX Runtime Version or Commit ID

ONNX Runtime API

Architecture

Execution Provider

Execution Provider Library Version

chaihahaha commented Nov 21, 2023 • edited Loading

chaihahaha commented Nov 24, 2023 • edited Loading

chaihahaha commented Nov 21, 2023 •

edited

Loading

chaihahaha commented Nov 21, 2023 •

edited

Loading

chaihahaha commented Nov 24, 2023 •

edited

Loading