Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Odla_trt optimized and model test #499

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 67 additions & 17 deletions ODLA/platforms/tensorrt/odla_tensorrt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
#include <NvInferRuntime.h>
#include <ODLA/odla.h>
#include <bits/stdint-intn.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include <time.h>
#include <ctime>
#include <cassert>
#include <cmath>
#include <cstddef>
Expand All @@ -31,10 +34,12 @@
#include <numeric>
#include <unordered_map>
#include <vector>
#include <mutex>

#include "plugins/initPlugin.h"

using namespace nvinfer1;
using namespace std;

#if !defined(ODLA_VERSION_NUMBER) || (ODLA_VERSION_NUMBER < 50)
#error This library requires minimum ODLA version 0.5
Expand Down Expand Up @@ -173,8 +178,7 @@ struct _odla_computation {
network = builder->createNetworkV2(flags);
#endif
}
}

}
~_odla_computation() {
if (!load_engine_mode) {
builder->destroy();
Expand All @@ -189,6 +193,8 @@ struct _odla_context {
odla_computation comp = nullptr;
nvinfer1::ICudaEngine* engine = nullptr;
nvinfer1::IExecutionContext* ctx = nullptr;
void* temp_input_ptr = nullptr;
void* temp_output_ptr = nullptr;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if there are multiple inputs/outputs?

#if NV_TENSORRT_MAJOR >= 7
nvinfer1::IBuilderConfig* builder_cfg = nullptr;
nvinfer1::IOptimizationProfile* builder_profile = nullptr;
Expand All @@ -210,6 +216,7 @@ struct _odla_context {
std::unordered_map<std::string, InputPtrInfo> input_ptrs;

int run_batch_size = 0;
// CUdeviceptr cumemalloc_address;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove

_odla_context(odla_computation comp) : comp(comp) {
if (!comp->load_engine_mode) {
#if NV_TENSORRT_MAJOR < 7
Expand Down Expand Up @@ -238,7 +245,6 @@ struct _odla_context {
builder_cfg->addOptimizationProfile(builder_profile);
}
builder_cfg->setMaxWorkspaceSize(comp->max_workspace_size);

if (comp->fp16_mode) {
builder_cfg->setFlag(BuilderFlag::kFP16);
builder_cfg->setFlag(BuilderFlag::kSTRICT_TYPES);
Expand Down Expand Up @@ -509,13 +515,18 @@ odla_status odla_SetContextItem(odla_context context, odla_item_type type,
switch (type) {
case ODLA_RUN_BATCH_SIZE:
context->run_batch_size = *(reinterpret_cast<int*>(value));
// odla_value_shape real_shape = value->type.shape;
// size_t bytes =
// GetTotalElements(real_shape) * GetElementSize(value->type.element_type);
// CUdeviceptr dev_ptr;
// CHECK(cuMemAlloc(&dev_ptr, bytes));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove dead code

// context->cumemalloc_address = dev_ptr;
break;

default:
std::cerr << "Unsupported property type: " << type << std::endl;
return ODLA_FAILURE;
}

return ODLA_SUCCESS;
}

Expand All @@ -534,7 +545,15 @@ odla_value odla_CreateArgument(odla_value_type type, const odla_value_id id) {
auto input = g_comp->network->addInput(name, GetNVDataType(type.element_type),
GetNVDims(type.shape));
odla_value v = CreateValue(input, type, id);
g_comp->inputs[name] = v;
g_comp->inputs[name] = v; //inputs[input] = v
// odla_value_shape real_shape = v->type.shape;
// std::cerr << "odla_value_shape:" << real_shape << "\n";
// size_t bytes =
// GetTotalElements(real_shape) * GetTotalElements(v->type.element_type);
// CHECK(cudaMalloc(&dev_ptr, bytes));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

// void* validated_data_ptr =
// ValidateValuePtr(value->type, const_cast<void*>(data_ptr));
// // CHECK(cudaMemcpy(dev_ptr, ))
g_comp->input_vals.push_back(v);
return v;
}
Expand Down Expand Up @@ -576,7 +595,7 @@ odla_status odla_SetValueAsOutput(const odla_value val) {
val->tensor->setName(name);
g_comp->network->markOutput(*val->tensor);
return ODLA_SUCCESS;
}
}
odla_status odla_GetNumOfOutputsFromComputation(
const odla_computation computation, odla_uint32* num_outputs) {
*num_outputs = computation->output_vals.size();
Expand All @@ -594,22 +613,43 @@ odla_status odla_GetOutputFromComputationByIdx(
return ODLA_SUCCESS;
}

//这里 每运行一个batch都会运行
odla_status odla_BindToArgument(odla_value value, const odla_void* data_ptr,
odla_context context) {
// CUdeviceptr dev_ptr;
clock_t startTime, endTime;
void* dev_ptr = nullptr;
odla_value_shape real_shape = value->type.shape;
if ((g_comp && g_comp->is_dynamic_batch) || context->run_batch_size) {
real_shape.dims[0] = context->run_batch_size;
}
size_t bytes =
GetTotalElements(real_shape) * GetElementSize(value->type.element_type);
CHECK(cudaMalloc(&dev_ptr, bytes));
// CHECK(cuMemAlloc(&dev_ptr, bytes));
// CHECK(cudaMalloc(&dev_ptr, bytes));
// 在这里检测一下有没有预先cudamalloc过,如果有过,将数据传到对应地址
// CUdeviceptr dev_ptr = context->cumemalloc_addres;
// std::cerr << "context->temp_input_ptr:" << context->temp_input_ptr << "\n";
if (context->temp_input_ptr == nullptr) {
CHECK(cudaMalloc(&(context->temp_input_ptr), bytes));
}
dev_ptr = context->temp_input_ptr;
void* validated_data_ptr =
ValidateValuePtr(value->type, const_cast<void*>(data_ptr));
// void* pagelocked_buffer = context->input_ptrs[value->name].host_ptr;
// startTime = clock();
// CHECK(cuMemcpyHtoD(dev_ptr, validated_data_ptr, bytes));
CHECK(cudaMemcpy(dev_ptr, validated_data_ptr, bytes, cudaMemcpyHostToDevice));

// endTime = clock();
// std::cout << "the run time is:" << (double) (endTime - startTime) /CLOCKS_PER_SEC << "s" << std::endl;
// std::ofstream outf;
// outf.open("odla_cudamemcpy_times.txt", std::ios::app);
// outf << (double) (endTime - startTime) /CLOCKS_PER_SEC << std::endl;
// outf.close();
// void* dev1_ptr;
// dev1_ptr = (void*) dev_ptr;
// CHECK(cudaMemcpy(dev_ptr, validated_data_ptr, bytes, cudaMemcpyHostToDevice));
context->input_ptrs[value->name] = {.host_ptr = data_ptr, .dev_ptr = dev_ptr};

return ODLA_SUCCESS;
}

Expand All @@ -623,15 +663,19 @@ odla_status odla_BindToArgumentById(const odla_value_id value_id,

odla_status odla_BindToOutput(odla_value value, odla_void* data_ptr,
odla_context context) {
// CUdeviceptr dst;
void* dst = nullptr;
odla_value_shape real_shape = value->type.shape;
if ((g_comp && g_comp->is_dynamic_batch) || context->run_batch_size) {
real_shape.dims[0] = context->run_batch_size;
}
size_t bytes =
GetTotalElements(real_shape) * GetElementSize(value->type.element_type);

CHECK(cudaMalloc(&dst, bytes));
if (context->temp_output_ptr == nullptr){
CHECK(cudaMalloc(&(context->temp_output_ptr), bytes));
}
dst = context->temp_output_ptr;
// CHECK(cudaMalloc(&dst, bytes));

context->output_ptrs[value->name] = {
.host_ptr = data_ptr, .dev_ptr = dst, .len = bytes, .vt = value->type};
Expand Down Expand Up @@ -852,6 +896,9 @@ odla_status odla_GetValueType(const odla_value value,
odla_status odla_ExecuteComputation(odla_computation comp, odla_context context,
odla_compute_mode mode,
odla_device device) {

// clock_t startTime, endTime;

std::vector<void*> buffers;
auto add_to_buffer = [&](const std::string& name, void* ptr) {
int idx = context->engine->getBindingIndex(name.c_str());
Expand All @@ -863,9 +910,14 @@ odla_status odla_ExecuteComputation(odla_computation comp, odla_context context,
}
};
for (auto& kv : context->input_ptrs) {
add_to_buffer(kv.first, kv.second.dev_ptr);
// void* kv_second_devptr;
// kv_second_devptr = (void*) kv.second.dev_ptr;
add_to_buffer(kv.first, kv.second.dev_ptr); //kv.first: input, kv.second.dev_ptr: 0x7f7698600000
}
for (auto& kv : context->output_ptrs) {
// void* kv_second_devptr;
// kv_second_devptr = (void*) kv.second.dev_ptr;
// add_to_buffer(kv.first, kv_second_devptr);
add_to_buffer(kv.first, kv.second.dev_ptr);
}
if (comp->is_dynamic_batch) {
Expand Down Expand Up @@ -894,12 +946,10 @@ odla_status odla_ExecuteComputation(odla_computation comp, odla_context context,
cudaMemcpyDeviceToHost));
}
}

// copy results and free temp buffers.
for (auto& ptr : buffers) {
CHECK(cudaFree(ptr));
}

// for (auto& ptr : buffers) {
// CHECK(cudaFree(ptr));
// }
context->input_ptrs.clear();
context->output_ptrs.clear();
return ODLA_SUCCESS;
Expand Down
2 changes: 1 addition & 1 deletion models/env.src
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
export SRC_DIR=${PWD}/..
export BUILD_DIR=$SRC_DIR/build
export MODELS_SRC=/models

export MODELS_ROOT=/models
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why add a new variable?

export HALO_BIN=$BUILD_DIR/bin/halo
export ODLA_INC=$SRC_DIR/ODLA/include
export ODLA_LIB=$BUILD_DIR/lib
Expand Down
Empty file modified models/vision/classification/1000_labels.txt
100644 → 100755
Empty file.
13 changes: 8 additions & 5 deletions models/vision/classification/alexnet/run_alexnet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@ fi

if [[ $TEST_WITH_GPU -eq 1 ]]; then
echo "======== Testing with ODLA TensorRT ========"
for i in 1 2 4 8 16 32 64
do
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is "i" used?

python3 $curr_dir/../../invoke_halo.py --model $model_file \
--label-file $curr_dir/../1000_labels.txt --image-dir $image_dir \
--odla tensorrt | tee $1
done
# RUN: FileCheck --input-file %t.1 %s
fi

# Using HALO to compile and run inference with ODLA XNNPACK
echo "======== Testing with ODLA DNNL ========"
python3 $curr_dir/../../invoke_halo.py --model $model_file \
--label-file $curr_dir/../1000_labels.txt --image-dir $image_dir \
--odla dnnl | tee $2
# # Using HALO to compile and run inference with ODLA XNNPACK
# echo "======== Testing with ODLA DNNL ========"
# python3 $curr_dir/../../invoke_halo.py --model $model_file \
# --label-file $curr_dir/../1000_labels.txt --image-dir $image_dir \
# --odla dnnl | tee $2
# RUN: FileCheck --input-file %t.2 %s

# CHECK: dog.jpg ==> "wallaby, brush kangaroo",
Expand Down
29 changes: 29 additions & 0 deletions models/vision/classification/alexnet/run_alexnet_tensorrt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
# RUN: %s
model_name="alexnet"
docker_model_file="/models/vision/classification/$model_name"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use $MODEL_ROOT

model_file="$docker_model_file/$model_name"".onnx"
image_dir="/models/vision/test_images"
curr_dir=`dirname $0`

# # Download model if it is not exist
# if [ ! -e $model_file ]; then
# $curr_dir/../get_cls_model_from_pytorch.py $model_name $model_file
# fi

# Download sample images if it is not exist
# $curr_dir/../../get_images.sh $image_dir

echo "=======Testing alexnet with TensorRT======="
python3 $curr_dir/../../onnx2tensorrt.py --model $model_file \
--label-file $curr_dir/../1000_labels.txt


# if [[ $TEST_WITH_GPU -eq 1 ]]; then
# echo "======== Testing with ODLA TensorRT ========"
# python3 $curr_dir/../../invoke_halo.py --model $model_file --image-dir $image_dir --odla tensorrt
# fi

# # Using HALO to compile and run inference with ODLA XNNPACK
# echo "======== Testing with ODLA DNNL ========"
# python3 $curr_dir/../../invoke_halo.py --model $model_file --label-file $curr_dir/../1000_labels.txt --image-dir $image_dir --odla dnnl
26 changes: 0 additions & 26 deletions models/vision/classification/caffenet/run_caffenet_onnx.sh

This file was deleted.

Empty file modified models/vision/classification/coco_classes.txt
100644 → 100755
Empty file.
5 changes: 4 additions & 1 deletion models/vision/classification/densenet/run_densenet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ fi
# check if GPU is enabled or not
if [[ $TEST_WITH_GPU -eq 1 ]]; then
echo "======== Testing with ODLA TensorRT ========"
python3 $curr_dir/../../invoke_halo.py --model $model_file \
for i in 1 2 4 8 16 32 64
do
python3 $curr_dir/../../invoke_halo.py --batch_size $i --model $model_file \
--label-file $curr_dir/../1000_labels.txt --image-dir \
$image_dir --odla tensorrt | tee $1
done
# RUN: FileCheck --input-file %t.1 %s
fi

Expand Down
29 changes: 29 additions & 0 deletions models/vision/classification/densenet/run_densenet_tensorrt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
# RUN: %s
model_name="densenet"
docker_model_file="/models/vision/classification/$model_name"
model_file="$docker_model_file/$model_name""121.onnx"
image_dir="/models/vision/test_images"
curr_dir=`dirname $0`

# # Download model if it is not exist
# if [ ! -e $model_file ]; then
# $curr_dir/../get_cls_model_from_pytorch.py $model_name $model_file
# fi

# Download sample images if it is not exist
# $curr_dir/../../get_images.sh $image_dir
for i in 1 2 4 8 16 32 64
do
echo "=======Testing densenet with TensorRT======="
python3 $curr_dir/../../onnx2tensorrt.py --model $model_file --label-file $curr_dir/../1000_labels.txt --batch_size $i
done

# if [[ $TEST_WITH_GPU -eq 1 ]]; then
# echo "======== Testing with ODLA TensorRT ========"
# python3 $curr_dir/../../invoke_halo.py --model $model_file --image-dir $image_dir --odla tensorrt
# fi

# # Using HALO to compile and run inference with ODLA XNNPACK
# echo "======== Testing with ODLA DNNL ========"
# python3 $curr_dir/../../invoke_halo.py --model $model_file --label-file $curr_dir/../1000_labels.txt --image-dir $image_dir --odla dnnl
35 changes: 35 additions & 0 deletions models/vision/classification/efficientnet/run_efficientnet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# RUN: %s %t.1 %t.2

model_name="efficientnet"
model_file="$MODELS_ROOT/vision/classification/efficientnet/$model_name-lite4-11.onnx"
image_dir="$MODELS_ROOT/vision/test_images"
if [[ $# != 0 ]];then
export TEST_TEMP_DIR=`dirname $1`
fi

curr_dir=`dirname $0`

# check if GPU is enabled or not
if [[ $TEST_WITH_GPU -eq 1 ]]; then
echo "======== Testing with ODLA TensorRT ========"
for i in 1
do
python3 $curr_dir/../../invoke_halo.py --batch_size $i --model $model_file \
--label-file $curr_dir/../1000_labels.txt --image-dir $image_dir \
--odla tensorrt --convert-layout-to=nhwc | tee $1
done
# RUN: FileCheck --input-file %t.1 %s
fi

# Using HALO to compile and run inference with ODLA XNNPACK
echo "======== Testing with ODLA XNNPACK (NHWC) ========"
python3 $curr_dir/../../invoke_halo.py --model $model_file \
--label-file $curr_dir/../1000_labels.txt --image-dir $image_dir \
--odla xnnpack --convert-layout-to=nhwc | tee $2
# RUN: FileCheck --input-file %t.2 %s

# CHECK: dog.jpg ==> "Samoyed, Samoyede",
# CHECK-NEXT: food.jpg ==> "ice cream, icecream",
# CHECK-NEXT: plane.jpg ==> "liner, ocean liner",
# CHECK-NEXT: sport.jpg ==> "ski",
Loading