Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into Cjian/drop_nuget
Browse files Browse the repository at this point in the history
# Conflicts:
#	tools/ci_build/github/azure-pipelines/templates/win-ci.yml
  • Loading branch information
jchen351 committed Nov 28, 2023
2 parents f08fd74 + 0b7048e commit b221a06
Show file tree
Hide file tree
Showing 33 changed files with 1,178 additions and 210 deletions.
2 changes: 2 additions & 0 deletions cmake/winml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
${winml_lib_api_dir}/impl/TensorKindFrom.h
${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
${winml_lib_api_dir}/NumericData.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.h
${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
${winml_lib_api_dir}/ImageFeatureDescriptor.h
${winml_lib_api_dir}/ImageFeatureValue.cpp
Expand Down
5 changes: 3 additions & 2 deletions js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ export const createConv2DMatMulProgramInfo =

LOG_DEBUG('verbose', () => `[conv2d_mm_webgpu] dispatch = ${dispatch}`);

const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : elementsPerThread[0];
const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1;

const tileAOuter = workGroupSize[1] * elementsPerThread[1];
const tileBOuter = workGroupSize[0] * elementsPerThread[0];
Expand All @@ -197,7 +197,8 @@ export const createConv2DMatMulProgramInfo =
const components = isVec4 ? 4 : 1;
const programUniforms: ProgramUniform[] =
[{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length, components);
const x =
inputVariable('x', inputs[0].dataType, inputs[0].dims.length, innerElementSize === 3 ? 1 : innerElementSize);
const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components);
const inputVariables = [x, w];

Expand Down
32 changes: 31 additions & 1 deletion js/web/test/data/ops/conv.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
]
},
{
"name": "conv with bias addition C",
"name": "conv with bias addition C - NHWC",
"operator": "Conv",
"inputShapeDefinitions": "rankOnly",
"opset": { "domain": "", "version": 17 },
Expand Down Expand Up @@ -158,6 +158,36 @@
"type": "float32"
}
]
},
{
"name": "inChannel = 3, outChannel = 4",
"inputs": [
{
"data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10],
"dims": [1, 3, 3, 3],
"type": "float32"
},
{
"data": [
1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8
],
"dims": [4, 3, 2, 2],
"type": "float32"
},
{
"data": [5, 6, 7, 8],
"dims": [4],
"type": "float32"
}
],
"outputs": [
{
"data": [360, 334, 271, 323, 909, 963, 1024, 1028, 683, 655, 576, 650, 473, 508, 570, 677],
"dims": [1, 4, 2, 2],
"type": "float32"
}
]
}
]
},
Expand Down
154 changes: 69 additions & 85 deletions onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,11 @@ struct TfIdfVectorizer::Impl {
Impl(const Impl&) = delete;
Impl& operator=(const Impl&) = delete;

void IncrementCount(size_t ngram_id, size_t row_num,
std::vector<uint32_t>& frequencies) const {
inline size_t OutputIdToIncrement(size_t ngram_id) const {
assert(ngram_id != 0);
--ngram_id;
assert(ngram_id < ngram_indexes_.size());
size_t output_idx = row_num * output_size_ + SafeInt<size_t>(ngram_indexes_[ngram_id]);
assert(output_idx < frequencies.size());
++frequencies[output_idx];
return SafeInt<size_t>(ngram_indexes_[ngram_id]);
}
};

Expand Down Expand Up @@ -252,77 +249,17 @@ TfIdfVectorizer::TfIdfVectorizer(const OpKernelInfo& info) : OpKernel(info), imp

TfIdfVectorizer::~TfIdfVectorizer() = default;

void TfIdfVectorizer::OutputResult(OpKernelContext* ctx, size_t B, const std::vector<uint32_t>& frequences) const {
const Impl& impl = *impl_;
std::vector<int64_t> output_dims;
if (B == 0) {
output_dims.push_back(impl.output_size_);
B = 1; // For use in the loops below
} else {
output_dims.push_back(B);
output_dims.push_back(impl.output_size_);
}

const auto row_size = impl.output_size_;

TensorShape output_shape(output_dims);
assert(frequences.size() == static_cast<size_t>(output_shape.Size()));

auto Y = ctx->Output(0, output_shape);
auto output_data = Y->MutableData<float>();
const auto& w = impl.weights_;
switch (impl.weighting_criteria_) {
case kTF: {
for (auto f : frequences) {
*output_data++ = static_cast<float>(f);
}
} break;
case kIDF: {
if (!w.empty()) {
const auto* freqs = frequences.data();
for (size_t batch = 0; batch < B; ++batch) {
for (size_t i = 0; i < row_size; ++i) {
*output_data++ = (*freqs++ > 0) ? w[i] : 0;
}
}
} else {
for (auto f : frequences) {
*output_data++ = (f > 0) ? 1.0f : 0;
}
}
} break;
case kTFIDF: {
if (!w.empty()) {
const auto* freqs = frequences.data();
for (size_t batch = 0; batch < B; ++batch) {
for (size_t i = 0; i < row_size; ++i) {
*output_data++ = *freqs++ * w[i];
}
}
} else {
for (auto f : frequences) {
*output_data++ = static_cast<float>(f);
}
}
} break;
case kNone: // fall-through
default:
assert(false);
}
}

void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_t row_size,
std::vector<uint32_t>& frequencies) const {
auto X = ctx->Input<Tensor>(0);
const auto elem_size = X->DataType()->Size();

const void* const row_begin = AdvanceElementPtr(X->DataRaw(), row_num * row_size, elem_size);
void TfIdfVectorizer::ComputeImpl(const void* x_data_raw, size_t elem_size, ptrdiff_t row_num, size_t row_size,
bool is_input_string, gsl::span<float> output_data,
std::function<void(size_t, gsl::span<float>&)>& fn_weight) const {
const void* const row_begin = AdvanceElementPtr(x_data_raw, row_num * row_size, elem_size);
const void* const row_end = AdvanceElementPtr(row_begin, row_size, elem_size);

const auto& impl = *impl_;
const auto max_gram_length = impl.max_gram_length_;
const auto max_skip_distance = impl.max_skip_count_ + 1; // Convert to distance
auto start_ngram_size = impl.min_gram_length_;
size_t output_idx;

for (auto skip_distance = 1; skip_distance <= max_skip_distance; ++skip_distance) {
auto ngram_start = row_begin;
Expand All @@ -336,7 +273,7 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_
}

auto ngram_item = ngram_start;
if (X->IsDataTypeString()) {
if (is_input_string) {
const std::string* str_item = reinterpret_cast<const std::string*>(ngram_item);
const StrMap* str_map = &impl.str_map_;
for (auto ngram_size = 1;
Expand All @@ -349,7 +286,8 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_
break;
}
if (ngram_size >= start_ngram_size && hit->second->id_ != 0) {
impl.IncrementCount(hit->second->id_, row_num, frequencies);
output_idx = impl.OutputIdToIncrement(hit->second->id_);
fn_weight(output_idx, output_data);
}
str_map = &hit->second->leafs_;
}
Expand All @@ -360,13 +298,14 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_
ngram_size <= max_gram_length &&
ngram_item < ngram_row_end;
++ngram_size, ngram_item = AdvanceElementPtr(ngram_item, skip_distance, elem_size)) {
int64_t val = (X->IsDataType<int32_t>()) ? int64_t{*reinterpret_cast<const int32_t*>(ngram_item)} : *reinterpret_cast<const int64_t*>(ngram_item);
int64_t val = (elem_size == 4) ? int64_t{*reinterpret_cast<const int32_t*>(ngram_item)} : *reinterpret_cast<const int64_t*>(ngram_item);
auto hit = int_map->find(val);
if (hit == int_map->end()) {
break;
}
if (ngram_size >= start_ngram_size && hit->second->id_ != 0) {
impl.IncrementCount(hit->second->id_, row_num, frequencies);
output_idx = impl.OutputIdToIncrement(hit->second->id_);
fn_weight(output_idx, output_data);
}
int_map = &hit->second->leafs_;
}
Expand Down Expand Up @@ -412,31 +351,76 @@ Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const {
}

assert((num_rows * C) == total_items);
// Frequency holder allocate [B..output_size_]
// and init all to zero
std::vector<uint32_t> frequencies;
frequencies.resize(num_rows * impl_->output_size_, 0);
const Impl& impl = *impl_;
TensorShapeVector output_dims;
if (B == 0) {
output_dims.push_back(impl.output_size_);
B = 1; // For use in the loops below
} else {
output_dims.push_back(B);
output_dims.push_back(impl.output_size_);
}
TensorShape output_shape(output_dims);

auto Y = ctx->Output(0, output_shape);
auto output_data = Y->MutableData<float>();
const bool is_input_string = X->IsDataTypeString();

if (total_items == 0 ||
(X->IsDataTypeString() && impl_->str_map_.empty()) ||
(is_input_string && impl_->str_map_.empty()) ||
((X->IsDataType<int32_t>() || X->IsDataType<int64_t>()) && impl_->int64_map_.empty())) {
// TfidfVectorizer may receive an empty input when it follows a Tokenizer
// (for example for a string containing only stopwords).
// TfidfVectorizer returns a zero tensor of shape
// {b_dim, output_size} when b_dim is the number of received observations
// and output_size the is the maximum value in ngram_indexes attribute plus 1.
OutputResult(ctx, B, frequencies);
memset(output_data, 0, static_cast<size_t>(output_shape.Size() * sizeof(float)));
return Status::OK();
}

std::function<void(ptrdiff_t)> fn = [this, ctx, C, &frequencies](ptrdiff_t row_num) {
ComputeImpl(ctx, row_num, C, frequencies);
};
auto x_data_raw = ctx->Input<Tensor>(0)->DataRaw();
const auto elem_size = X->DataType()->Size();
int32_t num_batches = std::min<int32_t>(concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool()) * 2, num_rows);

concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), num_rows, std::move(fn), 0);
const auto& w = impl.weights_;
std::function<void(size_t, gsl::span<float>&)> fn_weight;

OutputResult(ctx, B, frequencies);
switch (impl.weighting_criteria_) {
case kTF:
fn_weight = [](size_t i, gsl::span<float>& out) { out[i] += 1.0f; };
break;
case kIDF:
if (!w.empty()) {
fn_weight = [&w](size_t i, gsl::span<float>& out) { out[i] = w[i]; };
} else {
fn_weight = [](size_t i, gsl::span<float>& out) { out[i] = 1.0f; };
}
break;
case kTFIDF:
if (!w.empty()) {
fn_weight = [&w](size_t i, gsl::span<float>& out) { out[i] += w[i]; };
} else {
fn_weight = [](size_t i, gsl::span<float>& out) { out[i] += 1.0f; };
}
break;
case kNone: // fall-through
default:
assert(false);
}

std::function<void(ptrdiff_t)> fn = [this, C, output_data, x_data_raw, elem_size,
is_input_string, num_batches, num_rows, &fn_weight](ptrdiff_t batch_num) {
// Frequency holder allocate [B..output_size_] and init all to zero.
auto work = concurrency::ThreadPool::PartitionWork(batch_num, num_batches, static_cast<size_t>(num_rows));
std::vector<uint32_t> frequencies(this->impl_->output_size_);
for (auto row_num = work.start; row_num < work.end; ++row_num) {
auto out = gsl::span<float>(output_data + row_num * this->impl_->output_size_, this->impl_->output_size_);
std::fill(out.begin(), out.end(), 0.0f);
ComputeImpl(x_data_raw, elem_size, row_num, C, is_input_string, out, fn_weight);
}
};

concurrency::ThreadPool::TrySimpleParallelFor(ctx->GetOperatorThreadPool(), num_batches, std::move(fn));
return Status::OK();
}

Expand Down
7 changes: 2 additions & 5 deletions onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,8 @@ class TfIdfVectorizer final : public OpKernel {
Status Compute(OpKernelContext* ctx) const override;

private:
void ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_t row_size,
std::vector<uint32_t>& frequencies) const;

// Apply weighing criteria and output
void OutputResult(OpKernelContext* ctx, size_t b_dim, const std::vector<uint32_t>& frequences) const;
void ComputeImpl(const void* x_data_raw, size_t elem_size, ptrdiff_t row_num, size_t row_size, bool is_input_string,
gsl::span<float> output_data, std::function<void(size_t, gsl::span<float>&)>& fn_weight) const;

struct Impl;
std::unique_ptr<Impl> impl_;
Expand Down
2 changes: 0 additions & 2 deletions orttraining/orttraining/test/gradient/optimizer_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1542,7 +1542,6 @@ TEST(OptimizerTest, LambOptimizerTestLarge) {
std::vector<float> m(size);
std::vector<float> v(size);

std::random_device random_device;
std::mt19937 random_engine(0);
std::uniform_real_distribution<float> dist(0.1f, 1.0f);
for (int i = 0; i < size; ++i) {
Expand Down Expand Up @@ -1581,7 +1580,6 @@ TEST(OptimizerTest, LambOptimizerTestLarge) {

TEST(OptimizerTest, LambOptimizerMultiTensorRatio) {
constexpr int group_count = 127;
std::random_device random_device;
std::mt19937 random_engine(0);
std::uniform_real_distribution<float> dist(0.1f, 1.0f);
std::uniform_int_distribution<int64_t> dist_int(1, 1228);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,6 @@ void TestMultiTensorReduce(
test.SetDeterminism(use_determinism);

// Set up random number generator.
std::random_device random_device;
std::mt19937 random_engine(0);
std::uniform_real_distribution<float> dist(min, max);
std::uniform_int_distribution<int64_t> dist_int(min_tensor_size, max_tensor_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ parameters:
type: string
default: '--use_azure'

- name: CudaVersion
displayName: CUDA version
type: string
default: '11.8'
values:
- 11.8
- 12.2

resources:
repositories:
- repository: onnxruntime-inference-examples # The name used to reference this repository in the checkout step
Expand Down Expand Up @@ -146,15 +154,21 @@ stages:
timeoutInMinutes: 120
pool: 'Onnxruntime-Linux-GPU'
variables:
CUDA_VERSION: '11.8'
- name: CUDA_VERSION_MAJOR
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: '11'
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: '12'
- name: CUDA_VERSION
value: ${{ parameters.CudaVersion }}
steps:
- template: templates/set-version-number-variables-step.yml
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimecuda11centosbuild
Repository: onnxruntimecuda$(CUDA_VERSION_MAJOR)build

- script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
workingDirectory: $(Build.SourcesDirectory)
Expand Down
Loading

0 comments on commit b221a06

Please sign in to comment.