From 078f20b8fbcbb802aa5ffcd2134519721a61b88c Mon Sep 17 00:00:00 2001
From: Kevin Chen <45886021+kevinch-nv@users.noreply.github.com>
Date: Mon, 4 Oct 2021 15:05:15 -0700
Subject: [PATCH] TensorRT 8.2 EA ONNX Parser Release (#754)

Signed-off-by: Kevin Chen <kevinch@nvidia.com>
---
 CMakeLists.txt            |  43 +--
 ConditionalHelpers.cpp    | 330 +++++++++++++++++++
 ConditionalHelpers.hpp    |  51 +++
 ImporterContext.hpp       |  26 +-
 ModelImporter.cpp         |  97 +++++-
 ModelImporter.hpp         |   8 -
 README.md                 |  14 +-
 ShapedWeights.cpp         |   1 +
 Status.hpp                |   8 +
 builtin_op_importers.cpp  | 665 +++++++++++++++++++++++---------------
 docs/Changelog.md         |  17 +
 docs/contributing.md      |   1 -
 docs/operators.md         |  26 +-
 onnx2trt.hpp              |   5 +-
 onnx2trt_utils.cpp        | 315 ++++++++----------
 onnx2trt_utils.hpp        |  20 +-
 onnx_tensorrt/__init__.py |   2 +-
 onnx_utils.hpp            |  28 +-
 18 files changed, 1138 insertions(+), 519 deletions(-)
 create mode 100644 ConditionalHelpers.cpp
 create mode 100644 ConditionalHelpers.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba7f0308..c3692aee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,8 +4,8 @@ cmake_minimum_required(VERSION 3.13)
 project(onnx2trt LANGUAGES CXX C)
 
 set(ONNX2TRT_ROOT ${PROJECT_SOURCE_DIR})
-# Set C++11 as standard for the whole project
-set(CMAKE_CXX_STANDARD  11)
+# Set C++14 as standard for the whole project
+set(CMAKE_CXX_STANDARD 14)
 
 # Enable compiler warnings
 if (CMAKE_COMPILER_IS_GNUCC)
@@ -20,12 +20,16 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 set(PARSER_LINKER_SCRIPT  ${ONNX2TRT_ROOT}/libnvonnxparser.version)
 
+# Find length of source directory used to pad filename in Status.hpp
+string(LENGTH "${CMAKE_SOURCE_DIR}/" SOURCE_LENGTH)
+add_definitions("-DSOURCE_LENGTH=${SOURCE_LENGTH}")
+
 #--------------------------------------------------
 # Version information
 #--------------------------------------------------
 set(ONNX2TRT_MAJOR 8)
-set(ONNX2TRT_MINOR 0)
-set(ONNX2TRT_PATCH 1)
+set(ONNX2TRT_MINOR 2)
+set(ONNX2TRT_PATCH 0)
 set(ONNX2TRT_VERSION "${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}.${ONNX2TRT_PATCH}" CACHE STRING "ONNX2TRT version")
 
 #--------------------------------------------------
@@ -43,26 +47,13 @@ set(IMPORTER_SOURCES
   LoopHelpers.cpp
   RNNHelpers.cpp
   OnnxAttrs.cpp
+  ConditionalHelpers.cpp
 )
 
-# Do not build ONNXIFI by default.
-if(BUILD_ONNXIFI)
-  if (NOT CUDA_TOOLKIT_ROOT_DIR)
-    set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
-  endif()
-  message(debug "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}")
-  find_path(CUDA_INCLUDE_DIR cuda_runtime.h
-    HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES include
-  )
+if (BUILD_ONNXIFI)
   set(ONNXIFI_SOURCES onnx_trt_backend.cpp)
 endif()
 
-# Build with negative indices support for Gather:
-if (DEFINED SUPPORT_NEGATIVE_GATHER)
-  add_definitions("-DSUPPORT_NEGATIVE_GATHER=1")
-endif()
-
 # Build executables if BUILD_LIBRARY_ONLY flag is not set
 if (NOT DEFINED BUILD_LIBRARY_ONLY)
   set(EXECUTABLE_SOURCES
@@ -89,6 +80,16 @@ if(NOT TARGET onnx_proto)
   add_subdirectory(third_party/onnx EXCLUDE_FROM_ALL)
 endif()
 
+# CUDA
+if (NOT CUDA_TOOLKIT_ROOT_DIR)
+  set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
+endif()
+find_path(CUDA_INCLUDE_DIR cuda_runtime.h
+  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include
+)
+MESSAGE(STATUS "Found CUDA headers at ${CUDA_INCLUDE_DIR}")
+
 # TensorRT
 find_path(TENSORRT_INCLUDE_DIR NvInfer.h
   HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
@@ -112,7 +113,7 @@ endif()
 # Importer library
 # --------------------------------
 add_library(nvonnxparser SHARED ${IMPORTER_SOURCES})
-target_include_directories(nvonnxparser PUBLIC ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR})
+target_include_directories(nvonnxparser PUBLIC ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDA_INCLUDE_DIR})
 target_link_libraries(nvonnxparser PUBLIC onnx_proto ${PROTOBUF_LIBRARY} ${TENSORRT_LIBRARY})
 set_target_properties(nvonnxparser PROPERTIES
   VERSION   ${ONNX2TRT_VERSION}
@@ -121,7 +122,7 @@ set_target_properties(nvonnxparser PROPERTIES
   LINK_FLAGS "-Wl,--version-script=${PARSER_LINKER_SCRIPT}"
 )
 add_library(nvonnxparser_static STATIC ${IMPORTER_SOURCES})
-target_include_directories(nvonnxparser_static PUBLIC ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR})
+target_include_directories(nvonnxparser_static PUBLIC ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDA_INCLUDE_DIR})
 target_link_libraries(nvonnxparser_static PUBLIC onnx_proto ${PROTOBUF_LIBRARY} ${TENSORRT_LIBRARY})
 
 # --------------------------------
diff --git a/ConditionalHelpers.cpp b/ConditionalHelpers.cpp
new file mode 100644
index 00000000..e0f13f8e
--- /dev/null
+++ b/ConditionalHelpers.cpp
@@ -0,0 +1,330 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "ConditionalHelpers.hpp"
+#include "ModelImporter.hpp"
+#include "onnx2trt_utils.hpp"
+#include "toposort.hpp"
+
+namespace onnx2trt
+{
+
+using NodeName = std::string;
+using LayerName = std::string;
+using InputIndex = int32_t;
+
+// A SubgraphPortsMap maps either the inputs or outputs ports of each node in an ONNX graph.
+using SubgraphPortsMap = std::unordered_map<NodeName, std::set<InputIndex>>;
+
+// An InputsMap tracks which IIfConditionalInputLayer we've added to a layer's inputs,
+// so that we can reuse them if needed.
+using InputsMap = std::unordered_map<LayerName, nvinfer1::IIfConditionalInputLayer*>;
+
+// Search for a network Layer name in a SubgraphPortsMap using partial (prefix) name matching.
+// ONNX nodes are matched to network layers using prefix-matching because an ONNX node may have
+// several network layers associcated with it.
+SubgraphPortsMap::const_iterator findLayer(const SubgraphPortsMap& inputs, const std::string layerName)
+{
+    return std::find_if(inputs.begin(), inputs.end(), [&](const auto& item) {
+        const auto& key = item.first;
+        return layerName.compare(0, key.size(), key) == 0;
+    });
+}
+
+// Add an ConditionalInputLayer between `layer` and its inputs.
+// I.e. input[inIdx] -> layer ==> input[inIdx] -> ConditionalInputLayer -> layer.
+Status addConditionalInputLayer(IImporterContext* ctx, nvinfer1::IIfConditional* conditional, InputsMap& inputsMap,
+    nvinfer1::ILayer& layer, int32_t inIdx)
+{
+    auto input = layer.getInput(inIdx);
+    if (input == nullptr)
+    {
+        // Phantom input (an input that is really constant weights).
+        return Status::success();
+    }
+
+    if (layer.getType() == nvinfer1::LayerType::kCONDITIONAL_OUTPUT)
+    {
+        return Status::success();
+    }
+
+    auto const name = input->getName();
+    auto it = inputsMap.find(name);
+    nvinfer1::IIfConditionalInputLayer* inputLayer = nullptr;
+    if (it == inputsMap.end())
+    {
+        inputLayer = conditional->addInput(*input);
+        inputsMap[name] = inputLayer;
+        const std::string inputLayerName(name);
+        ctx->registerLayer(inputLayer, inputLayerName + "_InputLayer");
+        ctx->registerTensor(TensorOrWeights{inputLayer->getOutput(0)}, inputLayerName + "_InputLayer_output");
+    }
+    else
+    {
+        // An InputLayer may in the inputsMap if it has several consumers.
+        inputLayer = it->second;
+    }
+    layer.setInput(inIdx, *(inputLayer->getOutput(0)));
+    return Status::success();
+};
+
+// Take a snapshot of the network before and after parsing the subgraph and return a list
+// of newly added network layers.
+Status importSubgraph(
+    IImporterContext* ctx, const ::ONNX_NAMESPACE::GraphProto& subgraph, std::vector<nvinfer1::ILayer*>& newLayers)
+{
+    auto net = ctx->network();
+    int32_t beforeSubgraph = net->getNbLayers();
+    CHECK(onnx2trt::parseGraph(ctx, subgraph));
+
+    for (int32_t i = beforeSubgraph; i < net->getNbLayers(); i++)
+    {
+        newLayers.push_back(net->getLayer(i));
+    }
+
+    return Status::success();
+}
+
+// Add an IConditionalInputLayer to `layer`'s inputs, if they don't already exist.
+Status addConditionalInputIfNeeded(IImporterContext* ctx, nvinfer1::IIfConditional* conditional, InputsMap& inputsMap,
+    nvinfer1::ILayer& layer, SubgraphPortsMap subgraphInputsMap)
+{
+    // Return all of the layer's inputs that are external to the subgraph that
+    // that the layer belongs to.
+    auto getLayerExternalInputs = [&](std::string const& layerName) {
+        std::set<int32_t> inIndices;
+        auto iter = findLayer(subgraphInputsMap, layerName);
+        if (iter != subgraphInputsMap.end())
+        {
+            const auto& indicesSet = iter->second;
+            inIndices.insert(indicesSet.begin(), indicesSet.end());
+        }
+
+        return inIndices;
+    };
+
+    const auto inIndices = getLayerExternalInputs(layer.getName());
+    for (auto inIdx : inIndices)
+    {
+        LOG_VERBOSE("Adding Input layer for " << layer.getName());
+        addConditionalInputLayer(ctx, conditional, inputsMap, layer, inIdx);
+    }
+    return Status::success();
+}
+
+// Add IConditionalInputLayers to `layer`'s inputs.
+Status addIfInputLayers(IImporterContext* ctx, nvinfer1::IIfConditional* conditional, InputsMap& inputsMap,
+    const ::ONNX_NAMESPACE::GraphProto& subgraph, const std::vector<nvinfer1::ILayer*>& newLayers)
+{
+    // Find all of the tensors entering the subgraph.
+    // The node-names are from the ONNX context.
+    using NodeName = std::string;
+    using InputIndex = int32_t;
+    std::unordered_map<NodeName, std::set<InputIndex>> subgraphInputsMap;
+    getSubgraphInputs(subgraph, subgraphInputsMap);
+
+    // Add a ConditionalInputLayer in front of each input that is external to the subgraph.
+    for (const auto& layer : newLayers)
+    {
+        addConditionalInputIfNeeded(ctx, conditional, inputsMap, *layer, subgraphInputsMap);
+    }
+
+    return Status::success();
+}
+
+// Add an IConditionalOutputLayer to `layer`'s outputs.
+Status addIfOutputLayers(IImporterContext* ctx, nvinfer1::IIfConditional* conditional,
+    const ::ONNX_NAMESPACE::GraphProto& thenGraph, const std::vector<nvinfer1::ILayer*>& thenLayers,
+    const ::ONNX_NAMESPACE::GraphProto& elseGraph, const std::vector<nvinfer1::ILayer*>& elseLayers,
+    std::vector<TensorOrWeights>& graphOutputs)
+{
+    // Reported outputs are outputs that the ONNX model reports as subgraph outputs.  This list is
+    // not sufficient because it may produce names that are not fully compatible with TensorRT's naming.
+    // We use this list to help find the subgraph (SG) output tensors.
+    auto getReportedOutputs
+        = [&ctx](const ::ONNX_NAMESPACE::GraphProto& body, std::vector<std::string>& reportedOutputs) {
+              // Assuming that the subgraph was imported already, we can iterate on its output tensors.
+              const auto nbOutputs = body.output_size();
+              for (auto i = 0; i < nbOutputs; i++)
+              {
+                  reportedOutputs.emplace_back(body.output(i).name());
+              }
+          };
+
+    using NodeName = std::string;
+    std::unordered_map<NodeName, std::set<int32_t>> thenOutputs;
+    std::unordered_map<NodeName, std::set<int32_t>> elseOutputs;
+
+    std::vector<std::string> thenReportedOutputs;
+    getReportedOutputs(thenGraph, thenReportedOutputs);
+    getSubgraphOutputs(thenGraph, thenOutputs, thenReportedOutputs);
+    std::vector<std::string> elseReportedOutputs;
+    getReportedOutputs(thenGraph, elseReportedOutputs);
+    getSubgraphOutputs(elseGraph, elseOutputs, elseReportedOutputs);
+
+    // Retrieve the output tensors of a subgraph (tensors exiting the subgraph).
+    auto getSubgraphOutputTensors
+        = [](IImporterContext* ctx, std::vector<nvinfer1::ITensor*>& sgOutputs, SubgraphPortsMap& subgraphOutputs,
+              const ::ONNX_NAMESPACE::GraphProto& subgraph, std::vector<nvinfer1::ILayer*> subgraphLayers) {
+              for (const auto& layer : subgraphLayers)
+              {
+                  const auto layerName = layer->getName();
+                  auto iter = findLayer(subgraphOutputs, layerName);
+                  if (iter != subgraphOutputs.end())
+                  {
+                      sgOutputs.push_back(layer->getOutput(0));
+                  }
+              }
+
+              if (sgOutputs.empty())
+              {
+                  // No new layers, so we can't deduce the outputs and have to use what ONNX tells us.
+                  const int32_t nbOutputs = subgraph.output_size();
+                  for (int32_t outIdx = 0; outIdx < nbOutputs; outIdx++)
+                  {
+                      const auto thenName = subgraph.output(outIdx).name();
+                      auto* thenTensor = &convertToTensor(ctx->tensors().at(thenName), ctx);
+                      sgOutputs.push_back(thenTensor);
+                  }
+              }
+          };
+
+    std::vector<nvinfer1::ITensor*> thenOutputTensors;
+    getSubgraphOutputTensors(ctx, thenOutputTensors, thenOutputs, thenGraph, thenLayers);
+
+    std::vector<nvinfer1::ITensor*> elseSGOutputTensors;
+    getSubgraphOutputTensors(ctx, elseSGOutputTensors, elseOutputs, elseGraph, elseLayers);
+
+    ASSERT(thenOutputTensors.size() == elseSGOutputTensors.size()
+            && "The then/else branches of an If operator must have the same number of outputs.",
+        ErrorCode::kINVALID_NODE);
+
+    // Add an ConditionalOutputLayer with one output and two inputs
+    // (one from the thenGraph and another from the elseGraph).
+    for (size_t i = 0; i < elseSGOutputTensors.size(); i++)
+    {
+        auto* outputLayer = conditional->addOutput(*thenOutputTensors[i], *elseSGOutputTensors[i]);
+        ctx->registerLayer(outputLayer, std::string(conditional->getName()) + "_OutputLayer");
+        graphOutputs.emplace_back(outputLayer->getOutput(0));
+    }
+    return Status::success();
+}
+
+// Given a subgraph, find all of its external inputs/outputs (tensors entering/exiting the subgraph).
+Status getSubgraphTensors(const ::ONNX_NAMESPACE::GraphProto& graph,
+    std::unordered_map<std::string, std::set<int32_t>>& externalOutputs, bool extractOutputs,
+    const std::vector<std::string>* reportedOutputs = nullptr)
+{
+    std::vector<size_t> topoOrder;
+    ASSERT(toposort(graph.node(), &topoOrder) && "Failed to sort the model topologically.", ErrorCode::kINVALID_GRAPH);
+    using NodeName = std::string;
+    using TensorName = std::string;
+    using PortIndex = int32_t;
+    using Port = std::pair<NodeName, PortIndex>;
+    std::unordered_set<TensorName> outputTensors;
+    std::unordered_set<TensorName> inputTensors;
+
+    // To determine which tensors are entering or exiting the given graph, we first collect the sets of all input and
+    // output tensors. Then we categorize the tensors according to this logic:
+    //  Entering tensors := {inputs} - {outputs}
+    //  Exiting tensors := {outputs} - {inputs}
+
+    // Collect all input and output tensors belonging to nodes in the graph.
+    for (const auto& nodeIndex : topoOrder)
+    {
+        const auto& node = graph.node(nodeIndex);
+        for (const auto& outputName : node.output())
+        {
+            outputTensors.insert(outputName);
+        }
+        for (const auto& inputName : node.input())
+        {
+            inputTensors.insert(inputName);
+        }
+    }
+
+    using NodeProto = const ::ONNX_NAMESPACE::NodeProto;
+    auto getOutputs = [](NodeProto& node) { return node.output(); };
+    auto getInputs = [](NodeProto& node) { return node.input(); };
+
+    // Retrieve the list of tensors either exiting or entering the subgraph.
+    std::unordered_map<TensorName, std::vector<Port>> externalPortsMap;
+    auto filterTensors = [&](std::unordered_set<TensorName> tensors, auto nodeAccessor) {
+        for (const auto& nodeIndex : topoOrder)
+        {
+            const auto& node = graph.node(nodeIndex);
+            const auto& nodeName = getNodeName(node);
+            PortIndex i = 0;
+
+            for (const auto& tensorName : nodeAccessor(node))
+            {
+                if (tensorName.empty())
+                {
+                    continue;
+                }
+                if (tensors.count(tensorName) == 0)
+                {
+                    auto prefixFound = false;
+                    if (reportedOutputs)
+                    {
+                        // reportedOutputs are the names of the outputs as reported by the ONNX parser and help
+                        // us further filter the output tensors.
+                        //      Exiting tensors := {outputs} - {inputs} - {unreported tensors}
+                        // An example: a Split node is internal to a subgraph and has 4 outputs, but only two are
+                        // connected to the rest of the graph.  To prevent mistaking the 2 unused outputs as subgraph
+                        // outputs, we look for them in reportedOutputs which leads us to ignore the 2 tensors.
+                        const auto iter = std::find_if(
+                            reportedOutputs->begin(), reportedOutputs->end(), [&](const auto& outputName) {
+                                // Prefix name matching.
+                                return tensorName.compare(0, outputName.size(), outputName) == 0;
+                            });
+                        prefixFound = iter != reportedOutputs->end();
+                    }
+                    if (!reportedOutputs || prefixFound)
+                    {
+                        externalPortsMap[tensorName].push_back(std::make_pair(nodeName, i));
+                    }
+                }
+                i++;
+            }
+        }
+    };
+
+    if (extractOutputs)
+    {
+        filterTensors(inputTensors, getOutputs);
+    }
+    else
+    {
+        filterTensors(outputTensors, getInputs);
+    }
+
+    // Create the user's view of the external inputs, which uses the node-name as the key for
+    // looking up input/output port index.
+    for (auto const& input : externalPortsMap)
+    {
+        for (const Port& inPort : input.second)
+        {
+            auto const nodeName = inPort.first;
+            auto const portIndex = inPort.second;
+            externalOutputs[nodeName].insert(portIndex);
+        }
+    }
+    return Status::success();
+}
+
+Status getSubgraphOutputs(const ::ONNX_NAMESPACE::GraphProto& graph,
+    std::unordered_map<std::string, std::set<int32_t>>& externalOutputs,
+    const std::vector<std::string>& reportedOutputs)
+{
+    return getSubgraphTensors(graph, externalOutputs, true, &reportedOutputs);
+}
+
+Status getSubgraphInputs(
+    const ::ONNX_NAMESPACE::GraphProto& graph, std::unordered_map<std::string, std::set<int32_t>>& externalInputs)
+{
+    return getSubgraphTensors(graph, externalInputs, false);
+}
+
+} // namespace onnx2trt
diff --git a/ConditionalHelpers.hpp b/ConditionalHelpers.hpp
new file mode 100644
index 00000000..5260e0fb
--- /dev/null
+++ b/ConditionalHelpers.hpp
@@ -0,0 +1,51 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Helper functions used for importing the ONNX If-operator follow below.
+ *
+ */
+
+#pragma once
+
+#include "ImporterContext.hpp"
+#include "Status.hpp"
+#include <NvInfer.h>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace onnx2trt
+{
+
+// Given a subgraph, find all of its external inputs (tensors entering the subgraph).
+// The result is returned in `subgraphInputs`, which is a map indexed by layer-name and with values indicating a set
+// of external input indices.
+Status getSubgraphInputs(
+    const ::ONNX_NAMESPACE::GraphProto& graph, std::unordered_map<std::string, std::set<int32_t>>& subgraphInputs);
+
+// Given a subgraph, find all of its external outputs (tensors exiting the subgraph).
+// The result is returned in `subgraphInputs`, which is a map indexed by layer-name and with values indicating a set
+// of external outputs indices.
+Status getSubgraphOutputs(const ::ONNX_NAMESPACE::GraphProto& graph,
+    std::unordered_map<std::string, std::set<int32_t>>& subgraphOutputs,
+    const std::vector<std::string>& reportedOutputs);
+
+// Take a snapshot of the network before and after parsing the subgraph and return a list
+// of newly added network layers.
+Status importSubgraph(
+    IImporterContext* ctx, const ::ONNX_NAMESPACE::GraphProto& subgraph, std::vector<nvinfer1::ILayer*>& newLayers);
+
+using InputsMap = std::unordered_map<std::string, nvinfer1::IIfConditionalInputLayer*>;
+
+// Add IIfConditionalInputLayers to the inputs of the subgraph indicated by `subgraph`.
+onnx2trt::Status addIfInputLayers(IImporterContext* ctx, nvinfer1::IIfConditional* conditional, InputsMap& inputsMap,
+    const ::ONNX_NAMESPACE::GraphProto& subgraph, const std::vector<nvinfer1::ILayer*>& newLayers);
+
+// Add IIfConditionalOutputLayers to the outputs of the subgraph indicated by `subgraph`.
+onnx2trt::Status addIfOutputLayers(IImporterContext* ctx, nvinfer1::IIfConditional* conditional,
+    const ::ONNX_NAMESPACE::GraphProto& thenGraph, const std::vector<nvinfer1::ILayer*>& thenLayers,
+    const ::ONNX_NAMESPACE::GraphProto& elseGraph, const std::vector<nvinfer1::ILayer*>& elseLayers,
+    std::vector<TensorOrWeights>& graphOutputs);
+
+} // namespace onnx2trt
diff --git a/ImporterContext.hpp b/ImporterContext.hpp
index 88273607..af45e1ee 100644
--- a/ImporterContext.hpp
+++ b/ImporterContext.hpp
@@ -84,8 +84,9 @@ class ImporterContext final : public IImporterContext
     int64_t mSuffixCounter{0}; // increasing suffix counter used to uniquify layer names.
     std::unordered_set<std::string> mUnsupportedShapeTensors; // Container to hold output tensor names of layers that produce shape tensor outputs but do not natively support them.
     StringMap<std::string> mLoopTensors; // Container to map subgraph tensors to their original outer graph names.
-    std::string mOnnxFileLocation; // Keep track of the directory of the parsed ONNX file
+    std::string mOnnxFileLocation;       // Keep track of the directory of the parsed ONNX file
     std::unique_ptr<ErrorRecorderWrapper> mErrorWrapper; // error recorder to control TRT errors
+    StringMap<nvinfer1::IConstantLayer*> mConstantLayers;
 
 public:
     ImporterContext(nvinfer1::INetworkDefinition* network, nvinfer1::ILogger* logger)
@@ -178,6 +179,15 @@ class ImporterContext final : public IImporterContext
             LOG_VERBOSE("Registering layer: " << uniqueName << " for ONNX node: " << basename);
 
             layer->setName(uniqueName.c_str());
+            if (layer->getType() == nvinfer1::LayerType::kCONSTANT)
+            {
+                if (basename != uniqueName)
+                {
+                    LOG_ERROR("Constant layer: " << uniqueName << " can be a duplicate of: " << basename);
+                    assert(!"Internal error: duplicate constant layers for the same weights");
+                }
+                mConstantLayers.insert({uniqueName, static_cast<nvinfer1::IConstantLayer*>(layer)});
+            }
         }
     }
 
@@ -271,6 +281,20 @@ class ImporterContext final : public IImporterContext
     {
         return mErrorWrapper ? mErrorWrapper->getErrorRecorder() : nullptr;
     }
+    nvinfer1::IConstantLayer* getConstantLayer(const char* name) const final
+    {
+        if (name == nullptr)
+        {
+            return nullptr;
+        }
+        auto const iter = mConstantLayers.find(name);
+        if (iter == mConstantLayers.end())
+        {
+            return nullptr;
+        }
+        return iter->second;
+    }
+
 private:
     std::string generateUniqueName(std::set<std::string>& namesSet, const std::string& basename)
     {
diff --git a/ModelImporter.cpp b/ModelImporter.cpp
index efe15396..8080373d 100644
--- a/ModelImporter.cpp
+++ b/ModelImporter.cpp
@@ -223,27 +223,96 @@ Status parseGraph(IImporterContext* ctx, const ::ONNX_NAMESPACE::GraphProto& gra
     return Status::success();
 }
 
-Status importInput(ImporterContext* ctx, ::ONNX_NAMESPACE::ValueInfoProto const& input, nvinfer1::ITensor** tensor)
+Status importInput(ImporterContext* ctx, ::ONNX_NAMESPACE::ValueInfoProto const& input, nvinfer1::ITensor** tensor,
+    std::vector<NamedDimension>& namedDims)
 {
     auto const& onnxDtype = input.type().tensor_type();
     nvinfer1::DataType trtDtype;
     ASSERT_INPUT(convertDtype(onnxDtype.elem_type(), &trtDtype) && "Failed to convert ONNX date type to TensorRT data type.", ErrorCode::kUNSUPPORTED_NODE, input.name());
     nvinfer1::Dims trt_dims;
-    ASSERT_INPUT(convertOnnxDims(onnxDtype.shape().dim(), trt_dims) && "Failed to convert ONNX dimensions to TensorRT dimensions.", ErrorCode::kUNSUPPORTED_GRAPH, input.name());
+    size_t const oldNbNamedDimensions = namedDims.size();
+    ASSERT_INPUT(convertOnnxDims(onnxDtype.shape().dim(), trt_dims, namedDims) && "Failed to convert ONNX dimensions to TensorRT dimensions.", ErrorCode::kUNSUPPORTED_GRAPH, input.name());
     nvinfer1::ITensor* userInput = ctx->getUserInput(input.name().c_str());
     if (userInput)
     {
         ASSERT_INPUT(userInput && "User input is missing.", ErrorCode::kINVALID_VALUE, input.name());
-        // Note: We intentionally don't check dimensions/dtype here so that users can change the input shape/type if
-        // they want to.
+        // Intentionally don't check dimensions/dtype here so that users can change the input shape/type if
+        // they want to. However, equalities implied by dimension names are nonetheless respected.
         *tensor = userInput;
-        return Status::success();
+    }
+    else
+    {
+        LOG_VERBOSE(
+            "Adding network input: " << input.name() << " with dtype: " << trtDtype << ", dimensions: " << trt_dims);
+        ASSERT_INPUT( (*tensor = ctx->network()->addInput(input.name().c_str(), trtDtype, trt_dims)) && "Failed to add input to the network.",
+            ErrorCode::kUNSUPPORTED_NODE, input.name());
+    }
+
+    // Fill in field `tensor` for any dimensions that had names in the ONNX.
+    for (auto i = oldNbNamedDimensions; i < namedDims.size(); ++i)
+    {
+        namedDims[i].tensor = *tensor;
+    }
+    return Status::success();
+}
+
+//! Add equality assertions for dimensions with the same name.
+static Status assertDimsWithSameNameAreEqual(ImporterContext* ctx, std::vector<NamedDimension>& namedDims)
+{
+    // Cache for IShapeLayer
+    std::unordered_map<nvinfer1::ITensor const*, nvinfer1::IShapeLayer*> shapeMap;
+
+    // Sort records by name of dimension, using stable_sort for reproducibility.
+    std::stable_sort(namedDims.begin(), namedDims.end(),
+        [](const NamedDimension& x, const NamedDimension& y) { return x.dimParam < y.dimParam; });
+
+    // Each loop iteration covers a sequence of named dimensions with the same name.
+    // For each sequence, add IAssertionLayers that assert that the values are equal.
+    // TensorRT knows about transitive closure of equality, so just add the assertions
+    // for adjacent records.
+    decltype(namedDims.begin()) j;
+    for (auto i = namedDims.begin(); i < namedDims.end(); i = j)
+    {
+        // Walk j forward so that [i,j) is indices of named dimensions with the same name.
+        j = i;
+        do
+        {
+            ++j;
+        } while (j != namedDims.end() && j->dimParam == i->dimParam);
+
+        if (j - i < 2)
+        {
+            // Single occurrence of name is uninteresting.
+            continue;
+        }
+
+        std::ostringstream message;
+        message << "input dimensions named " << i->tensor->getName() << " must be equal";
+
+        // prev is the current end of the daisy chain.
+        nvinfer1::ITensor* prev = nullptr;
+        for (auto k = i; k < j; ++k)
+        {
+            // Create ITensor "next" with dimension length for record k.
+            auto& shape = shapeMap[k->tensor];
+            if (shape == nullptr)
+            {
+                shape = ctx->network()->addShape(*k->tensor);
+            }
+            auto* slice = ctx->network()->addSlice(*shape->getOutput(0), {1, {k->index}}, {1, {1}}, {1, {1}});
+            nvinfer1::ITensor* next = slice->getOutput(0);
+
+            if (prev)
+            {
+                // Add a link to the chain.
+                auto* equal = ctx->network()->addElementWise(*prev, *next, nvinfer1::ElementWiseOperation::kEQUAL);
+                auto* assertion = ctx->network()->addAssertion(*equal->getOutput(0), message.str().c_str());
+                ASSERT(assertion != nullptr && "addAssertion failed", ErrorCode::kMODEL_DESERIALIZE_FAILED);
+            }
+            prev = next;
+        }
     }
 
-    LOG_VERBOSE(
-        "Adding network input: " << input.name() << " with dtype: " << trtDtype << ", dimensions: " << trt_dims);
-    ASSERT_INPUT( (*tensor = ctx->network()->addInput(input.name().c_str(), trtDtype, trt_dims)) && "Failed to add input to the network.",
-        ErrorCode::kUNSUPPORTED_NODE, input.name());
     return Status::success();
 }
 
@@ -258,19 +327,20 @@ Status importInputs(ImporterContext* ctx, ::ONNX_NAMESPACE::GraphProto const& gr
         initializers.emplace(initializer.name());
     }
 
+    std::vector<NamedDimension> namedDims;
     for (const ::ONNX_NAMESPACE::ValueInfoProto& input : graph.input())
     {
         TensorOrWeights tensor;
         if (!initializers.count(input.name()))
         {
-            nvinfer1::ITensor* tensor_ptr;
-            CHECK(importInput(ctx, input, &tensor_ptr));
+            nvinfer1::ITensor* tensor_ptr{nullptr};
+            CHECK(importInput(ctx, input, &tensor_ptr, namedDims));
             tensor = tensor_ptr;
         }
         ctx->registerTensor(std::move(tensor), input.name());
     }
 
-    return Status::success();
+    return assertDimsWithSameNameAreEqual(ctx, namedDims);
 }
 
 Status deserialize_onnx_model(void const* serialized_onnx_model, size_t serialized_onnx_model_size,
@@ -419,7 +489,6 @@ bool ModelImporter::supportsModel(
     }
     return allSupported;
 }
-
 // Mark experimental ops as unsupported
 bool ModelImporter::supportsOperator(const char* op_name) const
 {
@@ -508,8 +577,10 @@ Status ModelImporter::importModel(
     ASSERT(!_importer_ctx.network()->hasImplicitBatchDimension() && "This version of the ONNX parser only supports TensorRT INetworkDefinitions with an explicit batch dimension. Please ensure the network was created using the EXPLICIT_BATCH NetworkDefinitionCreationFlag.", ErrorCode::kINVALID_VALUE);
     auto* ctx = &_importer_ctx;
     _importer_ctx.clearOpsets();
+#if ENABLE_STD_PLUGIN
     // Initialize plugin registry
     initLibNvInferPlugins(static_cast<void*>(&ctx->logger()), "");
+#endif // ENABLE_STD_PLUGIN
     for (int i = 0; i < model.opset_import().size(); ++i)
     {
         std::string domain = model.opset_import(i).domain();
diff --git a/ModelImporter.hpp b/ModelImporter.hpp
index 673602e9..e4896451 100644
--- a/ModelImporter.hpp
+++ b/ModelImporter.hpp
@@ -43,14 +43,6 @@ class ModelImporter : public nvonnxparser::IParser
     {
         delete this;
     }
-    // virtual void registerOpImporter(std::string op,
-    //                                NodeImporter const &node_importer) override {
-    //  // Note: This allows existing importers to be replaced
-    //  _op_importers[op] = node_importer;
-    //}
-    // virtual Status const &setInput(const char *name,
-    //                               nvinfer1::ITensor *input) override;
-    // virtual Status const& setOutput(const char* name, nvinfer1::ITensor** output) override;
     int getNbErrors() const override
     {
         return _errors.size();
diff --git a/README.md b/README.md
index c03b78c1..0317069f 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ For press and other inquiries, please contact Hector Marinez at hmarinez@nvidia.
 
 ## Supported TensorRT Versions
 
-Development on the Master branch is for the latest version of [TensorRT 8.0.1.6](https://developer.nvidia.com/nvidia-tensorrt-download) with full-dimensions and dynamic shape support.
+Development on the Master branch is for the latest version of [TensorRT 8.2.0.6](https://developer.nvidia.com/nvidia-tensorrt-download) with full-dimensions and dynamic shape support.
 
 For previous versions of TensorRT, refer to their respective branches.
 
@@ -48,12 +48,12 @@ Current supported ONNX operators are found in the [operator support matrix](docs
 ### Dependencies
 
  - [Protobuf >= 3.0.x](https://github.com/google/protobuf/releases)
- - [TensorRT 8.0.1.6](https://developer.nvidia.com/tensorrt)
- - [TensorRT 8.0.1.6 open source libaries (master branch)](https://github.com/NVIDIA/TensorRT/)
+ - [TensorRT 8.2.0.6](https://developer.nvidia.com/tensorrt)
+ - [TensorRT 8.2.0.6 open source libaries (master branch)](https://github.com/NVIDIA/TensorRT/)
 
 ### Building
 
-For building within docker, we recommend using and setting up the docker containers as instructed in the main [TensorRT repository](https://github.com/NVIDIA/TensorRT#setting-up-the-build-environment) to build the onnx-tensorrt library.
+For building within docker, we recommend using and setting up the docker containers as instructed in the main (TensorRT repository)[https://github.com/NVIDIA/TensorRT#setting-up-the-build-environment] to build the onnx-tensorrt library.
 
 Once you have cloned the repository, you can build the parser libraries and executables by running:
 
@@ -63,7 +63,9 @@ Once you have cloned the repository, you can build the parser libraries and exec
     // Ensure that you update your LD_LIBRARY_PATH to pick up the location of the newly built library:
     export LD_LIBRARY_PATH=$PWD:$LD_LIBRARY_PATH
 
-For building only the libraries, append `-DBUILD_LIBRARY_ONLY=1` to the CMake build command. If your model has Gather or GatherElements operations with negative indices, add `-DSUPPORT_NEGATIVE_GATHER` to the build command. Note that enabling negative-indices gather will have a performance impact on gathers with non-negative indices.
+Note that this project has a dependency on CUDA. By default the build will look in `/usr/local/cuda` for the CUDA toolkit installation. If your CUDA path is different, overwrite the default path by providing `-DCUDA_TOOLKIT_ROOT_DIR=<path_to_cuda_install>` in the CMake command.
+
+For building only the libraries, append `-DBUILD_LIBRARY_ONLY=1` to the CMake build command.
 
 ### Experimental Ops
 All experimental operators will be considered unsupported by the ONNX-TRT's `supportsModel()` function.
@@ -99,7 +101,7 @@ Python bindings for the ONNX-TensorRT parser are packaged in the shipped `.whl`
 
     python3 -m pip install <tensorrt_install_dir>/python/tensorrt-8.x.x.x-cp<python_ver>-none-linux_x86_64.whl
 
-TensorRT 8.0.1.6 supports ONNX release 1.6.0. Install it with:
+TensorRT 8.2.0.6 supports ONNX release 1.6.0. Install it with:
 
     python3 -m pip install onnx==1.6.0
 
diff --git a/ShapedWeights.cpp b/ShapedWeights.cpp
index 760bac19..d42e4631 100644
--- a/ShapedWeights.cpp
+++ b/ShapedWeights.cpp
@@ -143,6 +143,7 @@ bool transposeWeights(ShapedWeights const& weights, nvinfer1::Permutation const&
         new_shape.d[d] = shape.d[perm.order[d]];
         result->shape.d[d] = new_shape.d[d];
     }
+    
 
     if (shape.nbDims <= 4)
     {
diff --git a/Status.hpp b/Status.hpp
index f56c6e93..ba25de71 100644
--- a/Status.hpp
+++ b/Status.hpp
@@ -9,6 +9,14 @@
 #include <cassert>
 #include <string>
 
+#ifndef ENABLE_STD_PLUGIN
+#define ENABLE_STD_PLUGIN 1
+#endif // ENABLE_STD_PLUGIN
+
+#ifndef ENABLE_SAFE_PLUGIN
+#define ENABLE_SAFE_PLUGIN 0
+#endif // ENABLE_SAFE_PLUGIN
+
 // Used to strip out build path information from debug prints
 #if defined(SOURCE_LENGTH)
 #define __FILENAME__ (__FILE__ + SOURCE_LENGTH)
diff --git a/builtin_op_importers.cpp b/builtin_op_importers.cpp
index 937eaece..624fc5bb 100644
--- a/builtin_op_importers.cpp
+++ b/builtin_op_importers.cpp
@@ -3,6 +3,7 @@
  */
 
 #include "builtin_op_importers.hpp"
+#include "ConditionalHelpers.hpp"
 #include "LoopHelpers.hpp"
 #include "ModelImporter.hpp"
 #include "NvInferPlugin.h"
@@ -16,12 +17,11 @@
 #include <array>
 #include <cmath>
 #include <cstring> // For std::memcpy, std::memset
+#include <iostream>
 #include <iterator>
 #include <numeric> // For std::iota
 #include <tuple>
 #include <unordered_set>
-#include <iostream>
-#include <tuple>
 
 namespace onnx2trt
 {
@@ -470,7 +470,6 @@ DEFINE_BUILTIN_OP_IMPORTER(Clip)
 
     if (ctx->getOpsetVersion() >= 11)
     {
-        int numInputs = inputs.size();
         // Handle "min" node input.
         if (numInputs == 2)
         {
@@ -612,7 +611,7 @@ DEFINE_BUILTIN_OP_IMPORTER(Conv)
                 ErrorCode::kUNSUPPORTED_NODE);
         }
         // Handle Multi-input convolution
-        return convDeconvMultiInput(ctx, node, inputs, true /*isConv*/);
+        return convMultiInput(ctx, node, inputs);
     }
 
     nvinfer1::ITensor* tensorPtr = &convertToTensor(inputs.at(0), ctx);
@@ -726,46 +725,46 @@ DEFINE_BUILTIN_OP_IMPORTER(Conv)
 // When input.nbDims = 3, we expand it to 4D
 DEFINE_BUILTIN_OP_IMPORTER(ConvTranspose)
 {
-    if (inputs.at(1).is_tensor())
-    {
-        if (inputs.size() == 3)
+    // Expand spatial dims from 1D to 2D, return true if reshaped activation
+    const auto NCWtoNCHW = [&ctx, &node](nvinfer1::ITensor*& tensor, nvinfer1::Dims& tensorShape) {
+        if (tensor && tensor->getDimensions().nbDims == 3)
         {
-            ASSERT(inputs.at(2).is_weights()
-                    && "The bias tensor is required to be an initializer for the Deconvolution operator",
-                ErrorCode::kUNSUPPORTED_NODE);
+            const std::vector<int32_t> axes{3};
+            tensor = unsqueezeTensor(ctx, node, *tensor, axes);
+            tensorShape = tensor->getDimensions();
+            return true;
         }
-        // Handle Multi-input deconvolution
-        return convDeconvMultiInput(ctx, node, inputs, false /*isConv*/);
-    }
+        // for initializer, just change the shape by appending 1
+        if (tensorShape.nbDims == 3)
+        {
+            tensorShape.nbDims = 4;
+            tensorShape.d[3] = 1;
+        }
+        return false;
+    };
+
+    ASSERT((inputs.size() < 3 || inputs.at(2).is_weights())
+            && "The bias tensor is required to be an initializer for the Deconvolution operator",
+        ErrorCode::kUNSUPPORTED_NODE);
 
     nvinfer1::ITensor* tensorPtr = &convertToTensor(inputs.at(0), ctx);
+    nvinfer1::ITensor* kernelTensorPtr = inputs.at(1).is_tensor() ? &convertToTensor(inputs.at(1), ctx) : nullptr;
     nvinfer1::Dims dims = tensorPtr->getDimensions();
     // Deconvolution input must be at least 3D and at most 5D.
     ASSERT(dims.nbDims >= 3 && dims.nbDims <= 5 && "TensorRT only supports 1D, 2D or 3D deconvolutions!",
         ErrorCode::kUNSUPPORTED_NODE);
-    // Deconvolution weights must be an initializer
-    ASSERT( (inputs.at(1).is_weights()) && "ConvTranspose weights must be an initializer", ErrorCode::kUNSUPPORTED_NODE);
 
     // Kernel weights have layout [C, M/group, k1, k2, (k3)]
-    auto kernelWeights = inputs.at(1).weights();
+    auto kernelShape = inputs.at(1).shape();
 
-    bool needToExpandDims = (dims.nbDims == 3);
-    if (needToExpandDims)
-    {
-        std::vector<int> axes{3};
-        tensorPtr = unsqueezeTensor(ctx, node, *tensorPtr, axes);
-        ASSERT(tensorPtr && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
-        dims = tensorPtr->getDimensions();
-    }
-    if (kernelWeights.shape.nbDims == 3)
-    {
-        kernelWeights.shape.nbDims = 4;
-        kernelWeights.shape.d[3] = 1;
-    }
+    bool needReshapeBack = NCWtoNCHW(tensorPtr, dims);
+    NCWtoNCHW(kernelTensorPtr, kernelShape);
 
     const int nbSpatialDims = dims.nbDims - 2;
     // Check that the number of spatial dimensions and the kernel shape matches up.
-    ASSERT( (nbSpatialDims == kernelWeights.shape.nbDims - 2) && "The number of spatial dimensions and the kernel shape doesn't match up", ErrorCode::kUNSUPPORTED_NODE);
+    ASSERT((nbSpatialDims == kernelShape.nbDims - 2)
+            && "The number of spatial dimensions and the kernel shape doesn't match up",
+        ErrorCode::kUNSUPPORTED_NODE);
 
     // Get all attributes
     OnnxAttrs attrs(node, ctx);
@@ -780,29 +779,31 @@ DEFINE_BUILTIN_OP_IMPORTER(ConvTranspose)
     bool exclude_padding = false;
 
     int ngroup = attrs.get("group", 1);
-    int noutput = kernelWeights.shape.d[1] * ngroup; // Note: Weights order is CKRS
+    int noutput = kernelShape.d[1] * ngroup; // Note: Weights order is CKRS
 
     // Check for bias_weights
     nvinfer1::Weights biasWeights;
     if (inputs.size() == 3)
     {
-        ASSERT(inputs.at(2).is_weights() && "The bias tensor is required to be an initializer.", ErrorCode::kUNSUPPORTED_NODE);
         auto shapedBiasWeights = inputs.at(2).weights();
         // ONNX requires shapedBiasWeights to be 1D
-        ASSERT(shapedBiasWeights.shape.nbDims == 1 && "The bias tensor is required to be 1D.", ErrorCode::kINVALID_NODE);
-        ASSERT( (shapedBiasWeights.shape.d[0] == noutput) && "The shape of the bias tensor does not align with the shape of the output.", ErrorCode::kINVALID_NODE);
+        ASSERT(
+            shapedBiasWeights.shape.nbDims == 1 && "The bias tensor is required to be 1D.", ErrorCode::kINVALID_NODE);
+        ASSERT((shapedBiasWeights.shape.d[0] == noutput)
+                && "The shape of the bias tensor does not align with the shape of the output.",
+            ErrorCode::kINVALID_NODE);
         biasWeights = shapedBiasWeights;
     }
     else
     {
-        biasWeights = ShapedWeights::empty(kernelWeights.type);
+        biasWeights = ShapedWeights::empty(::ONNX_NAMESPACE::TensorProto::FLOAT);
     }
 
     // Kernel shape either comes from the attributes or extracted from the kernel weights shape
     kernelSize.nbDims = nbSpatialDims;
     for (int i = 1; i <= nbSpatialDims; ++i)
     {
-        kernelSize.d[nbSpatialDims - i] = kernelWeights.shape.d[kernelWeights.shape.nbDims - i];
+        kernelSize.d[nbSpatialDims - i] = kernelShape.d[kernelShape.nbDims - i];
     }
 
     getKernelParams(ctx, node, &kernelSize, &strides, &begPadding, &endPadding, paddingMode, exclude_padding,
@@ -810,8 +811,8 @@ DEFINE_BUILTIN_OP_IMPORTER(ConvTranspose)
 
     for (int i = 1; i <= nbSpatialDims; ++i)
     {
-        ASSERT( (kernelSize.d[nbSpatialDims - i] == kernelWeights.shape.d[kernelWeights.shape.nbDims - i])
-            && "Attribute kernel_shape misaligns with the dimensions of the weight tensor.",
+        ASSERT((kernelSize.d[nbSpatialDims - i] == kernelShape.d[kernelShape.nbDims - i])
+                && "Attribute kernel_shape misaligns with the dimensions of the weight tensor.",
             ErrorCode::kUNSUPPORTED_NODE);
     }
 
@@ -869,7 +870,9 @@ DEFINE_BUILTIN_OP_IMPORTER(ConvTranspose)
         }
     }
 
-    nvinfer1::Weights emptyBiasWeights = ShapedWeights::empty(kernelWeights.type);
+    const auto emptyBiasWeights = ShapedWeights::empty(::ONNX_NAMESPACE::TensorProto::FLOAT);
+    const auto kernelWeights
+        = kernelTensorPtr ? nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0} : inputs.at(1).weights();
 
     // Create a deconvolution layer and set known attributes - strides,ngroups, and dilations
     // If there is still output padding, remove the bias weights. Bias will be added below.
@@ -878,6 +881,14 @@ DEFINE_BUILTIN_OP_IMPORTER(ConvTranspose)
     layer->setStrideNd(strides);
     layer->setNbGroups(ngroup);
     layer->setDilationNd(dilations);
+    if (kernelTensorPtr)
+    {
+        layer->setInput(1, *kernelTensorPtr);
+    }
+    else
+    {
+        ctx->network()->setWeightsName(kernelWeights, inputs.at(1).weights().getName());
+    }
 
     // Check that 3D deconvolution paddings is valid
     if (nbSpatialDims == 3)
@@ -898,28 +909,38 @@ DEFINE_BUILTIN_OP_IMPORTER(ConvTranspose)
 
     // Register layer, along with refittable kernel weights and bias weights (if any)
     ctx->registerLayer(layer, getNodeName(node));
-    ctx->network()->setWeightsName(kernelWeights, inputs.at(1).weights().getName());
-    if (inputs.size() == 3)
-    {
-        ctx->network()->setWeightsName(biasWeights, inputs.at(2).weights().getName());
-    }
     tensorPtr = layer->getOutput(0);
     dims = tensorPtr->getDimensions();
 
     // There is still output padding. Add a padding layer to handle it.
     if (hasOutputPadding)
     {
-        // TRT only support 2D padding on the outermost dimensions
-        ASSERT( (outputPadding.nbDims == 2 || (outputPadding.nbDims == 3 && outputPadding.d[0] == 0))
-            && "TensorRT only supports 2D padding on the outermost dimensions.",
-            ErrorCode::kUNSUPPORTED_NODE);
-        // Convert 3D padding to 2d padding
-        if (nbSpatialDims == 3)
+        LOG_VERBOSE("Padding output deconvolution tensor with: " << outputPadding);
+
+        // Add padding layer
+        nvinfer1::ITensor* start{};
+        nvinfer1::ITensor* totalPadding{};
+        std::vector<int> combinePadding{};
+        for (int32_t i = 0; i < outputPadding.nbDims; ++i)
         {
-            outputPadding = {2, {outputPadding.d[1], outputPadding.d[2]}};
+            combinePadding.insert(combinePadding.begin(), 0);
+            combinePadding.push_back(outputPadding.d[i]);
         }
-        LOG_VERBOSE("Padding output deconvolution tensor with: " << outputPadding);
-        tensorPtr = ctx->network()->addPaddingNd(*tensorPtr, makeDims(2, 0), outputPadding)->getOutput(0);
+        ASSERT(
+            convertOnnxPadding(ctx, dims.nbDims, combinePadding, start, totalPadding) && "Failed to convert padding!",
+            ErrorCode::kUNSUPPORTED_NODE);
+        const auto size
+            = ctx->network()
+                  ->addElementWise(shapeOf(*tensorPtr).tensor(ctx), *totalPadding, nvinfer1::ElementWiseOperation::kSUM)
+                  ->getOutput(0);
+        const auto stride = makeDims(dims.nbDims, 1);
+        const auto& dummy = stride;
+        auto* sliceLayer = ctx->network()->addSlice(*tensorPtr, dummy, dummy, stride);
+        ASSERT(sliceLayer && "Could not create padding layer", ErrorCode::kUNSUPPORTED_NODE);
+        sliceLayer->setInput(1, *start);
+        sliceLayer->setInput(2, *size);
+        sliceLayer->setMode(nvinfer1::SliceMode::kFILL);
+        tensorPtr = sliceLayer->getOutput(0);
 
         // This bias is not handled by deconv. Use an elementwise to handle it.
         if (biasWeights.count != 0)
@@ -935,7 +956,12 @@ DEFINE_BUILTIN_OP_IMPORTER(ConvTranspose)
         }
     }
 
-    if (needToExpandDims)
+    if (inputs.size() == 3)
+    {
+        ctx->network()->setWeightsName(biasWeights, inputs.at(2).weights().getName());
+    }
+
+    if (needReshapeBack)
     {
         std::vector<int> axes{3};
         tensorPtr = squeezeTensor(ctx, node, *tensorPtr, axes);
@@ -1156,17 +1182,18 @@ NodeImportResult QuantDequantLinearHelper(
     ASSERT(zeroPointSize == scaleSize && "The scale and zero-point must have the same size",
         nvonnxparser::ErrorCode::kINVALID_NODE);
 
-    // Read the optional quantization axis attribute.
+    // Read the optional quantization axis attribute. Set it to the rank of the input tensor if not provided
     OnnxAttrs attrs(node, ctx);
-    const int32_t INVALID_AXIS = dataInput.getDimensions().nbDims;
-    int32_t axis = attrs.get<int>("axis", INVALID_AXIS);
+    const int32_t nbDims = dataInput.getDimensions().nbDims;
+    int32_t axis = attrs.get<int32_t>("axis", nbDims);
+    CHECK(convertAxis(axis, nbDims));
 
     if (scaleSize != 1)
     {
         // Per-Channel Quantization.
         // We assume this is weight-quantization with dimensions KCRS (K is # output channels).
         // Activations-quantization does not support per-axis quantization.
-        if (axis == INVALID_AXIS)
+        if (axis == nbDims)
         {
             axis = 0;
         }
@@ -1178,8 +1205,7 @@ NodeImportResult QuantDequantLinearHelper(
     else
     {
         // Per-Tensor Quantization.
-        ASSERT((axis == 1 || axis == INVALID_AXIS) && "Found quantization axis not compatible with a single quantization scale", nvonnxparser::ErrorCode::kINVALID_NODE);
-        // Currently this is ignored by TRT, but it is required by addScaleNd (for computing nbSpatialDims).
+        // Currently axis is ignored by TRT, but it is required here by addScaleNd (for computing nbSpatialDims).
         axis = 1;
     }
 
@@ -1190,8 +1216,7 @@ NodeImportResult QuantDequantLinearHelper(
         nvinfer1::IDequantizeLayer* dq = ctx->network()->addDequantize(dataInput, *scaleInput);
         ASSERT(dq && "Failed to create Dequantize layer.", ErrorCode::kUNSUPPORTED_NODE);
         dq->setAxis(axis);
-        nodeName += std::string("_quantize_scale_node");
-        dq->setName(nodeName.c_str());
+        nodeName += std::string("_dequantize_scale_node");
         layer = dq;
     }
     else
@@ -1274,6 +1299,43 @@ DEFINE_BUILTIN_OP_IMPORTER(Dropout)
     }
 }
 
+DEFINE_BUILTIN_OP_IMPORTER(Einsum)
+{
+    OnnxAttrs attrs(node, ctx);
+    const std::string equation = attrs.get<std::string>("equation");
+
+    std::string invalidCharacters;
+    for (char c : equation)
+    {
+        if ((c < 'a' || c > 'z') && c != '-' && c != '>' && c != '.' && c != ',' && c != ' ')
+        {
+            invalidCharacters.push_back(c);
+            invalidCharacters.push_back(',');
+        }
+    }
+
+    if (!invalidCharacters.empty())
+    {
+        invalidCharacters.pop_back();
+        return MAKE_ERROR("Invalid character(s) in Einsum equation: " + invalidCharacters, ErrorCode::kINVALID_NODE);
+    }
+
+    ASSERT((!inputs.empty()) && "Inputs vector is empty.", ErrorCode::kINVALID_NODE);
+
+    std::vector<nvinfer1::ITensor*> inputTensors;
+
+    for (auto input : inputs)
+    {
+        auto* tensor_ptr = &convertToTensor(input, ctx);
+        inputTensors.push_back(tensor_ptr);
+    }
+    auto nbInputs = static_cast<int32_t>(inputTensors.size());
+    nvinfer1::IEinsumLayer* einsumLayer = ctx->network()->addEinsum(inputTensors.data(), nbInputs, equation.c_str());
+    ctx->registerLayer(einsumLayer, getNodeName(node));
+
+    RETURN_FIRST_OUTPUT(einsumLayer);
+}
+
 DEFINE_BUILTIN_OP_IMPORTER(Elu)
 {
     OnnxAttrs attrs(node, ctx);
@@ -1319,9 +1381,12 @@ DEFINE_BUILTIN_OP_IMPORTER(Expand)
     const ShapeTensor starts = similar(ctx, newDims, 0);
     // Do the broadcast rule.
     const ShapeTensor sizes = broadcast(ctx, newDims, newShape);
-    // Compute (x > 1 ? 1 : 0) for x in newDims, assuming positive x, using only TensorRT operations.
+
+    const ShapeTensor delta = sub(ctx, sizes, newDims);
     const ShapeTensor one = shapeVector(1);
-    const ShapeTensor strides = min(ctx, one, sub(ctx, newDims, one));
+    // stride 1 for dims where sizes same as Slice input, 0 for not the same.
+    // delta is non-negative for Expand here
+    const ShapeTensor strides = sub(ctx, one, min(ctx, one, delta));
 
     nvinfer1::ISliceLayer* sliceLayer = addSlice(ctx, newInputTensor, starts, sizes, strides);
     ctx->registerLayer(sliceLayer, getNodeName(node));
@@ -1401,105 +1466,54 @@ DEFINE_BUILTIN_OP_IMPORTER(Floor)
 
 DEFINE_BUILTIN_OP_IMPORTER(Gather)
 {
-    nvinfer1::ITensor* data = &convertToTensor(inputs.at(0), ctx);
+    nvinfer1::ITensor& data = convertToTensor(inputs.at(0), ctx);
     // TRT does not support BOOL input types for this node
-    ASSERT( (data->getType() != nvinfer1::DataType::kBOOL) && "This version of TensorRT does not support BOOL input type for the Gather operator.", ErrorCode::kUNSUPPORTED_NODE);
-
-    nvinfer1::ITensor* indices = &convertToTensor(inputs.at(1), ctx);
+    ASSERT( (data.getType() != nvinfer1::DataType::kBOOL) && "This version of TensorRT does not support BOOL input type for the Gather operator.", ErrorCode::kUNSUPPORTED_NODE);
+    nvinfer1::ITensor& indices = convertToTensor(inputs.at(1), ctx);
     OnnxAttrs attrs(node, ctx);
-    int32_t axis = attrs.get<int32_t>("axis", 0);
-    int32_t nbDims = inputs.at(0).shape().nbDims;
+    int axis = attrs.get<int>("axis", 0);
+    int nbDims = inputs.at(0).shape().nbDims;
     CHECK(convertAxis(axis, nbDims));
     LOG_VERBOSE("Using Gather axis: " << axis);
-
-    // Support for negative indices can be enabled through adding -DSUPPORT_NEGATIVE_GATHER=1 in the CMake build command.
-    // This will unnecessarily reduce performance of networks that use only non-negative Gather indices.
-#if SUPPORT_NEGATIVE_GATHER
-    indices = convertGatherIndices(ctx, data, indices, axis);
-#endif // SUPPORT_NEGATIVE_GATHER
-
-    auto* layer = ctx->network()->addGather(*data, *indices, axis);
+    auto* layer = ctx->network()->addGather(data, indices, axis);
     ctx->registerLayer(layer, getNodeName(node));
     RETURN_FIRST_OUTPUT(layer);
 }
 
 DEFINE_BUILTIN_OP_IMPORTER(GatherElements)
 {
-
-    // We can treat GatherElements as a regular Gather operation with transformed input and indices tensors.
-    // Consider a simple example of a 3D tensor with axis = 1.
-    // The regular forumla of out[i][j][k] = in[i][idx[i][j][k]][k] can be rewritten as out[i][j][k] = in'[idx'[i,j,k]]
-    // Where in' is a squeezed down 1D representation of the data and idx' is calculated from the following formula:
-    // idx' = idx[i,j,k] * pitch[1] + bias. The bias is calculated as i*pitch[0] + k*pitch[2].
-
-    // clang-format off
-    /* Example: Data is 3D tensor of shape [2,2,2] with values [[[1,2], [3,4]], [[5,6], [7,8]]]
-                Indices is a 3D tensor of shape [2,2,1] with values [[[0], [1]], [[0], [1]]]
-                From the original formula, the output is [[[1], [3]], [[5], [7]]],
-
-                Pitch vector of data is [4,2,1].
-
-                idx` calculation:
-                    idx`[0, 0, 0] = [idx[0,0,0]](0) * [pitch[axis]](2) + [i(0)*pitch[0](4)](0) + [k(0)*pitch[2](1)](0) = 0
-                    idx`[0, 1, 0] = [idx[0,1,0]](1) * [pitch[axis]](2) + [i(0)*pitch[0](4)](0) + [k(0)*pitch[2](1)](0) = 2
-                    idx`[1, 0, 0] = [idx[1,0,0]](0) * [pitch[axis]](2) + [i(1)*pitch[0](4)](4) + [k(0)*pitch[2](1)](0) = 4
-                    idx`[1, 1, 0] = [idx[1,1,0]](1) * [pitch[axis]](2) + [i(1)*pitch[0](4)](4) + [k(0)*pitch[2](1)](0) = 6
-                    = [[[0], [2]], [[4], [6]]]
-
-                After linearizing data to 1D: [1,2,3,4,5,6,7,8], gathering on axis 0 with the new indices gives the same results.
-    */
-    // clang-format on
-
-    nvinfer1::ITensor* data = &convertToTensor(inputs.at(0), ctx);
-    nvinfer1::ITensor* index = &convertToTensor(inputs.at(1), ctx);
-
-    const nvinfer1::Dims& idxDims = index->getDimensions();
-    const nvinfer1::Dims& daDims = data->getDimensions();
-
-    ASSERT((data->getType() != nvinfer1::DataType::kBOOL) && "This version of TensorRT does not support BOOL input type for the GatherElements operator.", ErrorCode::kUNSUPPORTED_NODE);
-
-    // Note the above tranformation requires dimensions to be known at parse time, so check for dynamic shapes
-    ASSERT(!isDynamic(daDims) && !isDynamic(idxDims)
-            && "This version of TenosrRT does not support GatherElements on dynamic shapes!",
+    nvinfer1::ITensor& data = convertToTensor(inputs.at(0), ctx);
+    nvinfer1::ITensor& indices = convertToTensor(inputs.at(1), ctx);
+    const nvinfer1::Dims& dataDims = data.getDimensions();
+    ASSERT((data.getType() != nvinfer1::DataType::kBOOL)
+            && "This version of TensorRT does not support BOOL input type for the GatherElements operator.",
         ErrorCode::kUNSUPPORTED_NODE);
 
     OnnxAttrs attrs(node, ctx);
     int32_t axis = attrs.get<int32_t>("axis", 0);
-    int32_t dataNbDims = daDims.nbDims;
-
-    // Support for negative indices can be enabled through adding -DSUPPORT_NEGATIVE_GATHER=1 in the CMake build command.
-    // This will unnecessarily reduce performance of networks that use only non-negative Gather indices.
-#if SUPPORT_NEGATIVE_GATHER
-    index = convertGatherIndices(ctx, data, index, axis);
-#endif // SUPPORT_NEGATIVE_GATHER
-
+    const int32_t dataNbDims = dataDims.nbDims;
     CHECK(convertAxis(axis, dataNbDims));
     LOG_VERBOSE("Using Gather axis: " << axis);
 
-    // Calculate data pitches vector, and create axisPitch vector
-    int64_t nIndx = volume(idxDims);
-    std::vector<int32_t> pitches = calculatePitches(daDims);
-    std::vector<int32_t> axisPitch(nIndx, pitches[axis]);
-
-    // Calculate bias vector
-    std::vector<int32_t> biasVector = calculateBias(daDims, idxDims, pitches, axis);
-
-    // Perform idx` = idx * pitch[axis] + bias calculation.
-    auto* axisPitchTensor = addConstant(ctx, axisPitch, ::ONNX_NAMESPACE::TensorProto::INT32, idxDims)->getOutput(0);
-    auto* biasTensor = addConstant(ctx, biasVector, ::ONNX_NAMESPACE::TensorProto::INT32, idxDims)->getOutput(0);
+    auto* layer = ctx->network()->addGatherV2(data, indices, nvinfer1::GatherMode::kELEMENT);
+    layer->setGatherAxis(axis);
+    ctx->registerLayer(layer, getNodeName(node));
+    RETURN_FIRST_OUTPUT(layer);
+}
 
-    auto* mul
-        = ctx->network()->addElementWise(*index, *axisPitchTensor, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0);
-    auto* newIndices
-        = ctx->network()->addElementWise(*mul, *biasTensor, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+DEFINE_BUILTIN_OP_IMPORTER(GatherND)
+{
+    nvinfer1::ITensor& data = convertToTensor(inputs.at(0), ctx);
+    nvinfer1::ITensor& indices = convertToTensor(inputs.at(1), ctx);
+    ASSERT((data.getType() != nvinfer1::DataType::kBOOL)
+            && "This version of TensorRT does not support BOOL input type for the GatherND operator.",
+        ErrorCode::kUNSUPPORTED_NODE);
 
-    nvinfer1::Dims flattenDataDims{1, {static_cast<int32_t>(volume(daDims))}};
-    auto* reshape = ctx->network()->addShuffle(*data);
-    reshape->setReshapeDimensions(flattenDataDims);
-    reshape->setZeroIsPlaceholder(false);
+    OnnxAttrs attrs(node, ctx);
+    auto const nbElementWiseDims = attrs.get<int32_t>("batch_dims", 0);
 
-    nvinfer1::ITensor* flattenData = reshape->getOutput(0);
-    auto* layer = ctx->network()->addGather(*flattenData, *newIndices, 0);
+    auto* layer = ctx->network()->addGatherV2(data, indices, nvinfer1::GatherMode::kND);
+    layer->setNbElementWiseDims(nbElementWiseDims);
     ctx->registerLayer(layer, getNodeName(node));
     RETURN_FIRST_OUTPUT(layer);
 }
@@ -2114,6 +2128,9 @@ DEFINE_BUILTIN_OP_IMPORTER(If)
     const ::ONNX_NAMESPACE::GraphProto& elseGraph = attrs.get<const ::ONNX_NAMESPACE::GraphProto&>("else_branch");
 
     // Number of outputs are the same between the two branches.
+    ASSERT(thenGraph.output_size() == elseGraph.output_size()
+            && "then/else subgraphs should have the same number of outputs.",
+        ErrorCode::kINVALID_NODE);
     const int32_t nbOutputs = thenGraph.output_size();
     std::vector<TensorOrWeights> graphOutputs;
 
@@ -2127,43 +2144,39 @@ DEFINE_BUILTIN_OP_IMPORTER(If)
         {
             graphOutputs.emplace_back(ctx->tensors().at(body.output(i).name()));
         }
+        return {graphOutputs};
     }
-    // For tensor conditionals, we need to parse both branches
-    else
+
+    //
+    // The condition is not a build-time constant. Construct an if-conditional construct.
+    //
+
+    // The `condition` tensor must be a scalar boolean.
+    auto* condTensor = convertToScalar(ctx, &convertToTensor(cond, ctx));
+    ASSERT(condTensor && "Failed to convert the input cond to a scalar.", ErrorCode::kINVALID_NODE);
+
+    auto conditional = ctx->network()->addIfConditional();
+    conditional->setName(getNodeName(node).c_str());
+    conditional->setCondition(*condTensor);
+
+    std::vector<nvinfer1::ILayer*> thenLayers, elseLayers;
+    CHECK(importSubgraph(ctx, thenGraph, thenLayers));
+    CHECK(importSubgraph(ctx, elseGraph, elseLayers));
+
+    // Names must be unique
+    for (auto i = 0; i < nbOutputs; i++)
     {
-        CHECK(onnx2trt::parseGraph(ctx, thenGraph));
-        CHECK(onnx2trt::parseGraph(ctx, elseGraph));
-        for (auto i = 0; i < nbOutputs; i++)
-        {
-            const auto thenName = thenGraph.output(i).name();
-            const auto elseName = elseGraph.output(i).name();
-            ASSERT(thenName != elseName && "TensorRT requires conditional subgraphs to have different output tensor names!", ErrorCode::kUNSUPPORTED_NODE);
-            auto* thenTensor = &convertToTensor(ctx->tensors().at(thenName), ctx);
-            auto* elseTensor = &convertToTensor(ctx->tensors().at(elseName), ctx);
-            auto* condTensor = &convertToTensor(cond, ctx);
-            // While the number and datatypes of the outputs of each branch are equal, the shapes may be different
-            // TRT only supports dynamic branch selection if the output shapes are equal and if their shapes are broadcastable
-            CHECK(isBroadcastValid(ctx, thenTensor->getDimensions(), elseTensor->getDimensions()));
-            // Broadcast the condition tensor to the size of the output tensor for usage with the ISelect layer
-            CHECK(broadcastTensors(ctx, condTensor, thenTensor));
-            const bool needsCast = thenTensor->getType() == nvinfer1::DataType::kBOOL;
-            if (needsCast)
-            {
-                thenTensor = castHelper(ctx, thenTensor, nvinfer1::DataType::kINT32);
-                elseTensor = castHelper(ctx, elseTensor, nvinfer1::DataType::kINT32);
-            }
-            auto* layer = ctx->network()->addSelect(*condTensor, *thenTensor, *elseTensor);
-            ctx->registerLayer(layer, getNodeName(node));
-            if (needsCast)
-            {
-                graphOutputs.emplace_back(castHelper(ctx, layer->getOutput(0), nvinfer1::DataType::kBOOL));
-            }
-            else
-            {
-                graphOutputs.emplace_back(layer->getOutput(0));
-            }
-        }
+        const auto thenName = thenGraph.output(i).name();
+        const auto elseName = elseGraph.output(i).name();
+        ASSERT(thenName != elseName && "TensorRT requires conditional subgraphs to have different output tensor names!", ErrorCode::kUNSUPPORTED_NODE);
     }
+
+    using InputsMap = std::unordered_map<std::string, nvinfer1::IIfConditionalInputLayer*>;
+    InputsMap inputsMap;
+    CHECK(addIfInputLayers(ctx, conditional, inputsMap, thenGraph, thenLayers));
+    CHECK(addIfInputLayers(ctx, conditional, inputsMap, elseGraph, elseLayers));
+    CHECK(addIfOutputLayers(ctx, conditional, thenGraph, thenLayers, elseGraph, elseLayers, graphOutputs));
+
     return {graphOutputs};
 }
 
@@ -2197,6 +2210,15 @@ DEFINE_BUILTIN_OP_IMPORTER(InstanceNormalization)
     int nbDims = tensorPtr->getDimensions().nbDims;
     ASSERT(nbDims >= 3 && nbDims <= 5 && "TensorRT only supports InstanceNormalization on 3D, 4D, or 5D tensors!",
         ErrorCode::kUNSUPPORTED_NODE);
+
+    const bool needToExpandDims = (nbDims == 3);
+    if (needToExpandDims)
+    {
+        // Expand spatial dims from 1D to 2D
+        const std::vector<int32_t> axes{3};
+        tensorPtr = unsqueezeTensor(ctx, node, *tensorPtr, axes);
+        ASSERT(tensorPtr && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
+    }
     auto scale_weights = inputs.at(1).weights();
     auto bias_weights = inputs.at(2).weights();
     OnnxAttrs attrs(node, ctx);
@@ -2204,7 +2226,6 @@ DEFINE_BUILTIN_OP_IMPORTER(InstanceNormalization)
     const int32_t relu {0}; // the ONNX instance norm op does not use the relu parameter
     const float alpha {0.f}; // the ONNX instance norm op does not use the alpha parameter
 
-
     // Populate instanceNormalization plugin properties.
     const std::string pluginName = "InstanceNormalization_TRT";
     const std::string pluginVersion = "1";
@@ -2223,7 +2244,30 @@ DEFINE_BUILTIN_OP_IMPORTER(InstanceNormalization)
 
     auto* layer = ctx->network()->addPluginV2(&tensorPtr, 1, *plugin);
     ctx->registerLayer(layer, getNodeName(node));
-    RETURN_FIRST_OUTPUT(layer);
+    tensorPtr = layer->getOutput(0);
+
+    if (needToExpandDims)
+    {
+        // Un-expand spatial dims back to 1D
+        const std::vector<int32_t> axes{3};
+        tensorPtr = squeezeTensor(ctx, node, *tensorPtr, axes);
+        ASSERT(tensorPtr && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
+    }
+
+    return {{tensorPtr}};
+}
+
+DEFINE_BUILTIN_OP_IMPORTER(IsNaN)
+{
+    // IEEE arithmetic guarantees that x == x is false if x is a NaN, and true otherwise.
+    const std::vector<TensorOrWeights> newInputs{inputs[0], inputs[0]};
+    auto equalResult = elementwiseHelper(ctx, node, newInputs, nvinfer1::ElementWiseOperation::kEQUAL);
+    if (equalResult.is_error())
+    {
+        return equalResult;
+    }
+    auto equalRet = equalResult.value().at(0);
+    return unaryHelper(ctx, node, equalRet, nvinfer1::UnaryOperation::kNOT);
 }
 
 DEFINE_BUILTIN_OP_IMPORTER(LeakyRelu)
@@ -3033,97 +3077,153 @@ DEFINE_BUILTIN_OP_IMPORTER(Or)
 DEFINE_BUILTIN_OP_IMPORTER(Pad)
 {
     nvinfer1::ITensor* tensorPtr = &convertToTensor(inputs.at(0), ctx);
-    int nbDims = tensorPtr->getDimensions().nbDims;
-    std::vector<int> axes;
-    // TensorRT only supports 2D padding on the outermost dimensions of an input tensor that is
-    // at least 4D. Unsqueeze leading dimensions to convert input tensor to 4D if necessary
-    bool needToExpandDims = (nbDims < 4);
-    if (needToExpandDims)
-    {
-        int diff = 4 - nbDims;
-        axes.resize(diff);
-        std::iota(axes.begin(), axes.end(), 0);
-        tensorPtr = unsqueezeTensor(ctx, node, *tensorPtr, axes);
-        ASSERT(tensorPtr && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
-        nbDims = tensorPtr->getDimensions().nbDims;
-    }
+    const int32_t nbDims = tensorPtr->getDimensions().nbDims;
 
-    nvinfer1::Dims2 begPadding{0,0};
-    nvinfer1::Dims2 endPadding{0,0};
     OnnxAttrs attrs(node, ctx);
-    auto mode = attrs.get<std::string>("mode", "constant");
-    float value{0.f};
-    std::vector<int64_t> onnxPadding;
+    const auto mode = attrs.get<std::string>("mode", "constant");
+    float value{0.F};
+    nvinfer1::ITensor* valuePtr = nullptr;
+    std::vector<int32_t> onnxPadding;
 
     if (ctx->getOpsetVersion() < 11)
     {
-        value = attrs.get<float>("value", 0.f);
-        auto padding = attrs.get<std::vector<int>>("pads");
-        onnxPadding = std::vector<int64_t>(padding.begin(), padding.end());
+        value = attrs.get<float>("value", 0.F);
+        auto padding = attrs.get<std::vector<int32_t>>("pads");
+        onnxPadding = std::vector<int32_t>(padding.begin(), padding.end());
+        if (onnxPadding.empty())
+        {
+            LOG_VERBOSE("Found no-op pad in node: " + getNodeName(node));
+            RETURN_IDENTITY(inputs.at(0));
+        }
     }
-    // In opset >= 11, padding indicies and values moved from attributes to inputs
     else
     {
-        ASSERT(inputs.at(1).is_weights() && "The input pads is required to be an initializer.",
-            ErrorCode::kUNSUPPORTED_NODE);
-        weightsToVector<int64_t>(inputs.at(1).weights(), &onnxPadding);
+        // In opset >= 11, padding indicies and values moved from attributes to inputs
+        if (inputs.at(1).is_weights())
+        {
+            weightsToVector<int32_t>(inputs.at(1).weights(), &onnxPadding);
+        }
         if (inputs.size() == 3)
         {
-            ASSERT(inputs.at(2).is_weights() && "The input constant_value is required to be an initializer.",
-                ErrorCode::kUNSUPPORTED_NODE);
-            auto padWeight = inputs.at(2).weights();
-            ASSERT( (padWeight.count() == 1) && "The input constant_value is required to be a scalar.", ErrorCode::kINVALID_NODE);
-            value = static_cast<float*>(padWeight.values)[0];
+            if (inputs.at(2).is_weights())
+            {
+                const auto padWeight = inputs.at(2).weights();
+                ASSERT((padWeight.count() == 1) && "The input constant_value is required to be a scalar.",
+                    ErrorCode::kINVALID_NODE);
+                value = static_cast<const float*>(padWeight.values)[0];
+            }
+            else
+            {
+                valuePtr = &convertToTensor(inputs.at(2), ctx);
+            }
         }
     }
 
-    // Passthrough path for no-op padding
-    if (std::all_of(onnxPadding.begin(), onnxPadding.end(), [](int i){ return i == 0; })) {
-        LOG_VERBOSE("Found no-op pad in node: " + getNodeName(node));
-        RETURN_IDENTITY(inputs.at(0));
-    }
-
-    ASSERT(mode == "constant" && value == 0.f && "This version of TensorRT only supports constant 0 padding!",
-        ErrorCode::kUNSUPPORTED_NODE);
-
-    // Variables to help with padding on NHWC tensors
-    nvinfer1::Permutation firstPerm;
-    nvinfer1::Permutation secondPerm;
-    for (int32_t i = 0; i < nbDims; i++)
+    nvinfer1::ITensor* start{};
+    nvinfer1::ITensor* size{};
+    if (onnxPadding.empty())
     {
-        firstPerm.order[i] = i;
-        secondPerm.order[i] = i;
+        // the pads is from activation instead of initializer or attributes
+        nvinfer1::ITensor* onnxPaddingPtr = &convertToTensor(inputs.at(1), ctx);
+        ASSERT((onnxPaddingPtr->getDimensions().nbDims == 1) && "The pads input must be 1D.",
+            ErrorCode::kUNSUPPORTED_NODE);
+        ASSERT(onnxPaddingPtr->getDimensions().d[0] == nbDims * 2
+                && "pads should be a 1D tensor of shape [2 * input_rank]",
+            ErrorCode::kUNSUPPORTED_NODE);
+
+        auto pre = ctx->network()
+                       ->addSlice(
+                           *onnxPaddingPtr, nvinfer1::Dims{1, {0}}, nvinfer1::Dims{1, {nbDims}}, nvinfer1::Dims{1, {1}})
+                       ->getOutput(0);
+        auto post = ctx->network()
+                        ->addSlice(*onnxPaddingPtr, nvinfer1::Dims{1, {nbDims}}, nvinfer1::Dims{1, {nbDims}},
+                            nvinfer1::Dims{1, {1}})
+                        ->getOutput(0);
+
+        const std::vector<int32_t> zerosVal(nbDims, 0);
+        const auto zeros
+            = addConstant(ctx, zerosVal, ::ONNX_NAMESPACE::TensorProto::INT32, nvinfer1::Dims{1, {nbDims}})->getOutput(0);
+        start = ctx->network()->addElementWise(*zeros, *pre, nvinfer1::ElementWiseOperation::kSUB)->getOutput(0);
+        const auto totalPadding
+            = ctx->network()->addElementWise(*pre, *post, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+        size
+            = ctx->network()
+                  ->addElementWise(shapeOf(*tensorPtr).tensor(ctx), *totalPadding, nvinfer1::ElementWiseOperation::kSUM)
+                  ->getOutput(0);
     }
-    ASSERT(convertOnnxPadding(onnxPadding, begPadding, endPadding, firstPerm, secondPerm) && "TensorRT only supports 2D padding!", ErrorCode::kUNSUPPORTED_NODE);
-    // TODO: Remove this once TRT's padding layer supports non-activation types.
-    const nvinfer1::DataType originalDtype = tensorPtr->getType();
-    const bool needsCast = originalDtype != nvinfer1::DataType::kFLOAT;
-    if (needsCast)
+    else
     {
-        tensorPtr = castHelper(ctx, tensorPtr, nvinfer1::DataType::kFLOAT);
+        // passthrough path for no-op padding
+        if (std::all_of(onnxPadding.begin(), onnxPadding.end(), [](int32_t i) { return i == 0; }))
+        {
+            LOG_VERBOSE("Found no-op pad in node: " + getNodeName(node));
+            RETURN_IDENTITY(inputs.at(0));
+        }
+
+        // the pads is from initializer or attributes
+        nvinfer1::ITensor* totalPadding = nullptr;
+        ASSERT(convertOnnxPadding(ctx, nbDims, onnxPadding, start, totalPadding) && "Failed to convert padding!",
+            ErrorCode::kUNSUPPORTED_NODE);
+        size
+            = ctx->network()
+                  ->addElementWise(shapeOf(*tensorPtr).tensor(ctx), *totalPadding, nvinfer1::ElementWiseOperation::kSUM)
+                  ->getOutput(0);
     }
-    // Transpose tensor if necessary to support generic 2D padding
-    tensorPtr = transposeTensor(ctx, node, *tensorPtr, firstPerm);
 
-    auto* layer = ctx->network()->addPaddingNd(*tensorPtr, begPadding, endPadding);
+    // add slice node
+    const auto stride = makeDims(nbDims, 1);
+    const auto& dummy = stride;
+    auto* layer = ctx->network()->addSlice(*tensorPtr, dummy, dummy, stride);
     ASSERT(layer && "Could not create padding layer", ErrorCode::kUNSUPPORTED_NODE);
-    ctx->registerLayer(layer, getNodeName(node));
-    tensorPtr = layer->getOutput(0);
-
-    tensorPtr = transposeTensor(ctx, node, *tensorPtr, secondPerm);
+    layer->setInput(1, *start);
+    layer->setInput(2, *size);
+    if (mode == "constant")
+    {
+        layer->setMode(nvinfer1::SliceMode::kFILL);
 
-    if (needsCast)
+        if (valuePtr)
+        {
+            layer->setInput(4, *valuePtr);
+        }
+        else if (value != 0.F)
+        {
+            // constant_value must have the same data type as the input tensor
+            nvinfer1::ITensor* fillValue = nullptr;
+            switch (tensorPtr->getType())
+            {
+            case nvinfer1::DataType::kFLOAT:
+            case nvinfer1::DataType::kHALF:
+            case nvinfer1::DataType::kINT8:
+                fillValue = addConstant(ctx, std::vector<float>{value}, ::ONNX_NAMESPACE::TensorProto::FLOAT,
+                    nvinfer1::Dims{
+                        0, {0}})->getOutput(0);
+                break;
+            default:
+                fillValue = addConstant(ctx, std::vector<int32_t>{static_cast<int32_t>(value)},
+                    ::ONNX_NAMESPACE::TensorProto::INT32,
+                    nvinfer1::Dims{
+                        0, {0}})->getOutput(0);
+                break;
+            }
+            ASSERT(fillValue && "Could not create layer for constant_value", ErrorCode::kUNSUPPORTED_NODE);
+            layer->setInput(4, *fillValue);
+        }
+    }
+    else if (mode == "reflect")
     {
-        tensorPtr = castHelper(ctx, tensorPtr, originalDtype);
+        layer->setMode(nvinfer1::SliceMode::kREFLECT);
     }
-
-    // Squeeze back to original rank if necessary
-    if (needToExpandDims)
+    else if (mode == "edge")
     {
-        tensorPtr = squeezeTensor(ctx, node, *tensorPtr, axes);
-        ASSERT(tensorPtr && "Failed the squeeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
+        layer->setMode(nvinfer1::SliceMode::kCLAMP);
     }
-    return {{tensorPtr}};
+    else
+    {
+        return MAKE_ERROR("Unsupported pad mode", ErrorCode::kUNSUPPORTED_NODE);
+    }
+
+    ctx->registerLayer(layer, getNodeName(node));
+    return {{layer->getOutput(0)}};
 }
 
 DEFINE_BUILTIN_OP_IMPORTER(ParametricSoftplus)
@@ -3382,6 +3482,16 @@ DEFINE_BUILTIN_OP_IMPORTER(Relu)
     return activationHelper(ctx, node, inputs, nvinfer1::ActivationType::kRELU);
 }
 
+DEFINE_BUILTIN_OP_IMPORTER(Sign)
+{
+    return unaryHelper(ctx, node, inputs.at(0), nvinfer1::UnaryOperation::kSIGN);
+}
+
+DEFINE_BUILTIN_OP_IMPORTER(Round)
+{
+    return unaryHelper(ctx, node, inputs.at(0), nvinfer1::UnaryOperation::kROUND);
+}
+
 DEFINE_BUILTIN_OP_IMPORTER(Resize)
 {
     nvinfer1::ITensor& input = convertToTensor(inputs.at(0), ctx);
@@ -3958,6 +4068,36 @@ DEFINE_BUILTIN_OP_IMPORTER(Scan)
     return {nodeOutputs};
 }
 
+DEFINE_BUILTIN_OP_IMPORTER(ScatterND)
+{
+    auto* layer = addScatterLayer(ctx, inputs, nvinfer1::ScatterMode::kND);
+    ctx->registerLayer(layer, getNodeName(node));
+    RETURN_FIRST_OUTPUT(layer);
+}
+
+DEFINE_BUILTIN_OP_IMPORTER(ScatterElements)
+{
+    auto* layer = addScatterLayer(ctx, inputs, nvinfer1::ScatterMode::kELEMENT);
+    OnnxAttrs attrs(node, ctx);
+    int32_t axis = attrs.get<int>("axis", 0);
+    int32_t nbDims = inputs.at(0).shape().nbDims;
+    CHECK(convertAxis(axis, nbDims));
+    layer->setAxis(axis);
+    ctx->registerLayer(layer, getNodeName(node));
+    RETURN_FIRST_OUTPUT(layer);
+}
+
+DEFINE_BUILTIN_OP_IMPORTER(Scatter)
+{
+    // Scatter was deprecated in Opset 11 and replaced by ScatterElements
+    if (ctx->getOpsetVersion() >= 11)
+    {
+        LOG_WARNING("Scatter was deprecated in Opset 11. Node: \"" << getNodeName(node) << "\" will be converted to ScatterElements.");
+    }
+
+    return importScatterElements(ctx, node, inputs);
+}
+
 DEFINE_BUILTIN_OP_IMPORTER(Selu)
 {
     OnnxAttrs attrs(node, ctx);
@@ -4187,6 +4327,7 @@ DEFINE_BUILTIN_OP_IMPORTER(Split)
     std::vector<int> splitList;
     ShapeTensor sizes;
     ShapeTensor sizeSliceAxis;
+    ShapeTensor splitSizesTensor;
     const bool hasSplitList = (ctx->getOpsetVersion() >= 13) ? (inputs.size() == 2) : attrs.count("split");
     if (hasSplitList)
     {
@@ -4194,12 +4335,18 @@ DEFINE_BUILTIN_OP_IMPORTER(Split)
         // In opset >= 13, split lengths are an optional input
         if (ctx->getOpsetVersion() >= 13)
         {
-            ASSERT(inputs.at(1).is_weights() && "Split input 'split', if specified, must be an initializer!", ErrorCode::kUNSUPPORTED_NODE);
-            auto splitWeights = inputs.at(1).weights();
-            int32_t* splitValues = static_cast<int32_t*>(splitWeights.values);
-            for (size_t i = 0; i < splitWeights.count(); i++)
+            if (inputs.at(1).is_weights())
             {
-                splitList.push_back(splitValues[i]);
+                const auto splitWeights = inputs.at(1).weights();
+                const int32_t* splitValues = static_cast<const int32_t*>(splitWeights.values);
+                for (size_t i = 0; i < splitWeights.count(); i++)
+                {
+                    splitList.push_back(splitValues[i]);
+                }
+            }
+            else
+            {
+                splitSizesTensor = {ctx, inputs.at(1)};
             }
         }
         // Pre-opset 13 split lengths are provided as an attribute
@@ -4207,7 +4354,9 @@ DEFINE_BUILTIN_OP_IMPORTER(Split)
         {
             splitList = attrs.get<std::vector<int>>("split");
         }
-        ASSERT( (static_cast<int>(splitList.size()) == numOutputs) && "The shape of the split attribute misaligns with the number of outputs.", ErrorCode::kINVALID_NODE);
+        ASSERT((splitList.empty() || (static_cast<int>(splitList.size()) == numOutputs))
+                && "The shape of the split attribute misaligns with the number of outputs.",
+            ErrorCode::kINVALID_NODE);
     }
     else
     {
@@ -4233,7 +4382,14 @@ DEFINE_BUILTIN_OP_IMPORTER(Split)
         }
         if (hasSplitList)
         {
-            sizeSliceAxis = shapeVector(splitList[i]);
+            if (splitList.empty())
+            {
+                sizeSliceAxis = gather(ctx, splitSizesTensor, ShapeTensor(1, {i}));
+            }
+            else
+            {
+                sizeSliceAxis = shapeVector(splitList[i]);
+            }
             sizes = interlace(ctx, inputDims, sizeSliceAxis, subscripts);
         }
 
@@ -4342,7 +4498,6 @@ DEFINE_BUILTIN_OP_IMPORTER(Tile)
     // "input : T
     // Input tensor of any shape."
     nvinfer1::ITensor& input = convertToTensor(inputs.at(0), ctx);
-    ASSERT((input.getType() != nvinfer1::DataType::kBOOL) && "This version of TensorRT does not support BOOL input for the Tile operator." , ErrorCode::kUNSUPPORTED_NODE);
     const auto inputDims = shapeOf(input);
 
     // "repeats : T1
@@ -5021,6 +5176,7 @@ DEFINE_BUILTIN_OP_IMPORTER(TRT_MaxAverageBlendPool)
     RETURN_FIRST_OUTPUT(layer);
 }
 
+#if ENABLE_STD_PLUGIN
 DEFINE_BUILTIN_OP_IMPORTER(TRT_PluginV2)
 {
     std::vector<nvinfer1::ITensor*> tensors;
@@ -5048,6 +5204,7 @@ DEFINE_BUILTIN_OP_IMPORTER(TRT_PluginV2)
     ctx->registerLayer(layer, getNodeName(node));
     RETURN_ALL_OUTPUTS(layer);
 }
+#endif // ENABLE_STD_PLUGIN
 
 DEFINE_BUILTIN_OP_IMPORTER(TRT_Gather)
 {
diff --git a/docs/Changelog.md b/docs/Changelog.md
index d586e09b..1fab021d 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -2,6 +2,23 @@
 
 # ONNX-TensorRT Changelog
 
+## TensorRT 8.2 EA Release - 2021-10-04
+### Added
+- Added support for the following ONNX operators:
+  - Einsum
+  - IsNan
+  - GatherND
+  - Scatter
+  - ScatterElements
+  - ScatterND
+  - Sign
+  - Round
+
+### Updated
+- Updated `Gather` and `GatherElements` implementations to natively support negative indices
+- Updated `Pad` layer to support ND padding, along with `edge` and `reflect` padding mode support
+- Updated `If` layer with general performance improvements.
+
 ## TensorRT 8.0 Release - 2021-07-02
 ### Added
  - Rehauled resize operator, now fully supporting the following modes:
diff --git a/docs/contributing.md b/docs/contributing.md
index 99c15119..ce78f59d 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -1,5 +1,4 @@
 <!--- SPDX-License-Identifier: Apache-2.0 -->
-
 # Contributing
 
 Contributions are always welcome to improve the onnx-tensorrt parser. For those looking to contribute, please follow the PR process as outlined in the [TensorRT Open Source Software repository](https://github.com/NVIDIA/TensorRT/blob/master/CONTRIBUTING.md).
diff --git a/docs/operators.md b/docs/operators.md
index 416264a0..5972ca37 100644
--- a/docs/operators.md
+++ b/docs/operators.md
@@ -2,7 +2,7 @@
 
 # Supported ONNX Operators
 
-TensorRT 8.0 supports operators up to Opset 13. Latest information of ONNX operators can be found [here](https://github.com/onnx/onnx/blob/master/docs/Operators.md)
+TensorRT 8.2 supports operators up to Opset 13. Latest information of ONNX operators can be found [here](https://github.com/onnx/onnx/blob/master/docs/Operators.md)
 
 TensorRT supports the following ONNX data types: DOUBLE, FLOAT32, FLOAT16, INT8, and BOOL
 
@@ -49,7 +49,7 @@ See below for the support matrix of ONNX operators in ONNX-TensorRT.
 | Div                       | Y          | FP32, FP16, INT32 |
 | Dropout                   | Y          | FP32, FP16 |
 | DynamicQuantizeLinear     | N          |
-| Einsum                    | N          |
+| Einsum                    | Y          | FP32, FP16 | Ellipsis and diagonal operations are not supported.
 | Elu                       | Y          | FP32, FP16, INT8 |
 | Equal                     | Y          | FP32, FP16, INT32 |
 | Erf                       | Y          | FP32, FP16 |
@@ -58,9 +58,9 @@ See below for the support matrix of ONNX operators in ONNX-TensorRT.
 | EyeLike                   | Y          | FP32, FP16, INT32, BOOL |
 | Flatten                   | Y          | FP32, FP16, INT32, BOOL |
 | Floor                     | Y          | FP32, FP16 |
-| Gather                    | Y          | FP32, FP16, INT32, BOOL | Only positive indices (>=0) are supported <br />Compile with `-DSUPPORT_NEGATIVE_GATHER=1` to enable support for negative indices
-| GatherElements            | Y          | FP32, FP16, INT32, BOOL | Only positive indices (>=0) are supported <br />Compile with `-DSUPPORT_NEGATIVE_GATHER=1` to enable support for negative indices
-| GatherND                  | N          |
+| Gather                    | Y          | FP32, FP16, INT8, INT32 |
+| GatherElements            | Y          | FP32, FP16, INT8, INT32 |
+| GatherND                  | Y          | FP32, FP16, INT8, INT32 |
 | Gemm                      | Y          | FP32, FP16, INT8 |
 | GlobalAveragePool         | Y          | FP32, FP16, INT8 |
 | GlobalLpPool              | Y          | FP32, FP16, INT8 |
@@ -75,7 +75,7 @@ See below for the support matrix of ONNX operators in ONNX-TensorRT.
 | ImageScaler               | Y          | FP32, FP16 |
 | InstanceNormalization     | Y          | FP32, FP16 | Scales `scale` and biases `B` must be initializers. Input rank must be >=3 & <=5                                                                                  |
 | IsInf                     | N          |
-| IsNaN                     | N          |
+| IsNaN                     | Y          | FP32, FP16, INT32 |
 | LeakyRelu                 | Y          | FP32, FP16, INT8 |
 | Less                      | Y          | FP32, FP16, INT32 |
 | LessOrEqual               | Y          | FP32, FP16, INT32 |
@@ -100,12 +100,12 @@ See below for the support matrix of ONNX operators in ONNX-TensorRT.
 | Multinomial               | N          |
 | Neg                       | Y          | FP32, FP16, INT32 |
 | NegativeLogLikelihoodLoss | N          |
-| NonMaxSuppression         | Y [EXPERIMENTAL] | FP32, FP16 | Inputs `max_output_boxes_per_class`, `iou_threshold`, and `score_threshold` must be initializers. <br />Output has fixed shape and is padded to [`max_output_boxes_per_class`, 3].
+| NonMaxSuppression         | Y [EXPERIMENTAL] | FP32, FP16 | Inputs `max_output_boxes_per_class`, `iou_threshold`, and `score_threshold` must be initializers. Output has fixed shape and is padded to [`max_output_boxes_per_class`, 3].
 | NonZero                   | N          |
 | Not                       | Y          | BOOL |
 | OneHot                    | N          |
 | Or                        | Y          | BOOL |
-| Pad                       | Y          | FP32, FP16, INT8, INT32 | Zero-constant padding on two dimensions only                    |
+| Pad                       | Y          | FP32, FP16, INT8, INT32 |
 | ParametricSoftplus        | Y          | FP32, FP16, INT8 |
 | Pow                       | Y          | FP32, FP16 |
 | PRelu                     | Y          | FP32, FP16, INT8 |
@@ -134,12 +134,12 @@ See below for the support matrix of ONNX operators in ONNX-TensorRT.
 | ReverseSequence           | Y          | FP32, FP16 | Dynamic input shapes are unsupported
 | RNN                       | Y          | FP32, FP16 | For bidirectional RNNs, activation functions must be the same for both the forward and reverse pass
 | RoiAlign                  | N          |
-| Round                     | N          |
+| Round                     | Y          | FP32, FP16, INT8 |
 | ScaledTanh                | Y          | FP32, FP16, INT8 |
 | Scan                      | Y          | FP32, FP16 |
-| Scatter                   | N          |
-| ScatterElements           | N          |
-| ScatterND                 | N          |
+| Scatter                   | Y          | FP32, FP16, INT8, INT32 |
+| ScatterElements           | Y          | FP32, FP16, INT8, INT32 |
+| ScatterND                 | Y          | FP32, FP16, INT8, INT32 |
 | Selu                      | Y          | FP32, FP16, INT8|
 | SequenceAt                | N          |
 | SequenceConstruct         | N          |
@@ -150,7 +150,7 @@ See below for the support matrix of ONNX operators in ONNX-TensorRT.
 | Shape                     | Y          | FP32, FP16, INT32, INT8, BOOL |
 | Shrink                    | N          |
 | Sigmoid                   | Y          | FP32, FP16, INT8 |
-| Sign                      | N          |
+| Sign                      | Y          | FP32, FP16, INT8, INT32 |
 | Sin                       | Y          | FP32, FP16 |
 | Sinh                      | Y          | FP32, FP16 |
 | Size                      | Y          | FP32, FP16, INT32, INT8, BOOL |
diff --git a/onnx2trt.hpp b/onnx2trt.hpp
index 680ef900..4ee38e04 100644
--- a/onnx2trt.hpp
+++ b/onnx2trt.hpp
@@ -54,11 +54,10 @@ class IImporterContext
     virtual nvinfer1::ILogger& logger() = 0;
     virtual bool hasError() const = 0;
     virtual nvinfer1::IErrorRecorder* getErrorRecorder() const = 0;
+    virtual nvinfer1::IConstantLayer* getConstantLayer(const char* name) const = 0;
 
 protected:
-    virtual ~IImporterContext()
-    {
-    }
+    virtual ~IImporterContext() {}
 };
 
 } // namespace onnx2trt
diff --git a/onnx2trt_utils.cpp b/onnx2trt_utils.cpp
index cc5ca367..cf50bb9f 100644
--- a/onnx2trt_utils.cpp
+++ b/onnx2trt_utils.cpp
@@ -3,6 +3,7 @@
  */
 
 #include "onnx2trt_utils.hpp"
+#include "NvInferSafeRuntime.h"
 #include "OnnxAttrs.hpp"
 #include <set>
 
@@ -330,7 +331,8 @@ Status convertAxis(int& axis, int nbDims)
     {
         axis += nbDims;
     }
-    ASSERT((axis >= 0 && axis < nbDims) && "Axis must be in the range [0, nbDims).", ErrorCode::kUNSUPPORTED_NODE);
+    // Support nbDims as a valid axis for QuantDequantLinearHelper
+    ASSERT((axis >= 0 && axis <= nbDims) && "Axis must be in the range [0, nbDims].", ErrorCode::kUNSUPPORTED_NODE);
     return Status::success();
 }
 
@@ -393,112 +395,43 @@ int32_t* convertINT64(const int64_t* weightValues, nvinfer1::Dims shape, IImport
     return int32Weights;
 }
 
-nvinfer1::ITensor* convertGatherIndices(IImporterContext* ctx, nvinfer1::ITensor* data, nvinfer1::ITensor* indices, int32_t axis)
+bool convertOnnxPadding(IImporterContext* ctx, int32_t nbInputDims, const std::vector<int32_t>& onnxPadding,
+    nvinfer1::ITensor*& startTensor, nvinfer1::ITensor*& totalPaddingTensor)
 {
-    const int32_t n = indices->getDimensions().nbDims;
-    auto axisLength = getAxisLength(ctx, data, axis);
-    broadcastTensor(ctx, axisLength, n);
-
-    // The formula here implements "indices < 0 ? indices + axisLength : indices"
-    // via the formula "indices - axisLength * max(-1, min(0, indices))".
-    // Think of the "max(-1, min(0, indices))" as extracting the sign bit from the indices.
-    const nvinfer1::Dims d = makeDims(n, 1);
-    auto zero = addConstantScalar(ctx, 0, ::ONNX_NAMESPACE::TensorProto::INT32, d)->getOutput(0);
-    auto minusOne = addConstantScalar(ctx, -1, ::ONNX_NAMESPACE::TensorProto::INT32, d)->getOutput(0);
-    auto min = ctx->network()->addElementWise(*zero, *indices, nvinfer1::ElementWiseOperation::kMIN)->getOutput(0);
-    auto max = ctx->network()->addElementWise(*minusOne, *min, nvinfer1::ElementWiseOperation::kMAX)->getOutput(0);
-    auto prod = ctx->network()->addElementWise(*max, *axisLength, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0);
-    auto sub = ctx->network()->addElementWise(*indices, *prod, nvinfer1::ElementWiseOperation::kSUB)->getOutput(0);
-    return sub;
-}
-
-bool convertOnnxPadding(std::vector<int64_t>& onnxPadding, nvinfer1::Dims2& begPadding, nvinfer1::Dims2& endPadding,
-    nvinfer1::Permutation& firstPerm, nvinfer1::Permutation& secondPerm)
-{
-    // Input tensor may have been unsqueezed to 4D. Insert no-op pads for all unsqueezed dimensions
-    const size_t minimumSize = 8;
-    while (onnxPadding.size() < minimumSize)
-    {
-        onnxPadding.insert(onnxPadding.begin() + onnxPadding.size() / 2, 0);
-        onnxPadding.insert(onnxPadding.begin(), 0);
-    }
-
-    const auto size = onnxPadding.size();
-    const auto half = size / 2;
-    std::set<size_t> pads;
-    for (size_t i = 0; i < onnxPadding.size(); i++)
-    {
-        if (onnxPadding[i] != 0)
-        {
-            pads.emplace(i);
-        }
-    }
-    // For all present paddings, ensure that their corresponding beg/end index is also present
-    for (const auto& pad : pads)
-    {
-        if (pad < half)
-        {
-            pads.emplace(pad + half);
-        }
-        else
-        {
-            pads.emplace(pad - half);
-        }
-    }
-    // For no-op paddings, simply return
-    if (pads.size() == 0)
+    std::vector<int32_t> start;
+    std::vector<int32_t> totalPadding;
+    if (onnxPadding.size() % 2U != 0)
     {
-        return true;
-    }
-    // For 1D padding, set the second dimension to either the last or second last dimension
-    if (pads.size() == 2)
-    {
-        if (pads.find(half - 1) == pads.end())
-        {
-            pads.emplace(size - 1);
-            pads.emplace(half - 1);
-        }
-        else
-        {
-            pads.emplace(size - 2);
-            pads.emplace(half - 2);
-        }
+        return false;
     }
-    // Fail on > 2D padding.
-    if (pads.size() > 4)
+    const auto diff = nbInputDims - static_cast<int32_t>(onnxPadding.size() / 2U);
+    if (diff < 0)
     {
         return false;
     }
+    start.resize(nbInputDims, 0);
+    totalPadding.resize(nbInputDims, 0);
 
-    // Pads should now be populated with 4 indices. Set beg and end padding values
-    std::vector<size_t> finalIndices(pads.begin(), pads.end());
-    begPadding.d[0] = onnxPadding[finalIndices[0]];
-    begPadding.d[1] = onnxPadding[finalIndices[1]];
-    endPadding.d[0] = onnxPadding[finalIndices[2]];
-    endPadding.d[1] = onnxPadding[finalIndices[3]];
-
-    // For the first permutation, swap the last two dimensions with the first two indices
-    std::swap(firstPerm.order[half - 1], firstPerm.order[finalIndices[1]]);
-    std::swap(firstPerm.order[half - 2], firstPerm.order[finalIndices[0]]);
-
-    // For the second (reverse) permutation - it is a mapping of the original in-order indices from the first
-    // permutation
-    secondPerm = firstPerm;
-    for (size_t i = 0; i < half; i++)
+    for (int32_t i = diff; i < nbInputDims; i++)
     {
-        if (secondPerm.order[i] != static_cast<int32_t>(i))
+        const auto idx = i - diff;
+        const auto pre = onnxPadding[idx];
+        const auto post = onnxPadding[onnxPadding.size() / 2U + idx];
+        if (pre < 0 || post < 0)
         {
-            for (size_t j = 0; j < half; j++)
-            {
-                if (firstPerm.order[j] == static_cast<int32_t>(i))
-                {
-                    secondPerm.order[i] = static_cast<int32_t>(j);
-                    continue;
-                }
-            }
+            return false;
         }
+
+        start[i] = -pre;
+        totalPadding[i] = pre + post;
     }
-    return true;
+
+    startTensor
+        = addConstant(ctx, start, ::ONNX_NAMESPACE::TensorProto::INT32, nvinfer1::Dims{1, {nbInputDims}})->getOutput(0);
+    totalPaddingTensor
+        = addConstant(ctx, totalPadding, ::ONNX_NAMESPACE::TensorProto::INT32, nvinfer1::Dims{1, {nbInputDims}})
+              ->getOutput(0);
+    return startTensor && totalPaddingTensor;
 }
 
 bool shiftIsAllZeros(const ShapedWeights& shiftInt8)
@@ -527,17 +460,9 @@ onnx2trt::ShapedWeights createZeroShifts(const onnx2trt::ShapedWeights& shiftInt
 
 nvinfer1::ITensor* createZeroTensor(IImporterContext* ctx, nvinfer1::ITensor* data)
 {
-    nvinfer1::ITensor* zero;
-    if (data->getType() == nvinfer1::DataType::kFLOAT)
-    {
-        zero
-            = addConstant(ctx, std::vector<float>{0.f}, ::ONNX_NAMESPACE::TensorProto::FLOAT, {0, {1}})->getOutput(0);
-    }
-    else
-    {
-        zero
-            = addConstant(ctx, std::vector<int32_t>{0}, ::ONNX_NAMESPACE::TensorProto::INT32, {0, {1}})->getOutput(0);
-    }
+    nvinfer1::ITensor* zero
+        = addConstant(ctx, std::vector<float>{0.f}, ::ONNX_NAMESPACE::TensorProto::FLOAT, {0, {1}})->getOutput(0);
+    zero = castHelper(ctx, zero, data->getType());
     broadcastTensors(ctx, zero, data);
     zero = ctx->network()->addElementWise(*data, *zero, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0);
     return zero;
@@ -851,6 +776,12 @@ nvinfer1::ITensor& convertToTensor(TensorOrWeights& input, IImporterContext* ctx
     }
     // Handle non-tensor indices input by adding a new constant layer to the network.
     ShapedWeights& weights = input.weights();
+
+    auto const existingConstantLayer = ctx->getConstantLayer(weights.getName());
+    if (existingConstantLayer != nullptr)
+    {
+        return *(existingConstantLayer->getOutput(0));
+    }
     // Note the TRT doesn't natively handle boolean weights. First create an INT32 weights copy of the boolean weights,
     // then cast it back to bool within TRT.
     if (weights.type == ::ONNX_NAMESPACE::TensorProto::BOOL)
@@ -939,7 +870,7 @@ bool elementwiseCheck(const std::vector<TensorOrWeights>& inputs, const nvinfer1
 }
 
 NodeImportResult elementwiseHelper(IImporterContext* ctx, ::ONNX_NAMESPACE::NodeProto const& node,
-    std::vector<TensorOrWeights>& inputs, nvinfer1::ElementWiseOperation binary_op)
+    const std::vector<TensorOrWeights>& inputs, nvinfer1::ElementWiseOperation binary_op)
 {
     ASSERT((!inputs.empty()) && "Inputs vector is empty.", ErrorCode::kINVALID_NODE);
 
@@ -1243,7 +1174,21 @@ nvinfer1::ITensor* globalPoolingHelper(IImporterContext* ctx, ::ONNX_NAMESPACE::
 nvinfer1::IPluginCreator* importPluginCreator(
     const std::string& pluginName, const std::string& pluginVersion, const std::string& pluginNamespace)
 {
-    return getPluginRegistry()->getPluginCreator(pluginName.c_str(), pluginVersion.c_str(), pluginNamespace.c_str());
+    nvinfer1::IPluginCreator* creator = nullptr;
+
+#if ENABLE_STD_PLUGIN
+    creator = getPluginRegistry()->getPluginCreator(pluginName.c_str(), pluginVersion.c_str(), pluginNamespace.c_str());
+#endif // ENABLE_STD_PLUGIN
+
+#if ENABLE_SAFE_PLUGIN
+    if (creator == nullptr && nvinfer1::safe::getSafePluginRegistry() != nullptr)
+    {
+        creator = nvinfer1::safe::getSafePluginRegistry()->getPluginCreator(
+            pluginName.c_str(), pluginVersion.c_str(), pluginNamespace.c_str());
+    }
+#endif // ENABLE_SAFE_PLUGIN
+
+    return creator;
 }
 
 std::unique_ptr<nvinfer1::IPluginV2, PluginDeleter> createPlugin(const std::string& name,
@@ -1897,19 +1842,25 @@ bool supportsShapeTensor(nvinfer1::LayerType type, nvinfer1::ElementWiseOperatio
     {
     // Layers that allow shape tensor output
     case nvinfer1::LayerType::kCONCATENATION:
+    case nvinfer1::LayerType::kCONDITION:
+    case nvinfer1::LayerType::kCONDITIONAL_INPUT:
+    case nvinfer1::LayerType::kCONDITIONAL_OUTPUT:
     case nvinfer1::LayerType::kCONSTANT:
     case nvinfer1::LayerType::kGATHER:
     case nvinfer1::LayerType::kIDENTITY:
     case nvinfer1::LayerType::kPADDING:
+    case nvinfer1::LayerType::kSCATTER:
     case nvinfer1::LayerType::kSELECT:
     case nvinfer1::LayerType::kSHAPE:
     case nvinfer1::LayerType::kSHUFFLE:
     case nvinfer1::LayerType::kSLICE: return true;
     // Layers that do not allow shape tensor output
     case nvinfer1::LayerType::kACTIVATION:
+    case nvinfer1::LayerType::kASSERTION:
     case nvinfer1::LayerType::kCONVOLUTION:
     case nvinfer1::LayerType::kDECONVOLUTION:
     case nvinfer1::LayerType::kDEQUANTIZE:
+    case nvinfer1::LayerType::kEINSUM:
     case nvinfer1::LayerType::kFULLY_CONNECTED:
     case nvinfer1::LayerType::kITERATOR:
     case nvinfer1::LayerType::kLOOP_OUTPUT:
@@ -2043,19 +1994,21 @@ NodeImportResult unaryHelper(
     IImporterContext* ctx, const ::ONNX_NAMESPACE::NodeProto& node, TensorOrWeights& input, nvinfer1::UnaryOperation op)
 {
     nvinfer1::ITensor* tensorPtr = &convertToTensor(input, ctx);
-    auto inputType = tensorPtr->getType();
+    const auto rank = tensorPtr->getDimensions().nbDims;
+    const auto inputType = tensorPtr->getType();
+
     bool validUnaryType = true;
     switch (op)
     {
     case nvinfer1::UnaryOperation::kNOT:
     {
         // TRT only supports BOOL types for the NOT operation
-        validUnaryType = (inputType == nvinfer1::DataType::kBOOL);
+        validUnaryType = inputType == nvinfer1::DataType::kBOOL;
         break;
     }
     case nvinfer1::UnaryOperation::kABS:
     {
-        // ABS can work with INT32 types via temporary cast to FLOAT.
+        // WAR: Special operators like ABS can work with INT32 types via temporary cast to FLOAT.
         if (inputType == nvinfer1::DataType::kINT32)
         {
             tensorPtr = castHelper(ctx, tensorPtr, nvinfer1::DataType::kFLOAT);
@@ -2064,7 +2017,7 @@ NodeImportResult unaryHelper(
     }
     case nvinfer1::UnaryOperation::kNEG:
     {
-        // NEG can work with INT32 types via ElementWise Layer: (0 - x)
+        // WAR: NEG can work with INT32 types via ElementWise Layer: (0 - x)
         if (inputType == nvinfer1::DataType::kINT32)
         {
             // Calculate the rank of the input, and set all size to one and rely on broadcasting
@@ -2081,11 +2034,11 @@ NodeImportResult unaryHelper(
         validUnaryType = (inputType != nvinfer1::DataType::kBOOL && inputType != nvinfer1::DataType::kINT32);
     }
     }
+
     ASSERT(validUnaryType
             && "This version of TensorRT does not support the given operator with the given input data type.",
         ErrorCode::kUNSUPPORTED_NODE);
 
-    int rank = tensorPtr->getDimensions().nbDims;
     // Support scalar inputs by unsqueezing to 1D
     if (rank == 0)
     {
@@ -2121,126 +2074,103 @@ NodeImportResult unaryHelper(
     return {{tensorPtr}};
 }
 
-NodeImportResult convDeconvMultiInput(
-    IImporterContext* ctx, const ::ONNX_NAMESPACE::NodeProto& node, std::vector<TensorOrWeights>& inputs, bool isConv)
+NodeImportResult convMultiInput(
+    IImporterContext* ctx, const ::ONNX_NAMESPACE::NodeProto& node, std::vector<TensorOrWeights>& inputs)
 {
-    nvinfer1::ITensor* inputTensor = &convertToTensor(inputs.at(0), ctx);
-    nvinfer1::ITensor* weightsTensor = &convertToTensor(inputs.at(1), ctx);
+    nvinfer1::ITensor* input_tensor_ptr = &convertToTensor(inputs.at(0), ctx);
+    nvinfer1::ITensor* kernel_tensor_ptr = &convertToTensor(inputs.at(1), ctx);
 
-    nvinfer1::Dims inputDims = inputTensor->getDimensions();
-    nvinfer1::Dims weightsDims = weightsTensor->getDimensions();
-    const std::string layerType = isConv ? "Convolution " : "Deconvolution";
-    LOG_VERBOSE(layerType << " input dimensions: " << inputDims);
-    LOG_VERBOSE(layerType << " kernel dimensions: " << weightsDims);
+    nvinfer1::Dims dims = input_tensor_ptr->getDimensions();
+    LOG_VERBOSE("Convolution input dimensions: " << dims);
 
-    bool needToExpandDims = (inputDims.nbDims == 3);
+    bool needToExpandDims = (dims.nbDims == 3);
     if (needToExpandDims)
     {
         // Expand spatial dims from 1D to 2D
-        const std::vector<int32_t> axes{3};
-        inputTensor = unsqueezeTensor(ctx, node, *inputTensor, axes);
-        ASSERT(inputTensor && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
-        inputDims = inputTensor->getDimensions();
+        const std::vector<int> axes{3};
+        input_tensor_ptr = unsqueezeTensor(ctx, node, *input_tensor_ptr, axes);
+        ASSERT(input_tensor_ptr && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
+        dims = input_tensor_ptr->getDimensions();
     }
 
-    if (weightsDims.nbDims == 3)
+    if (kernel_tensor_ptr->getDimensions().nbDims == 3)
     {
         // Expand spatial dims from 1D to 2D
-        const std::vector<int32_t> axes{3};
-        weightsTensor = unsqueezeTensor(ctx, node, *weightsTensor, axes);
-        ASSERT(weightsTensor && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
-        weightsDims = weightsTensor->getDimensions();
+        const std::vector<int> axes{3};
+        kernel_tensor_ptr = unsqueezeTensor(ctx, node, *kernel_tensor_ptr, axes);
+        ASSERT(kernel_tensor_ptr && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
     }
+    nvinfer1::Dims kernel_size = inputs.at(1).shape();
 
-    auto kernelWeights = nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto biasWeights = nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto kernel_weights = nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto bias_weights = nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
 
-    const int32_t nbSpatialDims = inputDims.nbDims - 2;
+    nvinfer1::Dims input_dims = input_tensor_ptr->getDimensions();
+    const int nbSpatialDims = input_dims.nbDims - 2;
     // Check that the number of spatial dimensions and the kernel shape matches up.
-    ASSERT((nbSpatialDims == weightsDims.nbDims - 2)
+    ASSERT((nbSpatialDims == kernel_tensor_ptr->getDimensions().nbDims - 2)
             && "The input tensor shape misaligns with the input kernel shape.",
         ErrorCode::kUNSUPPORTED_NODE);
 
-    nvinfer1::Dims filterDims;
-    filterDims.nbDims = nbSpatialDims;
+    nvinfer1::Dims filter_dim;
+    filter_dim.nbDims = nbSpatialDims;
     nvinfer1::Dims strides = makeDims(nbSpatialDims, 1);
-    nvinfer1::Dims begPadding = makeDims(nbSpatialDims, 0);
-    nvinfer1::Dims endPadding = makeDims(nbSpatialDims, 0);
+    nvinfer1::Dims beg_padding = makeDims(nbSpatialDims, 0);
+    nvinfer1::Dims end_padding = makeDims(nbSpatialDims, 0);
     nvinfer1::Dims dilations = makeDims(nbSpatialDims, 1);
     nvinfer1::PaddingMode paddingMode;
     bool exclude_padding;
     getKernelParams(
-        ctx, node, &filterDims, &strides, &begPadding, &endPadding, paddingMode, exclude_padding, &dilations);
+        ctx, node, &filter_dim, &strides, &beg_padding, &end_padding, paddingMode, exclude_padding, &dilations);
 
-    for (int32_t i = 1; i <= nbSpatialDims; ++i)
+    for (int i = 1; i <= nbSpatialDims; ++i)
     {
-        ASSERT((filterDims.d[nbSpatialDims - i]
-                   == weightsDims.d[weightsDims.nbDims - i])
+        ASSERT((filter_dim.d[nbSpatialDims - i]
+                   == kernel_tensor_ptr->getDimensions().d[kernel_tensor_ptr->getDimensions().nbDims - i])
                 && "The attribute kernel_shape misalgins with the shape of the input kernel.",
             ErrorCode::kUNSUPPORTED_NODE);
     }
-    OnnxAttrs attrs(node, ctx);
-    int32_t numGroups = attrs.get("group", 1);
-    int32_t nChannel = inputDims.d[1];
 
-    // Conv weights shape is provided as [M,C/G,H1,H2], while deconv weights shape is provied as [C,M/G,H1,H2]
-    int32_t M = isConv ? weightsDims.d[0] : weightsDims.d[1] * numGroups;
-    int32_t C = isConv ? weightsDims.d[1] * numGroups : weightsDims.d[0];
+    int nChannel = input_dims.d[1];
+    int K = kernel_size.d[0];
+    int C = kernel_size.d[1];
 
     if (inputs.size() == 3)
     {
         // TRT-9875 - fix how bias tensor is handled
-        biasWeights = inputs.at(2).weights();
+        bias_weights = inputs.at(2).weights();
     }
 
-    ASSERT((nChannel == -1 || C == nChannel)
+    OnnxAttrs attrs(node, ctx);
+    int ngroup = attrs.get("group", 1);
+    ASSERT((nChannel == -1 || C * ngroup == nChannel)
             && "The attribute group and the kernel shape misalign with the channel size of the input tensor. ",
         ErrorCode::kINVALID_NODE);
 
-    nvinfer1::ILayer* layer = nullptr;
-    if (isConv)
-    {
-        nvinfer1::IConvolutionLayer* convLayer
-            = ctx->network()->addConvolutionNd(*inputTensor, M, filterDims, kernelWeights, biasWeights);
-        layer = convLayer;
-        ASSERT(convLayer && "Failed to add the Convolution layer.", ErrorCode::kUNSUPPORTED_NODE);
-
-        convLayer->setStrideNd(strides);
-        convLayer->setPaddingMode(paddingMode);
-        convLayer->setPrePadding(begPadding);
-        convLayer->setPostPadding(endPadding);
-        convLayer->setDilationNd(dilations);
-        convLayer->setNbGroups(numGroups);
-    }
-    else
-    {
-        nvinfer1::IDeconvolutionLayer* deconvLayer
-            = ctx->network()->addDeconvolutionNd(*inputTensor, M, filterDims, kernelWeights, biasWeights);
-        layer = deconvLayer;
-        ASSERT(deconvLayer && "Failed to add the Deconvolution layer.", ErrorCode::kUNSUPPORTED_NODE);
-
-        deconvLayer->setStrideNd(strides);
-        deconvLayer->setPaddingMode(paddingMode);
-        deconvLayer->setPrePadding(begPadding);
-        deconvLayer->setPostPadding(endPadding);
-        deconvLayer->setDilationNd(dilations);
-        deconvLayer->setNbGroups(numGroups);
-    }
+    nvinfer1::IConvolutionLayer* layer
+        = ctx->network()->addConvolutionNd(*input_tensor_ptr, K, filter_dim, kernel_weights, bias_weights);
+    ASSERT(layer && "Failed to add the Convolution layer.", ErrorCode::kUNSUPPORTED_NODE);
+    layer->setStrideNd(strides);
+    layer->setPaddingMode(paddingMode);
+    layer->setPrePadding(beg_padding);
+    layer->setPostPadding(end_padding);
+    layer->setDilationNd(dilations);
+    layer->setNbGroups(ngroup);
 
     // Set kernel weights tensor as second convolution input.
-    layer->setInput(1, *weightsTensor);
+    layer->setInput(1, *kernel_tensor_ptr);
     ctx->registerLayer(layer, getNodeName(node));
 
-    nvinfer1::ITensor* outputTensor = layer->getOutput(0);
+    nvinfer1::ITensor* output_tensor_ptr = layer->getOutput(0);
     if (needToExpandDims)
     {
         // Un-expand spatial dims back to 1D
-        const std::vector<int32_t> axes{3};
-        outputTensor = squeezeTensor(ctx, node, *outputTensor, axes);
-        ASSERT(outputTensor && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
+        const std::vector<int> axes{3};
+        output_tensor_ptr = squeezeTensor(ctx, node, *output_tensor_ptr, axes);
+        ASSERT(output_tensor_ptr && "Failed to unsqueeze tensor.", ErrorCode::kUNSUPPORTED_NODE);
     }
 
-    return {{outputTensor}};
+    return {{output_tensor_ptr}};
 }
 
 nvinfer1::ITensor* unsqueezeTensor(IImporterContext* ctx, const ::ONNX_NAMESPACE::NodeProto& node,
@@ -2384,4 +2314,13 @@ nvinfer1::ITensor* addSoftmax(IImporterContext* ctx, const ::ONNX_NAMESPACE::Nod
     return softMax->getOutput(0);
 }
 
+nvinfer1::IScatterLayer* addScatterLayer(
+    IImporterContext* ctx, std::vector<TensorOrWeights>& inputs, nvinfer1::ScatterMode mode)
+{
+    nvinfer1::ITensor& data = convertToTensor(inputs.at(0), ctx);
+    nvinfer1::ITensor& indices = convertToTensor(inputs.at(1), ctx);
+    nvinfer1::ITensor& updates = convertToTensor(inputs.at(2), ctx);
+    return ctx->network()->addScatter(data, indices, updates, mode);
+}
+
 } // namespace onnx2trt
diff --git a/onnx2trt_utils.hpp b/onnx2trt_utils.hpp
index 3156bbe8..db2c0cf6 100644
--- a/onnx2trt_utils.hpp
+++ b/onnx2trt_utils.hpp
@@ -170,11 +170,9 @@ bool convertDtype(int32_t onnx_dtype, nvinfer1::DataType* trt_dtype);
 // Helper function to convert INT64 weight values into INT32
 int32_t* convertINT64(const int64_t* weightValues, nvinfer1::Dims shape, IImporterContext* ctx);
 
-// Helper function to convert negative gather indices into non-negative indices.
-nvinfer1::ITensor* convertGatherIndices(IImporterContext* ctx, nvinfer1::ITensor* data, nvinfer1::ITensor* indices, int32_t axis);
-
-// Helper function to convert ONNX padding into TRT padding. Will update begPadding, endPadding, firstPerm, and secondPerm by reference
-bool convertOnnxPadding(std::vector<int64_t>& onnxPadding, nvinfer1::Dims2& begPadding, nvinfer1::Dims2& endPadding, nvinfer1::Permutation& firstPerm, nvinfer1::Permutation& secondPerm);
+// Helper function to convert ONNX padding into TRT padding. Will update startTensor and totalPaddingTensor by reference
+bool convertOnnxPadding(IImporterContext* ctx, int32_t nbInputDims, const std::vector<int32_t>& onnxPadding,
+    nvinfer1::ITensor*& startTensor, nvinfer1::ITensor*& totalPaddingTensor);
 
 // Helper function to check if all of the values in the shift tensor are zeros
 bool shiftIsAllZeros(const ShapedWeights& shiftInt8);
@@ -189,9 +187,9 @@ nvinfer1::ITensor* createZeroTensor(IImporterContext* ctx, nvinfer1::ITensor* da
 bool convertOnnxWeights(
     const ::ONNX_NAMESPACE::TensorProto& onnxTensor, onnx2trt::ShapedWeights* weights, IImporterContext* ctx);
 
-// Helper function to convert multi input convolution/deconvolution
-NodeImportResult convDeconvMultiInput(
-    IImporterContext* ctx, const ::ONNX_NAMESPACE::NodeProto& node, std::vector<TensorOrWeights>& inputs, bool isConv);
+// Helper function to convert multi input convolution
+NodeImportResult convMultiInput(
+    IImporterContext* ctx, const ::ONNX_NAMESPACE::NodeProto& node, std::vector<TensorOrWeights>& inputs);
 
 // Helper function to convert a 1D tensor into a scalar
 nvinfer1::ITensor* convertToScalar(IImporterContext* ctx, nvinfer1::ITensor* inpTensor);
@@ -210,7 +208,7 @@ bool elementwiseCheck(const std::vector<TensorOrWeights>& inputs, const nvinfer1
 
 // Helper function to import an ONNX elementwise op into TRT
 NodeImportResult elementwiseHelper(IImporterContext* ctx, ::ONNX_NAMESPACE::NodeProto const& node,
-    std::vector<TensorOrWeights>& inputs, nvinfer1::ElementWiseOperation binary_op);
+    const std::vector<TensorOrWeights>& inputs, nvinfer1::ElementWiseOperation binary_op);
 
 // Helper function to flatten a tensor on a given axis
 nvinfer1::ITensor* flattenTensor(IImporterContext* ctx, ::ONNX_NAMESPACE::NodeProto const& node, nvinfer1::ITensor& tensor, int axis = 0, bool regLayer = false);
@@ -374,4 +372,8 @@ ShapeTensor axesToInterlaceSubscripts(const ShapeTensor& axes, int nbDims);
 //! Helper function to add SoftMax layer.
 nvinfer1::ITensor* addSoftmax(IImporterContext* ctx, const ::ONNX_NAMESPACE::NodeProto& node, nvinfer1::ITensor& input);
 
+// Helper function to import ONNX scatter nodes into TRT
+nvinfer1::IScatterLayer* addScatterLayer(
+    IImporterContext* ctx, std::vector<TensorOrWeights>& inputs, nvinfer1::ScatterMode mode);
+
 } // namespace onnx2trt
diff --git a/onnx_tensorrt/__init__.py b/onnx_tensorrt/__init__.py
index ccd23709..2a3f701a 100644
--- a/onnx_tensorrt/__init__.py
+++ b/onnx_tensorrt/__init__.py
@@ -4,4 +4,4 @@
 
 from . import backend
 
-__version__ = "8.0.1"
+__version__ = "8.2.0"
diff --git a/onnx_utils.hpp b/onnx_utils.hpp
index 3e149b69..c2c147bb 100644
--- a/onnx_utils.hpp
+++ b/onnx_utils.hpp
@@ -15,8 +15,30 @@
 namespace
 {
 
+//! Describes occurrence of a named dimension.
+class NamedDimension
+{
+public:
+    //! TensorRT tensor.
+    nvinfer1::ITensor* tensor;
+
+    //! Index of tensor dimension to be named.
+    int32_t index;
+
+    //! ONNX "dim param" that is the name of the dimension.
+    std::string dimParam;
+
+    //! Construct a NamedDimension where the tensor will be filled in later.
+    NamedDimension(int32_t index_, const std::string& dimParam_)
+        : tensor(nullptr)
+        , index(index_)
+        , dimParam(dimParam_)
+    {
+    }
+};
+
 template <typename OnnxDims>
-bool convertOnnxDims(OnnxDims const& onnxDims, nvinfer1::Dims& trtDims)
+bool convertOnnxDims(OnnxDims const& onnxDims, nvinfer1::Dims& trtDims, std::vector<NamedDimension>& namedDims)
 {
     std::vector<int32_t> onnxDimsVec;
     for (const auto& onnxDim : onnxDims)
@@ -28,6 +50,10 @@ bool convertOnnxDims(OnnxDims const& onnxDims, nvinfer1::Dims& trtDims)
         }
         else
         {
+            if (!onnxDim.dim_param().empty())
+            {
+                namedDims.emplace_back(static_cast<int32_t>(onnxDimsVec.size()), onnxDim.dim_param());
+            }
             const int32_t dim = onnxDim.dim_param() == "" ? (onnxDim.dim_value() >= 0 ? onnxDim.dim_value() : -1) : -1;
             onnxDimsVec.emplace_back(dim);
         }