diff --git a/include/onnxruntime/core/framework/int4.h b/include/onnxruntime/core/framework/int4.h
index 228c1e4e872de..aff365dc9738f 100644
--- a/include/onnxruntime/core/framework/int4.h
+++ b/include/onnxruntime/core/framework/int4.h
@@ -84,11 +84,21 @@ struct Int4x2Base {
     return (num_int4_elems + 1) / 2;
   }
 
+  /// <summary>
+  /// Copy a source buffer of 4-bit elements (packed) into a destination buffer of 8-bit elements (unpacked).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store unpacked 8-bit elements</param>
+  /// <param name="src">Source buffer with 4-bit elements</param>
+  /// <returns>True on success</returns>
   static bool Unpack(gsl::span<UnpackedType> dst, gsl::span<const Int4x2Base<Signed>> src) {
     if (CalcNumInt4Pairs(dst.size()) != src.size()) {
       return false;
     }
 
+    if (src.empty()) {
+      return true;
+    }
+
     for (size_t i = 0; i < dst.size(); i++) {
       size_t r = i >> 1;   // i / 2;
       size_t c = i & 0x1;  // i % 2;
@@ -98,11 +108,21 @@ struct Int4x2Base {
     return true;
   }
 
+  /// <summary>
+  /// Copy a source buffer of 8-bit elements (unpacked) into a destination buffer of 4-bit elements (packed).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store packed 4-bit elements</param>
+  /// <param name="src">Source buffer with 8-bit elements</param>
+  /// <returns>True on success</returns>
   static bool Pack(gsl::span<Int4x2Base<Signed>> dst, gsl::span<const UnpackedType> src) {
-    if (src.empty() || (CalcNumInt4Pairs(src.size()) != dst.size())) {
+    if (CalcNumInt4Pairs(src.size()) != dst.size()) {
       return false;
     }
 
+    if (src.empty()) {
+      return true;
+    }
+
     size_t src_i = 0;
     size_t dst_i = 0;
 
@@ -116,6 +136,20 @@ struct Int4x2Base {
 
     return true;
   }
+
+  /// <summary>
+  /// Returns hierarchical indices for a packed int4 element from the given element index.
+  ///
+  /// Usage:
+  ///   Int4x2* data = ...;
+  ///   auto indices = GetTensorElemIndices(3);  // 4th int4 element
+  ///   int8_t elem = data[indices.first].GetElem(indices.second);
+  /// </summary>
+  /// <param name="index">Index of 4-bit element</param>
+  /// <returns>Unpacked element</returns>
+  static inline std::pair<size_t, size_t> GetTensorElemIndices(size_t index) {
+    return {index >> 1, index & 0x1};
+  }
 };
 
 using Int4x2 = Int4x2Base<true>;
diff --git a/onnxruntime/core/framework/print_tensor_statistics_utils.h b/onnxruntime/core/framework/print_tensor_statistics_utils.h
index fd036114f3e76..65360674e88d0 100644
--- a/onnxruntime/core/framework/print_tensor_statistics_utils.h
+++ b/onnxruntime/core/framework/print_tensor_statistics_utils.h
@@ -79,6 +79,33 @@ void PrintCommonStats(const T* data, size_t count) {
   PrintValue(max);
 }
 
+#define DEF_PRINT_COMMON_STATS_INT4(INT4_TYPE)                                   \
+  template <>                                                                    \
+  inline void PrintCommonStats<INT4_TYPE>(const INT4_TYPE* data, size_t count) { \
+    using UnpackedType = typename INT4_TYPE::UnpackedType;                       \
+    UnpackedType min = data[0].GetElem(0);                                       \
+    UnpackedType max = min;                                                      \
+    for (size_t i = 1; i < count; i++) {                                         \
+      auto indices = INT4_TYPE::GetTensorElemIndices(i);                         \
+      auto value = data[indices.first].GetElem(indices.second);                  \
+      if (value > max) {                                                         \
+        max = value;                                                             \
+      }                                                                          \
+      if (value < min) {                                                         \
+        min = value;                                                             \
+      }                                                                          \
+    }                                                                            \
+                                                                                 \
+    std::cout << "Min=";                                                         \
+    PrintValue(min);                                                             \
+                                                                                 \
+    std::cout << ",Max=";                                                        \
+    PrintValue(max);                                                             \
+  }
+
+DEF_PRINT_COMMON_STATS_INT4(Int4x2)
+DEF_PRINT_COMMON_STATS_INT4(UInt4x2)
+
 template <typename T>
 void PrintHalfStats(const T* data, size_t count) {
   float min = data[0].ToFloat();
diff --git a/onnxruntime/core/framework/print_tensor_utils.h b/onnxruntime/core/framework/print_tensor_utils.h
index 6bd4e2d3af3fd..b8c50a266b655 100644
--- a/onnxruntime/core/framework/print_tensor_utils.h
+++ b/onnxruntime/core/framework/print_tensor_utils.h
@@ -75,6 +75,29 @@ void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t
   std::cout << std::endl;
 }
 
+// INT4 - Print snippet of 2D tensor with shape (dim0, dim1)
+#define DEF_PRINT_CPU_TENSOR_SNIPPET_2D_INT4(INT4_TYPE)                                             \
+  template <>                                                                                       \
+  inline void PrintCpuTensorSnippet<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1, \
+                                               int64_t edge_items) {                                \
+    for (int64_t i = 0; i < dim0; i++) {                                                            \
+      SKIP_NON_EDGE_ITEMS(dim0, i, edge_items);                                                     \
+      auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1));                \
+      PrintValue(tensor[indices.first].GetElem(indices.second));                                    \
+      for (int64_t j = 1; j < dim1; j++) {                                                          \
+        SKIP_NON_EDGE_ITEMS_LAST_DIM(dim1, j, edge_items);                                          \
+        std::cout << ", ";                                                                          \
+        indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 + j));               \
+        PrintValue(tensor[indices.first].GetElem(indices.second));                                  \
+      }                                                                                             \
+      std::cout << std::endl;                                                                       \
+    }                                                                                               \
+    std::cout << std::endl;                                                                         \
+  }
+
+DEF_PRINT_CPU_TENSOR_SNIPPET_2D_INT4(Int4x2)
+DEF_PRINT_CPU_TENSOR_SNIPPET_2D_INT4(UInt4x2)
+
 // Print snippet of 3D tensor with shape (dim0, dim1, dim2)
 template <typename T>
 void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t dim2, int64_t edge_items) {
@@ -95,6 +118,33 @@ void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t
   std::cout << std::endl;
 }
 
+// INT4 - Print snippet of 3D tensor with shape (dim0, dim1, dim2)
+#define DEF_PRINT_CPU_TENSOR_SNIPPET_3D_INT4(INT4_TYPE)                                                           \
+  template <>                                                                                                     \
+  inline void PrintCpuTensorSnippet<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2, \
+                                               int64_t edge_items) {                                              \
+    for (int64_t i = 0; i < dim0; i++) {                                                                          \
+      SKIP_NON_EDGE_ITEMS(dim0, i, edge_items);                                                                   \
+      for (int64_t j = 0; j < dim1; j++) {                                                                        \
+        SKIP_NON_EDGE_ITEMS(dim1, j, edge_items);                                                                 \
+        auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2));          \
+        PrintValue(tensor[indices.first].GetElem(indices.second));                                                \
+        for (int64_t k = 1; k < dim2; k++) {                                                                      \
+          SKIP_NON_EDGE_ITEMS_LAST_DIM(dim2, k, edge_items);                                                      \
+          std::cout << ", ";                                                                                      \
+          indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2 + k));         \
+          PrintValue(tensor[indices.first].GetElem(indices.second));                                              \
+        }                                                                                                         \
+        std::cout << std::endl;                                                                                   \
+      }                                                                                                           \
+      std::cout << std::endl;                                                                                     \
+    }                                                                                                             \
+    std::cout << std::endl;                                                                                       \
+  }
+
+DEF_PRINT_CPU_TENSOR_SNIPPET_3D_INT4(Int4x2)
+DEF_PRINT_CPU_TENSOR_SNIPPET_3D_INT4(UInt4x2)
+
 // Print 2D tensor
 template <typename T>
 void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1) {
@@ -109,6 +159,26 @@ void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1) {
   std::cout << std::endl;
 }
 
+// INT4 - Print 2D tensor
+#define DEF_PRINT_CPU_TENSOR_FULL_2D_INT4(INT4_TYPE)                                               \
+  template <>                                                                                      \
+  inline void PrintCpuTensorFull<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1) { \
+    for (int64_t i = 0; i < dim0; i++) {                                                           \
+      auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1));               \
+      PrintValue(tensor[indices.first].GetElem(indices.second));                                   \
+      for (int64_t j = 1; j < dim1; j++) {                                                         \
+        std::cout << ", ";                                                                         \
+        indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 + j));              \
+        PrintValue(tensor[indices.first].GetElem(indices.second));                                 \
+      }                                                                                            \
+      std::cout << std::endl;                                                                      \
+    }                                                                                              \
+    std::cout << std::endl;                                                                        \
+  }
+
+DEF_PRINT_CPU_TENSOR_FULL_2D_INT4(Int4x2)
+DEF_PRINT_CPU_TENSOR_FULL_2D_INT4(UInt4x2)
+
 // Print 3D tensor
 template <typename T>
 void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1, int64_t dim2) {
@@ -126,6 +196,29 @@ void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1, int64_t dim
   std::cout << std::endl;
 }
 
+// INT4 - Print 3D tensor
+#define DEF_PRINT_CPU_TENSOR_FULL_3D_INT4(INT4_TYPE)                                                             \
+  template <>                                                                                                    \
+  inline void PrintCpuTensorFull<INT4_TYPE>(const INT4_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2) { \
+    for (int64_t i = 0; i < dim0; i++) {                                                                         \
+      for (int64_t j = 0; j < dim1; j++) {                                                                       \
+        auto indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2));         \
+        PrintValue(tensor[indices.first].GetElem(indices.second));                                               \
+        for (int64_t k = 1; k < dim2; k++) {                                                                     \
+          std::cout << ", ";                                                                                     \
+          indices = INT4_TYPE::GetTensorElemIndices(static_cast<size_t>(i * dim1 * dim2 + j * dim2 + k));        \
+          PrintValue(tensor[indices.first].GetElem(indices.second));                                             \
+        }                                                                                                        \
+        std::cout << std::endl;                                                                                  \
+      }                                                                                                          \
+      std::cout << std::endl;                                                                                    \
+    }                                                                                                            \
+    std::cout << std::endl;                                                                                      \
+  }
+
+DEF_PRINT_CPU_TENSOR_FULL_3D_INT4(Int4x2)
+DEF_PRINT_CPU_TENSOR_FULL_3D_INT4(UInt4x2)
+
 template <typename T>
 void PrintCpuTensor(const Tensor& tensor, int threshold = kDefaultSnippetThreshold, int edge_items = kDefaultSnippetEdgeItems) {
   const auto& shape = tensor.Shape();
diff --git a/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc b/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc
index 88bb3d70db312..17e26a57f5f3e 100644
--- a/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc
+++ b/onnxruntime/test/debug_node_inputs_outputs/debug_node_inputs_outputs_utils_test.cc
@@ -31,6 +31,34 @@ void VerifyTensorProtoFileData(const PathString& tensor_proto_path, gsl::span<co
 
   ASSERT_EQ(gsl::span<const T>(actual_data), expected_data);
 }
+
+template <bool Signed>
+void VerifyTensorProtoFileDataInt4(const PathString& tensor_proto_path,
+                                   gsl::span<const Int4x2Base<Signed>> expected_data,
+                                   gsl::span<const int64_t> shape) {
+  size_t num_elems = 1;
+  for (auto dim_val : shape) {
+    num_elems *= static_cast<size_t>(dim_val);
+  }
+
+  std::ifstream tensor_proto_stream{tensor_proto_path};
+
+  ONNX_NAMESPACE::TensorProto tensor_proto{};
+  ASSERT_TRUE(tensor_proto.ParseFromIstream(&tensor_proto_stream));
+
+  std::vector<Int4x2Base<Signed>> actual_data{};
+  actual_data.resize(expected_data.size());
+  ASSERT_STATUS_OK(utils::UnpackTensor(tensor_proto, Path{}, actual_data.data(), num_elems));
+
+  ASSERT_EQ(actual_data.size(), expected_data.size());
+
+  for (size_t i = 0; i < num_elems; i++) {
+    auto indices = Int4x2Base<Signed>::GetTensorElemIndices(i);
+    auto actual_val = actual_data[indices.first].GetElem(indices.second);
+    auto expected_val = expected_data[indices.first].GetElem(indices.second);
+    ASSERT_EQ(actual_val, expected_val);
+  }
+}
 }  // namespace
 
 namespace env_vars = utils::debug_node_inputs_outputs_env_vars;
@@ -72,5 +100,53 @@ TEST(DebugNodeInputsOutputs, BasicFileOutput) {
   tester.Run();
 }
 
+// Test dumping input and output INT4 tensors to file.
+TEST(DebugNodeInputsOutputs, FileOutput_Int4) {
+  TemporaryDirectory temp_dir{ORT_TSTR("debug_node_inputs_outputs_utils_test")};
+  ScopedEnvironmentVariables scoped_env_vars{
+      EnvVarMap{
+          {env_vars::kDumpInputData, "1"},
+          {env_vars::kDumpOutputData, "1"},
+          {env_vars::kNameFilter, nullopt},
+          {env_vars::kOpTypeFilter, nullopt},
+          {env_vars::kDumpDataDestination, "files"},
+          {env_vars::kAppendRankToFileName, nullopt},
+          {env_vars::kOutputDir, ToUTF8String(temp_dir.Path())},
+          {env_vars::kDumpingDataToFilesForAllNodesIsOk, "1"},
+      }};
+
+  constexpr int8_t unused_val = 0;
+  const std::vector<int64_t> input_shape({5, 3});
+  const std::vector<Int4x2> input_vals = {Int4x2(1, 2), Int4x2(3, 4), Int4x2(5, 6), Int4x2(7, 8),
+                                          Int4x2(9, 10), Int4x2(11, 12), Int4x2(13, 14), Int4x2(15, unused_val)};
+
+  const std::vector<int64_t> perm = {1, 0};
+  const std::vector<int64_t> expected_shape({3, 5});
+  const std::vector<Int4x2> expected_vals = {Int4x2(1, 4), Int4x2(7, 10), Int4x2(13, 2), Int4x2(5, 8),
+                                             Int4x2(11, 14), Int4x2(3, 6), Int4x2(9, 12), Int4x2(15, unused_val)};
+
+  OpTester tester{"Transpose", 21, kOnnxDomain};
+  tester.AddAttribute("perm", perm);
+  tester.AddInput<Int4x2>("x", input_shape, input_vals);
+  tester.AddOutput<Int4x2>("y", expected_shape, expected_vals);
+
+  auto verify_file_data =
+      [&temp_dir, &input_vals, &expected_vals, &input_shape, &expected_shape](
+          const std::vector<OrtValue>& fetches,
+          const std::string& /*provider_type*/) {
+        ASSERT_EQ(fetches.size(), 1u);
+        // check it contains a tensor
+        fetches[0].Get<Tensor>();
+        VerifyTensorProtoFileDataInt4(temp_dir.Path() + ORT_TSTR("/x.tensorproto"), gsl::make_span(input_vals),
+                                      gsl::make_span(input_shape));
+        VerifyTensorProtoFileDataInt4(temp_dir.Path() + ORT_TSTR("/y.tensorproto"),
+                                      gsl::make_span(expected_vals), gsl::make_span(expected_shape));
+      };
+
+  tester.SetCustomOutputVerifier(verify_file_data);
+
+  tester.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime