diff --git a/bazel/tachyon_deps.bzl b/bazel/tachyon_deps.bzl
index e3fca4747..dcd0e290b 100644
--- a/bazel/tachyon_deps.bzl
+++ b/bazel/tachyon_deps.bzl
@@ -15,6 +15,7 @@ load("//third_party/nasm:workspace.bzl", nasm = "repo")
 load("//third_party/node_addon_api:install_node_addon_api.bzl", "install_node_addon_api")
 load("//third_party/omp:omp_configure.bzl", "omp_configure")
 load("//third_party/pdqsort:workspace.bzl", pdqsort = "repo")
+load("//third_party/powersort:workspace.bzl", powersort = "repo")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/rapidsnark:workspace.bzl", rapidsnark = "repo")
 
@@ -36,6 +37,7 @@ def tachyon_deps():
     json()
     nasm()
     pdqsort()
+    powersort()
     rapidsnark()
 
     install_node_addon_api(name = "node_addon_api")
diff --git a/tachyon/base/BUILD.bazel b/tachyon/base/BUILD.bazel
index b50d47be8..b844f6d36 100644
--- a/tachyon/base/BUILD.bazel
+++ b/tachyon/base/BUILD.bazel
@@ -1,5 +1,5 @@
 load("//bazel:tachyon.bzl", "if_has_openmp_on_macos", "if_posix")
-load("//bazel:tachyon_cc.bzl", "tachyon_cc_library", "tachyon_cc_unittest")
+load("//bazel:tachyon_cc.bzl", "tachyon_cc_benchmark", "tachyon_cc_library", "tachyon_cc_unittest")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -146,6 +146,15 @@ tachyon_cc_library(
     deps = [":logging"],
 )
 
+tachyon_cc_library(
+    name = "sort",
+    hdrs = ["sort.h"],
+    deps = [
+        "@pdqsort",
+        "@powersort",
+    ],
+)
+
 tachyon_cc_library(
     name = "static_storage",
     hdrs = ["static_storage.h"],
@@ -162,6 +171,16 @@ tachyon_cc_library(
     hdrs = ["type_list.h"],
 )
 
+tachyon_cc_benchmark(
+    name = "sort_benchmark",
+    srcs = ["sort_benchmark.cc"],
+    deps = [
+        "//tachyon/base:random",
+        "//tachyon/base:sort",
+        "//tachyon/base/containers:container_util",
+    ],
+)
+
 tachyon_cc_unittest(
     name = "base_unittests",
     srcs = [
diff --git a/tachyon/base/containers/container_util.h b/tachyon/base/containers/container_util.h
index 0d4daebd5..a8558818b 100644
--- a/tachyon/base/containers/container_util.h
+++ b/tachyon/base/containers/container_util.h
@@ -59,7 +59,7 @@ template <typename Generator,
 std::vector<ReturnType> CreateVectorParallel(size_t size,
                                              Generator&& generator) {
   std::vector<ReturnType> ret(size);
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(); }
+  OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(); }
   return ret;
 }
 
@@ -93,7 +93,7 @@ template <typename Generator,
 std::vector<ReturnType> CreateVectorParallel(size_t size,
                                              Generator&& generator) {
   std::vector<ReturnType> ret(size);
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(i); }
+  OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { ret[i] = generator(i); }
   return ret;
 }
 
diff --git a/tachyon/base/openmp_util.h b/tachyon/base/openmp_util.h
index f1ef02214..19b111a55 100644
--- a/tachyon/base/openmp_util.h
+++ b/tachyon/base/openmp_util.h
@@ -14,21 +14,21 @@
 
 #if defined(TACHYON_HAS_OPENMP)
 #define CONSTEXPR_IF_NOT_OPENMP
-#define OMP_FOR _Pragma("omp for")
-#define OMP_FOR_NOWAIT _Pragma("omp for nowait")
+#define OMP_FOR(expr) _Pragma("omp for") for (expr)
+#define OMP_FOR_NOWAIT(expr) _Pragma("omp for nowait") for (expr)
+#define OMP_NESTED_FOR(expr) _Pragma("omp for collapse(2)") for (expr)
 #define OMP_PARALLEL _Pragma("omp parallel")
-#define OPENMP_PARALLEL_FOR(expr) _Pragma("omp parallel for") for (expr)
-#define OPENMP_PARALLEL_NESTED_FOR(expr) \
+#define OMP_PARALLEL_FOR(expr) _Pragma("omp parallel for") for (expr)
+#define OMP_PARALLEL_NESTED_FOR(expr) \
   _Pragma("omp parallel for collapse(2)") for (expr)
-#define OPENMP_FOR(expr) _Pragma("omp for") for (expr)
 #else
 #define CONSTEXPR_IF_NOT_OPENMP constexpr
-#define OMP_FOR
-#define OMP_FOR_NOWAIT
+#define OMP_FOR(expr) for (expr)
+#define OMP_FOR_NOWAIT(expr) for (expr)
+#define OMP_NESTED_FOR(expr) for (expr)
 #define OMP_PARALLEL
-#define OPENMP_PARALLEL_FOR(expr) for (expr)
-#define OPENMP_PARALLEL_NESTED_FOR(expr) for (expr)
-#define OPENMP_FOR(expr) for (expr)
+#define OMP_PARALLEL_FOR(expr) for (expr)
+#define OMP_PARALLEL_NESTED_FOR(expr) for (expr)
 #endif  // defined(TACHYON_HAS_OPENMP)
 
 namespace tachyon::base {
diff --git a/tachyon/base/parallelize.h b/tachyon/base/parallelize.h
index a72e0efe9..2183bc868 100644
--- a/tachyon/base/parallelize.h
+++ b/tachyon/base/parallelize.h
@@ -11,6 +11,92 @@
 #include "tachyon/base/openmp_util.h"
 
 namespace tachyon::base {
+namespace internal {
+
+template <typename Container, typename Callable,
+          typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
+          typename RunType = typename FunctorTraits::RunType,
+          typename ArgList = internal::ExtractArgs<RunType>,
+          typename SpanTy = internal::GetType<0, ArgList>,
+          size_t ArgNum = internal::GetSize<ArgList>>
+void InvokeParallelizeCallback(Container& container, size_t i,
+                               size_t num_chunks, size_t chunk_size,
+                               Callable callback) {
+  size_t len =
+      i == num_chunks - 1 ? std::size(container) - i * chunk_size : chunk_size;
+  SpanTy chunk(std::data(container) + i * chunk_size, len);
+  if constexpr (ArgNum == 1) {
+    callback(chunk);
+  } else if constexpr (ArgNum == 2) {
+    callback(chunk, i);
+  } else {
+    static_assert(ArgNum == 3);
+    callback(chunk, i, chunk_size);
+  }
+}
+
+template <typename Callable,
+          typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
+          typename RunType = typename FunctorTraits::RunType,
+          typename ArgList = internal::ExtractArgs<RunType>,
+          size_t ArgNum = internal::GetSize<ArgList>>
+void InvokeParallelizeCallback(size_t size, size_t i, size_t num_chunks,
+                               size_t chunk_size, Callable callback) {
+  size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
+  if constexpr (ArgNum == 1) {
+    callback(len);
+  } else if constexpr (ArgNum == 2) {
+    callback(len, i);
+  } else {
+    static_assert(ArgNum == 3);
+    callback(len, i, chunk_size);
+  }
+}
+
+template <typename Container, typename Callable,
+          typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
+          typename RunType = typename FunctorTraits::RunType,
+          typename ReturnType = typename FunctorTraits::ReturnType,
+          typename ArgList = internal::ExtractArgs<RunType>,
+          typename SpanTy = internal::GetType<0, ArgList>,
+          size_t ArgNum = internal::GetSize<ArgList>>
+void InvokeParallelizeCallback(Container& container, size_t i,
+                               size_t num_chunks, size_t chunk_size,
+                               Callable callback,
+                               std::vector<ReturnType>& values) {
+  size_t len =
+      i == num_chunks - 1 ? std::size(container) - i * chunk_size : chunk_size;
+  SpanTy chunk(std::data(container) + i * chunk_size, len);
+  if constexpr (ArgNum == 1) {
+    values[i] = callback(chunk);
+  } else if constexpr (ArgNum == 2) {
+    values[i] = callback(chunk, i);
+  } else {
+    static_assert(ArgNum == 3);
+    values[i] = callback(chunk, i, chunk_size);
+  }
+}
+
+template <typename Callable,
+          typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
+          typename RunType = typename FunctorTraits::RunType,
+          typename ReturnType = typename FunctorTraits::ReturnType,
+          typename ArgList = internal::ExtractArgs<RunType>,
+          size_t ArgNum = internal::GetSize<ArgList>>
+void InvokeParallelizeCallback(size_t size, size_t i, size_t num_chunks,
+                               size_t chunk_size, Callable callback,
+                               std::vector<ReturnType>& values) {
+  size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
+  if constexpr (ArgNum == 1) {
+    values[i] = callback(len);
+  } else if constexpr (ArgNum == 2) {
+    values[i] = callback(len, i);
+  } else {
+    static_assert(ArgNum == 3);
+    values[i] = callback(len, i, chunk_size);
+  }
+}
+}  // namespace internal
 
 template <typename T>
 using ParallelizeCallback1 = std::function<void(absl::Span<T>)>;
@@ -21,51 +107,35 @@ using ParallelizeCallback3 = std::function<void(absl::Span<T>, size_t, size_t)>;
 
 // Splits the |container| by |chunk_size| and executes |callback| in parallel.
 // See parallelize_unittest.cc for more details.
-template <typename Container, typename Callable,
-          typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
-          typename RunType = typename FunctorTraits::RunType,
-          typename ArgList = internal::ExtractArgs<RunType>,
-          typename SpanTy = internal::GetType<0, ArgList>,
-          typename T = typename SpanTy::value_type,
-          size_t ArgNum = internal::GetSize<ArgList>>
+template <typename Container, typename Callable>
 void ParallelizeByChunkSize(Container& container, size_t chunk_size,
                             Callable callback) {
   if (chunk_size == 0) return;
   size_t num_chunks = (std::size(container) + chunk_size - 1) / chunk_size;
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
-    size_t len = i == num_chunks - 1 ? std::size(container) - i * chunk_size
-                                     : chunk_size;
-    SpanTy chunk(std::data(container) + i * chunk_size, len);
-    if constexpr (ArgNum == 1) {
-      callback(chunk);
-    } else if constexpr (ArgNum == 2) {
-      callback(chunk, i);
-    } else {
-      static_assert(ArgNum == 3);
-      callback(chunk, i, chunk_size);
-    }
+  if (num_chunks == 1) {
+    internal::InvokeParallelizeCallback(container, 0, num_chunks, chunk_size,
+                                        callback);
+    return;
+  }
+  OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+    internal::InvokeParallelizeCallback(container, i, num_chunks, chunk_size,
+                                        callback);
   }
 }
 
 // Splits the |size| by |chunk_size| and executes |callback| in parallel.
-template <typename Callable,
-          typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
-          typename RunType = typename FunctorTraits::RunType,
-          typename ArgList = internal::ExtractArgs<RunType>,
-          size_t ArgNum = internal::GetSize<ArgList>>
+template <typename Callable>
 void ParallelizeByChunkSize(size_t size, size_t chunk_size, Callable callback) {
   if (chunk_size == 0) return;
   size_t num_chunks = (size + chunk_size - 1) / chunk_size;
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
-    size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
-    if constexpr (ArgNum == 1) {
-      callback(len);
-    } else if constexpr (ArgNum == 2) {
-      callback(len, i);
-    } else {
-      static_assert(ArgNum == 3);
-      callback(len, i, chunk_size);
-    }
+  if (num_chunks == 1) {
+    internal::InvokeParallelizeCallback(size, 0, num_chunks, chunk_size,
+                                        callback);
+    return;
+  }
+  OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+    internal::InvokeParallelizeCallback(size, i, num_chunks, chunk_size,
+                                        callback);
   }
 }
 
@@ -95,29 +165,21 @@ void Parallelize(size_t size, Callable callback,
 template <typename Container, typename Callable,
           typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
           typename RunType = typename FunctorTraits::RunType,
-          typename ReturnType = typename FunctorTraits::ReturnType,
-          typename ArgList = internal::ExtractArgs<RunType>,
-          typename SpanTy = internal::GetType<0, ArgList>,
-          typename T = typename SpanTy::value_type,
-          size_t ArgNum = internal::GetSize<ArgList>>
+          typename ReturnType = typename FunctorTraits::ReturnType>
 std::vector<ReturnType> ParallelizeMapByChunkSize(Container& container,
                                                   size_t chunk_size,
                                                   Callable callback) {
   if (chunk_size == 0) return {};
   size_t num_chunks = (std::size(container) + chunk_size - 1) / chunk_size;
   std::vector<ReturnType> values(num_chunks);
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
-    size_t len = i == num_chunks - 1 ? std::size(container) - i * chunk_size
-                                     : chunk_size;
-    SpanTy chunk(std::data(container) + i * chunk_size, len);
-    if constexpr (ArgNum == 1) {
-      values[i] = callback(chunk);
-    } else if constexpr (ArgNum == 2) {
-      values[i] = callback(chunk, i);
-    } else {
-      static_assert(ArgNum == 3);
-      values[i] = callback(chunk, i, chunk_size);
-    }
+  if (num_chunks == 1) {
+    internal::InvokeParallelizeCallback(container, 0, num_chunks, chunk_size,
+                                        callback, values);
+    return values;
+  }
+  OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+    internal::InvokeParallelizeCallback(container, i, num_chunks, chunk_size,
+                                        callback, values);
   }
   return values;
 }
@@ -128,25 +190,21 @@ std::vector<ReturnType> ParallelizeMapByChunkSize(Container& container,
 template <typename Callable,
           typename FunctorTraits = internal::MakeFunctorTraits<Callable>,
           typename RunType = typename FunctorTraits::RunType,
-          typename ReturnType = typename FunctorTraits::ReturnType,
-          typename ArgList = internal::ExtractArgs<RunType>,
-          size_t ArgNum = internal::GetSize<ArgList>>
+          typename ReturnType = typename FunctorTraits::ReturnType>
 std::vector<ReturnType> ParallelizeMapByChunkSize(size_t size,
                                                   size_t chunk_size,
                                                   Callable callback) {
   if (chunk_size == 0) return {};
   size_t num_chunks = (size + chunk_size - 1) / chunk_size;
   std::vector<ReturnType> values(num_chunks);
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
-    size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
-    if constexpr (ArgNum == 1) {
-      values[i] = callback(len);
-    } else if constexpr (ArgNum == 2) {
-      values[i] = callback(len, i);
-    } else {
-      static_assert(ArgNum == 3);
-      values[i] = callback(len, i, chunk_size);
-    }
+  if (num_chunks == 1) {
+    internal::InvokeParallelizeCallback(size, 0, num_chunks, chunk_size,
+                                        callback, values);
+    return values;
+  }
+  OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+    internal::InvokeParallelizeCallback(size, i, num_chunks, chunk_size,
+                                        callback, values);
   }
   return values;
 }
diff --git a/tachyon/base/sort.h b/tachyon/base/sort.h
new file mode 100644
index 000000000..9f2b42d23
--- /dev/null
+++ b/tachyon/base/sort.h
@@ -0,0 +1,28 @@
+#ifndef TACHYON_BASE_SORT_H_
+#define TACHYON_BASE_SORT_H_
+
+#include "third_party/pdqsort/include/pdqsort.h"
+#include "third_party/powersort/include/sorts/powersort.h"
+
+namespace tachyon::base {
+
+template <typename Iter>
+void UnstableSort(Iter begin, Iter end) {
+  return pdqsort(begin, end);
+}
+
+template <typename Iter, typename Compare>
+void UnstableSort(Iter begin, Iter end, Compare compare) {
+  return pdqsort(begin, end, compare);
+}
+
+// TODO(chokobole): Add StableSort() with compare version.
+template <typename Iter>
+void StableSort(Iter begin, Iter end) {
+  algorithms::powersort<Iter> sort;
+  sort.sort(begin, end);
+}
+
+}  // namespace tachyon::base
+
+#endif  // TACHYON_BASE_SORT_H_
diff --git a/tachyon/base/sort_benchmark.cc b/tachyon/base/sort_benchmark.cc
new file mode 100644
index 000000000..cd0fb041a
--- /dev/null
+++ b/tachyon/base/sort_benchmark.cc
@@ -0,0 +1,259 @@
+#include "benchmark/benchmark.h"
+
+#include "tachyon/base/containers/container_util.h"
+#include "tachyon/base/random.h"
+#include "tachyon/base/sort.h"
+
+namespace tachyon::math {
+
+enum class SortMethod {
+  kPdq,
+  kPowersort,
+  kStdStableSort,
+  kStdSort,
+};
+
+std::vector<uint64_t> GetData(size_t size) {
+  static std::map<size_t, std::vector<uint64_t>>* s_data_map = nullptr;
+  if (s_data_map == nullptr) {
+    s_data_map = new std::map<size_t, std::vector<uint64_t>>();
+  }
+  std::vector<uint64_t>& data = (*s_data_map)[size];
+  if (data.empty()) {
+    data = base::CreateVector(size, [](size_t i) {
+      return base::Uniform(base::Range<uint64_t>::All());
+    });
+  }
+  return data;
+}
+
+std::vector<uint64_t> GetPartiallySortedData(size_t size) {
+  static std::map<size_t, std::vector<uint64_t>>* s_data_map = nullptr;
+  if (s_data_map == nullptr) {
+    s_data_map = new std::map<size_t, std::vector<uint64_t>>();
+  }
+  std::vector<uint64_t>& data = (*s_data_map)[size];
+  if (data.empty()) {
+    data = base::CreateVector(size, [](size_t i) { return uint64_t{i}; });
+    size_t shuffle_count = size / 8;
+    for (size_t i = 0; i < shuffle_count; ++i) {
+      size_t idx = base::Uniform(base::Range<size_t>::Until(shuffle_count));
+      size_t idx2 = base::Uniform(base::Range<size_t>::Until(shuffle_count));
+      std::swap(data[idx], data[idx2]);
+    }
+  }
+  return data;
+}
+
+template <SortMethod kSortMethod>
+void BM_SortRandomData(benchmark::State& state) {
+  std::vector<uint64_t> data = GetData(state.range(0));
+  std::vector<uint64_t> data2 = data;
+  for (auto _ : state) {
+    if constexpr (kSortMethod == SortMethod::kPdq) {
+      base::UnstableSort(data2.begin(), data2.end());
+    } else if constexpr (kSortMethod == SortMethod::kPowersort) {
+      base::StableSort(data2.begin(), data2.end());
+    } else if constexpr (kSortMethod == SortMethod::kStdStableSort) {
+      std::stable_sort(data2.begin(), data2.end());
+    } else if constexpr (kSortMethod == SortMethod::kStdSort) {
+      std::sort(data2.begin(), data2.end());
+    }
+    data2 = data;
+  }
+  benchmark::DoNotOptimize(data);
+  benchmark::DoNotOptimize(data2);
+}
+
+template <SortMethod kSortMethod>
+void BM_SortPartiallySortedData(benchmark::State& state) {
+  std::vector<uint64_t> data = GetPartiallySortedData(state.range(0));
+  std::vector<uint64_t> data2 = data;
+  for (auto _ : state) {
+    if constexpr (kSortMethod == SortMethod::kPdq) {
+      base::UnstableSort(data2.begin(), data2.end());
+    } else if constexpr (kSortMethod == SortMethod::kPowersort) {
+      base::StableSort(data2.begin(), data2.end());
+    } else if constexpr (kSortMethod == SortMethod::kStdStableSort) {
+      std::stable_sort(data2.begin(), data2.end());
+    } else if constexpr (kSortMethod == SortMethod::kStdSort) {
+      std::sort(data2.begin(), data2.end());
+    }
+    data2 = data;
+  }
+  benchmark::DoNotOptimize(data);
+  benchmark::DoNotOptimize(data2);
+}
+
+BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kPdq)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kStdSort)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kPowersort)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+BENCHMARK_TEMPLATE(BM_SortRandomData, SortMethod::kStdStableSort)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+
+BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kPdq)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kStdSort)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kPowersort)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+BENCHMARK_TEMPLATE(BM_SortPartiallySortedData, SortMethod::kStdStableSort)
+    ->RangeMultiplier(2)
+    ->Range(1 << 5, 1 << 20);
+
+}  // namespace tachyon::math
+
+// clang-format off
+// Executing tests from //tachyon/base:sort_benchmark
+// -----------------------------------------------------------------------------
+// 2024-08-05T09:26:02+00:00
+// Running /home/chokobole/.cache/bazel/_bazel_chokobole/234690e3562329d13f7f07caac03dae4/execroot/kroma_network_tachyon/bazel-out/k8-opt/bin/tachyon/base/sort_benchmark.runfiles/kroma_network_tachyon/tachyon/base/sort_benchmark
+// Run on (32 X 5499.96 MHz CPU s)
+// CPU Caches:
+//   L1 Data 48 KiB (x16)
+//   L1 Instruction 32 KiB (x16)
+//   L2 Unified 2048 KiB (x16)
+//   L3 Unified 36864 KiB (x1)
+// Load Average: 2.58, 2.64, 2.46
+// ---------------------------------------------------------------------------------------------------------
+// Benchmark                                                               Time             CPU   Iterations
+// ---------------------------------------------------------------------------------------------------------
+// BM_SortRandomData<SortMethod::kPdq>/32                               64.5 ns         64.5 ns     10927050
+// BM_SortRandomData<SortMethod::kPdq>/64                                173 ns          173 ns      4082994
+// BM_SortRandomData<SortMethod::kPdq>/128                               458 ns          458 ns      1526526
+// BM_SortRandomData<SortMethod::kPdq>/256                               917 ns          917 ns       767867
+// BM_SortRandomData<SortMethod::kPdq>/512                              1994 ns         1994 ns       349757
+// BM_SortRandomData<SortMethod::kPdq>/1024                             4694 ns         4694 ns       148456
+// BM_SortRandomData<SortMethod::kPdq>/2048                             9822 ns         9822 ns        71312
+// BM_SortRandomData<SortMethod::kPdq>/4096                            29664 ns        29660 ns        23724
+// BM_SortRandomData<SortMethod::kPdq>/8192                            87432 ns        87430 ns         8014
+// BM_SortRandomData<SortMethod::kPdq>/16384                          206323 ns       206317 ns         3389
+// BM_SortRandomData<SortMethod::kPdq>/32768                          449141 ns       449055 ns         1560
+// BM_SortRandomData<SortMethod::kPdq>/65536                          951382 ns       951308 ns          736
+// BM_SortRandomData<SortMethod::kPdq>/131072                        1971393 ns      1971368 ns          355
+// BM_SortRandomData<SortMethod::kPdq>/262144                        4258559 ns      4258174 ns          166
+// BM_SortRandomData<SortMethod::kPdq>/524288                        8783038 ns      8781649 ns           71
+// BM_SortRandomData<SortMethod::kPdq>/1048576                      18092094 ns     18091328 ns           39
+// BM_SortRandomData<SortMethod::kStdSort>/32                           65.7 ns         65.7 ns     10562672
+// BM_SortRandomData<SortMethod::kStdSort>/64                            135 ns          135 ns      5318656
+// BM_SortRandomData<SortMethod::kStdSort>/128                           369 ns          369 ns      1917457
+// BM_SortRandomData<SortMethod::kStdSort>/256                           780 ns          780 ns       906629
+// BM_SortRandomData<SortMethod::kStdSort>/512                          1720 ns         1720 ns       402973
+// BM_SortRandomData<SortMethod::kStdSort>/1024                         3937 ns         3937 ns       178163
+// BM_SortRandomData<SortMethod::kStdSort>/2048                        22613 ns        22612 ns        30836
+// BM_SortRandomData<SortMethod::kStdSort>/4096                        84293 ns        84293 ns         8256
+// BM_SortRandomData<SortMethod::kStdSort>/8192                       212055 ns       212048 ns         3296
+// BM_SortRandomData<SortMethod::kStdSort>/16384                      463883 ns       463879 ns         1505
+// BM_SortRandomData<SortMethod::kStdSort>/32768                     1011653 ns      1011613 ns          692
+// BM_SortRandomData<SortMethod::kStdSort>/65536                     2219845 ns      2219834 ns          315
+// BM_SortRandomData<SortMethod::kStdSort>/131072                    4715038 ns      4714815 ns          149
+// BM_SortRandomData<SortMethod::kStdSort>/262144                   10003182 ns     10002959 ns           70
+// BM_SortRandomData<SortMethod::kStdSort>/524288                   20982786 ns     20982067 ns           33
+// BM_SortRandomData<SortMethod::kStdSort>/1048576                  43885082 ns     43884019 ns           16
+// BM_SortRandomData<SortMethod::kPowersort>/32                         84.1 ns         84.1 ns      8342074
+// BM_SortRandomData<SortMethod::kPowersort>/64                          190 ns          190 ns      3655272
+// BM_SortRandomData<SortMethod::kPowersort>/128                         446 ns          446 ns      1580095
+// BM_SortRandomData<SortMethod::kPowersort>/256                         984 ns          984 ns       711206
+// BM_SortRandomData<SortMethod::kPowersort>/512                        2340 ns         2340 ns       302783
+// BM_SortRandomData<SortMethod::kPowersort>/1024                       5165 ns         5164 ns       130697
+// BM_SortRandomData<SortMethod::kPowersort>/2048                      33551 ns        33551 ns        21216
+// BM_SortRandomData<SortMethod::kPowersort>/4096                     100123 ns       100120 ns         6908
+// BM_SortRandomData<SortMethod::kPowersort>/8192                     246248 ns       246236 ns         2893
+// BM_SortRandomData<SortMethod::kPowersort>/16384                    544985 ns       544968 ns         1285
+// BM_SortRandomData<SortMethod::kPowersort>/32768                   1185154 ns      1185143 ns          590
+// BM_SortRandomData<SortMethod::kPowersort>/65536                   2544378 ns      2544265 ns          275
+// BM_SortRandomData<SortMethod::kPowersort>/131072                  5465731 ns      5465444 ns          128
+// BM_SortRandomData<SortMethod::kPowersort>/262144                 11856806 ns     11856582 ns           59
+// BM_SortRandomData<SortMethod::kPowersort>/524288                 25558533 ns     25557610 ns           27
+// BM_SortRandomData<SortMethod::kPowersort>/1048576                53946952 ns     53946318 ns           12
+// BM_SortRandomData<SortMethod::kStdStableSort>/32                      106 ns          106 ns      6572790
+// BM_SortRandomData<SortMethod::kStdStableSort>/64                      233 ns          233 ns      2997692
+// BM_SortRandomData<SortMethod::kStdStableSort>/128                     497 ns          497 ns      1416719
+// BM_SortRandomData<SortMethod::kStdStableSort>/256                    1133 ns         1133 ns       614965
+// BM_SortRandomData<SortMethod::kStdStableSort>/512                    2474 ns         2473 ns       281798
+// BM_SortRandomData<SortMethod::kStdStableSort>/1024                   5541 ns         5541 ns       126485
+// BM_SortRandomData<SortMethod::kStdStableSort>/2048                  39425 ns        39424 ns        17762
+// BM_SortRandomData<SortMethod::kStdStableSort>/4096                  99459 ns        99456 ns         7006
+// BM_SortRandomData<SortMethod::kStdStableSort>/8192                 238091 ns       238082 ns         2953
+// BM_SortRandomData<SortMethod::kStdStableSort>/16384                539281 ns       539263 ns         1300
+// BM_SortRandomData<SortMethod::kStdStableSort>/32768               1168490 ns      1168483 ns          597
+// BM_SortRandomData<SortMethod::kStdStableSort>/65536               2514056 ns      2513922 ns          279
+// BM_SortRandomData<SortMethod::kStdStableSort>/131072              5325962 ns      5325863 ns          131
+// BM_SortRandomData<SortMethod::kStdStableSort>/262144             11590090 ns     11589633 ns           61
+// BM_SortRandomData<SortMethod::kStdStableSort>/524288             24997070 ns     24993431 ns           29
+// BM_SortRandomData<SortMethod::kStdStableSort>/1048576            52364991 ns     52364081 ns           13
+// BM_SortPartiallySortedData<SortMethod::kPdq>/32                      26.1 ns         26.1 ns     27818789
+// BM_SortPartiallySortedData<SortMethod::kPdq>/64                      67.9 ns         67.9 ns     10076970
+// BM_SortPartiallySortedData<SortMethod::kPdq>/128                      159 ns          159 ns      4460125
+// BM_SortPartiallySortedData<SortMethod::kPdq>/256                      326 ns          326 ns      2155077
+// BM_SortPartiallySortedData<SortMethod::kPdq>/512                      612 ns          612 ns      1158772
+// BM_SortPartiallySortedData<SortMethod::kPdq>/1024                    1205 ns         1205 ns       587540
+// BM_SortPartiallySortedData<SortMethod::kPdq>/2048                    2252 ns         2252 ns       309958
+// BM_SortPartiallySortedData<SortMethod::kPdq>/4096                    4450 ns         4450 ns       157626
+// BM_SortPartiallySortedData<SortMethod::kPdq>/8192                    9237 ns         9237 ns        75456
+// BM_SortPartiallySortedData<SortMethod::kPdq>/16384                  19177 ns        19176 ns        36507
+// BM_SortPartiallySortedData<SortMethod::kPdq>/32768                  48304 ns        48302 ns        14462
+// BM_SortPartiallySortedData<SortMethod::kPdq>/65536                 124313 ns       124310 ns         5590
+// BM_SortPartiallySortedData<SortMethod::kPdq>/131072                282868 ns       282862 ns         2471
+// BM_SortPartiallySortedData<SortMethod::kPdq>/262144                748262 ns       748224 ns          930
+// BM_SortPartiallySortedData<SortMethod::kPdq>/524288               1587168 ns      1587159 ns          441
+// BM_SortPartiallySortedData<SortMethod::kPdq>/1048576              3434923 ns      3434856 ns          204
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/32                  37.8 ns         37.8 ns     17865125
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/64                  81.7 ns         81.7 ns      8572684
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/128                  201 ns          201 ns      3502366
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/256                  459 ns          459 ns      1513415
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/512                 1124 ns         1124 ns       622606
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/1024                2566 ns         2566 ns       274741
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/2048                5570 ns         5570 ns       125864
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/4096               12436 ns        12435 ns        56418
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/8192               26709 ns        26708 ns        26237
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/16384              69598 ns        69597 ns        10023
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/32768             187689 ns       187657 ns         3598
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/65536             422855 ns       422759 ns         1658
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/131072            926691 ns       926666 ns          755
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/262144           2141686 ns      2141672 ns          327
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/524288           4528703 ns      4528676 ns          153
+// BM_SortPartiallySortedData<SortMethod::kStdSort>/1048576          9690864 ns      9690796 ns           72
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/32                47.6 ns         47.6 ns     14566847
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/64                69.7 ns         69.7 ns     10073432
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/128                119 ns          119 ns      5879687
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/256                234 ns          234 ns      2997032
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/512                472 ns          472 ns      1489784
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/1024               959 ns          958 ns       729569
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/2048              1963 ns         1963 ns       358399
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/4096              4909 ns         4908 ns       142478
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/8192             10384 ns        10381 ns        67528
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/16384            43864 ns        43863 ns        15994
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/32768           118543 ns       118541 ns         5922
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/65536           282220 ns       282215 ns         2485
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/131072          639465 ns       639359 ns         1093
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/262144         1601955 ns      1601811 ns          435
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/524288         3426178 ns      3425631 ns          206
+// BM_SortPartiallySortedData<SortMethod::kPowersort>/1048576        7438610 ns      7438426 ns           91
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/32            70.3 ns         70.3 ns     10067148
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/64             131 ns          131 ns      5360170
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/128            266 ns          266 ns      2622105
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/256            631 ns          631 ns      1074449
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/512           1243 ns         1243 ns       566539
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/1024          2805 ns         2805 ns       249929
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/2048          5767 ns         5767 ns       121282
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/4096         12814 ns        12813 ns        55051
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/8192         34371 ns        34370 ns        20305
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/16384        95992 ns        95990 ns         7274
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/32768       201706 ns       201701 ns         3472
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/65536       453366 ns       453346 ns         1543
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/131072      977696 ns       977652 ns          715
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/262144     2291897 ns      2291858 ns          305
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/524288     5089614 ns      5089385 ns          137
+// BM_SortPartiallySortedData<SortMethod::kStdStableSort>/1048576   11416225 ns     11415938 ns           60
+// clang-format on
diff --git a/tachyon/crypto/commitments/fri/two_adic_fri_config.h b/tachyon/crypto/commitments/fri/two_adic_fri_config.h
index 22b029fb8..59a5b17f2 100644
--- a/tachyon/crypto/commitments/fri/two_adic_fri_config.h
+++ b/tachyon/crypto/commitments/fri/two_adic_fri_config.h
@@ -54,7 +54,7 @@ std::vector<ExtF> FoldMatrix(const ExtF& beta,
       ExtF::GetBitRevIndexSuccessivePowers(rows, w_inv, half_beta);
 
   std::vector<ExtF> ret(rows);
-  OPENMP_PARALLEL_FOR(size_t r = 0; r < rows; ++r) {
+  OMP_PARALLEL_FOR(size_t r = 0; r < rows; ++r) {
     const ExtF& lo = mat(r, 0);
     const ExtF& hi = mat(r, 1);
     ret[r] = (one_half + powers[r]) * lo + (one_half - powers[r]) * hi;
diff --git a/tachyon/crypto/commitments/fri/two_adic_fri_prover.h b/tachyon/crypto/commitments/fri/two_adic_fri_prover.h
index 5267f202c..158cf2082 100644
--- a/tachyon/crypto/commitments/fri/two_adic_fri_prover.h
+++ b/tachyon/crypto/commitments/fri/two_adic_fri_prover.h
@@ -70,7 +70,7 @@ CommitPhaseResult<ChallengeMMCS> CommitPhase(
     // |folded| will never be the size of |inputs[0]|.
     for (size_t i = 1; i < inputs.size(); ++i) {
       if (inputs[i].size() == folded.size()) {
-        OPENMP_PARALLEL_FOR(size_t j = 0; j < inputs[i].size(); ++j) {
+        OMP_PARALLEL_FOR(size_t j = 0; j < inputs[i].size(); ++j) {
           folded[j] += inputs[i][j];
         }
       }
@@ -81,7 +81,7 @@ CommitPhaseResult<ChallengeMMCS> CommitPhase(
   VLOG(2) << "FRI(final_eval): " << final_eval.ToHexString(true);
 
 #if DCHECK_IS_ON()
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < folded.size(); ++i) {
+  OMP_PARALLEL_FOR(size_t i = 0; i < folded.size(); ++i) {
     DCHECK_EQ(folded[i], final_eval);
   }
 #endif
diff --git a/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h b/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h
index 2e5699e42..63afb0173 100644
--- a/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h
+++ b/tachyon/crypto/commitments/merkle_tree/binary_merkle_tree/binary_merkle_tree.h
@@ -68,8 +68,8 @@ class BinaryMerkleTree final
     // Finally, the remaining tree should be constructed from leaves 1 and 2.
     size_t leaves_size = std::size(leaves);
     if (leaves_size > leaves_size_for_parallelization_) {
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < leaves_size;
-                          i += leaves_size_for_parallelization_) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < leaves_size;
+                       i += leaves_size_for_parallelization_) {
         size_t from = leaves_size - 1 + i;
         size_t to = from + leaves_size_for_parallelization_;
         BuildTreeFromLeaves(base::Range<size_t>(from, to));
@@ -135,7 +135,7 @@ class BinaryMerkleTree final
     }
     base::CheckedNumeric<size_t> n = leaves_size;
     storage_->Allocate(((n << 1) - 1).ValueOrDie());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < leaves_size; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < leaves_size; ++i) {
       storage_->SetHash(leaves_size + i - 1,
                         hasher_->ComputeLeafHash(leaves[i]));
     }
diff --git a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel
index 07cb3a0a5..0d17b5532 100644
--- a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel
+++ b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/BUILD.bazel
@@ -14,6 +14,7 @@ tachyon_cc_library(
     deps = [
         "//tachyon/base:logging",
         "//tachyon/base:parallelize",
+        "//tachyon/base:sort",
         "//tachyon/base/containers:container_util",
         "//tachyon/math/finite_fields:extension_field_traits_forward",
         "//tachyon/math/finite_fields:finite_field_traits",
@@ -31,6 +32,7 @@ tachyon_cc_library(
     deps = [
         ":field_merkle_tree",
         "//tachyon/base:bits",
+        "//tachyon/base:sort",
         "//tachyon/crypto/commitments:mixed_matrix_commitment_scheme",
     ],
 )
diff --git a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h
index 70a177fcc..dbc57afca 100644
--- a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h
+++ b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree.h
@@ -19,6 +19,7 @@
 #include "tachyon/base/containers/container_util.h"
 #include "tachyon/base/logging.h"
 #include "tachyon/base/parallelize.h"
+#include "tachyon/base/sort.h"
 #include "tachyon/math/finite_fields/extension_field_traits_forward.h"
 #include "tachyon/math/finite_fields/finite_field_traits.h"
 #include "tachyon/math/finite_fields/packed_field_traits_forward.h"
@@ -50,15 +51,11 @@ class FieldMerkleTree {
                                std::vector<math::RowMajorMatrix<F>>&& leaves) {
     CHECK(!leaves.empty());
 
-    std::vector<const math::RowMajorMatrix<F>*> sorted_leaves = base::Map(
-        leaves, [](const math::RowMajorMatrix<F>& matrix) { return &matrix; });
-    // TODO(chokobole): Use https://github.com/timsort/cpp-TimSort or
-    // https://github.com/sebawild/powersort for better performance.
-    std::stable_sort(
-        sorted_leaves.begin(), sorted_leaves.end(),
-        [](const math::RowMajorMatrix<F>* a, const math::RowMajorMatrix<F>* b) {
-          return a->rows() > b->rows();
+    std::vector<RowMajorMatrixView> sorted_leaves =
+        base::Map(leaves, [](const math::RowMajorMatrix<F>& matrix) {
+          return RowMajorMatrixView(&matrix);
         });
+    base::StableSort(sorted_leaves.begin(), sorted_leaves.end());
 
 #if DCHECK_IS_ON()
     {
@@ -81,9 +78,9 @@ class FieldMerkleTree {
         break;
       }
     }
-    absl::Span<const math::RowMajorMatrix<F>*> tallest_matrices =
+    absl::Span<RowMajorMatrixView> tallest_matrices =
         absl::MakeSpan(sorted_leaves.data(), first_layer_size);
-    absl::Span<const math::RowMajorMatrix<F>*> remaining_leaves =
+    absl::Span<RowMajorMatrixView> remaining_leaves =
         absl::MakeSpan(sorted_leaves.data() + first_layer_size,
                        sorted_leaves.size() - first_layer_size);
 
@@ -103,7 +100,7 @@ class FieldMerkleTree {
           break;
         }
       }
-      absl::Span<const math::RowMajorMatrix<F>*> matrices_to_inject;
+      absl::Span<RowMajorMatrixView> matrices_to_inject;
       if (next_layer_size > 0) {
         matrices_to_inject = remaining_leaves.subspan(0, next_layer_size);
         remaining_leaves.remove_prefix(next_layer_size);
@@ -124,6 +121,32 @@ class FieldMerkleTree {
   const Digest& GetRoot() const { return digest_layers_.back()[0]; }
 
  private:
+  class RowMajorMatrixView {
+   public:
+    RowMajorMatrixView() = default;
+    explicit RowMajorMatrixView(const math::RowMajorMatrix<F>* ptr)
+        : ptr_(ptr) {}
+
+    // TODO(chokobole): This comparison is intentionally reversed to sort in
+    // descending order, as powersort doesn't accept custom callbacks.
+    bool operator<(const RowMajorMatrixView& other) const {
+      return ptr_->rows() > other.ptr_->rows();
+    }
+    bool operator<=(const RowMajorMatrixView& other) const {
+      return ptr_->rows() >= other.ptr_->rows();
+    }
+    bool operator>(const RowMajorMatrixView& other) const {
+      return ptr_->rows() < other.ptr_->rows();
+    }
+
+    const math::RowMajorMatrix<F>* operator->() const { return ptr_; }
+
+    const math::RowMajorMatrix<F>& operator*() const { return *ptr_; }
+
+   private:
+    const math::RowMajorMatrix<F>* ptr_ = nullptr;
+  };
+
   FieldMerkleTree(std::vector<math::RowMajorMatrix<F>>&& leaves,
                   std::vector<std::vector<Digest>>&& digest_layers)
       : leaves_(std::move(leaves)), digest_layers_(std::move(digest_layers)) {}
@@ -131,7 +154,7 @@ class FieldMerkleTree {
   template <typename Hasher, typename PackedHasher>
   static std::vector<Digest> CreateFirstDigestLayer(
       const Hasher& hasher, const PackedHasher& packed_hasher,
-      absl::Span<const math::RowMajorMatrix<F>*> tallest_matrices) {
+      absl::Span<RowMajorMatrixView> tallest_matrices) {
     size_t max_rows = static_cast<size_t>(tallest_matrices[0]->rows());
     size_t max_rows_padded = absl::bit_ceil(max_rows);
 
@@ -143,8 +166,8 @@ class FieldMerkleTree {
             absl::Span<Digest> chunk, size_t chunk_offset, size_t chunk_size) {
           size_t start = chunk_offset * chunk_size;
           if (chunk.size() == chunk_size) {
-            std::vector<PackedPrimeField> packed_prime_fields = base::FlatMap(
-                tallest_matrices, [start](const math::RowMajorMatrix<F>* m) {
+            std::vector<PackedPrimeField> packed_prime_fields =
+                base::FlatMap(tallest_matrices, [start](RowMajorMatrixView m) {
                   return math::PackRowVertically<PackedPrimeField>(*m, start);
                 });
             PackedDigest packed_digest =
@@ -170,7 +193,7 @@ class FieldMerkleTree {
       const Hasher& hasher, const PackedHasher& packed_hasher,
       const Compressor& compressor, const PackedCompressor& packed_compressor,
       const std::vector<Digest>& prev_layer,
-      absl::Span<const math::RowMajorMatrix<F>*> matrices_to_inject) {
+      absl::Span<RowMajorMatrixView> matrices_to_inject) {
     if (matrices_to_inject.empty())
       return Compress(compressor, packed_compressor, prev_layer);
 
@@ -202,7 +225,7 @@ class FieldMerkleTree {
             };
             inputs[0] = packed_compressor.Compress(inputs);
             std::vector<PackedPrimeField> packed_prime_fields = base::FlatMap(
-                matrices_to_inject, [start](const math::RowMajorMatrix<F>* m) {
+                matrices_to_inject, [start](RowMajorMatrixView m) {
                   return math::PackRowVertically<PackedPrimeField>(*m, start);
                 });
             inputs[1] = packed_hasher.Hash(packed_prime_fields);
@@ -290,8 +313,8 @@ class FieldMerkleTree {
   }
 
   static std::vector<PrimeField> GetRowAsPrimeFieldVector(
-      absl::Span<const math::RowMajorMatrix<F>*> matrices, size_t row) {
-    return base::FlatMap(matrices, [row](const math::RowMajorMatrix<F>* m) {
+      absl::Span<RowMajorMatrixView> matrices, size_t row) {
+    return base::FlatMap(matrices, [row](RowMajorMatrixView m) {
       if constexpr (math::FiniteFieldTraits<F>::kIsExtensionField) {
         static_assert(
             math::ExtensionFieldTraits<F>::kDegreeOverBasePrimeField ==
diff --git a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h
index 16855e1fe..a0a69c5d7 100644
--- a/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h
+++ b/tachyon/crypto/commitments/merkle_tree/field_merkle_tree/field_merkle_tree_mmcs.h
@@ -68,6 +68,18 @@ class FieldMerkleTreeMMCS final
     size_t index;
     math::Dimensions dimensions;
 
+    // TODO(chokobole): This comparison is intentionally reversed to sort in
+    // descending order, as powersort doesn't accept custom callbacks.
+    bool operator<(const IndexedDimensions& other) const {
+      return dimensions.height > other.dimensions.height;
+    }
+    bool operator<=(const IndexedDimensions& other) const {
+      return dimensions.height >= other.dimensions.height;
+    }
+    bool operator>(const IndexedDimensions& other) const {
+      return dimensions.height < other.dimensions.height;
+    }
+
     std::string ToString() const {
       return absl::Substitute("($0, $1)", index, dimensions.ToString());
     }
@@ -131,13 +143,8 @@ class FieldMerkleTreeMMCS final
           return IndexedDimensions{index, dimensions};
         });
 
-    // TODO(chokobole): Use https://github.com/timsort/cpp-TimSort or
-    // https://github.com/sebawild/powersort for better performance.
-    std::stable_sort(
-        sorted_dimensions_list.begin(), sorted_dimensions_list.end(),
-        [](const IndexedDimensions& a, const IndexedDimensions& b) {
-          return a.dimensions.height > b.dimensions.height;
-        });
+    base::StableSort(sorted_dimensions_list.begin(),
+                     sorted_dimensions_list.end());
 
     absl::Span<const IndexedDimensions> remaining_dimensions_list =
         absl::MakeConstSpan(sorted_dimensions_list);
diff --git a/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h b/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h
index d890d4d99..381087d04 100644
--- a/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h
+++ b/tachyon/crypto/commitments/mixed_matrix_commitment_scheme.h
@@ -28,7 +28,7 @@ class MixedMatrixCommitmentScheme {
   [[nodiscard]] bool Commit(const std::vector<Field>& vector,
                             Commitment* commitment, ProverData* prover_data) {
     math::RowMajorMatrix<Field> matrix(vector.size(), 1);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < vector.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < vector.size(); ++i) {
       matrix(i, 0) = vector[i];
     }
     return Commit(std::move(matrix), commitment, prover_data);
diff --git a/tachyon/crypto/commitments/polynomial_openings.h b/tachyon/crypto/commitments/polynomial_openings.h
index 1b47fc38f..aa64eca79 100644
--- a/tachyon/crypto/commitments/polynomial_openings.h
+++ b/tachyon/crypto/commitments/polynomial_openings.h
@@ -155,7 +155,7 @@ struct GroupedPolynomialOpenings {
       const Field& r, const std::vector<Poly>& low_degree_extensions) const {
     // numerators: [P₀(X) - R₀(X), P₁(X) - R₁(X), P₂(X) - R₂(X)]
     std::vector<Poly> numerators(low_degree_extensions.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < low_degree_extensions.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < low_degree_extensions.size(); ++i) {
       numerators[i] =
           *poly_openings_vec[i].poly_oracle - low_degree_extensions[i];
     }
diff --git a/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h b/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h
index cfeaec3bf..598a4d231 100644
--- a/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h
+++ b/tachyon/crypto/sumcheck/multilinear/sumcheck_prover.h
@@ -99,7 +99,7 @@ class SumcheckProver {
     std::vector<std::vector<F>> finished_evaluations(
         num_chunks, std::vector<F>(max_evaluations_ + 1, F::Zero()));
 
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
       size_t begin = i * chunk_size;
       size_t len = (i == num_chunks - 1) ? size - begin : chunk_size;
       std::vector<F> intermediate_evaluations(max_evaluations_ + 1, F::Zero());
diff --git a/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h b/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h
index f42a9e506..cfb69b08b 100644
--- a/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h
+++ b/tachyon/crypto/sumcheck/multilinear/sumcheck_verifier.h
@@ -190,7 +190,7 @@ F InterpolateUniPoly(const std::vector<F>& poly, const F& evaluation_point) {
   std::vector<F> products(num_chunks, F::One());
   std::vector<F> denom_ups(num_chunks, F::One());
   std::vector<std::vector<F>> list_of_evals(num_chunks);
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+  OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
     size_t begin = i * chunk_size;
     size_t len = (i == num_chunks - 1) ? poly_size - begin : chunk_size;
     list_of_evals[i].reserve(len);
diff --git a/tachyon/math/base/batch_inverse_benchmark.cc b/tachyon/math/base/batch_inverse_benchmark.cc
index 33e4e0726..5f656b097 100644
--- a/tachyon/math/base/batch_inverse_benchmark.cc
+++ b/tachyon/math/base/batch_inverse_benchmark.cc
@@ -36,7 +36,7 @@ void BM_InverseParallelFor(benchmark::State& state) {
   std::vector<F> fields = base::CreateVectorParallel(
       state.range(0), [](size_t i) { return F::FromBigInt(BigInt(i + 1)); });
   for (auto _ : state) {
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < fields.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < fields.size(); ++i) {
       CHECK(fields[i].InverseInPlace());
     }
   }
diff --git a/tachyon/math/base/groups.h b/tachyon/math/base/groups.h
index a6c61ff9d..d6f251b56 100644
--- a/tachyon/math/base/groups.h
+++ b/tachyon/math/base/groups.h
@@ -91,7 +91,7 @@ class MultiplicativeGroup : public MultiplicativeSemigroup<G> {
       size_t chunk_size = base::GetNumElementsPerThread(groups);
       size_t num_chunks = (size + chunk_size - 1) / chunk_size;
       std::atomic<bool> check_valid(true);
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
         size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
         absl::Span<const G> groups_chunk(std::data(groups) + i * chunk_size,
                                          len);
diff --git a/tachyon/math/base/semigroups.h b/tachyon/math/base/semigroups.h
index f3a68c9d6..4983de25c 100644
--- a/tachyon/math/base/semigroups.h
+++ b/tachyon/math/base/semigroups.h
@@ -556,7 +556,7 @@ class AdditiveSemigroup {
       LOG(ERROR) << "scalars and bases are empty";
       return false;
     }
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) {
       (*outputs)[i] = bases[i].ScalarMul(scalars[i]);
     }
     return true;
@@ -572,7 +572,7 @@ class AdditiveSemigroup {
       LOG(ERROR) << "scalars are empty";
       return false;
     }
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) {
       (*outputs)[i] = base.ScalarMul(scalars[i]);
     }
     return true;
@@ -589,7 +589,7 @@ class AdditiveSemigroup {
       LOG(ERROR) << "bases are empty";
       return false;
     }
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) {
       (*outputs)[i] = bases[i].ScalarMul(scalar);
     }
     return true;
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h
index 4c23d64ce..8cc8962f5 100644
--- a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h
+++ b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger.h
@@ -144,7 +144,7 @@ class Pippenger : public PippengerBase<Point> {
       FillDigits(scalars[i], ctx_.window_bits, &scalar_digits[i]);
     }
     if (parallel_windows_) {
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) {
         AccumulateSingleWindowNAFSum(bases_first, scalar_digits, i,
                                      &(*window_sums)[i],
                                      i == ctx_.window_count - 1);
@@ -203,7 +203,7 @@ class Pippenger : public PippengerBase<Point> {
                             absl::Span<const BigInt<N>> scalars,
                             std::vector<Bucket>* window_sums) {
     if (parallel_windows_) {
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < ctx_.window_count; ++i) {
         AccumulateSingleWindowSum(bases_first, scalars, ctx_.window_bits * i,
                                   &(*window_sums)[i]);
       }
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h
index c94d146a3..f79c9147e 100644
--- a/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h
+++ b/tachyon/math/elliptic_curves/msm/algorithms/pippenger/pippenger_adapter.h
@@ -82,7 +82,7 @@ class PippengerAdapter {
       size_t num_chunks = (scalars_size + chunk_size - 1) / chunk_size;
       std::vector<Result> results;
       results.resize(num_chunks);
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
         size_t start = i * chunk_size;
         size_t len = i == num_chunks - 1 ? scalars_size - start : chunk_size;
         Pippenger<Point> pippenger;
diff --git a/tachyon/math/elliptic_curves/msm/fixed_base_msm.h b/tachyon/math/elliptic_curves/msm/fixed_base_msm.h
index 2c09ec096..0fbdab43b 100644
--- a/tachyon/math/elliptic_curves/msm/fixed_base_msm.h
+++ b/tachyon/math/elliptic_curves/msm/fixed_base_msm.h
@@ -127,7 +127,7 @@ class FixedBaseMSM {
       LOG(ERROR) << "the size of scalar and output iterators don't match ";
       return false;
     }
-    OPENMP_PARALLEL_FOR(difference_type i = 0; i < size; ++i) {
+    OMP_PARALLEL_FOR(difference_type i = 0; i < size; ++i) {
       *(outputs_first + i) = ScalarMul(*(scalars_first + i));
     }
     return true;
@@ -201,7 +201,7 @@ class FixedBaseMSM {
 
     base_multiples_ = std::vector<std::vector<AddResult>>(
         window_count, std::vector<AddResult>(window_size));
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < window_count; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < window_count; ++i) {
       size_t cur_window_size =
           i == window_count - 1 ? last_window_size : window_size;
 
diff --git a/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h b/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h
index 5e1099fff..1915a1d02 100644
--- a/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h
+++ b/tachyon/math/elliptic_curves/short_weierstrass/jacobian_point.h
@@ -120,7 +120,7 @@ class JacobianPoint<
                       ScalarField::kParallelBatchInverseDivisorThreshold)) {
       size_t chunk_size = base::GetNumElementsPerThread(jacobian_points);
       size_t num_chunks = (size + chunk_size - 1) / chunk_size;
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
         size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
         absl::Span<AffinePoint<Curve>> affine_points_chunk(
             std::data(*affine_points) + i * chunk_size, len);
diff --git a/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h b/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h
index 0b9de58f4..590516d3d 100644
--- a/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h
+++ b/tachyon/math/elliptic_curves/short_weierstrass/point_xyzz.h
@@ -123,7 +123,7 @@ class PointXYZZ<_Curve,
                       ScalarField::kParallelBatchInverseDivisorThreshold)) {
       size_t chunk_size = base::GetNumElementsPerThread(point_xyzzs);
       size_t num_chunks = (size + chunk_size - 1) / chunk_size;
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
         size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
         absl::Span<AffinePoint<Curve>> affine_points_chunk(
             std::data(*affine_points) + i * chunk_size, len);
diff --git a/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h b/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h
index dba9e37ea..6b1ef006d 100644
--- a/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h
+++ b/tachyon/math/elliptic_curves/short_weierstrass/projective_point.h
@@ -119,7 +119,7 @@ class ProjectivePoint<
                       ScalarField::kParallelBatchInverseDivisorThreshold)) {
       size_t chunk_size = base::GetNumElementsPerThread(projective_points);
       size_t num_chunks = (size + chunk_size - 1) / chunk_size;
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
         size_t len = i == num_chunks - 1 ? size - i * chunk_size : chunk_size;
         absl::Span<AffinePoint<Curve>> affine_points_chunk(
             std::data(*affine_points) + i * chunk_size, len);
diff --git a/tachyon/math/matrix/matrix_operations.h b/tachyon/math/matrix/matrix_operations.h
index d2a4d74af..19fe5a6aa 100644
--- a/tachyon/math/matrix/matrix_operations.h
+++ b/tachyon/math/matrix/matrix_operations.h
@@ -43,7 +43,7 @@ math::Vector<F> MulMatVec(const Eigen::MatrixBase<Derived>& matrix,
   static_assert(std::is_same_v<F, typename Derived2::Scalar>);
 
   math::Vector<F> ret = math::Vector<F>::Constant(vector.size(), F::Zero());
-  OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
+  OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
     for (Eigen::Index j = 0; j < matrix.cols(); ++j) {
       ret[i] += matrix(i, j) * vector[j];
     }
@@ -75,13 +75,13 @@ math::Vector<F> MulMatVec(
 
   math::Vector<F> ret = math::Vector<F>::Constant(vector.size(), F::Zero());
   if (vector.rows() == 1) {
-    OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
+    OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
       for (Eigen::Index j = 0; j < matrix.cols(); ++j) {
         ret[i] += matrix(i, j) * vector(0, j);
       }
     }
   } else if (vector.cols() == 1) {
-    OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
+    OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
       for (Eigen::Index j = 0; j < matrix.cols(); ++j) {
         ret[i] += matrix(i, j) * vector(j, 0);
       }
@@ -127,7 +127,7 @@ math::Matrix<F> MulMatMat(const Eigen::MatrixBase<Derived>& matrix,
 
   math::Matrix<F> ret =
       math::Matrix<F>::Constant(matrix.rows(), matrix2.cols(), F::Zero());
-  OPENMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
+  OMP_PARALLEL_FOR(Eigen::Index i = 0; i < matrix.rows(); ++i) {
     for (Eigen::Index j = 0; j < matrix.cols(); ++j) {
       for (Eigen::Index k = 0; k < matrix2.cols(); ++k) {
         ret(i, k) += matrix(i, j) * matrix2(j, k);
diff --git a/tachyon/math/matrix/matrix_utils.h b/tachyon/math/matrix/matrix_utils.h
index a2aa7330f..28843f6e7 100644
--- a/tachyon/math/matrix/matrix_utils.h
+++ b/tachyon/math/matrix/matrix_utils.h
@@ -154,7 +154,7 @@ void ExpandInPlaceWithZeroPad(Eigen::MatrixBase<Derived>& mat,
 
   Derived padded = Derived::Zero(new_rows, cols);
 
-  OPENMP_PARALLEL_FOR(Eigen::Index row = 0; row < original_rows; ++row) {
+  OMP_PARALLEL_FOR(Eigen::Index row = 0; row < original_rows; ++row) {
     Eigen::Index padded_row_index = row << added_bits;
     // TODO(ashjeong): Check if moved properly
     padded.row(padded_row_index) = std::move(mat.row(row));
@@ -173,7 +173,7 @@ void ReverseMatrixIndexBits(Eigen::MatrixBase<Derived>& mat) {
   }
   uint32_t log_n = base::bits::CheckedLog2(rows);
 
-  OPENMP_PARALLEL_FOR(size_t row = 1; row < rows; ++row) {
+  OMP_PARALLEL_FOR(size_t row = 1; row < rows; ++row) {
     size_t ridx = base::bits::ReverseBitsLen(row, log_n);
     if (row < ridx) {
       mat.row(row).swap(mat.row(ridx));
diff --git a/tachyon/math/polynomials/multivariate/linear_combination_term.h b/tachyon/math/polynomials/multivariate/linear_combination_term.h
index ac49788b1..e92dc7ac7 100644
--- a/tachyon/math/polynomials/multivariate/linear_combination_term.h
+++ b/tachyon/math/polynomials/multivariate/linear_combination_term.h
@@ -51,7 +51,7 @@ struct LinearCombinationTerm {
     size_t num_chunks = (size + chunk_size - 1) / chunk_size;
 
     std::vector<F> sums(num_chunks, F::Zero());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
       size_t start = i * chunk_size;
       size_t len = (i == num_chunks - 1) ? size - start : chunk_size;
       for (size_t j = start; j < start + len; ++j) {
diff --git a/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h b/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h
index 94cb6231c..40390e097 100644
--- a/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h
+++ b/tachyon/math/polynomials/multivariate/multilinear_extension_ops.h
@@ -32,7 +32,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
     std::vector<F> o_evaluations(r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       o_evaluations[i] = l_evaluations[i] + r_evaluations[i];
     }
     return MultilinearExtension<D>(D(std::move(o_evaluations)));
@@ -51,7 +51,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
       return self;
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       l_evaluations[i] += r_evaluations[i];
     }
     return self;
@@ -71,7 +71,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
     std::vector<F> o_evaluations(r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       o_evaluations[i] = l_evaluations[i] - r_evaluations[i];
     }
     return MultilinearExtension<D>(D(std::move(o_evaluations)));
@@ -90,7 +90,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
       return self;
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       l_evaluations[i] -= r_evaluations[i];
     }
     return self;
@@ -102,7 +102,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
       return self;
     }
     std::vector<F> o_evaluations(i_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < o_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < o_evaluations.size(); ++i) {
       o_evaluations[i] = -i_evaluations[i];
     }
     return MultilinearExtension<D>(D(std::move(o_evaluations)));
@@ -114,7 +114,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
       return self;
     }
     // clang-format off
-    OPENMP_PARALLEL_FOR(F& evaluation : evaluations) {
+    OMP_PARALLEL_FOR(F& evaluation : evaluations) {
       // clang-format on
       evaluation.NegateInPlace();
     }
@@ -131,7 +131,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
     std::vector<F> o_evaluations(r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
       o_evaluations[i] = l_evaluations[i] * r_evaluations[i];
     }
     return MultilinearExtension<D>(D(std::move(o_evaluations)));
@@ -151,7 +151,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
       return self;
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
       l_evaluations[i] *= r_evaluations[i];
     }
     return self;
@@ -178,7 +178,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
     }
     std::vector<F> o_evaluations(r_evaluations.size());
     std::atomic<bool> check_valid(true);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
       std::optional<F> div = l_evaluations[i] / r_evaluations[i];
       if (UNLIKELY(!div)) {
         check_valid.store(false, std::memory_order_relaxed);
@@ -214,7 +214,7 @@ class MultilinearExtensionOp<MultilinearDenseEvaluations<F, MaxDegree>> {
       return std::nullopt;
     }
     std::atomic<bool> check_valid(true);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       if (UNLIKELY(!(l_evaluations[i] /= r_evaluations[i])))
         check_valid.store(false, std::memory_order_relaxed);
     }
diff --git a/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h b/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h
index 05229c11e..c8218691a 100644
--- a/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h
+++ b/tachyon/math/polynomials/multivariate/multivariate_polynomial_ops.h
@@ -79,7 +79,7 @@ class MultivariatePolynomialOp<MultivariateSparseCoefficients<F, MaxDegree>> {
     }
     const Terms& i_terms = self.coefficients_.terms_;
     Terms o_terms(i_terms.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < o_terms.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < o_terms.size(); ++i) {
       o_terms[i] = -i_terms[i];
     }
     return MultivariatePolynomial<S>(
@@ -93,7 +93,7 @@ class MultivariatePolynomialOp<MultivariateSparseCoefficients<F, MaxDegree>> {
     }
     Terms& terms = self.coefficients_.terms_;
     // clang-format off
-    OPENMP_PARALLEL_FOR(Term& term : terms) { term.coefficient.NegateInPlace(); }
+    OMP_PARALLEL_FOR(Term& term : terms) { term.coefficient.NegateInPlace(); }
     // clang-format on
     return self;
   }
diff --git a/tachyon/math/polynomials/univariate/BUILD.bazel b/tachyon/math/polynomials/univariate/BUILD.bazel
index 8c7d40b9e..cbb2e13ca 100644
--- a/tachyon/math/polynomials/univariate/BUILD.bazel
+++ b/tachyon/math/polynomials/univariate/BUILD.bazel
@@ -154,6 +154,7 @@ tachyon_cc_library(
         "//tachyon/base:logging",
         "//tachyon/base:optional",
         "//tachyon/base:parallelize",
+        "//tachyon/base:sort",
         "//tachyon/base/buffer:copyable",
         "//tachyon/base/containers:adapters",
         "//tachyon/base/containers:container_util",
@@ -166,7 +167,6 @@ tachyon_cc_library(
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/types:span",
-        "@pdqsort",
     ],
 )
 
diff --git a/tachyon/math/polynomials/univariate/evaluations_utils.h b/tachyon/math/polynomials/univariate/evaluations_utils.h
index c34fc3965..7587ddf71 100644
--- a/tachyon/math/polynomials/univariate/evaluations_utils.h
+++ b/tachyon/math/polynomials/univariate/evaluations_utils.h
@@ -30,7 +30,7 @@ template <typename Container>
 void SwapBitRevElementsInPlace(Container& container, size_t size,
                                size_t log_len) {
   if (size <= 1) return;
-  OPENMP_PARALLEL_FOR(size_t idx = 1; idx < size; ++idx) {
+  OMP_PARALLEL_FOR(size_t idx = 1; idx < size; ++idx) {
     size_t ridx = base::bits::ReverseBitsLen(idx, log_len);
     if (idx < ridx) {
       std::swap(container.at(idx), container.at(ridx));
diff --git a/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc b/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc
index 979f9959d..85d968fec 100644
--- a/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc
+++ b/tachyon/math/polynomials/univariate/lagrange_interpolation_unittest.cc
@@ -24,7 +24,7 @@ TEST(LagrangeInterpolationTest, LagrangeInterpolate) {
   UnivariateDensePolynomial<F, kDegree> poly;
   EXPECT_TRUE(LagrangeInterpolate(points, evals, &poly));
 
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < points.size(); ++i) {
+  OMP_PARALLEL_FOR(size_t i = 0; i < points.size(); ++i) {
     EXPECT_EQ(poly.Evaluate(points[i]), evals[i]);
   }
 }
diff --git a/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h b/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h
index 82664495d..af4521cb9 100644
--- a/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h
+++ b/tachyon/math/polynomials/univariate/mixed_radix_evaluation_domain.h
@@ -102,7 +102,7 @@ class MixedRadixEvaluationDomain
     BestFFT(poly, this->group_gen_inv_);
     if (this->offset_.IsOne()) {
       // clang-format off
-      OPENMP_PARALLEL_FOR(F& coeff : poly.coefficients_.coefficients_) {
+      OMP_PARALLEL_FOR(F& coeff : poly.coefficients_.coefficients_) {
         // clang-format on
         coeff *= this->size_inv_;
       }
diff --git a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h
index d350a0eba..702d4a6d3 100644
--- a/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h
+++ b/tachyon/math/polynomials/univariate/radix2_evaluation_domain.h
@@ -137,7 +137,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
     // an explanation)
     std::vector<F> weights =
         F::GetSuccessivePowers(this->size_, shift, this->size_inv_);
-    OPENMP_PARALLEL_FOR(size_t row = 0; row < weights.size(); ++row) {
+    OMP_PARALLEL_FOR(size_t row = 0; row < weights.size(); ++row) {
       // Reverse bits because |mat| is encoded in bit-reversed order
       mat.row(base::bits::ReverseBitsLen(row, this->log_size_of_group_)) *=
           weights[row];
@@ -227,7 +227,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
     IFFTHelperInPlace(poly);
     if (this->offset_.IsOne()) {
       // clang-format off
-      OPENMP_PARALLEL_FOR(F& val : poly.coefficients_.coefficients_) {
+      OMP_PARALLEL_FOR(F& val : poly.coefficients_.coefficients_) {
         // clang-format on
         val *= this->size_inv_;
       }
@@ -262,8 +262,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
 
     // Each butterfly cluster uses 2 * |gap| positions.
     size_t chunk_size = 2 * gap;
-    OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < poly_or_evals.NumElements();
-                               i += chunk_size) {
+    OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < poly_or_evals.NumElements();
+                            i += chunk_size) {
       for (size_t j = 0; j < gap; ++j) {
         fn(poly_or_evals.at(i + j), poly_or_evals.at(i + j + gap), roots[j]);
       }
@@ -322,8 +322,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
       }
     }
 
-    roots_vec_[this->log_size_of_group_ - 1] = largest;
-    inv_roots_vec_[0] = largest_inv;
+    roots_vec_[this->log_size_of_group_ - 1] = std::move(largest);
+    inv_roots_vec_[0] = std::move(largest_inv);
 
     // Prepare space in each vector for the others.
     size_t size = this->size_ / 2;
@@ -334,7 +334,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
     }
 
     // Assign every element based on the biggest vector.
-    OPENMP_PARALLEL_FOR(size_t i = 1; i < this->log_size_of_group_; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 1; i < this->log_size_of_group_; ++i) {
       for (size_t j = 0; j < this->size_ / std::pow(2, i + 1); ++j) {
         size_t k = std::pow(2, i) * j;
         roots_vec_[this->log_size_of_group_ - i - 1][j] = roots_vec_.back()[k];
@@ -374,8 +374,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
     size_t chunk_rows = 1 << mid_;
 
     // max block size: 2^|mid_|
-    // TODO(ashjeong): benchmark between |OPENMP_PARALLEL_FOR| here vs
-    // |OPENMP_PARALLEL_NESTED_FOR| in |RunDitLayers|
+    // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs
+    // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers|
     for (size_t block_start = 0; block_start < this->size_;
          block_start += chunk_rows) {
       size_t cur_chunk_rows = std::min(chunk_rows, this->size_ - block_start);
@@ -400,8 +400,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
     size_t chunk_rows = 1 << (this->log_size_of_group_ - mid_);
 
     // max block size: 2^(|this->log_size_of_group_| - |mid_|)
-    // TODO(ashjeong): benchmark between |OPENMP_PARALLEL_FOR| here vs
-    // |OPENMP_PARALLEL_NESTED_FOR| in |RunDitLayers|
+    // TODO(ashjeong): benchmark between |OMP_PARALLEL_FOR| here vs
+    // |OMP_PARALLEL_NESTED_FOR| in |RunDitLayers|
     for (size_t block_start = 0; block_start < this->size_;
          block_start += chunk_rows) {
       size_t thread = block_start / chunk_rows;
@@ -433,8 +433,8 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
     size_t sub_rows = static_cast<size_t>(submat.rows());
     DCHECK_GE(sub_rows, block_size);
 
-    OPENMP_PARALLEL_NESTED_FOR(size_t block_start = 0; block_start < sub_rows;
-                               block_start += block_size) {
+    OMP_PARALLEL_NESTED_FOR(size_t block_start = 0; block_start < sub_rows;
+                            block_start += block_size) {
       for (size_t i = 0; i < half_block_size; ++i) {
         size_t lo = block_start + i;
         size_t hi = lo + half_block_size;
@@ -464,7 +464,7 @@ class Radix2EvaluationDomain : public UnivariateEvaluationDomain<F, MaxDegree>,
     std::vector<PackedPrimeField*> shorts_2 =
         PackRowHorizontally<PackedPrimeField>(row_2_block, suffix_2);
 
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < shorts_1.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < shorts_1.size(); ++i) {
       UnivariateEvaluationDomain<F, MaxDegree>::template ButterflyFnOutIn<
           PackedPrimeField>(*shorts_1[i], *shorts_2[i], packed_twiddle);
     }
diff --git a/tachyon/math/polynomials/univariate/two_adic_subgroup.h b/tachyon/math/polynomials/univariate/two_adic_subgroup.h
index 427b41fb9..cb504b002 100644
--- a/tachyon/math/polynomials/univariate/two_adic_subgroup.h
+++ b/tachyon/math/polynomials/univariate/two_adic_subgroup.h
@@ -50,7 +50,7 @@ class TwoAdicSubgroup {
     Eigen::Index cols = mat.cols();
 
     std::vector<F> weights = F::GetSuccessivePowers(rows, shift);
-    OPENMP_PARALLEL_NESTED_FOR(Eigen::Index row = 0; row < rows; ++row) {
+    OMP_PARALLEL_NESTED_FOR(Eigen::Index row = 0; row < rows; ++row) {
       for (Eigen::Index col = 0; col < cols; ++col) {
         mat(row, col) *= weights[row];
       }
diff --git a/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h b/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h
index 812e4732c..06a5c5d0a 100644
--- a/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h
+++ b/tachyon/math/polynomials/univariate/univariate_dense_coefficients.h
@@ -179,7 +179,7 @@ class UnivariateDenseCoefficients {
   Fold(const Field& r) const {
     size_t size = coefficients_.size();
     std::vector<F> coefficients((size + 1) >> 1);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < size; i += 2) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < size; i += 2) {
       coefficients[i >> 1] = coefficients_[i + 1] * r;
       coefficients[i >> 1] += coefficients_[i];
     }
diff --git a/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h b/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h
index b3fb88a46..c694dcd02 100644
--- a/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h
+++ b/tachyon/math/polynomials/univariate/univariate_evaluation_domain.h
@@ -454,7 +454,7 @@ class UnivariateEvaluationDomain : public EvaluationDomain<F, MaxDegree> {
     // Invariant: |pow| = |c|*|g|ⁱ at the i-th iteration of the loop
     size_t size = poly_or_evals.NumElements();
     size_t num_elems_per_thread = std::max(size / thread_nums, size_t{1024});
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < size; i += num_elems_per_thread) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < size; i += num_elems_per_thread) {
       F pow = c * g.Pow(i);
       for (size_t j = 0; j < num_elems_per_thread; ++j) {
         if (i + j >= size) break;
diff --git a/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h b/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h
index 237471ea2..ea1dbcfde 100644
--- a/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h
+++ b/tachyon/math/polynomials/univariate/univariate_evaluations_ops.h
@@ -36,7 +36,7 @@ class UnivariateEvaluationsOp {
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
     std::vector<F> o_evaluations(r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       o_evaluations[i] = l_evaluations[i] + r_evaluations[i];
     }
     return Poly(std::move(o_evaluations));
@@ -54,7 +54,7 @@ class UnivariateEvaluationsOp {
       return self;
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       l_evaluations[i] += r_evaluations[i];
     }
     return self;
@@ -73,7 +73,7 @@ class UnivariateEvaluationsOp {
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
     std::vector<F> o_evaluations(r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       o_evaluations[i] = l_evaluations[i] - r_evaluations[i];
     }
     return Poly(std::move(o_evaluations));
@@ -91,7 +91,7 @@ class UnivariateEvaluationsOp {
       return self;
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       l_evaluations[i] -= r_evaluations[i];
     }
     return self;
@@ -103,7 +103,7 @@ class UnivariateEvaluationsOp {
       return self;
     }
     std::vector<F> o_evaluations(i_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < i_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < i_evaluations.size(); ++i) {
       o_evaluations[i] = -i_evaluations[i];
     }
     return Poly(std::move(o_evaluations));
@@ -115,7 +115,7 @@ class UnivariateEvaluationsOp {
       return self;
     }
     // clang-format off
-    OPENMP_PARALLEL_FOR(F& evaluation : evaluations) {
+    OMP_PARALLEL_FOR(F& evaluation : evaluations) {
       // clang-format on
       evaluation.NegateInPlace();
     }
@@ -131,7 +131,7 @@ class UnivariateEvaluationsOp {
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
     std::vector<F> o_evaluations(r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       o_evaluations[i] = l_evaluations[i] * r_evaluations[i];
     }
     return Poly(std::move(o_evaluations));
@@ -150,7 +150,7 @@ class UnivariateEvaluationsOp {
       return self;
     }
     CHECK_EQ(l_evaluations.size(), r_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       l_evaluations[i] *= r_evaluations[i];
     }
     return self;
@@ -167,7 +167,7 @@ class UnivariateEvaluationsOp {
       return self;
     }
     std::vector<F> o_evaluations(l_evaluations.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
       o_evaluations[i] = l_evaluations[i] * scalar;
     }
     return Poly(std::move(o_evaluations));
@@ -179,7 +179,7 @@ class UnivariateEvaluationsOp {
       // 0 * s or f(x) * 1
       return self;
     }
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l_evaluations.size(); ++i) {
       l_evaluations[i] *= scalar;
     }
     return self;
@@ -205,7 +205,7 @@ class UnivariateEvaluationsOp {
     }
     std::vector<F> o_evaluations(r_evaluations.size());
     std::atomic<bool> check_valid(true);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       const std::optional<F> div = l_evaluations[i] / r_evaluations[i];
       if (UNLIKELY(!div)) {
         check_valid.store(false, std::memory_order_relaxed);
@@ -239,7 +239,7 @@ class UnivariateEvaluationsOp {
       return std::nullopt;
     }
     std::atomic<bool> check_valid(true);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_evaluations.size(); ++i) {
       if (UNLIKELY(!(l_evaluations[i] /= r_evaluations[i])))
         check_valid.store(false, std::memory_order_relaxed);
     }
diff --git a/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h b/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h
index f2a7534e2..478529438 100644
--- a/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h
+++ b/tachyon/math/polynomials/univariate/univariate_polynomial_ops.h
@@ -12,10 +12,9 @@
 #include <utility>
 #include <vector>
 
-#include "third_party/pdqsort/include/pdqsort.h"
-
 #include "tachyon/base/openmp_util.h"
 #include "tachyon/base/optional.h"
+#include "tachyon/base/sort.h"
 #include "tachyon/math/base/arithmetics_results.h"
 #include "tachyon/math/polynomials/univariate/univariate_polynomial.h"
 
@@ -43,7 +42,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     std::vector<F>& o_coefficients = ret.coefficients_.coefficients_;
     o_coefficients.resize(
         std::max(l_coefficients.size(), r_coefficients.size()));
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
       o_coefficients[i] = self.coefficients_[i] + other.coefficients_[i];
     }
 
@@ -63,7 +62,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     const std::vector<F>& r_coefficients = other.coefficients_.coefficients_;
     l_coefficients.resize(
         std::max(l_coefficients.size(), r_coefficients.size()));
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) {
       l_coefficients[i] += r_coefficients[i];
     }
 
@@ -84,7 +83,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     UnivariatePolynomial<D> ret;
     std::vector<F>& o_coefficients = ret.coefficients_.coefficients_;
     o_coefficients.resize(std::max(degree, other_degree) + 1);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
       o_coefficients[i] = self.coefficients_[i] + other.coefficients()[i];
     }
 
@@ -109,7 +108,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
 
     std::vector<F>& l_coefficients = self.coefficients_.coefficients_;
     const std::vector<Term>& r_terms = other.coefficients().terms_;
-    OPENMP_PARALLEL_FOR(const Term& r_term : r_terms) {
+    OMP_PARALLEL_FOR(const Term& r_term : r_terms) {
       if (r_term.degree <= degree) {
         l_coefficients[r_term.degree] += r_term.coefficient;
       } else {
@@ -138,7 +137,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     std::vector<F>& o_coefficients = ret.coefficients_.coefficients_;
     o_coefficients.resize(
         std::max(l_coefficients.size(), r_coefficients.size()));
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
       o_coefficients[i] = self.coefficients_[i] - other.coefficients_[i];
     }
 
@@ -158,7 +157,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     const std::vector<F>& r_coefficients = other.coefficients_.coefficients_;
     l_coefficients.resize(
         std::max(l_coefficients.size(), r_coefficients.size()));
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < r_coefficients.size(); ++i) {
       l_coefficients[i] -= r_coefficients[i];
     }
 
@@ -179,7 +178,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     UnivariatePolynomial<D> ret;
     std::vector<F>& o_coefficients = ret.coefficients_.coefficients_;
     o_coefficients.resize(std::max(degree, other_degree) + 1);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
       o_coefficients[i] = self.coefficients_[i] - other.coefficients()[i];
     }
 
@@ -204,7 +203,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
 
     std::vector<F>& l_coefficients = self.coefficients_.coefficients_;
     const std::vector<Term>& r_terms = other.coefficients().terms_;
-    OPENMP_PARALLEL_FOR(const Term& r_term : r_terms) {
+    OMP_PARALLEL_FOR(const Term& r_term : r_terms) {
       if (r_term.degree <= degree) {
         l_coefficients[r_term.degree] -= r_term.coefficient;
       } else {
@@ -225,7 +224,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     }
     const std::vector<F>& i_coefficients = self.coefficients_.coefficients_;
     std::vector<F> o_coefficients(i_coefficients.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < o_coefficients.size(); ++i) {
       o_coefficients[i] = -i_coefficients[i];
     }
     return UnivariatePolynomial<D>(D(std::move(o_coefficients)));
@@ -237,7 +236,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     }
     std::vector<F>& coefficients = self.coefficients_.coefficients_;
     // clang-format off
-    OPENMP_PARALLEL_FOR(F& coefficient : coefficients) {
+    OMP_PARALLEL_FOR(F& coefficient : coefficients) {
       // clang-format on
       coefficient.NegateInPlace();
     }
@@ -254,7 +253,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     }
     const std::vector<F>& l_coefficients = self.coefficients_.coefficients_;
     std::vector<F> o_coefficients(l_coefficients.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l_coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l_coefficients.size(); ++i) {
       o_coefficients[i] = l_coefficients[i] * scalar;
     }
     return UnivariatePolynomial<D>(D(std::move(o_coefficients)));
@@ -267,7 +266,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     }
     std::vector<F>& coefficients = self.coefficients_.coefficients_;
     // clang-format off
-    OPENMP_PARALLEL_FOR(F& coefficient : coefficients) {
+    OMP_PARALLEL_FOR(F& coefficient : coefficients) {
       // clang-format on
       coefficient *= scalar;
     }
@@ -446,7 +445,7 @@ class UnivariatePolynomialOp<UnivariateDenseCoefficients<F, MaxDegree>> {
     l_coefficients = std::vector<F>(other.Degree() + 1);
 
     const std::vector<Term>& r_terms = other.coefficients().terms_;
-    OPENMP_PARALLEL_FOR(const Term& r_term : r_terms) {
+    OMP_PARALLEL_FOR(const Term& r_term : r_terms) {
       if constexpr (NEGATION) {
         l_coefficients[r_term.degree] = -r_term.coefficient;
       } else {
@@ -762,9 +761,7 @@ class UnivariatePolynomialOp<UnivariateSparseCoefficients<F, MaxDegree>> {
     }
     size_t size = self.Degree() + 1;
     std::vector<F> coefficients(size);
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) {
-      coefficients[i] = self[i];
-    }
+    OMP_PARALLEL_FOR(size_t i = 0; i < size; ++i) { coefficients[i] = self[i]; }
     return UnivariatePolynomial<D>(D(std::move(coefficients)));
   }
 
@@ -852,7 +849,7 @@ class UnivariatePolynomialOp<UnivariateSparseCoefficients<F, MaxDegree>> {
         }
       }
     }
-    pdqsort(c_terms.begin(), c_terms.end());
+    base::UnstableSort(c_terms.begin(), c_terms.end());
     c.coefficients_ = S(std::move(c_terms));
   }
 };
diff --git a/tachyon/zk/air/plonky3/challenger/challenger.h b/tachyon/zk/air/plonky3/challenger/challenger.h
index 0223a149f..586c96309 100644
--- a/tachyon/zk/air/plonky3/challenger/challenger.h
+++ b/tachyon/zk/air/plonky3/challenger/challenger.h
@@ -97,7 +97,7 @@ class Challenger {
     uint32_t chunk_size = range.GetSize() / thread_nums;
     std::vector<uint32_t> ret(thread_nums,
                               std::numeric_limits<uint32_t>::max());
-    OPENMP_PARALLEL_FOR(uint32_t i = 0; i < thread_nums; ++i) {
+    OMP_PARALLEL_FOR(uint32_t i = 0; i < thread_nums; ++i) {
       uint32_t start = range.from + i * chunk_size;
       uint32_t end = start + std::min(range.to - start, chunk_size);
       for (uint32_t j = start; j < end; ++j) {
diff --git a/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc b/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc
index 4da05eed6..052228e08 100644
--- a/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc
+++ b/tachyon/zk/base/nested_for_loop_openmp_benchmark.cc
@@ -14,7 +14,7 @@ void BM_NestedForLoopParallelCols(benchmark::State& state) {
         return base::CreateVector(rows, []() { return F::Random(); });
       });
   for (auto _ : state) {
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < cols; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < cols; ++i) {
       for (size_t j = 0; j < rows; ++j) {
         table[i][j].DoubleInPlace();
       }
@@ -33,7 +33,7 @@ void BM_NestedForLoopParallelRows(benchmark::State& state) {
       });
   for (auto _ : state) {
     for (size_t i = 0; i < cols; ++i) {
-      OPENMP_PARALLEL_FOR(size_t j = 0; j < rows; ++j) {
+      OMP_PARALLEL_FOR(size_t j = 0; j < rows; ++j) {
         table[i][j].DoubleInPlace();
       }
     }
@@ -50,7 +50,7 @@ void BM_NestedForLoopParallelCollapse(benchmark::State& state) {
         return base::CreateVector(rows, []() { return F::Random(); });
       });
   for (auto _ : state) {
-    OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < cols; ++i) {
+    OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < cols; ++i) {
       for (size_t j = 0; j < rows; ++j) {
         table[i][j].DoubleInPlace();
       }
diff --git a/tachyon/zk/base/parallelize_benchmark.cc b/tachyon/zk/base/parallelize_benchmark.cc
index 5abcb7831..d71a6aac2 100644
--- a/tachyon/zk/base/parallelize_benchmark.cc
+++ b/tachyon/zk/base/parallelize_benchmark.cc
@@ -26,7 +26,7 @@ void BM_ForLoop(benchmark::State& state) {
   std::vector<F> vec =
       base::CreateVectorParallel(n, []() { return F::Random(); });
   for (auto _ : state) {
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < n; ++i) { vec[i].DoubleInPlace(); }
+    OMP_PARALLEL_FOR(size_t i = 0; i < n; ++i) { vec[i].DoubleInPlace(); }
   }
   benchmark::DoNotOptimize(vec);
 }
diff --git a/tachyon/zk/lookup/halo2/BUILD.bazel b/tachyon/zk/lookup/halo2/BUILD.bazel
index 660929a40..1fbbc6f27 100644
--- a/tachyon/zk/lookup/halo2/BUILD.bazel
+++ b/tachyon/zk/lookup/halo2/BUILD.bazel
@@ -22,6 +22,7 @@ tachyon_cc_library(
     name = "permute_expression_pair",
     hdrs = ["permute_expression_pair.h"],
     deps = [
+        "//tachyon/base:sort",
         "//tachyon/zk/base/entities:prover_base",
         "//tachyon/zk/lookup:pair",
         "@com_google_absl//absl/container:btree",
diff --git a/tachyon/zk/lookup/halo2/permute_expression_pair.h b/tachyon/zk/lookup/halo2/permute_expression_pair.h
index 21708a4c4..15de2fe94 100644
--- a/tachyon/zk/lookup/halo2/permute_expression_pair.h
+++ b/tachyon/zk/lookup/halo2/permute_expression_pair.h
@@ -12,8 +12,8 @@
 #include <vector>
 
 #include "absl/container/btree_map.h"
-#include "third_party/pdqsort/include/pdqsort.h"
 
+#include "tachyon/base/sort.h"
 #include "tachyon/zk/base/entities/prover_base.h"
 #include "tachyon/zk/lookup/pair.h"
 
@@ -35,8 +35,8 @@ template <typename PCS, typename Evals, typename F = typename Evals::Field>
   std::vector<F> permuted_input_expressions = in.input().evaluations();
 
   // sort input lookup expression values
-  pdqsort(permuted_input_expressions.begin(),
-          permuted_input_expressions.begin() + usable_rows);
+  base::UnstableSort(permuted_input_expressions.begin(),
+                     permuted_input_expressions.begin() + usable_rows);
 
   // a map of each unique element in the table expression and its count
   absl::btree_map<F, RowIndex> leftover_table_map;
diff --git a/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel b/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel
index 245132416..2e0302c20 100644
--- a/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel
+++ b/tachyon/zk/lookup/log_derivative_halo2/BUILD.bazel
@@ -22,6 +22,7 @@ tachyon_cc_library(
     deps = [
         "//tachyon/base:parallelize",
         "//tachyon/base:ref",
+        "//tachyon/base:sort",
         "//tachyon/base/containers:container_util",
         "//tachyon/crypto/commitments:polynomial_openings",
         "//tachyon/zk/base/entities:prover_base",
diff --git a/tachyon/zk/lookup/log_derivative_halo2/evaluator.h b/tachyon/zk/lookup/log_derivative_halo2/evaluator.h
index 0ba455ccc..6a4c706b5 100644
--- a/tachyon/zk/lookup/log_derivative_halo2/evaluator.h
+++ b/tachyon/zk/lookup/log_derivative_halo2/evaluator.h
@@ -110,19 +110,28 @@ class Evaluator {
       //     = Σᵢ(τ(X) * Π_{j != i} φⱼ(X)) - m(X) * Π(φᵢ(X))
       //
       // (1 - (l_last(X) + l_blind(X))) * (LHS - RHS) = 0
+      std::vector<F> inputs_value;
       size_t start = chunk_offset * chunk_size;
       for (size_t idx = 0; idx < chunk.size(); ++idx) {
         size_t cur_idx = start + idx;
 
         // φᵢ(X) = fᵢ(X) + β
-        std::vector<F> inputs_value = base::Map(
-            inputs_eval_data,
-            [&inputs_evaluator, &cur_idx](
-                size_t i,
-                plonk::EvaluationInput<EvalsOrExtendedEvals>& input_eval_data) {
-              return inputs_evaluator[i].Evaluate(input_eval_data, cur_idx,
-                                                  /*scale=*/1, F::Zero());
-            });
+        if (idx == 0) {
+          inputs_value = base::Map(
+              inputs_eval_data,
+              [&inputs_evaluator, &cur_idx](
+                  size_t i, plonk::EvaluationInput<EvalsOrExtendedEvals>&
+                                input_eval_data) {
+                return inputs_evaluator[i].Evaluate(input_eval_data, cur_idx,
+                                                    /*scale=*/1, F::Zero());
+              });
+        } else {
+          for (size_t i = 0; i < inputs_value.size(); ++i) {
+            inputs_value[i] =
+                inputs_evaluator[i].Evaluate(inputs_eval_data[i], cur_idx,
+                                             /*scale=*/1, F::Zero());
+          }
+        }
 
         // Π(φᵢ(X))
         F inputs_prod = std::accumulate(
diff --git a/tachyon/zk/lookup/log_derivative_halo2/prover.h b/tachyon/zk/lookup/log_derivative_halo2/prover.h
index efb3094dc..8742e99d9 100644
--- a/tachyon/zk/lookup/log_derivative_halo2/prover.h
+++ b/tachyon/zk/lookup/log_derivative_halo2/prover.h
@@ -32,6 +32,12 @@ struct TableEvalWithIndex {
   bool operator<(const TableEvalWithIndex& other) const {
     return eval < other.eval;
   }
+  bool operator<=(const TableEvalWithIndex& other) const {
+    return eval <= other.eval;
+  }
+  bool operator>(const TableEvalWithIndex& other) const {
+    return eval > other.eval;
+  }
 };
 
 template <typename BigInt>
@@ -41,7 +47,7 @@ struct ComputeMPolysTempStorage {
 
   explicit ComputeMPolysTempStorage(size_t usable_rows)
       : sorted_table_with_indices(usable_rows), m_values_atomic(usable_rows) {
-    OPENMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) {
+    OMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) {
       m_values_atomic[i] = 0;
     }
   }
diff --git a/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h b/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h
index fa29a8cec..5451087da 100644
--- a/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h
+++ b/tachyon/zk/lookup/log_derivative_halo2/prover_impl.h
@@ -14,6 +14,7 @@
 #include "tachyon/base/containers/container_util.h"
 #include "tachyon/base/parallelize.h"
 #include "tachyon/base/ref.h"
+#include "tachyon/base/sort.h"
 #include "tachyon/zk/lookup/log_derivative_halo2/prover.h"
 #include "tachyon/zk/plonk/expressions/compress_expression.h"
 
@@ -97,33 +98,33 @@ BlindedPolynomial<Poly, Evals> Prover<Poly, Evals>::ComputeMPoly(
     const Evals& compressed_table, ComputeMPolysTempStorage<BigInt>& storage) {
   RowIndex usable_rows = prover->GetUsableRows();
 
-  OPENMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) {
+  OMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) {
     storage.sorted_table_with_indices[i] = {i, compressed_table[i].ToBigInt()};
   }
 
-  // TODO(chokobole): Use https://github.com/timsort/cpp-TimSort or
-  // https://github.com/sebawild/powersort for better performance.
-  std::stable_sort(storage.sorted_table_with_indices.begin(),
+  base::StableSort(storage.sorted_table_with_indices.begin(),
                    storage.sorted_table_with_indices.end());
 
-  OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < compressed_inputs.size(); ++i) {
-    for (RowIndex j = 0; j < usable_rows; ++j) {
-      BigInt input = compressed_inputs[i][j].ToBigInt();
-      auto it = base::BinarySearchByKey(
-          storage.sorted_table_with_indices.begin(),
-          storage.sorted_table_with_indices.end(), input, LessThan<BigInt>{});
-      if (it != storage.sorted_table_with_indices.end()) {
-        storage.m_values_atomic[it->index].fetch_add(1,
-                                                     std::memory_order_relaxed);
+  std::vector<F> m_values(prover->pcs().N());
+  OMP_PARALLEL {
+    OMP_NESTED_FOR(size_t i = 0; i < compressed_inputs.size(); ++i) {
+      for (RowIndex j = 0; j < usable_rows; ++j) {
+        BigInt input = compressed_inputs[i][j].ToBigInt();
+        auto it = base::BinarySearchByKey(
+            storage.sorted_table_with_indices.begin(),
+            storage.sorted_table_with_indices.end(), input, LessThan<BigInt>{});
+        if (it != storage.sorted_table_with_indices.end()) {
+          storage.m_values_atomic[it->index].fetch_add(
+              1, std::memory_order_relaxed);
+        }
       }
     }
-  }
 
-  // Convert atomic |m_values| to |Evals|.
-  std::vector<F> m_values(prover->pcs().N());
-  OPENMP_PARALLEL_FOR(RowIndex i = 0; i < usable_rows; ++i) {
-    m_values[i] =
-        F(storage.m_values_atomic[i].exchange(0, std::memory_order_relaxed));
+    // Convert atomic |m_values| to |Evals|.
+    OMP_FOR(RowIndex i = 0; i < usable_rows; ++i) {
+      m_values[i] =
+          F(storage.m_values_atomic[i].exchange(0, std::memory_order_relaxed));
+    }
   }
 
   BlindedPolynomial<Poly, Evals> m_poly(Evals(std::move(m_values)),
@@ -206,11 +207,11 @@ BlindedPolynomial<Poly, Evals> Prover<Poly, Evals>::CreateGrandSumPoly(
     ComputeLogDerivatives(compressed_inputs[i], beta, input_log_derivatives);
 
     if (i == 0) {
-      OPENMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) {
+      OMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) {
         storage.inputs_log_derivatives[j] = input_log_derivatives[j];
       }
     } else {
-      OPENMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) {
+      OMP_PARALLEL_FOR(size_t j = 0; j < usable_rows; ++j) {
         storage.inputs_log_derivatives[j] += input_log_derivatives[j];
       }
     }
@@ -227,7 +228,7 @@ BlindedPolynomial<Poly, Evals> Prover<Poly, Evals>::CreateGrandSumPoly(
   // |storage.inputs_log_derivatives| since the current values of
   // |storage.inputs_log_derivatives| are not needed anymore.
   std::vector<F>& log_derivatives_diff = storage.inputs_log_derivatives;
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) {
+  OMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) {
     log_derivatives_diff[i] -= m_values[i] * storage.table_log_derivatives[i];
     if (i != usable_rows - 1) {
       grand_sum[i + 1] = log_derivatives_diff[i];
diff --git a/tachyon/zk/plonk/expressions/compress_expression.h b/tachyon/zk/plonk/expressions/compress_expression.h
index b5de054ef..2d1b80217 100644
--- a/tachyon/zk/plonk/expressions/compress_expression.h
+++ b/tachyon/zk/plonk/expressions/compress_expression.h
@@ -27,13 +27,13 @@ Evals CompressExpressions(
 
   for (size_t expr_idx = 0; expr_idx < expressions.size(); ++expr_idx) {
     if (UNLIKELY(expr_idx == 0)) {
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) {
         ProvingEvaluator<Evals> evaluator = evaluator_tpl;
         evaluator.set_idx(i);
         compressed_values[i] = evaluator.Evaluate(expressions[expr_idx].get());
       }
     } else {
-      OPENMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) {
+      OMP_PARALLEL_FOR(size_t i = 0; i < compressed_values.size(); ++i) {
         ProvingEvaluator<Evals> evaluator = evaluator_tpl;
         evaluator.set_idx(i);
         compressed_values[i] *= theta;
diff --git a/tachyon/zk/plonk/keys/proving_key.h b/tachyon/zk/plonk/keys/proving_key.h
index bcfbd975d..c85fab4aa 100644
--- a/tachyon/zk/plonk/keys/proving_key.h
+++ b/tachyon/zk/plonk/keys/proving_key.h
@@ -164,7 +164,7 @@ class ProvingKey : public Key {
     // | 5 | 0               |
     // | 6 | 0               |
     // | 7 | 0               |
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < usable_rows; ++i) {
       // NOTE(chokobole): It's safe to access since we created |domain->size()|
       // |evals|, which is greater than |usable_rows|.
       evals.at(i) = F::One();
diff --git a/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel b/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel
index ed935b9cf..f6ab49f06 100644
--- a/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel
+++ b/tachyon/zk/plonk/layout/floor_planner/v1/BUILD.bazel
@@ -67,12 +67,12 @@ tachyon_cc_library(
     deps = [
         "//tachyon:export",
         "//tachyon/base:logging",
+        "//tachyon/base:sort",
         "//tachyon/base/containers:container_util",
         "//tachyon/zk/plonk/base:column_key",
         "//tachyon/zk/plonk/base:column_type",
         "//tachyon/zk/plonk/layout:region_shape",
         "//tachyon/zk/plonk/layout/floor_planner:allocations",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@pdqsort",
     ],
 )
diff --git a/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h b/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h
index 9e2391cbf..7412f6d84 100644
--- a/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h
+++ b/tachyon/zk/plonk/layout/floor_planner/v1/v1_strategy.h
@@ -15,10 +15,10 @@
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "third_party/pdqsort/include/pdqsort.h"
 
 #include "tachyon/base/containers/container_util.h"
 #include "tachyon/base/logging.h"
+#include "tachyon/base/sort.h"
 #include "tachyon/export.h"
 #include "tachyon/zk/plonk/base/column_key.h"
 #include "tachyon/zk/plonk/base/column_type.h"
@@ -72,10 +72,10 @@ SlotInResult<F> SlotIn(std::vector<RegionShape<F>>& region_shapes) {
     // - The sort order relies on Column's Ord implementation!
     std::vector<RegionColumn> region_columns(region.columns().begin(),
                                              region.columns().end());
-    pdqsort(region_columns.begin(), region_columns.end(),
-            [](const RegionColumn& lhs, const RegionColumn& rhs) {
-              return lhs < rhs;
-            });
+    base::UnstableSort(region_columns.begin(), region_columns.end(),
+                       [](const RegionColumn& lhs, const RegionColumn& rhs) {
+                         return lhs < rhs;
+                       });
 
     std::optional<RowIndex> region_start =
         FirstFitRegion(&column_allocations, region_columns, region.row_count(),
@@ -102,41 +102,43 @@ SlotInBiggestAdviceFirstResult SlotInBiggestAdviceFirst(
   // NOTE(TomTaehoonKim): Sorted result might be different from the original
   // See
   // https://github.com/kroma-network/halo2/blob/7d0a369/halo2_proofs/src/layout/floor_planner/v1/strategy.rs#L202-L215
-  pdqsort(sorted_regions.begin(), sorted_regions.end(),
-          [](const RegionShape<F>& lhs, const RegionShape<F>& rhs) {
-            // Count the number of advice columns
-            size_t lhs_advice_cols = 0;
-            for (const RegionColumn& column : lhs.columns()) {
-              if (column.type() == RegionColumn::Type::kColumn) {
-                const AnyColumnKey& c = column.column();
-                if (c.type() == ColumnType::kAdvice) {
-                  ++lhs_advice_cols;
-                }
-              }
-            }
-            size_t rhs_advice_cols = 0;
-            for (const RegionColumn& column : rhs.columns()) {
-              if (column.type() == RegionColumn::Type::kColumn) {
-                const AnyColumnKey& c = column.column();
-                if (c.type() == ColumnType::kAdvice) {
-                  ++rhs_advice_cols;
-                }
-              }
-            }
-            // Sort by advice area (since this has the most contention).
-            return lhs_advice_cols * lhs.row_count() <
-                   rhs_advice_cols * rhs.row_count();
-          });
+  base::UnstableSort(sorted_regions.begin(), sorted_regions.end(),
+                     [](const RegionShape<F>& lhs, const RegionShape<F>& rhs) {
+                       // Count the number of advice columns
+                       size_t lhs_advice_cols = 0;
+                       for (const RegionColumn& column : lhs.columns()) {
+                         if (column.type() == RegionColumn::Type::kColumn) {
+                           const AnyColumnKey& c = column.column();
+                           if (c.type() == ColumnType::kAdvice) {
+                             ++lhs_advice_cols;
+                           }
+                         }
+                       }
+                       size_t rhs_advice_cols = 0;
+                       for (const RegionColumn& column : rhs.columns()) {
+                         if (column.type() == RegionColumn::Type::kColumn) {
+                           const AnyColumnKey& c = column.column();
+                           if (c.type() == ColumnType::kAdvice) {
+                             ++rhs_advice_cols;
+                           }
+                         }
+                       }
+                       // Sort by advice area (since this has the most
+                       // contention).
+                       return lhs_advice_cols * lhs.row_count() <
+                              rhs_advice_cols * rhs.row_count();
+                     });
   std::reverse(sorted_regions.begin(), sorted_regions.end());
 
   // Lay out the sorted regions.
   SlotInResult<F> result = SlotIn(sorted_regions);
 
   // Un-sort the regions so they match the original indexing.
-  pdqsort(result.regions.begin(), result.regions.end(),
-          [](const RegionInfo<F>& lhs, const RegionInfo<F>& rhs) {
-            return lhs.region.region_index() < rhs.region.region_index();
-          });
+  base::UnstableSort(result.regions.begin(), result.regions.end(),
+                     [](const RegionInfo<F>& lhs, const RegionInfo<F>& rhs) {
+                       return lhs.region.region_index() <
+                              rhs.region.region_index();
+                     });
   std::vector<RowIndex> region_starts = base::Map(
       result.regions,
       [](const RegionInfo<F>& region) { return region.region_start; });
diff --git a/tachyon/zk/plonk/permutation/grand_product_argument.h b/tachyon/zk/plonk/permutation/grand_product_argument.h
index 8dd4727fa..435758e68 100644
--- a/tachyon/zk/plonk/permutation/grand_product_argument.h
+++ b/tachyon/zk/plonk/permutation/grand_product_argument.h
@@ -80,7 +80,7 @@ class GrandProductArgument {
     size_t chunk_size = base::GetNumElementsPerThread(grand_product);
     size_t num_chunks = (size + chunk_size - 1) / chunk_size;
 
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < num_chunks; ++i) {
       RowIndex start = i * chunk_size;
       RowIndex end = i == num_chunks - 1 ? size : start + chunk_size;
       for (size_t j = 0; j < num_cols; ++j) {
diff --git a/tachyon/zk/plonk/permutation/permutation_assembly.h b/tachyon/zk/plonk/permutation/permutation_assembly.h
index 5ee8da3df..6fa67d042 100644
--- a/tachyon/zk/plonk/permutation/permutation_assembly.h
+++ b/tachyon/zk/plonk/permutation/permutation_assembly.h
@@ -131,7 +131,7 @@ class TACHYON_EXPORT PermutationAssembly {
                                     domain->template Zero<Evals>());
 
     // Assign |unpermuted_table| to |permutations|.
-    OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < permutations.size(); ++i) {
+    OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < permutations.size(); ++i) {
       for (size_t j = 0; j < rows_; ++j) {
         // NOTE(chokobole): It's safe to access since we created |kDegree|
         // |Zeros()|.
diff --git a/tachyon/zk/plonk/permutation/unpermuted_table.h b/tachyon/zk/plonk/permutation/unpermuted_table.h
index 3dc779c71..f786805ce 100644
--- a/tachyon/zk/plonk/permutation/unpermuted_table.h
+++ b/tachyon/zk/plonk/permutation/unpermuted_table.h
@@ -70,7 +70,7 @@ class UnpermutedTable {
     // Assign [δⁱω⁰, δⁱω¹, δⁱω², ..., δⁱωⁿ⁻¹] to each col.
     for (size_t i = 1; i < cols; ++i) {
       std::vector<F> col(rows);
-      OPENMP_PARALLEL_FOR(RowIndex j = 0; j < rows; ++j) {
+      OMP_PARALLEL_FOR(RowIndex j = 0; j < rows; ++j) {
         col[j] = unpermuted_table[i - 1][j] * delta;
       }
       unpermuted_table.push_back(Evals(std::move(col)));
diff --git a/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h b/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h
index b2c34faf3..acd37e563 100644
--- a/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h
+++ b/tachyon/zk/plonk/vanishing/vanishing_prover_impl.h
@@ -171,7 +171,7 @@ void VanishingProver<Poly, Evals, ExtendedPoly, ExtendedEvals>::BatchEvaluate(
                 [](absl::Span<F> h_piece) { return h_piece; });
   std::vector<F> coeffs(n);
   for (size_t i = h_pieces.size() - 1; i != SIZE_MAX; --i) {
-    OPENMP_PARALLEL_FOR(size_t j = 0; j < n; ++j) {
+    OMP_PARALLEL_FOR(size_t j = 0; j < n; ++j) {
       coeffs[j] *= x_n;
       coeffs[j] += h_pieces[i][j];
     }
diff --git a/tachyon/zk/plonk/vanishing/vanishing_utils.h b/tachyon/zk/plonk/vanishing/vanishing_utils.h
index 2d6ee9c96..626dbd7e6 100644
--- a/tachyon/zk/plonk/vanishing/vanishing_utils.h
+++ b/tachyon/zk/plonk/vanishing/vanishing_utils.h
@@ -103,7 +103,7 @@ ExtendedEvals& DivideByVanishingPolyInPlace(
   // Multiply the inverse to obtain the quotient polynomial in the coset
   // evaluation domain.
   std::vector<F>& evaluations = evals.evaluations();
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < evaluations.size(); ++i) {
+  OMP_PARALLEL_FOR(size_t i = 0; i < evaluations.size(); ++i) {
     evaluations[i] *= t_evaluations[i % t_evaluations.size()];
   }
 
@@ -126,7 +126,7 @@ void DistributePowersZeta(Poly& poly, bool into_coset) {
                       into_coset ? zeta_inv : zeta};
 
   std::vector<F>& coeffs = poly.coefficients().coefficients();
-  OPENMP_PARALLEL_FOR(size_t i = 0; i < coeffs.size(); ++i) {
+  OMP_PARALLEL_FOR(size_t i = 0; i < coeffs.size(); ++i) {
     size_t j = i % 3;
     if (j == 0) continue;
     coeffs[i] *= coset_powers[j - 1];
@@ -186,7 +186,7 @@ std::vector<F> BuildExtendedColumnWithColumns(
   size_t rows = columns[0].size();
 
   std::vector<F> flattened_transposed_columns(cols * rows);
-  OPENMP_PARALLEL_NESTED_FOR(size_t i = 0; i < columns.size(); ++i) {
+  OMP_PARALLEL_NESTED_FOR(size_t i = 0; i < columns.size(); ++i) {
     for (size_t j = 0; j < rows; ++j) {
       flattened_transposed_columns[j * cols + i] = columns[i][j];
     }
diff --git a/tachyon/zk/r1cs/constraint_system/BUILD.bazel b/tachyon/zk/r1cs/constraint_system/BUILD.bazel
index ada522475..a3f711dba 100644
--- a/tachyon/zk/r1cs/constraint_system/BUILD.bazel
+++ b/tachyon/zk/r1cs/constraint_system/BUILD.bazel
@@ -34,10 +34,10 @@ tachyon_cc_library(
     hdrs = ["linear_combination.h"],
     deps = [
         ":term",
+        "//tachyon/base:sort",
         "//tachyon/base/containers:container_util",
         "//tachyon/base/ranges:algorithm",
         "@com_google_googletest//:gtest_prod",
-        "@pdqsort",
     ],
 )
 
diff --git a/tachyon/zk/r1cs/constraint_system/linear_combination.h b/tachyon/zk/r1cs/constraint_system/linear_combination.h
index 8c0a8fe3f..06ec4ac6e 100644
--- a/tachyon/zk/r1cs/constraint_system/linear_combination.h
+++ b/tachyon/zk/r1cs/constraint_system/linear_combination.h
@@ -16,10 +16,10 @@
 
 #include "absl/strings/str_join.h"
 #include "gtest/gtest_prod.h"
-#include "third_party/pdqsort/include/pdqsort.h"
 
 #include "tachyon/base/containers/container_util.h"
 #include "tachyon/base/ranges/algorithm.h"
+#include "tachyon/base/sort.h"
 #include "tachyon/zk/r1cs/constraint_system/term.h"
 
 namespace tachyon::zk::r1cs {
@@ -64,10 +64,10 @@ class LinearCombination {
   std::vector<Term<F>>&& TakeTerms() && { return std::move(terms_); }
 
   void Deduplicate() {
-    pdqsort(terms_.begin(), terms_.end(),
-            [](const Term<F>& a, const Term<F>& b) {
-              return a.variable < b.variable;
-            });
+    base::UnstableSort(terms_.begin(), terms_.end(),
+                       [](const Term<F>& a, const Term<F>& b) {
+                         return a.variable < b.variable;
+                       });
     bool is_first = true;
     auto cur_var_first_it = terms_.begin();
     auto it = terms_.begin();
diff --git a/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h b/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h
index 2b37d7ba2..20edeefde 100644
--- a/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h
+++ b/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h
@@ -139,18 +139,15 @@ class QuadraticArithmeticProgram {
     // where x is |full_assignments|.
     // clang-format on
     OMP_PARALLEL {
-      OMP_FOR_NOWAIT
-      for (size_t i = 0; i < matrices.num_constraints; ++i) {
+      OMP_FOR_NOWAIT(size_t i = 0; i < matrices.num_constraints; ++i) {
         a[i] = EvaluateConstraint(matrices.a[i], full_assignments);
       }
 
-      OMP_FOR_NOWAIT
-      for (size_t i = 0; i < matrices.num_constraints; ++i) {
+      OMP_FOR_NOWAIT(size_t i = 0; i < matrices.num_constraints; ++i) {
         b[i] = EvaluateConstraint(matrices.b[i], full_assignments);
       }
 
-      OMP_FOR
-      for (size_t i = 0; i < matrices.num_constraints; ++i) {
+      OMP_FOR(size_t i = 0; i < matrices.num_constraints; ++i) {
         c[i] = EvaluateConstraint(matrices.c[i], full_assignments);
       }
     }
@@ -181,7 +178,7 @@ class QuadraticArithmeticProgram {
                    .Inverse());
 
     // |h_evals[i]| = (|a[i]| * |b[i]| - |c[i]|)) / (g * ωⁿ⁺ˡ⁺¹ - 1)
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) {
       F& h_evals_i = a_evals.at(i);
       h_evals_i *= b_evals[i];
       h_evals_i -= c_evals[i];
diff --git a/tachyon/zk/r1cs/groth16/proving_key.h b/tachyon/zk/r1cs/groth16/proving_key.h
index 3eb06b6a2..212d24b7c 100644
--- a/tachyon/zk/r1cs/groth16/proving_key.h
+++ b/tachyon/zk/r1cs/groth16/proving_key.h
@@ -115,7 +115,7 @@ class ProvingKey : public Key {
     std::vector<F>& a = qap_instance_map_result.a;
     std::vector<F>& b = qap_instance_map_result.b;
     std::vector<F>& c = qap_instance_map_result.c;
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) {
       l[i] = ComputeABC(
           a[num_instance_variables + i], b[num_instance_variables + i],
           c[num_instance_variables + i], toxic_waste, delta_inverse);
diff --git a/tachyon/zk/r1cs/groth16/verifying_key.h b/tachyon/zk/r1cs/groth16/verifying_key.h
index 3f1ab1c73..216e49721 100644
--- a/tachyon/zk/r1cs/groth16/verifying_key.h
+++ b/tachyon/zk/r1cs/groth16/verifying_key.h
@@ -91,7 +91,7 @@ class VerifyingKey : public Key {
     const std::vector<F>& a = qap_instance_map_result.a;
     const std::vector<F>& b = qap_instance_map_result.b;
     const std::vector<F>& c = qap_instance_map_result.c;
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < l.size(); ++i) {
       l[i] = ComputeABC(a[i], b[i], c[i], toxic_waste, gamma_inverse);
     }
 
diff --git a/third_party/pdqsort/workspace.bzl b/third_party/pdqsort/workspace.bzl
index b5271935b..cc7b013fb 100644
--- a/third_party/pdqsort/workspace.bzl
+++ b/third_party/pdqsort/workspace.bzl
@@ -1,4 +1,4 @@
-"""loads the hwloc library, used by Tachyon."""
+"""loads the pdqsort library, used by Tachyon."""
 
 load("//third_party:repo.bzl", "tachyon_http_archive", "tf_mirror_urls")
 
diff --git a/third_party/powersort/BUILD.bazel b/third_party/powersort/BUILD.bazel
new file mode 100644
index 000000000..e69de29bb
diff --git a/third_party/powersort/fix_multiple_definitions.patch b/third_party/powersort/fix_multiple_definitions.patch
new file mode 100644
index 000000000..2725d3572
--- /dev/null
+++ b/third_party/powersort/fix_multiple_definitions.patch
@@ -0,0 +1,176 @@
+diff --git a/src/sorts/merging.h b/src/sorts/merging.h
+index a2a3769..9afc08f 100644
+--- a/src/sorts/merging.h
++++ b/src/sorts/merging.h
+@@ -7,14 +7,6 @@
+ 
+ namespace algorithms {
+ 
+-#ifdef COUNT_MERGECOST
+-	const bool COUNT_MERGE_COSTS = true;
+-#else
+-	const bool COUNT_MERGE_COSTS = false;
+-#endif
+-	long long volatile totalMergeCosts = 0;
+-	long long volatile totalBufferCosts = 0;
+-
+     /**
+      * A sentinel value used by some merging method;
+      * this value must be strictly larger than any value in the input.
+@@ -39,7 +31,7 @@ namespace algorithms {
+         COPY_BOTH_WITH_SENTINELS
+     };
+ 
+-    std::string to_string(merging_methods mergingMethod) {
++    inline std::string to_string(merging_methods mergingMethod) {
+         switch (mergingMethod) {
+             case UNSTABLE_BITONIC_MERGE:
+                 return "UNSTABLE_BITONIC_MERGE";
+@@ -70,10 +62,8 @@ namespace algorithms {
+ 	 */
+ 	template<typename Iter, typename Iter2>
+ 	void merge_runs_bitonic(Iter l, Iter m, Iter r, Iter2 B) {
+-		if (COUNT_MERGE_COSTS) totalMergeCosts += (r-l);
+ 		std::copy_backward(l,m,B+(m-l));
+         std::reverse_copy(m,r,B+(m-l));
+-        if (COUNT_MERGE_COSTS) totalBufferCosts += (r-l);
+         auto i = B, j = B+(r-l-1);
+ 		for (auto k = l; k < r; ++k)
+ 			*k = *j < *i ? *j-- : *i++;
+@@ -90,10 +80,8 @@ namespace algorithms {
+ 	template<typename Iter, typename Iter2>
+ 	void merge_runs_bitonic_manual_copy(Iter l, Iter m, Iter r, Iter2 B) {
+ 		Iter i1, j1; Iter2 b;
+-		if (COUNT_MERGE_COSTS) totalMergeCosts += (r-l);
+ 		for (i1 = m-1, b = B+(m-1-l); i1 >= l;) *b-- = *i1--;
+ 		for (j1 = r, b = B+(m-l); j1 > m;) *b++ = *--j1;
+-        if (COUNT_MERGE_COSTS) totalBufferCosts += (r-l);
+ 		auto i = B, j = B+(r-l-1);
+ 		for (auto k = l; k < r; ++k)
+ 			*k = *j < *i ? *j-- : *i++;
+@@ -111,10 +99,8 @@ namespace algorithms {
+ 	 */
+ 	template<typename Iter, typename Iter2>
+ 	void merge_runs_bitonic_branchless(Iter l, Iter m, Iter r, Iter2 B) {
+-		if (COUNT_MERGE_COSTS) totalMergeCosts += (r-l);
+ 		std::copy_backward(l,m,B+(m-l));
+ 		std::reverse_copy(m,r,B+(m-l));
+-        if (COUNT_MERGE_COSTS) totalBufferCosts += (r-l);
+ 		Iter2 i = B, j = B+(r-l-1);
+ 		for (auto k = l; k < r; ++k) {
+ 			bool const cmp = *j < *i;
+@@ -133,10 +119,8 @@ namespace algorithms {
+ 	template<typename Iter, typename Iter2>
+ 	void merge_runs_copy_half(Iter l, Iter m, Iter r, Iter2 B) {
+ 		auto n1 = m-l, n2 = r-m;
+-		if (COUNT_MERGE_COSTS) totalMergeCosts += (n1+n2);
+         if (n1 <= n2) {
+             std::copy(l,m,B);
+-            if (COUNT_MERGE_COSTS) totalBufferCosts += (m-l);
+             auto c1 = B, e1 = B + n1;
+             auto c2 = m, e2 = r, o = l;
+             while (c1 < e1 && c2 < e2)
+@@ -144,7 +128,6 @@ namespace algorithms {
+             while (c1 < e1) *o++ = *c1++;
+         } else {
+             std::copy(m,r,B);
+-            if (COUNT_MERGE_COSTS) totalBufferCosts += (r-m);
+             auto c1 = m-1, s1 = l, o = r-1;
+             auto c2 = B+n2-1, s2 = B;
+             while (c1 >= s1 && c2 >= s2)
+@@ -161,9 +144,7 @@ namespace algorithms {
+ 	template<typename Iter, typename Iter2>
+ 	void merge_runs_basic(Iter l, Iter m, Iter r, Iter2 B) {
+ 		auto n1 = m-l, n2 = r-m;
+-		if (COUNT_MERGE_COSTS) totalMergeCosts += (n1+n2);
+         std::copy(l,r,B);
+-        if (COUNT_MERGE_COSTS) totalBufferCosts += (n1+n2);
+         auto c1 = B, e1 = B + n1, c2 = e1, e2 = e1 + n2;
+         auto o = l;
+         while (c1 < e1 && c2 < e2)
+@@ -182,12 +163,10 @@ namespace algorithms {
+         typedef typename std::iterator_traits<Iter>::value_type T;
+         static_assert(std::numeric_limits<T>::is_specialized, "Needs numeric type (for sentinels)");
+         auto n1 = m-l, n2 = r-m;
+-		if (COUNT_MERGE_COSTS) totalMergeCosts += (n1+n2);
+         std::copy(l, m, B);
+         *(B + (m - l)) = plus_inf_sentinel<T>();
+         std::copy(m, r, B + (m - l + 1));
+         *(B + (r - l) + 1) = plus_inf_sentinel<T>();
+-        if (COUNT_MERGE_COSTS) totalBufferCosts += (n1+n2+2);
+         auto c1 = B, c2 = B + (m - l + 1), o = l;
+         while (o < r) *o++ = *c1 <= *c2 ? *c1++ : *c2++;
+ 	}
+diff --git a/src/sorts/powersort.h b/src/sorts/powersort.h
+index 93d2ace..6a0b36b 100644
+--- a/src/sorts/powersort.h
++++ b/src/sorts/powersort.h
+@@ -24,7 +24,7 @@ namespace algorithms {
+ 		BITWISE_LOOP,
+ 		MOST_SIGNIFICANT_SET_BIT,
+ 	};
+-	std::string to_string(node_power_implementations implementation) {
++	inline std::string to_string(node_power_implementations implementation) {
+ 		switch (implementation) {
+ 			case TRIVIAL: return "TRIVIAL";
+ 			case DIVISION_LOOP: return "DIVISION_LOOP";
+@@ -36,7 +36,7 @@ namespace algorithms {
+ 	};
+ 
+ 
+-	power_t node_power_trivial(size_t begin, size_t end,
++	inline power_t node_power_trivial(size_t begin, size_t end,
+ 	                            size_t beginA, size_t beginB, size_t endB) {
+ 		size_t n = end - begin;
+ 		size_t n1 = beginB - beginA, n2 = endB - beginB;
+@@ -51,7 +51,7 @@ namespace algorithms {
+ 		return k;
+ 	}
+ 
+-    power_t node_power_div(size_t begin, size_t end,
++    inline power_t node_power_div(size_t begin, size_t end,
+ 	                        size_t beginA, size_t beginB, size_t endB) {
+ 		size_t twoN = 2*(end - begin); // 2*n
+ 		size_t n1 = beginB - beginA, n2 = endB - beginB; // lengths of runs
+@@ -66,7 +66,7 @@ namespace algorithms {
+ 		return k;
+ 	}
+ 
+-    power_t node_power_bitwise(size_t begin, size_t end,
++    inline power_t node_power_bitwise(size_t begin, size_t end,
+ 	                            size_t beginA, size_t beginB, size_t endB) {
+ 		size_t n = end - begin;
+ 		assert (n < (size_t{1} << 63));
+@@ -87,7 +87,7 @@ namespace algorithms {
+ 		return nCommonBits + 1;
+ 	}
+ 
+-    power_t node_power_clz(size_t begin, size_t end,
++    inline power_t node_power_clz(size_t begin, size_t end,
+ 	                        size_t beginA, size_t beginB, size_t endB) {
+ 		size_t n = end - begin;
+ 		assert(n <= (size_t{1} << 31));
+@@ -99,7 +99,7 @@ namespace algorithms {
+ 	}
+ 
+ 	// not precise enough for large powers ...
+-    power_t node_power_clz_unconstrained(ptrdiff_t begin, ptrdiff_t end,
++    inline power_t node_power_clz_unconstrained(ptrdiff_t begin, ptrdiff_t end,
+ 	                                      ptrdiff_t beginA, ptrdiff_t beginB, ptrdiff_t endB) {
+ 		assert(begin <= beginA && beginA <= beginB && beginB <= endB && endB <= end);
+ 		auto n = static_cast<size_t>(end - begin);
+@@ -128,12 +128,12 @@ namespace algorithms {
+ 		}
+ 	}
+ 
+-	unsigned floor_log2(unsigned int n) {
++	inline unsigned floor_log2(unsigned int n) {
+ 		if (n <= 0) return 0;
+ 		return 31 - __builtin_clz( n );
+ 	}
+ 
+-	unsigned floor_log2(unsigned long n) {
++	inline unsigned floor_log2(unsigned long n) {
+ 		if (n <= 0) return 0;
+ 		return 63 - __builtin_clzl( n );
+ 	}
diff --git a/third_party/powersort/fix_sign_compare_warning.patch b/third_party/powersort/fix_sign_compare_warning.patch
new file mode 100644
index 000000000..25c7ef454
--- /dev/null
+++ b/third_party/powersort/fix_sign_compare_warning.patch
@@ -0,0 +1,31 @@
+diff --git a/src/sorts/powersort.h b/src/sorts/powersort.h
+index 54ab704..93d2ace 100644
+--- a/src/sorts/powersort.h
++++ b/src/sorts/powersort.h
+@@ -69,7 +69,7 @@ namespace algorithms {
+     power_t node_power_bitwise(size_t begin, size_t end,
+ 	                            size_t beginA, size_t beginB, size_t endB) {
+ 		size_t n = end - begin;
+-		assert (n < (1L << 63));
++		assert (n < (size_t{1} << 63));
+ 		size_t l = beginA - begin + beginB - begin;
+ 		size_t r = beginB - begin + endB - begin;
+ 		// a and b are given by l/(2*n) and r/(2*n), both are in [0,1).
+@@ -90,7 +90,7 @@ namespace algorithms {
+     power_t node_power_clz(size_t begin, size_t end,
+ 	                        size_t beginA, size_t beginB, size_t endB) {
+ 		size_t n = end - begin;
+-		assert(n <= (1L << 31));
++		assert(n <= (size_t{1} << 31));
+ 		unsigned long l2 = beginA + beginB - 2*begin; // 2*l
+ 		unsigned long r2 = beginB + endB - 2*begin;   // 2*r
+ 		auto a = static_cast<unsigned int>((l2 << 30) / n);
+@@ -103,7 +103,7 @@ namespace algorithms {
+ 	                                      ptrdiff_t beginA, ptrdiff_t beginB, ptrdiff_t endB) {
+ 		assert(begin <= beginA && beginA <= beginB && beginB <= endB && endB <= end);
+ 		auto n = static_cast<size_t>(end - begin);
+-		assert(n < (1L << 63));
++		assert(n < (size_t{1} << 63));
+ 		auto l2 = static_cast<size_t>((beginA - begin) + (beginB - begin)); // 2*l
+ 		auto r2 = static_cast<size_t>((beginB - begin) + (endB - begin));   // 2*r
+ 		static_assert(sizeof(size_t) == 8, "assume 64bit size_t"); // can compute with 64 bits
diff --git a/third_party/powersort/fix_static_assertion.patch b/third_party/powersort/fix_static_assertion.patch
new file mode 100644
index 000000000..b9e862908
--- /dev/null
+++ b/third_party/powersort/fix_static_assertion.patch
@@ -0,0 +1,49 @@
+diff --git a/src/sorts/merging.h b/src/sorts/merging.h
+index 9afc08f..835b6d2 100644
+--- a/src/sorts/merging.h
++++ b/src/sorts/merging.h
+@@ -254,27 +254,23 @@ namespace algorithms {
+     template<merging_methods mergingMethod,
+             typename Iter, typename Iter2>
+     void merge_runs(Iter l, Iter m, Iter r, Iter2 B) {
+-        switch(mergingMethod) {
+-            case UNSTABLE_BITONIC_MERGE:
+-                return merge_runs_bitonic(l, m, r, B);
+-            case UNSTABLE_BITONIC_MERGE_MANUAL_COPY:
+-                return merge_runs_bitonic_manual_copy(l, m, r, B);
+-            case UNSTABLE_BITONIC_MERGE_BRANCHLESS:
+-                return merge_runs_bitonic_branchless(l, m, r, B);
+-            case COPY_SMALLER:
+-                return merge_runs_copy_half(l, m, r, B);
+-            case COPY_BOTH:
+-                return merge_runs_basic(l, m, r, B);
+-            case COPY_BOTH_WITH_SENTINELS:
+-                return merge_runs_basic_sentinels(l, m, r, B);
+-            default:
+-                assert(false);
+-                __builtin_unreachable();
+-        }
+-    }
+-
+-
+-
++			if constexpr (mergingMethod == UNSTABLE_BITONIC_MERGE) {
++				return merge_runs_bitonic(l, m, r, B);
++			} else if constexpr (mergingMethod == UNSTABLE_BITONIC_MERGE_MANUAL_COPY) {
++				return merge_runs_bitonic_manual_copy(l, m, r, B);
++			} else if constexpr (mergingMethod == UNSTABLE_BITONIC_MERGE_BRANCHLESS) {
++				return merge_runs_bitonic_branchless(l, m, r, B);
++			} else if constexpr (mergingMethod == COPY_SMALLER) {
++				return merge_runs_copy_half(l, m, r, B);
++			} else if constexpr (mergingMethod == COPY_BOTH) {
++				return merge_runs_basic(l, m, r, B);
++			} else if constexpr (mergingMethod == COPY_BOTH_WITH_SENTINELS) {
++				return merge_runs_basic_sentinels(l, m, r, B);
++			} else {
++				assert(false);
++				__builtin_unreachable();
++ 		  }
++		}
+ }
+ 
+ #endif //MERGESORTS_MERGING_H
diff --git a/third_party/powersort/powersort.BUILD b/third_party/powersort/powersort.BUILD
new file mode 100644
index 000000000..8ff6ee1a7
--- /dev/null
+++ b/third_party/powersort/powersort.BUILD
@@ -0,0 +1,15 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "powersort",
+    hdrs = [
+        "src/algorithms.h",
+        "src/sorts/insertionsort.h",
+        "src/sorts/merging.h",
+        "src/sorts/powersort.h",
+    ],
+    include_prefix = "third_party/powersort/include",
+    strip_include_prefix = "src",
+)
diff --git a/third_party/powersort/remove_binary_function.patch b/third_party/powersort/remove_binary_function.patch
new file mode 100644
index 000000000..724c7a4d1
--- /dev/null
+++ b/third_party/powersort/remove_binary_function.patch
@@ -0,0 +1,13 @@
+diff --git a/src/algorithms.h b/src/algorithms.h
+index 4b94d9b..a2a2576 100644
+--- a/src/algorithms.h
++++ b/src/algorithms.h
+@@ -16,7 +16,7 @@ namespace algorithms {
+ 
+ 	/** superclass for sorting methods */
+ 	template<typename Iterator>
+-	class sorter : std::binary_function<Iterator, Iterator, void> {
++	class sorter {
+ 	protected:
+ 		using elem_t = typename std::iterator_traits<Iterator>::value_type ;
+ 		using diff_t = typename std::iterator_traits<Iterator>::difference_type ;
diff --git a/third_party/powersort/workspace.bzl b/third_party/powersort/workspace.bzl
new file mode 100644
index 000000000..0322cfce7
--- /dev/null
+++ b/third_party/powersort/workspace.bzl
@@ -0,0 +1,20 @@
+"""loads the powersort library, used by Tachyon."""
+
+load("//third_party:repo.bzl", "tachyon_http_archive", "tf_mirror_urls")
+
+def repo():
+    tachyon_http_archive(
+        name = "powersort",
+        urls = tf_mirror_urls("https://github.com/sebawild/powersort/archive/48e31e909280ca43bb2c33dd3df9922b0a0f3f84.tar.gz"),
+        sha256 = "89122b7e7e2a0f0b41cc5411f9adde581769ff2f7d141335ce7e5011b932da06",
+        strip_prefix = "powersort-48e31e909280ca43bb2c33dd3df9922b0a0f3f84",
+        build_file = "//third_party/powersort:powersort.BUILD",
+        patch_file = [
+            "@kroma_network_tachyon//third_party/powersort:fix_sign_compare_warning.patch",
+            "@kroma_network_tachyon//third_party/powersort:fix_multiple_definitions.patch",
+            "@kroma_network_tachyon//third_party/powersort:fix_static_assertion.patch",
+            # In c++ 17, std::binary_function is removed.
+            # See https://en.cppreference.com/w/cpp/utility/functional/binary_function.
+            "@kroma_network_tachyon//third_party/powersort:remove_binary_function.patch",
+        ],
+    )
diff --git a/vendors/circom/benchmark/rapidsnark_runner.h b/vendors/circom/benchmark/rapidsnark_runner.h
index 0d9f8bf45..809ab48bb 100644
--- a/vendors/circom/benchmark/rapidsnark_runner.h
+++ b/vendors/circom/benchmark/rapidsnark_runner.h
@@ -82,7 +82,7 @@ class RapidsnarkRunner : public Runner<Curve, MaxDegree> {
     base::TimeTicks now = base::TimeTicks::Now();
 
     std::vector<FrElement> full_assignments(full_assignments_in.size());
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < full_assignments_in.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < full_assignments_in.size(); ++i) {
       using BigInt = typename F::BigIntTy;
       BigInt bigint = full_assignments_in[i].ToBigInt();
       memcpy(full_assignments[i].v, bigint.limbs, BigInt::kByteNums);
diff --git a/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h b/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h
index a5ee0db72..a6be43316 100644
--- a/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h
+++ b/vendors/circom/circomlib/circuit/quadratic_arithmetic_program.h
@@ -38,7 +38,7 @@ class QuadraticArithmeticProgram {
     omp_lock_t locks[kNumLocks];
     for (size_t i = 0; i < kNumLocks; i++) omp_init_lock(&locks[i]);
 #endif
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); i++) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); i++) {
       const Coefficient<F>& c = coefficients[i];
       std::vector<F>& ab = (c.matrix == 0) ? a : b;
 
@@ -58,7 +58,7 @@ class QuadraticArithmeticProgram {
     for (size_t i = 0; i < kNumLocks; i++) omp_destroy_lock(&locks[i]);
 #endif
 
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) {
       c[i] = a[i] * b[i];
     }
 
@@ -81,7 +81,7 @@ class QuadraticArithmeticProgram {
     c_evals = domain->FFT(std::move(c_poly));
 
     // |h_evals[i]| = |a[i]| * |b[i]| - |c[i]|
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < domain->size(); ++i) {
       F& h_evals_i = a_evals.at(i);
       h_evals_i *= b_evals[i];
       h_evals_i -= c_evals[i];
diff --git a/vendors/circom/circomlib/wtns/wtns.h b/vendors/circom/circomlib/wtns/wtns.h
index b62bc2bff..64a6b0be3 100644
--- a/vendors/circom/circomlib/wtns/wtns.h
+++ b/vendors/circom/circomlib/wtns/wtns.h
@@ -127,7 +127,7 @@ struct WtnsDataSection {
     if (!buffer.ReadPtr(&ptr, header.num_witness)) return false;
     witnesses = {ptr, header.num_witness};
 
-    OPENMP_PARALLEL_FOR(uint32_t i = 0; i < header.num_witness; ++i) {
+    OMP_PARALLEL_FOR(uint32_t i = 0; i < header.num_witness; ++i) {
       witnesses[i] = F(witnesses[i].value());
     }
     return true;
diff --git a/vendors/circom/circomlib/zkey/zkey.h b/vendors/circom/circomlib/zkey/zkey.h
index 09d542e64..fb9b10c69 100644
--- a/vendors/circom/circomlib/zkey/zkey.h
+++ b/vendors/circom/circomlib/zkey/zkey.h
@@ -215,7 +215,7 @@ struct CoefficientsSection {
     if (!buffer.ReadPtr(&ptr, num_coefficients)) return false;
     coefficients = {ptr, num_coefficients};
 
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); ++i) {
+    OMP_PARALLEL_FOR(size_t i = 0; i < coefficients.size(); ++i) {
       coefficients[i].value =
           F::FromMontgomery(coefficients[i].value.ToBigInt());
     }