diff --git a/include/helib/EncryptedArray.h b/include/helib/EncryptedArray.h
index 9359feb19..d34b09ab4 100644
--- a/include/helib/EncryptedArray.h
+++ b/include/helib/EncryptedArray.h
@@ -2651,7 +2651,7 @@ inline void totalSums(Ctxt& ctxt)
 
 //! @brief Map all non-zero slots to 1, leaving zero slots as zero.
 //! Assumes that r=1, and that all the slots contain elements from GF(p^d).
-void mapTo01(const EncryptedArray& ea, Ctxt& ctxt);
+void mapTo01(const EncryptedArray& ea, Ctxt& ctxt, bool multithread = true);
 // Implemented in eqtesting.cpp. We compute
 //             x^{p^d-1} = x^{(1+p+...+p^{d-1})*(p-1)}
 // by setting y=x^{p-1} and then outputting y * y^p * ... * y^{p^{d-1}},
diff --git a/misc/psi/io/io.h b/misc/psi/io/io.h
index ee0059bf7..abfed3382 100644
--- a/misc/psi/io/io.h
+++ b/misc/psi/io/io.h
@@ -76,11 +76,14 @@ helib::Database<TXT> readDbFromFile(const std::string& databaseFilePath,
       }
     }
   } else { // Ctxt query
-    for (long i = 0; i < nrow; ++i) {
-      for (long j = 0; j < ncol; ++j) {
-        reader.value().readDatum(data(i, j), i, j);
-      }
+    NTL_EXEC_RANGE(nrow * ncol, first, last)
+    Reader<TXT> threadReader(reader.value());
+    for (long i = first; i < last; ++i) {
+      long row = i / ncol;
+      long col = i % ncol;
+      threadReader.readDatum(data(row, col), row, col);
     }
+    NTL_EXEC_RANGE_END
   }
 
   return helib::Database<TXT>(data, contextp);
@@ -128,11 +131,14 @@ helib::Matrix<TXT> readQueryFromFile(const std::string& queryFilePath,
     }
   } else { // Ctxt query
     // Read in ctxts
-    for (long i = 0; i < nrow; ++i) {
-      for (long j = 0; j < ncol; ++j) {
-        reader.value().readDatum(query(i, j), i, j);
-      }
+    NTL_EXEC_RANGE(nrow * ncol, first, last)
+    Reader<TXT> threadReader(reader.value());
+    for (long i = first; i < last; ++i) {
+      long row = i / ncol;
+      long col = i % ncol;
+      threadReader.readDatum(query(row, col), row, col);
     }
+    NTL_EXEC_RANGE_END
     if (ncol == 1) { // Transpose to make row vector
       query.transpose();
     }
diff --git a/src/eqtesting.cpp b/src/eqtesting.cpp
index b310190d5..d11bb1786 100644
--- a/src/eqtesting.cpp
+++ b/src/eqtesting.cpp
@@ -9,6 +9,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License. See accompanying LICENSE file.
  */
+
+/* Copyright (C) 2022 Intel Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Modifying HElib to optimize the 01 map.
+ * Contributions include
+ * Modified:
+ *   mapTo01
+ *     added parallelism to existing logic for norm calculation
+ *     added alternative logic for norm calculation which uses log(d) 
+ *     automorphisms on a single core
+ *     added an additional optional argument `multithread` which determines 
+ *     which version to run
+ *      
+ */
 /**
  * @file eqtesting.cpp
  * @brief Useful functions for equality testing...
@@ -17,6 +32,7 @@
 #include <helib/timing.h>
 #include <helib/EncryptedArray.h>
 #include <helib/Ptxt.h>
+#include <NTL/BasicThreadPool.h>
 
 #include <cstdio>
 
@@ -29,10 +45,7 @@ namespace helib {
 // and then outputting y * y^p * ... * y^{p^{d-1}}, with exponentiation to
 // powers of p done via Frobenius.
 
-// FIXME: the computation of the "norm" y * y^p * ... * y^{p^{d-1}}
-// can be done using O(log d) automorphisms, rather than O(d).
-
-void mapTo01(const EncryptedArray& ea, Ctxt& ctxt)
+void mapTo01(const EncryptedArray& ea, Ctxt& ctxt, bool multithread)
 {
   long p = ctxt.getPtxtSpace();
   if (p != ea.getPAlgebra().getP()) // ptxt space is p^r for r>1
@@ -40,13 +53,39 @@ void mapTo01(const EncryptedArray& ea, Ctxt& ctxt)
 
   if (p > 2)
     ctxt.power(p - 1); // set y = x^{p-1}
-
   long d = ea.getDegree();
-  if (d > 1) { // compute the product of the d automorphisms
-    std::vector<Ctxt> v(d, ctxt);
-    for (long i = 1; i < d; i++)
-      v[i].frobeniusAutomorph(i);
-    totalProduct(ctxt, v);
+  // TODO: investigate this trade off more thoroughly
+  // Computing in parallel over t threads has runtime approximately
+  // (d - 1)/t, whereas single thread has runtime approx log(d)
+  if ((NTL::AvailableThreads() > 1) && multithread) {
+    // Compute O(d) Frobenius automorphisms in parallel    
+    if (d > 1) {
+      // compute the d - 1 automorphisms in parallel
+      std::vector<Ctxt> v(d, ctxt);
+      NTL_EXEC_RANGE(d - 1, first, last)
+      for (long i = first; i < last; i++)
+        v[i + 1].frobeniusAutomorph(i + 1);
+      NTL_EXEC_RANGE_END
+      // and compute the product of the d automorphisms
+      totalProduct(ctxt, v);
+    }
+  } else {
+    // Compute of the "norm" y * y^p * ... * y^{p^{d-1}}
+    //  using O(log d) automorphisms, rather than O(d).
+    long e = 1;
+    long b = NTL::NumBits(d);
+    Ctxt orig = ctxt;
+    for (long i = b - 2; i >= 0; i--) {
+      Ctxt tmp = ctxt;
+      tmp.frobeniusAutomorph(e);
+      ctxt *= tmp;
+      e *= 2;
+      if (NTL::bit(d, i)) {
+        ctxt.frobeniusAutomorph(1);
+        ctxt *= orig;
+        e++;
+      }
+    }
   }
 }