From 5153c63d2f3c311bd8272ba9848b296f88b409b4 Mon Sep 17 00:00:00 2001
From: Marcel Keller <mks.keller@gmail.com>
Date: Thu, 16 Feb 2023 12:34:22 +1100
Subject: [PATCH] More accessible machine learning functionality.

---
 BMR/Register.h                                |   2 +
 CHANGELOG.md                                  |  12 +
 CONFIG                                        |   2 +-
 Compiler/GC/types.py                          |  35 +-
 Compiler/allocator.py                         |  39 +-
 Compiler/compilerLib.py                       | 180 ++++-
 Compiler/decision_tree.py                     | 233 ++++--
 Compiler/floatingpoint.py                     |   4 +-
 Compiler/instructions.py                      |  26 +-
 Compiler/instructions_base.py                 |  39 +-
 Compiler/library.py                           | 355 ++------
 Compiler/ml.py                                | 760 +++++++++++++++---
 Compiler/mpc_math.py                          |   2 +-
 Compiler/program.py                           |  74 +-
 Compiler/sorting.py                           |  18 +
 Compiler/sqrt_oram.py                         |   3 +-
 Compiler/types.py                             | 572 +++++++++----
 ExternalIO/README.md                          |  11 +-
 GC/FakeSecret.cpp                             |   6 +
 GC/FakeSecret.h                               |   2 +
 GC/Program.hpp                                |   2 +-
 GC/RuntimeBranching.h                         |   3 +
 GC/Secret.h                                   |   2 +
 GC/SemiSecret.h                               |   2 +
 GC/ShareSecret.h                              |   5 +-
 GC/ThreadMaster.hpp                           |   3 +-
 GC/instructions.h                             |   2 +-
 License.txt                                   |   2 +-
 Machines/TripleMachine.cpp                    |   4 +
 Makefile                                      |  42 +-
 Math/bigint.h                                 |  12 +
 Math/gf2n.cpp                                 |  27 +-
 Math/gf2n.h                                   |   4 +-
 Math/gf2nlong.cpp                             |  28 +-
 Math/gf2nlong.h                               |  15 -
 Math/gfp.h                                    |   1 +
 Networking/Exchanger.h                        |   3 +
 Networking/ServerSocket.cpp                   |  25 +-
 Networking/sockets.cpp                        |   2 +-
 Networking/sockets.h                          |  21 +-
 Networking/ssl_sockets.h                      |   2 +-
 OT/BitMatrix.h                                |   3 -
 OT/OTVole.hpp                                 |   6 +-
 Processor/Data_Files.hpp                      |   5 +-
 Processor/ExternalClients.cpp                 |   4 +
 Processor/ExternalClients.h                   |   3 +
 Processor/Instruction.h                       |   1 +
 Processor/Instruction.hpp                     |  45 +-
 Processor/Machine.h                           |   5 +
 Processor/Machine.hpp                         |  62 +-
 Processor/OnlineOptions.cpp                   |   7 +-
 Processor/PrepBase.cpp                        |  11 +-
 Processor/Processor.h                         |  25 +-
 Processor/Processor.hpp                       |  96 ++-
 Processor/Program.cpp                         |  12 +
 Processor/Program.h                           |   5 +
 Processor/instructions.h                      |   2 +-
 Programs/Source/bankers_bonus.mpc             |   3 +-
 Programs/Source/breast_logistic.mpc           |  54 ++
 Programs/Source/breast_tree.mpc               |  33 +
 Programs/Source/diabetes.mpc                  |  32 +
 Programs/Source/easy_adult.mpc                |  38 +
 Programs/Source/keras_cifar_lenet.mpc         |  41 +-
 Programs/Source/keras_mnist_dense.mpc         |  34 +-
 Programs/Source/keras_mnist_lenet.mpc         |  34 +-
 Programs/Source/keras_mnist_lenet_predict.mpc |   2 +-
 Programs/Source/test_sbitfix.mpc              |   3 +-
 Programs/Source/torch_alex_test.mpc           |  92 +++
 Programs/Source/torch_cifar_alex.mpc          |  70 ++
 Programs/Source/torch_cifar_lenet.mpc         |  57 ++
 .../Source/torch_cifar_lenet_pretrain.mpc     |  81 ++
 Programs/Source/torch_mnist_dense.mpc         |  57 ++
 .../Source/torch_mnist_dense_pretrain.mpc     |  72 ++
 Programs/Source/torch_mnist_dense_test.mpc    |  40 +
 Programs/Source/torch_mnist_lenet.mpc         |  49 ++
 Programs/Source/torch_mnist_lenet_predict.mpc |  74 ++
 Protocols/FakeShare.h                         |   1 +
 Protocols/Hemi.hpp                            |   4 +-
 Protocols/SemiInput.hpp                       |   2 +-
 Protocols/ShareInterface.h                    |   2 +
 README.md                                     | 101 ++-
 Scripts/build.sh                              |   7 +-
 Scripts/compile-emulate.py                    |  17 +
 Scripts/compile-run.py                        |  23 +
 Scripts/memory-usage.py                       |  15 +-
 Scripts/setup-clients.sh                      |   3 +
 Scripts/setup-ssl.sh                          |   2 +-
 Scripts/test_tutorial.sh                      |  10 +-
 Scripts/tldr.sh                               |   3 +-
 Scripts/torch_cifar_alex_import.py            |  61 ++
 Scripts/torch_mnist_dense_import.py           |  46 ++
 Scripts/torch_mnist_lenet_import.py           |  51 ++
 Tools/FlexBuffer.cpp                          |   5 +-
 Tools/Hash.cpp                                |   5 +
 Tools/Hash.h                                  |   1 +
 Tools/Lock.h                                  |  17 +
 Tools/ezOptionParser.h                        |  20 +-
 Yao/YaoEvalWire.cpp                           |   9 +
 Yao/YaoEvalWire.h                             |   2 +
 Yao/YaoEvaluator.cpp                          |   8 +-
 Yao/YaoEvaluator.h                            |   2 +-
 Yao/YaoGarbleWire.cpp                         |   9 +
 Yao/YaoGarbleWire.h                           |   2 +
 Yao/YaoGarbler.cpp                            |   5 +-
 Yao/YaoGarbler.h                              |   2 +-
 Yao/YaoPlayer.cpp                             |   2 +-
 deps/libOTe                                   |   2 +-
 doc/Compiler.rst                              |   8 +
 doc/Doxyfile                                  |   2 +-
 doc/add-protocol.rst                          |  12 +-
 doc/compilation.rst                           |   8 +-
 doc/gen-readme.sh                             |   7 +-
 doc/index.rst                                 |  14 +-
 doc/io.rst                                    |   4 +-
 doc/low-level.rst                             |   6 +
 doc/machine-learning.rst                      | 472 ++++++++++-
 doc/ml-quickstart.rst                         |  92 +++
 doc/networking.rst                            |   2 +
 doc/troubleshooting.rst                       |  20 +
 119 files changed, 3857 insertions(+), 969 deletions(-)
 create mode 100644 Programs/Source/breast_logistic.mpc
 create mode 100644 Programs/Source/breast_tree.mpc
 create mode 100644 Programs/Source/diabetes.mpc
 create mode 100644 Programs/Source/easy_adult.mpc
 create mode 100644 Programs/Source/torch_alex_test.mpc
 create mode 100644 Programs/Source/torch_cifar_alex.mpc
 create mode 100644 Programs/Source/torch_cifar_lenet.mpc
 create mode 100644 Programs/Source/torch_cifar_lenet_pretrain.mpc
 create mode 100644 Programs/Source/torch_mnist_dense.mpc
 create mode 100644 Programs/Source/torch_mnist_dense_pretrain.mpc
 create mode 100644 Programs/Source/torch_mnist_dense_test.mpc
 create mode 100644 Programs/Source/torch_mnist_lenet.mpc
 create mode 100644 Programs/Source/torch_mnist_lenet_predict.mpc
 create mode 100755 Scripts/compile-emulate.py
 create mode 100755 Scripts/compile-run.py
 create mode 100755 Scripts/torch_cifar_alex_import.py
 create mode 100755 Scripts/torch_mnist_dense_import.py
 create mode 100755 Scripts/torch_mnist_lenet_import.py
 create mode 100644 doc/ml-quickstart.rst

diff --git a/BMR/Register.h b/BMR/Register.h
index 4def65901..2085eb25a 100644
--- a/BMR/Register.h
+++ b/BMR/Register.h
@@ -296,6 +296,8 @@ class ProgramRegister : public Phase, public Register
     static void andm(GC::Processor<U>&, const BaseInstruction&)
     { throw runtime_error("andm not implemented"); }
 
+    static void run_tapes(const vector<int>&) { throw not_implemented(); }
+
 	// most BMR phases don't need actual input
 	template<class T>
 	static T get_input(GC::Processor<T>& processor, const InputArgs& args)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f201d4640..9a3a276d7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,17 @@
 The changelog explains changes pulled through from the private development repository. Bug fixes and small enhancements are committed between releases and not documented here.
 
+## 0.3.5 (Feb 16, 2023)
+
+- Easier-to-use machine learning interface
+- Integrated compilation-execution facility
+- Import/export sequential models and parameters from/to PyTorch
+- Binary-format input files
+- Less aggressive round optimization for faster compilation by default
+- Multithreading with client interface
+- Functionality to protect order of specific memory accesses
+- Oblivious transfer works again on older (pre-2011) x86 CPUs
+- clang is used by default
+
 ## 0.3.4 (Nov 9, 2022)
 
 - Decision tree learning
diff --git a/CONFIG b/CONFIG
index 0d41c9ef7..6d5f0f170 100644
--- a/CONFIG
+++ b/CONFIG
@@ -47,7 +47,7 @@ endif
 USE_KOS = 0
 
 # allow to set compiler in CONFIG.mine
-CXX = g++
+CXX = clang++
 
 # use CONFIG.mine to overwrite DIR settings
 -include CONFIG.mine
diff --git a/Compiler/GC/types.py b/Compiler/GC/types.py
index f70ee6417..1c024c7e3 100644
--- a/Compiler/GC/types.py
+++ b/Compiler/GC/types.py
@@ -711,16 +711,23 @@ def n_elements():
             def mem_size():
                 return n
             @classmethod
-            def get_input_from(cls, player):
+            def get_input_from(cls, player, size=1, f=0):
                 """ Secret input from :py:obj:`player`. The input is decomposed
                 into bits.
 
                 :param: player (int)
                 """
+                v = [0] * n
                 sbits._check_input_player(player)
-                res = cls.from_vec(sbit() for i in range(n))
-                inst.inputbvec(n + 3, 0, player, *res.v)
-                return res
+                instructions_base.check_vector_size(size)
+                for i in range(size):
+                    vv = [sbit() for i in range(n)]
+                    inst.inputbvec(n + 3, f, player, *vv)
+                    for j in range(n):
+                        tmp = vv[j] << i
+                        v[j] = tmp ^ v[j]
+                        sbits._check_input_player(player)
+                return cls.from_vec(v)
             get_raw_input_from = get_input_from
             @classmethod
             def from_vec(cls, vector):
@@ -728,6 +735,7 @@ def from_vec(cls, vector):
                 res.v = _complement_two_extend(list(vector), n)[:n]
                 return res
             def __init__(self, other=None, size=None):
+                instructions_base.check_vector_size(size)
                 if other is not None:
                     if util.is_constant(other):
                         t = sbits.get_type(size or 1)
@@ -1148,6 +1156,9 @@ class sbitint(_bitint, _number, sbits, _sbitintbase):
         mul: 15
         lt: 0
 
+    This class is retained for compatibility, but development now
+    focuses on :py:class:`sbitintvec`.
+
     """
     n_bits = None
     bin_type = None
@@ -1347,9 +1358,12 @@ def output(self):
                                 cbits(0), cbits(0))
 
 class sbitfix(_fix):
-    """ Secret signed integer in one binary register.
+    """ Secret signed fixed-point number in one binary register.
     Use :py:obj:`set_precision()` to change the precision.
 
+    This class is retained for compatibility, but development now
+    focuses on :py:class:`sbitfixvec`.
+
     Example::
 
         print_ln('add: %s', (sbitfix(0.5) + sbitfix(0.3)).reveal())
@@ -1453,15 +1467,8 @@ def get_input_from(cls, player, size=1):
 
         :param: player (int)
         """
-        v = [0] * sbitfix.k
-        sbits._check_input_player(player)
-        for i in range(size):
-            vv = [sbit() for i in range(sbitfix.k)]
-            inst.inputbvec(len(v) + 3, sbitfix.f, player, *vv)
-            for j in range(sbitfix.k):
-                tmp = vv[j] << i
-                v[j] = tmp ^ v[j]
-        return cls._new(cls.int_type.from_vec(v))
+        return cls._new(cls.int_type.get_input_from(player, size=size,
+                                                    f=sbitfix.f))
     def __init__(self, value=None, *args, **kwargs):
         if isinstance(value, (list, tuple)):
             self.v = self.int_type.from_vec(sbitvec([x.v for x in value]))
diff --git a/Compiler/allocator.py b/Compiler/allocator.py
index b68160434..980a189a4 100644
--- a/Compiler/allocator.py
+++ b/Compiler/allocator.py
@@ -315,7 +315,6 @@ def dependency_graph(self, merge_classes):
         last_def = defaultdict_by_id(lambda: -1)
         last_mem_write = []
         last_mem_read = []
-        warned_about_mem = []
         last_mem_write_of = defaultdict(list)
         last_mem_read_of = defaultdict(list)
         last_print_str = None
@@ -364,20 +363,22 @@ def mem_access(n, instr, last_access_this_kind, last_access_other_kind):
                     addr_i = addr + i
                     handle_mem_access(addr_i, reg_type, last_access_this_kind,
                                       last_access_other_kind)
-                if block.warn_about_mem and not warned_about_mem and \
-                   (instr.get_size() > 100):
+                if block.warn_about_mem and \
+                   not block.parent.warned_about_mem and \
+                   (instr.get_size() > 100) and not instr._protect:
                     print('WARNING: Order of memory instructions ' \
                         'not preserved due to long vector, errors possible')
-                    warned_about_mem.append(True)
+                    block.parent.warned_about_mem = True
             else:
                 handle_mem_access(addr, reg_type, last_access_this_kind,
                                   last_access_other_kind)
-            if block.warn_about_mem and not warned_about_mem and \
-               not isinstance(instr, DirectMemoryInstruction):
+            if block.warn_about_mem and \
+               not block.parent.warned_about_mem and \
+               not isinstance(instr, DirectMemoryInstruction) and \
+               not instr._protect:
                 print('WARNING: Order of memory instructions ' \
                     'not preserved, errors possible')
-                # hack
-                warned_about_mem.append(True)
+                block.parent.warned_about_mem = True
 
         def strict_mem_access(n, last_this_kind, last_other_kind):
             if last_other_kind and last_this_kind and \
@@ -473,14 +474,14 @@ def keep_text_order(inst, n):
                 depths[n] = depth
 
             if isinstance(instr, ReadMemoryInstruction):
-                if options.preserve_mem_order:
+                if options.preserve_mem_order or instr._protect:
                     strict_mem_access(n, last_mem_read, last_mem_write)
-                else:
+                elif not options.preserve_mem_order:
                     mem_access(n, instr, last_mem_read_of, last_mem_write_of)
             elif isinstance(instr, WriteMemoryInstruction):
-                if options.preserve_mem_order:
+                if options.preserve_mem_order or instr._protect:
                     strict_mem_access(n, last_mem_write, last_mem_read)
-                else:
+                elif not options.preserve_mem_order:
                     mem_access(n, instr, last_mem_write_of, last_mem_read_of)
             elif isinstance(instr, matmulsm):
                 if options.preserve_mem_order:
@@ -495,7 +496,7 @@ def keep_text_order(inst, n):
                     add_edge(last_print_str, n)
                 last_print_str = n
             elif isinstance(instr, PublicFileIOInstruction):
-                keep_order(instr, n, instr.__class__)
+                keep_order(instr, n, PublicFileIOInstruction)
             elif isinstance(instr, prep_class):
                 keep_order(instr, n, instr.args[0])
             elif isinstance(instr, StackInstruction):
@@ -586,7 +587,7 @@ class RegintOptimizer:
     def __init__(self):
         self.cache = util.dict_by_id()
 
-    def run(self, instructions):
+    def run(self, instructions, program):
         for i, inst in enumerate(instructions):
             if isinstance(inst, ldint_class):
                 self.cache[inst.args[0]] = inst.args[1]
@@ -601,6 +602,7 @@ def run(self, instructions):
             elif isinstance(inst, IndirectMemoryInstruction):
                 if inst.args[1] in self.cache:
                     instructions[i] = inst.get_direct(self.cache[inst.args[1]])
+                    instructions[i]._protect = inst._protect
             elif type(inst) == convint_class:
                 if inst.args[1] in self.cache:
                     res = self.cache[inst.args[1]]
@@ -614,4 +616,13 @@ def run(self, instructions):
                     if op == 0:
                         instructions[i] = ldsi(inst.args[0], 0,
                                                add_to_prog=False)
+            elif isinstance(inst, (crash, cond_print_str, cond_print_plain)):
+                if inst.args[0] in self.cache:
+                    cond = self.cache[inst.args[0]]
+                    if not cond:
+                        instructions[i] = None
+        pre = len(instructions)
         instructions[:] = list(filter(lambda x: x is not None, instructions))
+        post = len(instructions)
+        if pre != post and program.options.verbose:
+            print('regint optimizer removed %d instructions' % (pre - post))
diff --git a/Compiler/compilerLib.py b/Compiler/compilerLib.py
index 462a5d108..bb80dc344 100644
--- a/Compiler/compilerLib.py
+++ b/Compiler/compilerLib.py
@@ -3,6 +3,7 @@
 import re
 import sys
 import tempfile
+import subprocess
 from optparse import OptionParser
 
 from Compiler.exceptions import CompilerError
@@ -12,11 +13,12 @@
 
 
 class Compiler:
-    def __init__(self, custom_args=None, usage=None):
+    def __init__(self, custom_args=None, usage=None, execute=False):
         if usage:
             self.usage = usage
         else:
             self.usage = "usage: %prog [options] filename [args]"
+        self.execute = execute
         self.custom_args = custom_args
         self.build_option_parser()
         self.VARS = {}
@@ -72,7 +74,8 @@ def build_option_parser(self):
             "--optimize-hard",
             action="store_true",
             dest="optimize_hard",
-            help="currently not in use",
+            help="lower number of rounds at higher compilation cost "
+            "(disables -C and increases the budget to 100000)",
         )
         parser.add_option(
             "-u",
@@ -157,8 +160,8 @@ def build_option_parser(self):
             "-b",
             "--budget",
             dest="budget",
-            default=defaults.budget,
-            help="set budget for optimized loop unrolling " "(default: 100000)",
+            help="set budget for optimized loop unrolling (default: %d)" % \
+            defaults.budget,
         )
         parser.add_option(
             "-X",
@@ -195,7 +198,8 @@ def build_option_parser(self):
             "--CISC",
             action="store_true",
             dest="cisc",
-            help="faster CISC compilation mode",
+            help="faster CISC compilation mode "
+            "(used by default unless -O is given)",
         )
         parser.add_option(
             "-K",
@@ -217,15 +221,62 @@ def build_option_parser(self):
             dest="verbose",
             help="more verbose output",
         )
+        if self.execute:
+            parser.add_option(
+                "-E",
+                "--execute",
+                dest="execute",
+                help="protocol to execute with",
+            )
+            parser.add_option(
+                "-H",
+                "--hostfile",
+                dest="hostfile",
+                help="hosts to execute with",
+            )
         self.parser = parser
 
     def parse_args(self):
         self.options, self.args = self.parser.parse_args(self.custom_args)
-        if self.options.optimize_hard:
-            print("Note that -O/--optimize-hard currently has no effect")
+        if self.execute:
+            if not self.options.execute:
+                raise CompilerError("must give name of protocol with '-E'")
+            protocol = self.options.execute
+            if protocol.find("ring") >= 0 or protocol.find("2k") >= 0 or \
+               protocol.find("brain") >= 0 or protocol == "emulate":
+                if not (self.options.ring or self.options.binary):
+                    self.options.ring = "64"
+                if self.options.field:
+                    raise CompilerError(
+                        "field option not compatible with %s" % protocol)
+            else:
+                if protocol.find("bin") >= 0 or  protocol.find("ccd") >= 0 or \
+                   protocol.find("bmr") >= 0 or \
+                   protocol in ("replicated", "tinier", "tiny", "yao"):
+                    if not self.options.binary:
+                        self.options.binary = "32"
+                    if self.options.ring or self.options.field:
+                        raise CompilerError(
+                            "ring/field options not compatible with %s" %
+                            protocol)
+                if self.options.ring:
+                    raise CompilerError(
+                        "ring option not compatible with %s" % protocol)
+            if protocol == "emulate":
+                self.options.keep_cisc = ''
 
     def build_program(self, name=None):
         self.prog = Program(self.args, self.options, name=name)
+        if self.execute:
+            if self.options.execute in \
+               ("emulate", "ring", "rep-field", "semi2k"):
+                self.prog.use_trunc_pr = True
+            if self.options.execute in ("ring",):
+                self.prog.use_split(3)
+            if self.options.execute in ("semi2k",):
+                self.prog.use_split(2)
+            if self.options.execute in ("rep4-ring",):
+                self.prog.use_split(4)
 
     def build_vars(self):
         from . import comparison, floatingpoint, instructions, library, types
@@ -283,11 +334,15 @@ def build_vars(self):
             ]:
                 del self.VARS[i]
 
-    def prep_compile(self, name=None):
+    def prep_compile(self, name=None, build=True):
         self.parse_args()
         if len(self.args) < 1 and name is None:
             self.parser.print_help()
             exit(1)
+        if build:
+            self.build(name=name)
+
+    def build(self, name=None):
         self.build_program(name=name)
         self.build_vars()
 
@@ -307,7 +362,7 @@ def compile_file(self):
                     if if_stack and not re.match(if_stack[-1][0], line):
                         if_stack.pop()
                     m = re.match(
-                        r"(\s*)for +([a-zA-Z_]+) +in " r"+range\(([0-9a-zA-Z_]+)\):",
+                        r"(\s*)for +([a-zA-Z_]+) +in " r"+range\(([0-9a-zA-Z_.]+)\):",
                         line,
                     )
                     if m:
@@ -403,3 +458,110 @@ def finalize_compile(self):
             print("Memory size:", dict(self.prog.allocated_mem))
 
         return self.prog
+
+    @staticmethod
+    def executable_from_protocol(protocol):
+        match = {
+            "ring": "replicated-ring",
+            "rep-field": "replicated-field",
+            "replicated": "replicated-bin"
+        }
+        if protocol in match:
+            protocol = match[protocol]
+        if protocol.find("bmr") == -1:
+            protocol = re.sub("^mal-", "malicious-", protocol)
+        if protocol == "emulate":
+            return protocol + ".x"
+        else:
+            return protocol + "-party.x"
+
+    def local_execution(self, args=[]):
+        executable = self.executable_from_protocol(self.options.execute)
+        if not os.path.exists(executable):
+            print("Creating binary for virtual machine...")
+            try:
+                subprocess.run(["make", executable], check=True)
+            except:
+                raise CompilerError(
+                    "Cannot produce %s. " % executable + \
+                    "Note that compilation requires a few GB of RAM.")
+        vm = 'Scripts/%s.sh' % self.options.execute
+        os.execl(vm, vm, self.prog.name, *args)
+
+    def remote_execution(self, args=[]):
+        vm = self.executable_from_protocol(self.options.execute)
+        hosts = list(x.strip()
+                     for x in filter(None, open(self.options.hostfile)))
+        # test availability before compilation
+        from fabric import Connection
+        import subprocess
+        print("Creating static binary for virtual machine...")
+        subprocess.run(["make", "static/%s" % vm], check=True)
+
+        # transfer files
+        import glob
+        hostnames = []
+        destinations = []
+        for host in hosts:
+            split = host.split('/', maxsplit=1)
+            hostnames.append(split[0])
+            if len(split) > 1:
+                destinations.append(split[1])
+            else:
+                destinations.append('.')
+        connections = [Connection(hostname) for hostname in hostnames]
+        print("Setting up players...")
+
+        def run(i):
+            dest = destinations[i]
+            connection = connections[i]
+            connection.run(
+                "mkdir -p %s/{Player-Data,Programs/{Bytecode,Schedules}} " % \
+                dest)
+            # executable
+            connection.put("static/%s" % vm, dest)
+            # program
+            dest += "/"
+            connection.put("Programs/Schedules/%s.sch" % self.prog.name,
+                           dest + "Programs/Schedules")
+            for filename in glob.glob(
+                    "Programs/Bytecode/%s-*.bc" % self.prog.name):
+                connection.put(filename, dest + "Programs/Bytecode")
+            # inputs
+            for filename in glob.glob("Player-Data/Input*-P%d-*" % i):
+                connection.put(filename, dest + "Player-Data")
+            # key and certificates
+            for suffix in ('key', 'pem'):
+                connection.put("Player-Data/P%d.%s" % (i, suffix),
+                               dest + "Player-Data")
+            for filename in glob.glob("Player-Data/*.0"):
+                connection.put(filename, dest + "Player-Data")
+
+        import threading
+        import random
+        threads = []
+        for i in range(len(hosts)):
+            threads.append(threading.Thread(target=run, args=(i,)))
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+
+        # execution
+        threads = []
+        # random port numbers to avoid conflict
+        port = 10000 + random.randrange(40000)
+        if '@' in hostnames[0]:
+            party0 = hostnames[0].split('@')[1]
+        else:
+            party0 = hostnames[0]
+        for i in range(len(connections)):
+            run = lambda i: connections[i].run(
+                "cd %s; ./%s -p %d %s -h %s -pn %d %s" % \
+                (destinations[i], vm, i, self.prog.name, party0, port,
+                 ' '.join(args)))
+            threads.append(threading.Thread(target=run, args=(i,)))
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
diff --git a/Compiler/decision_tree.py b/Compiler/decision_tree.py
index 89e3fe5c7..7e25f1591 100644
--- a/Compiler/decision_tree.py
+++ b/Compiler/decision_tree.py
@@ -8,7 +8,6 @@
 
 debug = False
 debug_split = False
-debug_layers = False
 max_leaves = None
 
 def get_type(x):
@@ -70,26 +69,35 @@ def Sort(keys, *to_sort, n_bits=None, time=False):
     bs = Matrix.create_from(
         sum([k.get_vector().bit_decompose(nb)
              for k, nb in reversed(list(zip(keys, n_bits)))], []))
-    res = Matrix.create_from(to_sort)
+    get_vec = lambda x: x[:] if isinstance(x, Array) else x
+    res = Matrix.create_from(get_vec(x).v if isinstance(get_vec(x), sfix) else x
+                             for x in to_sort)
     res = res.transpose()
     if time:
         start_timer(11)
-    print_ln('sort')
     radix_sort_from_matrix(bs, res)
     if time:
         stop_timer(11)
         stop_timer(1)
-    return res.transpose()
+    res = res.transpose()
+    return [sfix._new(get_vec(x), k=get_vec(y).k, f=get_vec(y).f)
+            if isinstance(get_vec(y), sfix)
+            else x for (x, y) in zip(res, to_sort)]
 
-def VectMax(key, *data):
+def VectMax(key, *data, debug=False):
     def reducer(x, y):
         b = x[0] > y[0]
+        if debug:
+            print_ln('max b=%s', b.reveal())
         return [b.if_else(xx, yy) for xx, yy in zip(x, y)]
     if debug:
         key = list(key)
         data = [list(x) for x in data]
         print_ln('vect max key=%s data=%s', util.reveal(key), util.reveal(data))
-    return util.tree_reduce(reducer, zip(key, *data))[1:]
+    res = util.tree_reduce(reducer, zip(key, *data))[1:]
+    if debug:
+        print_ln('vect max res=%s', util.reveal(res))
+    return res
 
 def GroupSum(g, x):
     assert len(g) == len(x)
@@ -161,19 +169,19 @@ def ModifiedGini(g, y, debug=False):
     wqs = w[0] ** 2 + w[1] ** 2
     res = sfix(uqs) / us + sfix(wqs) / ws
     if debug:
+        print_ln('g=%s y=%s s=%s',
+                 util.reveal(g), util.reveal(y),
+                 util.reveal(s))
         print_ln('u0=%s', util.reveal(u[0]))
         print_ln('u0=%s', util.reveal(u[1]))
         print_ln('us=%s', util.reveal(us))
         print_ln('w0=%s', util.reveal(w[0]))
         print_ln('w1=%s', util.reveal(w[1]))
         print_ln('ws=%s', util.reveal(ws))
-        print_ln('p=%s', util.reveal(p))
-        print_ln('q=%s', util.reveal(q))
-        print_ln('g=%s y=%s s=%s',
-                 util.reveal(g), util.reveal(y),
-                 util.reveal(s))
+        print_ln('uqs=%s', util.reveal(uqs))
+        print_ln('wqs=%s', util.reveal(wqs))
     if debug:
-        print_ln('gini %s %s', str(res), util.reveal(res))
+        print_ln('gini %s %s', type(res), util.reveal(res))
     return res
 
 MIN_VALUE = -10000
@@ -181,11 +189,16 @@ def ModifiedGini(g, y, debug=False):
 def FormatLayer(h, g, *a):
     return CropLayer(h, *FormatLayer_without_crop(g, *a))
 
-def FormatLayer_without_crop(g, *a):
+def FormatLayer_without_crop(g, *a, debug=False):
     for x in a:
         assert len(x) == len(g)
     v = [g.if_else(aa, 0) for aa in a]
+    if debug:
+        print_ln('format in %s', util.reveal(a))
+        print_ln('format mux %s', util.reveal(v))
     v = Sort([g.bit_not()], *v, n_bits=[1])
+    if debug:
+        print_ln('format sort %s', util.reveal(v))
     return v
 
 def CropLayer(k, *v):
@@ -243,8 +256,9 @@ def ApplyTests(self, x, AID, Threshold):
         def _(j):
             e[j][:] = AID[:] == j
         xx = sum(x[j] * e[j] for j in range(m))
-        if debug:
+        if self.debug > 1:
             print_ln('apply e=%s xx=%s', util.reveal(e), util.reveal(xx))
+            print_ln('threshold %s', util.reveal(Threshold))
         return 2 * xx < Threshold
 
     def AttributeWiseTestSelection(self, g, x, y, time=False, debug=False):
@@ -252,10 +266,10 @@ def AttributeWiseTestSelection(self, g, x, y, time=False, debug=False):
         assert len(g) == len(y)
         if time:
             start_timer(2)
-        s = ModifiedGini(g, y, debug=debug)
+        s = ModifiedGini(g, y, debug=debug or self.debug > 2)
         if time:
             stop_timer(2)
-        if debug:
+        if debug or self.debug > 1:
             print_ln('gini %s', s.reveal())
         xx = x
         t = get_type(x).Array(len(x))
@@ -296,35 +310,46 @@ def GlobalTestSelection(self, x, y, g):
         @for_range_multithread(self.n_threads, 1, m)
         def _(j):
             single = not self.n_threads or self.n_threads == 1
-            print_ln('run %s', j)
+            time = self.time and single
+            if debug:
+                print_ln('run %s', j)
             @if_e(self.attr_lengths[j])
             def _():
                 u[j][:], v[j][:] = Sort((PrefixSum(g), x[j]), x[j], y,
-                                        n_bits=[util.log2(n), 1], time=single)
+                                        n_bits=[util.log2(n), 1], time=time)
             @else_
             def _():
                 u[j][:], v[j][:] = Sort((PrefixSum(g), x[j]), x[j], y,
                                         n_bits=[util.log2(n), None],
-                                        time=single)
+                                        time=time)
             if self.debug_threading:
                 print_ln('global sort %s %s %s', j, util.reveal(u[j]),
                          util.reveal(v[j]))
             t[j][:], s[j][:] = self.AttributeWiseTestSelection(
-                g, u[j], v[j], time=single, debug=self.debug_selection)
+                g, u[j], v[j], time=time, debug=self.debug_selection)
             if self.debug_threading:
                 print_ln('global attribute %s %s %s', j, util.reveal(t[j]),
                          util.reveal(s[j]))
         n = len(g)
-        a, tt = [sint.Array(n) for i in range(2)]
+        a = sint.Array(n)
         if self.debug_threading:
             print_ln('global s=%s', util.reveal(s))
         if self.debug_gini:
             print_ln('Gini indices ' + ' '.join(str(i) + ':%s' for i in range(m)),
                      *(ss[0].reveal() for ss in s))
-        start_timer(4)
-        a[:], tt[:] = VectMax((s[j][:] for j in range(m)), range(m),
-                              (t[j][:] for j in range(m)))
-        stop_timer(4)
+        if self.time:
+            start_timer(4)
+        if self.debug > 1:
+            print_ln('s=%s', s.reveal_nested())
+            print_ln('t=%s', t.reveal_nested())
+        a[:], tt = VectMax((s[j][:] for j in range(m)), range(m),
+                           (t[j][:] for j in range(m)), debug=self.debug > 1)
+        tt = Array.create_from(tt)
+        if self.time:
+            stop_timer(4)
+        if self.debug > 1:
+            print_ln('a=%s', util.reveal(a))
+            print_ln('tt=%s', util.reveal(tt))
         return a[:], tt[:]
 
     def TrainInternalNodes(self, k, x, y, g, NID):
@@ -333,13 +358,18 @@ def TrainInternalNodes(self, k, x, y, g, NID):
             assert len(xx) == len(g)
         AID, Threshold = self.GlobalTestSelection(x, y, g)
         s = GroupSame(g[:], y[:])
-        if debug or debug_split:
+        if self.debug > 1 or debug_split:
             print_ln('AID=%s', util.reveal(AID))
             print_ln('Threshold=%s', util.reveal(Threshold))
             print_ln('GroupSame=%s', util.reveal(s))
         AID, Threshold = s.if_else(0, AID), s.if_else(MIN_VALUE, Threshold)
+        if self.debug > 1 or debug_split:
+            print_ln('AID=%s', util.reveal(AID))
+            print_ln('Threshold=%s', util.reveal(Threshold))
         b = self.ApplyTests(x, AID, Threshold)
-        return FormatLayer_without_crop(g[:], NID, AID, Threshold), b
+        layer = FormatLayer_without_crop(g[:], NID, AID, Threshold,
+                                         debug=self.debug > 1)
+        return *layer, b
 
     @method_block
     def train_layer(self, k):
@@ -347,19 +377,21 @@ def train_layer(self, k):
         y = self.y
         g = self.g
         NID = self.NID
-        layer_matrix = self.layer_matrix
-        self.layer_matrix[k], b = \
+        if self.debug > 1:
+            print_ln('g=%s', g.reveal())
+            print_ln('y=%s', y.reveal())
+            print_ln('x=%s', x.reveal_nested())
+        self.nids[k], self.aids[k], self.thresholds[k], b = \
             self.TrainInternalNodes(k, x, y, g, NID)
-        if debug:
-            print_ln('internal %s %s',
-                     util.reveal(layer_matrix[k]), util.reveal(b))
-        if debug_layers:
+        if self.debug > 1:
             print_ln('layer %s:', k)
-            for name, data in zip(('NID', 'AID', 'Thr'), layer_matrix[k]):
+            for name, data in zip(('NID', 'AID', 'Thr'),
+                                  (self.nids[k], self.aids[k],
+                                   self.thresholds[k])):
                 print_ln(' %s: %s', name, data.reveal())
         NID[:] = 2 ** k * b + NID
         b_not = b.bit_not()
-        if debug:
+        if self.debug > 1:
             print_ln('b_not=%s', b_not.reveal())
         g[:] = GroupFirstOne(g, b_not) + GroupFirstOne(g, b)
         y[:], g[:], NID[:], *xx = Sort([b], y, g, NID, *x, n_bits=[1])
@@ -388,33 +420,38 @@ def __init__(self, x, y, h, binary=False, attr_lengths=None,
         self.NID.assign_all(1)
         self.y = Array.create_from(y)
         self.x = Matrix.create_from(x)
-        self.layer_matrix = sint.Tensor([h, 3, n])
+        self.nids, self.aids = [sint.Matrix(h, n) for i in range(2)]
+        self.thresholds = self.x.value_type.Matrix(h, n)
         self.n_threads = n_threads
         self.debug_selection = False
         self.debug_threading = False
-        self.debug_gini = True
+        self.debug_gini = False
+        self.debug = False
+        self.time = False
 
     def train(self):
         """ Train and return decision tree. """
-        h = len(self.layer_matrix)
+        h = len(self.nids)
         @for_range(h)
         def _(k):
             self.train_layer(k)
         return self.get_tree(h)
 
-    def train_with_testing(self, *test_set):
+    def train_with_testing(self, *test_set, output=False):
         """ Train decision tree and test against test data.
 
         :param y: binary labels (list or sint vector)
         :param x: sample data (by attribute, list or
           :py:obj:`~Compiler.types.Matrix`)
+        :param output: output tree after every level
         :returns: tree
 
         """
-        for k in range(len(self.layer_matrix)):
+        for k in range(len(self.nids)):
             self.train_layer(k)
             tree = self.get_tree(k + 1)
-            output_decision_tree(tree)
+            if output:
+                output_decision_tree(tree)
             test_decision_tree('train', tree, self.y, self.x,
                                n_threads=self.n_threads)
             if test_set:
@@ -425,7 +462,8 @@ def train_with_testing(self, *test_set):
     def get_tree(self, h):
         Layer = [None] * (h + 1)
         for k in range(h):
-            Layer[k] = CropLayer(k, *self.layer_matrix[k])
+            Layer[k] = CropLayer(k, self.nids[k], self.aids[k],
+                                 self.thresholds[k])
         Layer[h] = TrainLeafNodes(h, self.g[:], self.y[:], self.NID)
         return Layer
 
@@ -479,8 +517,9 @@ def run_decision_tree(layers, data):
     bits = layers[h][0].equal(index, h)
     return pick(bits, layers[h][1])
 
-def test_decision_tree(name, layers, y, x, n_threads=None):
-    start_timer(100)
+def test_decision_tree(name, layers, y, x, n_threads=None, time=False):
+    if time:
+        start_timer(100)
     n = len(y)
     x = x.transpose().reveal()
     y = y.reveal()
@@ -488,7 +527,8 @@ def test_decision_tree(name, layers, y, x, n_threads=None):
     truth = regint.Array(n)
     correct = regint.Array(2)
     parts = regint.Array(2)
-    layers = [Matrix.create_from(util.reveal(layer)) for layer in layers]
+    layers = [[Array.create_from(util.reveal(x)) for x in layer]
+              for layer in layers]
     @for_range_multithread(n_threads, 1, n)
     def _(i):
         guess[i] = run_decision_tree([[part[:] for part in layer]
@@ -501,4 +541,105 @@ def _(i):
         correct[truth[i]] += c
     print_ln('%s for height %s: %s/%s (%s/%s, %s/%s)', name, len(layers) - 1,
              sum(correct), n, correct[0], parts[0], correct[1], parts[1])
-    stop_timer(100)
+    if time:
+        stop_timer(100)
+
+class TreeClassifier:
+    """ Tree classification with convenient interface. Uses
+    :py:class:`TreeTrainer` internally.
+
+    :param max_depth: the depth of the decision tree
+
+    """
+    def __init__(self, max_depth):
+        self.max_depth = max_depth
+
+    @staticmethod
+    def get_attr_lengths(attr_types):
+        if attr_types == None:
+            return None
+        else:
+            return [1 if x == 'b' else 0 for x in attr_types]
+
+    def fit(self, X, y, attr_types=None):
+        """ Train tree.
+
+        :param X: sample data with row-wise samples (sint/sfix matrix)
+        :param y: binary labels (sint list/array)
+
+        """
+        self.tree = TreeTrainer(
+            X.transpose(), y, self.max_depth,
+            attr_lengths=self.get_attr_lengths(attr_types)).train()
+
+    def fit_with_testing(self, X_train, y_train, X_test, y_test,
+                         attr_types=None, output_trees=False, debug=False):
+        """ Train tree with accuracy output after every level.
+
+        :param X_train: training data with row-wise samples (sint/sfix matrix)
+        :param y_train: training binary labels (sint list/array)
+        :param X_test: testing data with row-wise samples (sint/sfix matrix)
+        :param y_test: testing binary labels (sint list/array)
+        :param attr_types: attributes types (list of 'b'/'c' for
+          binary/continuous; default is all continuous)
+        :param output_trees: output tree after every level
+        :param debug: output debugging information
+
+        """
+        trainer = TreeTrainer(X_train.transpose(), y_train, self.max_depth,
+                              attr_lengths=self.get_attr_lengths(attr_types))
+        trainer.debug = debug
+        trainer.debug_gini = debug
+        trainer.debug_threading = debug > 1
+        self.tree = trainer.train_with_testing(y_test, X_test.transpose(),
+                                               output=output_trees)
+
+    def predict(self, X):
+        """ Use tree for prediction.
+
+        :param X: sample data with row-wise samples (sint/sfix matrix)
+        :returns: sint array
+
+        """
+        res = sint.Array(len(X))
+        @for_range(len(X))
+        def _(i):
+            res[i] = run_decision_tree(self.tree, X[i])
+        return res
+
+    def output(self):
+        """ Output decision tree. """
+        output_decision_tree(self.tree)
+
+def preprocess_pandas(data):
+    """ Preprocess pandas data frame to suit
+    :py:class:`TreeClassifier` by expanding non-continuous attributes
+    to several binary attributes as a unary encoding.
+
+    :returns: a tuple of the processed data and a type list for the
+      :py:obj:`attr_types` argument.
+
+    """
+    import pandas
+    import numpy
+    res = []
+    types = []
+    for i, t in enumerate(data.dtypes):
+        if pandas.api.types.is_int64_dtype(t):
+            res.append(data.iloc[:,i].to_numpy())
+            types.append('c')
+        elif pandas.api.types.is_object_dtype(t):
+            values = data.iloc[:,i].unique()
+            print('converting the following to unary:', values)
+            if len(values) == 2:
+                res.append(data.iloc[:,i].to_numpy() == values[1])
+                types.append('b')
+            else:
+                for value in values:
+                    res.append(data.iloc[:,i].to_numpy() == value)
+                    types.append('b')
+        else:
+            raise CompilerError('unknown pandas type: ' + t)
+    res = numpy.array(res)
+    res = numpy.swapaxes(res, 0, 1)
+    return res, types
diff --git a/Compiler/floatingpoint.py b/Compiler/floatingpoint.py
index 7786f73c8..f44d95cbe 100644
--- a/Compiler/floatingpoint.py
+++ b/Compiler/floatingpoint.py
@@ -319,7 +319,7 @@ def Pow2(a, l, kappa):
 def Pow2_from_bits(bits):
     m = len(bits)
     t = list(bits)
-    pow2k = [types.cint() for i in range(m)]
+    pow2k = [None for i in range(m)]
     for i in range(m):
         pow2k[i] = two_power(2**i)
         t[i] = t[i]*pow2k[i] + 1 - t[i]
@@ -641,7 +641,7 @@ def BitDecFull(a, n_bits=None, maybe_mixed=False):
     n_bits = n_bits or bit_length
     assert n_bits <= bit_length
     logp = int(round(math.log(p, 2)))
-    if abs(p - 2 ** logp) / p < 2 ** -get_program().security:
+    if get_program().rabbit_gap():
         # inspired by Rabbit (https://eprint.iacr.org/2021/119)
         # no need for exact randomness generation
         # if modulo a power of two is close enough
diff --git a/Compiler/instructions.py b/Compiler/instructions.py
index c51318322..e50bc9fd8 100644
--- a/Compiler/instructions.py
+++ b/Compiler/instructions.py
@@ -295,6 +295,7 @@ class movint(base.Instruction):
 @base.vectorize
 class pushint(base.StackInstruction):
     """ Pushes clear integer register to the thread-local stack.
+    Considered obsolete.
 
     :param: source (regint)
     """
@@ -304,6 +305,7 @@ class pushint(base.StackInstruction):
 @base.vectorize
 class popint(base.StackInstruction):
     """ Pops from the thread-local stack to clear integer register.
+    Considered obsolete.
 
     :param: destination (regint)
     """
@@ -385,7 +387,7 @@ class use(base.Instruction):
     :param: number (int, -1 for unknown)
     """
     code = base.opcodes['USE']
-    arg_format = ['int','int','int']
+    arg_format = ['int','int','long']
 
     @classmethod
     def get_usage(cls, args):
@@ -404,7 +406,7 @@ class use_inp(base.Instruction):
     :param: number (int, -1 for unknown)
     """
     code = base.opcodes['USE_INP']
-    arg_format = ['int','int','int']
+    arg_format = ['int','int','long']
 
     @classmethod
     def get_usage(cls, args):
@@ -423,7 +425,7 @@ class use_edabit(base.Instruction):
     :param: number (int, -1 for unknown)
     """
     code = base.opcodes['USE_EDABIT']
-    arg_format = ['int','int','int']
+    arg_format = ['int','int','long']
 
     @classmethod
     def get_usage(cls, args):
@@ -439,7 +441,7 @@ class use_matmul(base.Instruction):
     :param: number (int, -1 for unknown)
     """
     code = base.opcodes['USE_MATMUL']
-    arg_format = ['int','int','int','int']
+    arg_format = ['int','int','int','long']
 
     @classmethod
     def get_usage(cls, args):
@@ -488,7 +490,7 @@ class use_prep(base.Instruction):
     :param: number of items to use (int, -1 for unknown)
     """
     code = base.opcodes['USE_PREP']
-    arg_format = ['str','int']
+    arg_format = ['str','long']
 
     @classmethod
     def get_usage(cls, args):
@@ -1873,6 +1875,20 @@ class floatoutput(base.PublicFileIOInstruction):
     code = base.opcodes['FLOATOUTPUT']
     arg_format = ['p','c','c','c','c']
 
+@base.vectorize
+class fixinput(base.PublicFileIOInstruction):
+    """ Binary fixed-point input.
+
+    :param: player (int)
+    :param: destination (cint)
+    :param: exponent (int)
+    :param: input type (0: 64-bit integer, 1: float, 2: double)
+
+    """
+    __slots__ = []
+    code = base.opcodes['FIXINPUT']
+    arg_format = ['p','cw','int','int']
+
 @base.vectorize
 class rand(base.Instruction):
     """ Store insecure random value of specified length in clear integer
diff --git a/Compiler/instructions_base.py b/Compiler/instructions_base.py
index f811e47c8..b72079c76 100644
--- a/Compiler/instructions_base.py
+++ b/Compiler/instructions_base.py
@@ -209,6 +209,7 @@
     CONDPRINTPLAIN = 0xE1,
     INTOUTPUT = 0xE6,
     FLOATOUTPUT = 0xE7,
+    FIXINPUT = 0xE8,
     GBITDEC = 0x18A,
     GBITCOM = 0x18B,
     # Secure socket
@@ -226,8 +227,13 @@ def int_to_bytes(x):
 global_vector_size_stack = []
 global_instruction_type_stack = ['modp']
 
+def check_vector_size(size):
+    if isinstance(size, program.curr_tape.Register):
+        raise CompilerError('vector size must be known at compile time')
+
 def set_global_vector_size(size):
     stack = global_vector_size_stack
+    check_vector_size(size)
     if size == 1 and not stack:
         return
     stack.append(size)
@@ -420,6 +426,7 @@ class MergeCISC(Mergeable):
         def __init__(self, *args, **kwargs):
             self.args = args
             self.kwargs = kwargs
+            self.security = program.security
             self.calls = [(args, kwargs)]
             self.params = []
             self.used = []
@@ -443,7 +450,7 @@ def is_vec(self):
 
         def merge_id(self):
             return self.function, tuple(self.params), \
-                tuple(sorted(self.kwargs.items()))
+                tuple(sorted(self.kwargs.items())), self.security
 
         def merge(self, other):
             self.calls += other.calls
@@ -468,7 +475,10 @@ def new_instructions(self, size, regs):
                     except:
                         args.append(arg)
                 program.options.cisc = False
+                old_security = program.security
+                program.security = self.security
                 self.function(*args, **self.kwargs)
+                program.security = old_security
                 program.options.cisc = True
                 reset_global_vector_size()
                 program.curr_tape = old_tape
@@ -579,7 +589,7 @@ def wrapper(*args, **kwargs):
                 same_sizes &= arg.size == args[0].size
             except:
                 pass
-        if program.options.cisc and same_sizes:
+        if program.use_cisc() and same_sizes:
             return MergeCISC(*args, **kwargs)
         else:
             return function(*args, **kwargs)
@@ -592,9 +602,9 @@ def instruction(res, *args, **kwargs):
     instruction = cisc(instruction)
 
     def wrapper(*args, **kwargs):
-        if not program.options.cisc:
-            return function(*args, **kwargs)
         from Compiler import types
+        if not (program.options.cisc and isinstance(args[0], types._register)):
+            return function(*args, **kwargs)
         if isinstance(args[0], types._clear):
             res_type = type(args[1])
         else:
@@ -671,7 +681,8 @@ def check(cls, arg):
             raise ArgumentError(arg, 'Invalid register argument')
         if arg.program != program.curr_tape:
             raise ArgumentError(arg, 'Register from other tape, trace: %s' % \
-                                    util.format_trace(arg.caller))
+                                    util.format_trace(arg.caller) +
+                                '\nMaybe use MemValue')
         if arg.reg_type != cls.reg_type:
             raise ArgumentError(arg, "Wrong register type '%s', expected '%s'" % \
                                     (arg.reg_type, cls.reg_type))
@@ -729,10 +740,10 @@ class LongArgFormat(IntArgFormat):
 
     @classmethod
     def encode(cls, arg):
-        return list(struct.pack('>Q', arg))
+        return list(struct.pack('>q', arg))
 
     def __init__(self, f):
-        self.i = struct.unpack('>Q', f.read(8))[0]
+        self.i = struct.unpack('>q', f.read(8))[0]
 
 class ImmediateModpAF(IntArgFormat):
     @classmethod
@@ -1075,21 +1086,27 @@ class ClearImmediate(ImmediateBase):
 ### Memory access instructions
 ###
 
-class DirectMemoryInstruction(Instruction):
+class MemoryInstruction(Instruction):
+    __slots__ = ['_protect']
+    def __init__(self, *args, **kwargs):
+        super(MemoryInstruction, self).__init__(*args, **kwargs)
+        self._protect = program._protect_memory
+
+class DirectMemoryInstruction(MemoryInstruction):
     __slots__ = []
     def __init__(self, *args, **kwargs):
         super(DirectMemoryInstruction, self).__init__(*args, **kwargs)
 
-class IndirectMemoryInstruction(Instruction):
+class IndirectMemoryInstruction(MemoryInstruction):
     __slots__ = []
 
     def get_direct(self, address):
         return self.direct(self.args[0], address, add_to_prog=False)
 
-class ReadMemoryInstruction(Instruction):
+class ReadMemoryInstruction(MemoryInstruction):
     __slots__ = []
 
-class WriteMemoryInstruction(Instruction):
+class WriteMemoryInstruction(MemoryInstruction):
     __slots__ = []
 
 class DirectMemoryWriteInstruction(DirectMemoryInstruction, \
diff --git a/Compiler/library.py b/Compiler/library.py
index 80c43ca8f..7c0ac10c0 100644
--- a/Compiler/library.py
+++ b/Compiler/library.py
@@ -117,7 +117,12 @@ def print_ln(s='', *args):
 
         print_ln('a is %s.', a.reveal())
     """
-    print_str(s + '\n', *args)
+    print_str(str(s) + '\n', *args)
+
+def print_both(s, end='\n'):
+    """ Print line during compilation and execution. """
+    print(s, end=end)
+    print_str(s + end)
 
 def print_ln_if(cond, ss, *args):
     """ Print line if :py:obj:`cond` is true. The further arguments
@@ -486,6 +491,8 @@ def cond_swap(x,y):
     return b.cond_swap(y, x)
 
 def sort(a):
+    print("WARNING: you're using bubble sort")
+
     res = a
     
     for i in range(len(a)):
@@ -524,272 +531,23 @@ def odd_even_merge_sort(a):
         raise CompilerError('Length of list must be power of two')
 
 def chunky_odd_even_merge_sort(a):
-    tmp = a[0].Array(len(a))
-    for i,j in enumerate(a):
-        tmp[i] = j
-    l = 1
-    while l < len(a):
-        l *= 2
-        k = 1
-        while k < l:
-            k *= 2
-            def round():
-                for i in range(len(a)):
-                    a[i] = tmp[i]
-                for i in range(len(a) // l):
-                    for j in range(l // k):
-                        base = i * l + j
-                        step = l // k
-                        if k == 2:
-                            a[base], a[base+step] = cond_swap(a[base], a[base+step])
-                        else:
-                            b = a[base:base+k*step:step]
-                            for m in range(base + step, base + (k - 1) * step, 2 * step):
-                                a[m], a[m+step] = cond_swap(a[m], a[m+step])
-                for i in range(len(a)):
-                    tmp[i] = a[i]
-            chunk = MPCThread(round, 'sort-%d-%d' % (l,k), single_thread=True)
-            chunk.start()
-            chunk.join()
-            #round()
-    for i in range(len(a)):
-        a[i] = tmp[i]
+    raise CompilerError(
+        'This function has been removed, use loopy_odd_even_merge_sort instead')
 
 def chunkier_odd_even_merge_sort(a, n=None, max_chunk_size=512, n_threads=7, use_chunk_wraps=False):
-    if n is None:
-        n = len(a)
-        a_base = instructions.program.malloc(n, 's')
-        for i,j in enumerate(a):
-            store_in_mem(j, a_base + i)
-    else:
-        a_base = a
-    tmp_base = instructions.program.malloc(n, 's')
-    chunks = {}
-    threads = []
-
-    def run_threads():
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join()
-        del threads[:]
-
-    def run_chunk(size, base):
-        if size not in chunks:
-            def swap_list(list_base):
-                for i in range(size // 2):
-                    base = list_base + 2 * i
-                    x, y = cond_swap(sint.load_mem(base),
-                                     sint.load_mem(base + 1))
-                    store_in_mem(x, base)
-                    store_in_mem(y, base + 1)
-            chunks[size] = FunctionTape(swap_list, 'sort-%d' % size)
-        return chunks[size](base)
-
-    def run_round(size):
-        # minimize number of chunk sizes
-        n_chunks = int(math.ceil(1.0 * size / max_chunk_size))
-        lower_size = size // n_chunks // 2 * 2
-        n_lower_size = n_chunks - (size - n_chunks * lower_size) // 2
-        # print len(to_swap) == lower_size * n_lower_size + \
-        #     (lower_size + 2) * (n_chunks - n_lower_size), \
-        #     len(to_swap), n_chunks, lower_size, n_lower_size
-        base = 0
-        round_threads = []
-        for i in range(n_lower_size):
-            round_threads.append(run_chunk(lower_size, tmp_base + base))
-            base += lower_size
-        for i in range(n_chunks - n_lower_size):
-            round_threads.append(run_chunk(lower_size + 2, tmp_base + base))
-            base += lower_size + 2
-        run_threads_in_rounds(round_threads)
-
-    postproc_chunks = []
-    wrap_chunks = {}
-    post_threads = []
-    pre_threads = []
-
-    def load_and_store(x, y, to_right):
-        if to_right:
-            store_in_mem(sint.load_mem(x), y)
-        else:
-            store_in_mem(sint.load_mem(y), x)
-
-    def run_setup(k, a_addr, step, tmp_addr):
-        if k == 2:
-            def mem_op(preproc, a_addr, step, tmp_addr):
-                load_and_store(a_addr, tmp_addr, preproc)
-                load_and_store(a_addr + step, tmp_addr + 1, preproc)
-            res = 2
-        else:
-            def mem_op(preproc, a_addr, step, tmp_addr):
-                instructions.program.curr_tape.merge_opens = False
-#                for i,m in enumerate(range(a_addr + step, a_addr + (k - 1) * step, step)):
-                for i in range(k - 2):
-                    m = a_addr + step + i * step
-                    load_and_store(m, tmp_addr + i, preproc)
-            res = k - 2
-        if not use_chunk_wraps or k <= 4:
-            mem_op(True, a_addr, step, tmp_addr)
-            postproc_chunks.append((mem_op, (a_addr, step, tmp_addr)))
-        else:
-            if k not in wrap_chunks:
-                pre_chunk = FunctionTape(mem_op, 'pre-%d' % k,
-                                         compile_args=[True])
-                post_chunk = FunctionTape(mem_op, 'post-%d' % k,
-                                          compile_args=[False])
-                wrap_chunks[k] = (pre_chunk, post_chunk)
-            pre_chunk, post_chunk = wrap_chunks[k]
-            pre_threads.append(pre_chunk(a_addr, step, tmp_addr))
-            post_threads.append(post_chunk(a_addr, step, tmp_addr))
-        return res
-
-    def run_threads_in_rounds(all_threads):
-        for thread in all_threads:
-            if len(threads) == n_threads:
-                run_threads()
-            threads.append(thread)
-        run_threads()
-        del all_threads[:]
-
-    def run_postproc():
-        run_threads_in_rounds(post_threads)
-        for chunk,args in postproc_chunks:
-            chunk(False, *args)
-        postproc_chunks[:] = []
-
-    l = 1
-    while l < n:
-        l *= 2
-        k = 1
-        while k < l:
-            k *= 2
-            size = 0
-            instructions.program.curr_tape.merge_opens = False
-            for i in range(n // l):
-                for j in range(l // k):
-                    base = i * l + j
-                    step = l // k
-                    size += run_setup(k, a_base + base, step, tmp_base + size)
-            run_threads_in_rounds(pre_threads)
-            run_round(size)
-            run_postproc()
-
-    if isinstance(a, list):
-        for i in range(n):
-            a[i] = sint.load_mem(a_base + i)
-        instructions.program.free(a_base, 's')
-    instructions.program.free(tmp_base, 's')
+    raise CompilerError(
+        'This function has been removed, use loopy_odd_even_merge_sort instead')
 
 def loopy_chunkier_odd_even_merge_sort(a, n=None, max_chunk_size=512, n_threads=7):
-    if n is None:
-        n = len(a)
-        a_base = instructions.program.malloc(n, 's')
-        for i,j in enumerate(a):
-            store_in_mem(j, a_base + i)
-    else:
-        a_base = a
-    tmp_base = instructions.program.malloc(n, 's')
-    tmp_i = instructions.program.malloc(1, 'ci')
-    chunks = {}
-    threads = []
-
-    def run_threads():
-        for thread in threads:
-            thread.start()
-        for thread in threads:
-            thread.join()
-        del threads[:]
-
-    def run_threads_in_rounds(all_threads):
-        for thread in all_threads:
-            if len(threads) == n_threads:
-                run_threads()
-            threads.append(thread)
-        run_threads()
-        del all_threads[:]
-
-    def run_chunk(size, base):
-        if size not in chunks:
-            def swap_list(list_base):
-                for i in range(size // 2):
-                    base = list_base + 2 * i
-                    x, y = cond_swap(sint.load_mem(base),
-                                     sint.load_mem(base + 1))
-                    store_in_mem(x, base)
-                    store_in_mem(y, base + 1)
-            chunks[size] = FunctionTape(swap_list, 'sort-%d' % size)
-        return chunks[size](base)
-
-    def run_round(size):
-        # minimize number of chunk sizes
-        n_chunks = int(math.ceil(1.0 * size / max_chunk_size))
-        lower_size = size // n_chunks // 2 * 2
-        n_lower_size = n_chunks - (size - n_chunks * lower_size) // 2
-        # print len(to_swap) == lower_size * n_lower_size + \
-        #     (lower_size + 2) * (n_chunks - n_lower_size), \
-        #     len(to_swap), n_chunks, lower_size, n_lower_size
-        base = 0
-        round_threads = []
-        for i in range(n_lower_size):
-            round_threads.append(run_chunk(lower_size, tmp_base + base))
-            base += lower_size
-        for i in range(n_chunks - n_lower_size):
-            round_threads.append(run_chunk(lower_size + 2, tmp_base + base))
-            base += lower_size + 2
-        run_threads_in_rounds(round_threads)
-
-    l = 1
-    while l < n:
-        l *= 2
-        k = 1
-        while k < l:
-            k *= 2
-            def load_and_store(x, y):
-                if to_tmp:
-                    store_in_mem(sint.load_mem(x), y)
-                else:
-                    store_in_mem(sint.load_mem(y), x)
-            def outer(i):
-                def inner(j):
-                    base = j + a_base + i * l
-                    step = l // k
-                    if k == 2:
-                        tmp_addr = regint.load_mem(tmp_i)
-                        load_and_store(base, tmp_addr)
-                        load_and_store(base + step, tmp_addr + 1)
-                        store_in_mem(tmp_addr + 2, tmp_i)
-                    else:
-                        def inner2(m):
-                            m += base
-                            tmp_addr = regint.load_mem(tmp_i)
-                            load_and_store(m, tmp_addr)
-                            store_in_mem(tmp_addr + 1, tmp_i)
-                        range_loop(inner2, step, (k - 1) * step, step)
-                range_loop(inner, l // k)
-            instructions.program.curr_tape.merge_opens = False
-            to_tmp = True
-            store_in_mem(tmp_base, tmp_i)
-            range_loop(outer, n // l)
-            if k == 2:
-                run_round(n)
-            else:
-                run_round(n // k * (k - 2))
-            instructions.program.curr_tape.merge_opens = False
-            to_tmp = False
-            store_in_mem(tmp_base, tmp_i)
-            range_loop(outer, n // l)
-
-    if isinstance(a, list):
-        for i in range(n):
-            a[i] = sint.load_mem(a_base + i)
-        instructions.program.free(a_base, 's')
-    instructions.program.free(tmp_base, 's')
-    instructions.program.free(tmp_i, 'ci')
+    raise CompilerError(
+        'This function has been removed, use loopy_odd_even_merge_sort instead')
 
 
 def loopy_odd_even_merge_sort(a, sorted_length=1, n_parallel=32,
                               n_threads=None):
+    a_in = a
+    if isinstance(a_in, list):
+        a = Array.create_from(a)
     steps = {}
     l = sorted_length
     while l < len(a):
@@ -833,8 +591,14 @@ def f(i):
                                 swap(m2, step)
                 steps[key] = step
             steps[key](l)
+    if isinstance(a_in, list):
+        a_in[:] = list(a)
 
 def mergesort(A):
+    if not get_program().options.insecure:
+        raise CompilerError('mergesort reveals the order of elements, '
+                            'use --insecure to activate it')
+
     B = Array(len(A), sint)
 
     def merge(i_left, i_right, i_end):
@@ -901,16 +665,16 @@ def for_range(start, stop=None, step=None):
 
     :param start/stop/step: regint/cint/int
 
-    Example:
-
-    .. code::
+    The following should output 10::
 
+        n = 10
         a = sint.Array(n)
         x = sint(0)
         @for_range(n)
         def _(i):
             a[i] = i
             x.update(x + 1)
+        print_ln('%s', x.reveal())
 
     Note that you cannot overwrite data structures such as
     :py:class:`~Compiler.types.Array` in a loop.  Use
@@ -924,11 +688,13 @@ def decorator(loop_body):
 def for_range_parallel(n_parallel, n_loops):
     """
     Decorator to execute a loop :py:obj:`n_loops` up to
-    :py:obj:`n_parallel` loop bodies in parallel.
+    :py:obj:`n_parallel` loop bodies with optimized communication in a
+    single thread.
+    In most cases, it is easier to use :py:func:`for_range_opt`.
     Using any other control flow instruction inside the loop breaks
     the optimization.
 
-    :param n_parallel: compile-time (int)
+    :param n_parallel: optimization parameter (int)
     :param n_loops: regint/cint/int or list of int
 
     Example:
@@ -1084,7 +850,7 @@ def exit_elimination(block):
                 del blocks[-n_to_merge + 1:]
                 del get_tape().req_node.children[-1]
                 merged.children = []
-                RegintOptimizer().run(merged.instructions)
+                RegintOptimizer().run(merged.instructions, get_program())
                 get_tape().active_basicblock = merged
             else:
                 req_node = get_tape().req_node.children[-1].nodes[0]
@@ -1151,6 +917,15 @@ def _(i):
         @for_range_opt_multithread(2, [5, 3])
         def f(i, j):
             ...
+
+    Note that you cannot use registers across threads. Use
+    :py:class:`MemValue` instead::
+
+        a = MemValue(sint(0))
+        @for_range_opt_multithread(8, 80)
+        def _(i):
+            b = a + 1
+
     """
     return for_range_multithread(n_threads, None, n_loops)
 
@@ -1179,6 +954,7 @@ def f(base, size):
         return map_reduce(n_threads, None, n_items, initializer=lambda: [],
                           reducer=None, looping=False)
     else:
+        max_size = max(1, max_size)
         def wrapper(function):
             @multithread(n_threads, n_items)
             def new_function(base, size):
@@ -1419,57 +1195,50 @@ def f(i):
         return f
     return decorator
 
-def while_loop(loop_body, condition, arg, g=None):
+def while_loop(loop_body, condition, arg=None, g=None):
     if not callable(condition):
         raise CompilerError('Condition must be callable')
-    # store arg in stack
-    pre_condition = condition(arg)
-    if not isinstance(pre_condition, (bool,int)) or pre_condition:
+    if arg is None:
+        pre_condition = condition()
+    else:
+        pre_condition = condition(arg)
         arg = regint(arg)
-        def loop_fn():
-            result = loop_body(arg)
+        cond = condition
+        condition = lambda: cond(arg)
+        tmp = loop_body
+        def loop_body():
+            result = tmp(arg)
             if isinstance(result, MemValue):
                 result = result.read()
             result.link(arg)
-            cont = condition(result)
-            return cont
+    if not isinstance(pre_condition, (bool,int)) or pre_condition:
+        def loop_fn():
+            loop_body()
+            return condition()
         if_statement(pre_condition, lambda: do_while(loop_fn, g=g))
 
 def while_do(condition, *args):
-    """ While-do loop. The decorator requires an initialization, and
-    the loop body function must return a suitable input for
-    :py:obj:`condition`.
+    """ While-do loop.
 
     :param condition: function returning public integer (regint/cint/int)
-    :param args: arguments given to :py:obj:`condition` and loop body
 
     The following executes an ten-fold loop:
 
     .. code::
 
-        @while_do(lambda x: x < 10, regint(0))
-        def f(i):
+        i = regint(0)
+        @while_do(lambda: i < 10)
+        def f():
             ...
-            return i + 1
+            i.update(i + 1)
+            ...
+
     """
     def decorator(loop_body):
         while_loop(loop_body, condition, *args)
         return loop_body
     return decorator
 
-def do_loop(condition, loop_fn):
-    # store initial condition to stack
-    pushint(condition if isinstance(condition,regint) else regint(condition))
-    def wrapped_loop():
-        # save condition to stack
-        new_cond = regint.pop()
-        # run the loop
-        condition = loop_fn(new_cond)
-        pushint(condition)
-        return condition
-    do_while(wrapped_loop)
-    regint.pop()
-
 def _run_and_link(function, g=None):
     if g is None:
         g = function.__globals__
diff --git a/Compiler/ml.py b/Compiler/ml.py
index c667e1d64..f5c9a9eba 100644
--- a/Compiler/ml.py
+++ b/Compiler/ml.py
@@ -216,9 +216,13 @@ def __getitem__(self, *args):
         self.alloc()
         return super(Tensor, self).__getitem__(*args)
 
-    def assign_vector(self, *args):
+    def assign_all(self, *args):
         self.alloc()
-        return super(Tensor, self).assign_vector(*args)
+        return super(Tensor, self).assign_all(*args)
+
+    def assign_vector(self, *args, **kwargs):
+        self.alloc()
+        return super(Tensor, self).assign_vector(*args, **kwargs)
 
     def assign_vector_by_indices(self, *args):
         self.alloc()
@@ -261,14 +265,15 @@ def forward(self, batch=None, training=None):
         self._forward(batch)
 
     def __str__(self):
-        return type(self).__name__ + str(self._Y.sizes)
+        return type(self).__name__ + str(self._Y.shape)
 
     def __repr__(self):
-        return '%s(%s)' % (type(self).__name__, self.Y.sizes)
+        return '%s(%s)' % (type(self).__name__, self.Y.shape)
 
 class NoVariableLayer(Layer):
     input_from = lambda *args, **kwargs: None
     output_weights = lambda *args: None
+    reveal_parameters_to_binary = lambda *args, **kwargs: None
 
     nablas = lambda self: ()
     reset = lambda self: None
@@ -300,7 +305,8 @@ def __init__(self, N, debug=False, approx=False):
         self.compute_loss = True
         self.d_out = 1
 
-    def divisor(self, divisor, size):
+    @staticmethod
+    def divisor(divisor, size=1):
         return cfix(1.0 / divisor, size=size)
 
     def _forward(self, batch):
@@ -325,7 +331,8 @@ def _(base, size):
                      self.divisor(N, 1))
 
     def eval(self, size, base=0, top=False):
-        assert not top
+        if top:
+            return self.X.get_vector(base, size) > 0
         if self.approx:
             return approx_sigmoid(self.X.get_vector(base, size), self.approx)
         else:
@@ -383,6 +390,36 @@ def _(i):
 	                    i, truth, guess, b, nabla)
         return n_correct
 
+class LinearOutput(NoVariableLayer):
+    n_outputs = -1
+
+    def __init__(self, N):
+        self.X = sfix.Array(N)
+        self.Y = sfix.Array(N)
+        self.nabla_X = sfix.Array(N)
+        self.l = MemValue(sfix(0))
+
+    def _forward(self, batch):
+        N = len(batch)
+        guess = self.X.get_vector(0, N)
+        truth = self.Y.get(batch.get_vector(0, N))
+        diff = guess - truth
+        self.nabla_X.assign_vector(diff)
+        #print_ln('%s %s %s', diff.reveal(), truth.reveal(), guess.reveal())
+        self.l.write(sum((diff) ** 2) * Output.divisor(N))
+
+    def backward(self, batch):
+        pass
+
+    def reveal_correctness(*args):
+        return 0
+
+    def average_loss(self, N):
+        return self.l.reveal()
+
+    def eval(self, size, base=0, top=False):
+        return self.X.get_vector(base, size)
+
 class MultiOutputBase(NoVariableLayer):
     def __init__(self, N, d_out, approx=False, debug=False):
         self.X = sfix.Matrix(N, d_out)
@@ -621,6 +658,25 @@ def output_weights(self):
         self.W.print_reveal_nested()
         print_ln('%s', self.b.reveal_nested())
 
+    def reveal_parameters_to_binary(self, reshape=None):
+        if reshape:
+            trans = self.W.transpose()
+            O = trans.sizes[0]
+            tmp = MultiArray([O] + reshape,
+                             value_type=self.W.value_type,
+                             address=trans.address)
+            X, Y, C = reshape
+            @for_range(O)
+            def _(i):
+                @for_range(C)
+                def _(j):
+                    part = tmp.get_vector_by_indices(i, None, None, j)
+                    part.reveal().binary_output()
+        else:
+            self.W.transpose().reveal_to_binary_output()
+        if self.input_bias:
+            self.b.reveal_to_binary_output()
+
     def backward_params(self, f_schur_Y, batch):
         N = len(batch)
         tmp = Matrix(self.d_in, self.d_out, unreduced_sfix)
@@ -726,14 +782,14 @@ def __init__(self, N, d_in, d_out, d=1, activation='id', debug=False):
         self.d = d
         self.activation = activation
 
-        self.X = MultiArray([N, d, d_in], sfix)
-        self.Y = MultiArray([N, d, d_out], sfix)
+        self.X = Tensor([N, d, d_in], sfix)
+        self.Y = Tensor([N, d, d_out], sfix)
         self.W = Tensor([d_in, d_out], sfix)
         self.b = sfix.Array(d_out)
 
         back_N = min(N, self.back_batch_size)
-        self.nabla_Y = MultiArray([back_N, d, d_out], sfix)
-        self.nabla_X = MultiArray([back_N, d, d_in], sfix)
+        self.nabla_Y = Tensor([back_N, d, d_out], sfix)
+        self.nabla_X = Tensor([back_N, d, d_in], sfix)
         self.nabla_W = sfix.Matrix(d_in, d_out)
         self.nabla_b = sfix.Array(d_out)
 
@@ -757,7 +813,7 @@ def reset(self):
         d_out = self.d_out
         r = math.sqrt(6.0 / (d_in + d_out))
         print('Initializing dense weights in [%f,%f]' % (-r, r))
-        self.W.randomize(-r, r)
+        self.W.randomize(-r, r, n_threads=self.n_threads)
         self.b.assign_all(0)
 
     def input_from(self, player, raw=False):
@@ -841,6 +897,7 @@ def backward(self, compute_nabla_X=True, batch=None):
             f_schur_Y = nabla_Y
 
         if compute_nabla_X:
+            nabla_X.alloc()
             @multithread(self.n_threads, N)
             def _(base, size):
                 B = sfix.Matrix(N, d_out, address=f_schur_Y.address)
@@ -875,8 +932,8 @@ def __init__(self, N, d_in, d_out):
         self.b = sfix.Array(d_out)
         self.nabla_b = self.b.same_shape()
 
-        self.X = MultiArray([N, 1, d_in], sfix)
-        self.Y = MultiArray([N, 1, d_out], sfix)
+        self.X = Tensor([N, 1, d_in], sfix)
+        self.Y = Tensor([N, 1, d_out], sfix)
         self.nabla_Y = self.Y.same_shape()
 
     def reset(self):
@@ -920,10 +977,10 @@ def __init__(self, N, d1, d2=1, alpha=0.5):
         self.N = N
         self.d1 = d1
         self.d2 = d2
-        self.X = MultiArray([N, d1, d2], sfix)
-        self.Y = MultiArray([N, d1, d2], sfix)
-        self.nabla_Y = MultiArray([N, d1, d2], sfix)
-        self.nabla_X = MultiArray([N, d1, d2], sfix)
+        self.X = Tensor([N, d1, d2], sfix)
+        self.Y = Tensor([N, d1, d2], sfix)
+        self.nabla_Y = Tensor([N, d1, d2], sfix)
+        self.nabla_X = Tensor([N, d1, d2], sfix)
         self.alpha = alpha
         self.B = MultiArray([N, d1, d2], sint)
 
@@ -1070,8 +1127,15 @@ def __init__(self, shape, strides=(1, 2, 2, 1), ksize=(1, 2, 2, 1),
         self.X = Tensor(shape, sfix)
         if padding == 'SAME':
             output_shape = [int(math.ceil(shape[i] / strides[i])) for i in range(4)]
+            padding = [0, 0]
         else:
-            output_shape = [(shape[i] - ksize[i]) // strides[i] + 1 for i in range(4)]
+            if padding == 'VALID':
+                padding = 0
+            if isinstance(padding, int):
+                padding = [padding, padding]
+            output_shape = [shape[0]] + [
+                (shape[i + 1] + 2 * padding[i] - ksize[i + 1]) // \
+                strides [i + 1] + 1 for i in range(2)] + [shape[3]]
         self.Y = Tensor(output_shape, sfix)
         self.strides = strides
         self.ksize = ksize
@@ -1108,48 +1172,53 @@ def backward(self, compute_nabla_X=True, batch=None):
         if compute_nabla_X:
             self.nabla_X.alloc()
             self.nabla_X.assign_all(0)
+            break_point()
             def process(pool, bi, k, i, j):
                 for (x, h_in, w_in, h, w), c \
                     in zip(pool, self.comparisons[bi][k][i][j]):
                     hh = h * h_in
                     ww = w * w_in
                     res = h_in * w_in * c * self.nabla_Y[bi][i][j][k]
+                    get_program().protect_memory(True)
                     self.nabla_X[bi][hh][ww][k] += res
+                    get_program().protect_memory(False)
         self.traverse(batch, process)
 
     def traverse(self, batch, process):
         need_padding = [self.strides[i] * (self.Y.sizes[i] - 1) + self.ksize[i] >
                         self.X.sizes[i] for i in range(4)]
-        overlap = reduce(operator.or_,
-                         (x < y for x, y in zip(self.strides, self.ksize)))
         @for_range_opt_multithread(self.n_threads,
                                    [len(batch), self.X.sizes[3]])
         def _(l, k):
             bi = batch[l]
+            XX = self.X[bi]
             @for_range_opt(self.Y.sizes[1])
             def _(i):
-                h_base = self.strides[1] * i
+                h_base = self.strides[1] * i - self.padding[1]
+                hs = [h_base + jj for jj in range(self.ksize[1])]
+                if need_padding[1]:
+                    h_ins = [(h < self.X.sizes[1]) * (h >= 0) for h in hs]
+                else:
+                    h_ins = [True] * self.ksize[1]
                 @for_range_opt(self.Y.sizes[2])
                 def _(j):
-                    if overlap:
-                        break_point()
-                    w_base = self.strides[2] * j
+                    w_base = self.strides[2] * j - self.padding[1]
                     pool = []
+                    ws = [w_base + jj for jj in range(self.ksize[2])]
+                    if need_padding[2]:
+                        w_ins = [(w < self.X.sizes[2]) * (w >= 0) for w in ws]
+                    else:
+                        w_ins = [True] * self.ksize[2]
                     for ii in range(self.ksize[1]):
-                        h = h_base + ii
-                        if need_padding[1]:
-                            h_in = h < self.X.sizes[1]
-                        else:
-                            h_in = True
+                        h = hs[ii]
+                        h_in = h_ins[ii]
+                        XXX = XX[h_in * h]
                         for jj in range(self.ksize[2]):
-                            w = w_base + jj
-                            if need_padding[2]:
-                                w_in = w < self.X.sizes[2]
-                            else:
-                                w_in = True
+                            w = ws[jj]
+                            w_in = w_ins[jj]
                             if not is_zero(h_in * w_in):
-                                pool.append([h_in * w_in * self.X[bi][h_in * h]
-                                             [w_in * w][k], h_in, w_in, h, w])
+                                pool.append([h_in * w_in * XXX[w_in * w][k],
+                                             h_in, w_in, h, w])
                     process(pool, bi, k, i, j)
 
 
@@ -1160,7 +1229,7 @@ class Argmax(NoVariableLayer):
     """
     def __init__(self, shape):
         assert len(shape) == 2
-        self.X = MultiArray(shape, sfix)
+        self.X = Tensor(shape, sfix)
         self.Y = Array(shape[0], sint)
 
     def _forward(self, batch=[0]):
@@ -1270,7 +1339,7 @@ def __init__(self, shape, approx=True, args=None):
         self.var, self.mu, self.weights, self.bias = arrays
         arrays = (sfix.Array(shape[2]) for i in range(4))
         self.mu_hat, self.var_hat, self.nabla_weights, self.nabla_bias = arrays
-        self.epsilon = 2 ** (-sfix.f + 1)
+        self.epsilon = 2 ** (-sfix.f * 2 // 3 + 1)
         self.momentum = 0.1
         if args != None:
             approx = 'precisebn' not in args
@@ -1449,8 +1518,8 @@ def __init__(self, input_shape, output_shape, inputs=None):
         for x in back_shapes:
             x[0] = min(x[0], self.back_batch_size)
 
-        self.nabla_X = MultiArray(back_shapes[0], self.input_squant)
-        self.nabla_Y = MultiArray(back_shapes[1], self.output_squant)
+        self.nabla_X = Tensor(back_shapes[0], self.input_squant)
+        self.nabla_Y = Tensor(back_shapes[1], self.output_squant)
         self.inputs = inputs
 
     def temp_shape(self):
@@ -1544,6 +1613,18 @@ def output_weights(self):
         self.weights.print_reveal_nested()
         print_ln('%s', self.bias.reveal_nested())
 
+    def reveal_parameters_to_binary(self):
+        assert not self.tf_weight_format
+        n_filters = self.weights.shape[0]
+        n_channels = self.weights.shape[3]
+        @for_range(n_filters)
+        def _(i):
+            @for_range(n_channels)
+            def _(j):
+                part = self.weights.get_vector_by_indices(i, None, None, j)
+                part.reveal().binary_output()
+        self.bias.reveal_to_binary_output()
+
     def dot_product(self, iv, wv, out_y, out_x, out_c):
         bias = self.bias[out_c]
         acc = self.output_squant.unreduced_dot_product(iv, wv)
@@ -1704,11 +1785,10 @@ class FixConv2d(Conv2d, FixBase):
 
     def reset(self):
         assert not self.tf_weight_format
-        kernel_size = self.weight_shape[1] * self.weight_shape[2]
-        r = math.sqrt(6.0 / (kernel_size * sum(self.weight_shape[::3])))
+        n_in = reduce(operator.mul, self.weight_shape[1:])
+        r = math.sqrt(6.0 / (n_in + self.weight_shape[0]))
         print('Initializing convolution weights in [%f,%f]' % (-r, r))
-        self.weights.assign_vector(
-            sfix.get_random(-r, r, size=self.weights.total_size()))
+        self.weights.randomize(-r, r, n_threads=self.n_threads)
         self.bias.assign_all(0)
 
     def backward(self, compute_nabla_X=True, batch=None):
@@ -1944,6 +2024,51 @@ def _(out_y, out_x, c):
             acc = self.const_div(acc, n)
             self.Y[0][out_y][out_x][c] = self.output_squant._new(acc)
 
+def easyConv2d(input_shape, batch_size, out_channels, kernel_size, stride=1,
+               padding=0):
+    """ More convenient interface to :py:class:`FixConv2d`.
+
+    :param input_shape: input shape (tuple/list of four int)
+    :param out_channels: output channels (int)
+    :param kernel_size: kernel size (int or tuple/list of two int)
+    :param stride: stride (int or tuple/list of two int)
+    :param padding: :py:obj:`'SAME'`, :py:obj:`'VALID'`, int, or tuple/list of two int
+
+    """
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    if isinstance(stride, int):
+        stride = (stride, stride)
+    weight_shape = [out_channels] + list(kernel_size) +  [input_shape[-1]]
+    output_shape = [batch_size] + list(
+        apply_padding(input_shape[1:3], kernel_size, stride, padding)) + \
+            [out_channels]
+    padding = padding.upper() if isinstance(padding, str) \
+        else padding
+    return FixConv2d(input_shape, weight_shape, (out_channels,), output_shape,
+                     stride, padding)
+
+def easyMaxPool(input_shape, kernel_size, stride=None, padding=0):
+    """ More convenient interface to :py:class:`MaxPool`.
+
+    :param input_shape: input shape (tuple/list of four int)
+    :param kernel_size: kernel size (int or tuple/list of two int)
+    :param stride: stride (int or tuple/list of two int)
+    :param padding: :py:obj:`'SAME'`, :py:obj:`'VALID'`, int,
+      or tuple/list of two int
+
+    """
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    if isinstance(stride, int):
+        stride = (stride, stride)
+    if stride == None:
+        stride = kernel_size
+    padding = padding.upper() if isinstance(padding, str) \
+        else padding
+    return MaxPool(input_shape, [1] + list(stride) + [1],
+                   [1] + list(kernel_size) + [1], padding)
+
 class QuantAveragePool2d(QuantBase, AveragePool2d):
     def input_params_from(self, player):
         print('WARNING: assuming that input and output quantization parameters are the same')
@@ -1997,9 +2122,15 @@ class Optimizer:
     """ Base class for graphs of layers. """
     n_threads = Layer.n_threads
     always_shuffle = True
+    shuffle = True
     time_layers = False
     revealing_correctness = False
     early_division = False
+    output_diff = False
+    output_grad = False
+    output_stats = False
+    print_accuracy = True
+    time_training = True
 
     @staticmethod
     def from_args(program, layers):
@@ -2007,14 +2138,19 @@ def from_args(program, layers):
             res = Adam(layers, 1, approx='adamapprox' in program.args)
         elif 'amsgrad' in program.args:
             res = Adam(layers, approx=True, amsgrad=True)
+        elif 'amsgradprec' in program.args:
+            res = Adam(layers, approx=False, amsgrad=True)
         elif 'quotient' in program.args:
             res = Adam(layers, approx=True, amsgrad=True, normalize=True)
         else:
             res = SGD(layers, 1)
         res.early_division = 'early_div' in program.args
+        res.output_diff = 'output_diff' in program.args
+        res.output_grad = 'output_grad' in program.args
+        res.output_stats = 'output_stats' in program.args
         return res
 
-    def __init__(self, report_loss=None):
+    def __init__(self, layers=[], report_loss=None):
         if get_program().options.binary:
             raise CompilerError(
                 'machine learning code not compatible with binary circuits')
@@ -2028,6 +2164,7 @@ def __init__(self, report_loss=None):
         self.i_epoch = MemValue(0)
         self.stopped_on_loss = MemValue(0)
         self.stopped_on_low_loss = MemValue(0)
+        self.layers = layers
 
     @property
     def layers(self):
@@ -2054,6 +2191,10 @@ def set_layers_with_inputs(self, layers):
             layer.last_used = list(filter(lambda x: x not in used, layer.inputs))
             used.update(layer.inputs)
 
+    def set_learning_rate(self, lr):
+        print('Setting learning rate to', lr)
+        self.gamma = MemValue(cfix(lr))
+
     def reset(self):
         """ Initialize weights. """
         for layer in self.layers:
@@ -2151,6 +2292,7 @@ def backward(self, batch):
                 layer.backward(compute_nabla_X=False,
                                batch=self.batch_for(layer, batch))
             else:
+                layer.nabla_X.alloc()
                 layer.backward(batch=self.batch_for(layer, batch))
                 if len(layer.inputs) == 1:
                     layer.inputs[0].nabla_Y.address = \
@@ -2161,6 +2303,92 @@ def backward(self, batch):
             if self.time_layers:
                 stop_timer(200 + i)
 
+    @classmethod
+    def stat(cls, name, tensor):
+        zero, neg, small = (cint.Array(cls.n_threads) for i in range(3))
+        s, mx, mn = (cfix.Array(cls.n_threads) for i in range(3))
+        for x in zero, neg, small, s, mx, mn:
+            x.assign_all(0)
+        total = tensor.total_size()
+        @multithread(cls.n_threads, total)
+        def _(base, size):
+            tn = get_thread_number() - 1
+            tmp = Array.create_from(
+                tensor.get_vector(base, size).reveal())
+            @for_range_opt(size, budget=1000)
+            def _(i):
+                zero[tn] += tmp[i] == 0
+                neg[tn] += tmp[i] < 0
+                small[tn] += abs(tmp[i]) < 2 ** (-tmp[i].f / 2)
+                s[tn] += tmp[i]
+                mx[tn] = util.max(mx[tn], tmp[i])
+                mn[tn] = util.min(mn[tn], tmp[i])
+            tmp.delete()
+        print_str(
+            ' %s 0:%s/%s, <0:%s/%s, >0:%s/%s, ~0:%s/%s sum:%s max:%s min:%s ',
+            name, sum(zero), total, sum(neg), total,
+            total - sum(zero) - sum(neg), total,
+            sum(small) - sum(zero), total, sum(s), util.max(mx), util.min(mn))
+        if len(tensor.shape) == 4:
+            corners = sum(([tensor[0][i][j][0] for j in (0, -1)]
+                           for i in (0, -1)), [])
+        elif len(tensor.shape) == 1:
+            x = tensor.to_array()
+            corners = [x[i] for i in (0, len(x) // 2 - 1, -1)]
+        else:
+            x = tensor[0].to_array()
+            corners = [x[i] for i in (0, len(x) // 2 - 1, -1)]
+        print_ln('corners:%s shape:%s', util.reveal(corners), tensor.shape)
+
+    def update(self, i_epoch, i_batch, batch):
+        if self.output_grad:
+            @if_(i_batch % 100 == 0)
+            def _():
+                for layer in self.layers[:-1]:
+                    cfix(10000).binary_output()
+                    break_point()
+                    layer.nabla_Y.get_vector(size=2000).reveal().binary_output()
+                    break_point()
+                    for theta, nabla in zip(layer.thetas(), layer.nablas()):
+                        cfix(5000).binary_output()
+                        break_point()
+                        nabla.get_vector().reveal().binary_output()
+                        break_point()
+        if self.output_stats:
+            old_params = []
+            @if_((i_batch % self.output_stats == 0).bit_or(i_epoch == 0))
+            def _():
+                for i, layer in enumerate(self.layers[:-1]):
+                    print_ln(layer)
+                    if layer == self.layers[0]:
+                        x = Array.create_from(layer.X.get_slice_vector(batch))
+                        self.stat(' 0 X', x)
+                    else:
+                        self.stat(' %d X' % i, layer.X)
+                    self.stat(' %d Y' % i, layer.Y)
+                    self.stat(' %d nabla_Y' % i, layer.nabla_Y)
+                    for nabla in layer.nablas():
+                        self.stat(' %d grad' % i, nabla)
+                    for theta in layer.thetas():
+                        self.stat(' %d param' % i, theta)
+                        if theta.total_size() < 1000:
+                            old_params.append(theta.get_vector())
+        if self.time_layers:
+            start_timer(1000)
+        self._update(i_epoch, MemValue(i_batch), batch)
+        if self.time_layers:
+            stop_timer(1000)
+        if self.output_stats:
+            @if_(i_batch % self.output_stats == 0)
+            def _():
+                for i, layer in enumerate(self.layers[:-1]):
+                    for theta in layer.thetas():
+                        if theta.total_size() < 1000:
+                            print_ln(layer)
+                            self.stat(' %d diff' % i, Array.create_from(
+                                theta.get_vector() - old_params[0]))
+                            del old_params[0]
+
     @_no_mem_warnings
     def run(self, batch_size=None, stop_on_loss=0):
         """ Run training.
@@ -2197,7 +2425,7 @@ def _(_):
                     indices.assign_vector(
                         regint.get_random(int(math.log2(len(X))), size=missing),
                         base=len(X))
-                if self.always_shuffle or n_per_epoch > 1:
+                if self.shuffle and (self.always_shuffle or n_per_epoch > 1):
                     indices.shuffle()
             loss_sum = MemValue(sfix(0))
             self.n_correct.write(0)
@@ -2212,11 +2440,7 @@ def _(j):
                                  label * n)
                 self.forward(batch=batch, training=True)
                 self.backward(batch=batch)
-                if self.time_layers:
-                    start_timer(1000)
-                self.update(i, batch=batch)
-                if self.time_layers:
-                    stop_timer(1000)
+                self.update(i, j, batch=batch)
                 loss_sum.iadd(self.layers[-1].l)
                 if self.print_loss_reduction:
                     before = self.layers[-1].average_loss(N)
@@ -2241,12 +2465,19 @@ def _(j):
                     return res
             if self.print_losses:
                 print_ln()
+            self.missing_newline = False
             if self.report_loss and self.layers[-1].compute_loss and self.layers[-1].approx != 5:
                 print_ln('loss in epoch %s: %s', i,
                          (loss_sum.reveal() * cfix(1 / n_per_epoch)))
             else:
-                print_ln('done with epoch %s', i)
-            time()
+                print_str('done with epoch %s', i)
+                if self.time_training or self.print_losses:
+                    print_ln()
+                else:
+                    print_str('\r')
+                    self.missing_newline = True
+            if self.time_training:
+                time()
             i.iadd(1)
             res = True
             if self.tol > 0:
@@ -2255,7 +2486,15 @@ def _(j):
             self.stopped_on_low_loss.write(1 - res)
             return res
 
-    def reveal_correctness(self, data, truth, batch_size):
+    def reveal_correctness(self, data, truth, batch_size=128, running=False):
+        """ Test correctness by revealing results.
+
+        :param data: test sample data
+        :param truth: test labels
+        :param batch_size: batch size
+        :param running: output after every batch
+
+        """
         N = data.sizes[0]
         n_correct = MemValue(0)
         loss = MemValue(sfix(0))
@@ -2266,13 +2505,20 @@ def f(start, batch_size, batch):
             n_correct.iadd(
                 self.layers[-1].reveal_correctness(batch_size, part_truth))
             loss.iadd(self.layers[-1].l * batch_size)
-        self.run_in_batches(f, data, batch_size)
+            if running:
+                total = start + batch_size
+                print_str('\rpart acc: %s (%s/%s) ',
+                          cfix(n_correct, k=63, f=31) / total, n_correct, total)
+        self.run_in_batches(f, data, batch_size, truth)
+        if running:
+            print_ln()
         loss = loss.reveal()
         if cfix.f < 31:
             loss = cfix._new(loss.v << (31 - cfix.f), k=63, f=31)
         return n_correct, loss / N
 
     def run_in_batches(self, f, data, batch_size, truth=None):
+        batch_size = min(batch_size, data.sizes[0])
         training_data = self.layers[0].X.address
         training_truth = self.layers[-1].Y.address
         self.layers[0].X.address = data.address
@@ -2287,30 +2533,35 @@ def _(i):
         batch_size = N % batch_size
         if batch_size:
             start = N - batch_size
-            f(start, batch_size, batch)
+            f(start, batch_size, regint.Array(batch_size))
         self.layers[0].X.address = training_data
         self.layers[-1].Y.address = training_truth
 
     @_no_mem_warnings
     def run_by_args(self, program, n_runs, batch_size, test_X, test_Y,
-                    acc_batch_size=None):
+                    acc_batch_size=None, reset=True):
         if acc_batch_size is None:
             acc_batch_size = batch_size
         depreciation = None
+        if program is None:
+            class A:
+                pass
+            program = A()
+            program.args = []
         for arg in program.args:
             m = re.match('rate(.*)', arg)
             if m:
-                self.gamma = MemValue(cfix(float(m.group(1))))
+                self.set_learning_rate(float(m.group(1)))
             m = re.match('dep(.*)', arg)
             if m:
                 depreciation = float(m.group(1))
         if 'nomom' in program.args:
             self.momentum = 0
-        self.print_losses = 'print_losses' in program.args
+        self.print_losses |= 'print_losses' in program.args
         self.print_random_update = 'print_random_update' in program.args
         Layer.print_random_update = self.print_random_update
         self.time_layers = 'time_layers' in program.args
-        self.revealing_correctness = not 'no_acc' in program.args
+        self.revealing_correctness &= not 'no_acc' in program.args
         self.layers[-1].compute_loss = not 'no_loss' in program.args
         if 'full_cisc' in program.args:
             program.options.keep_cisc = 'FPDiv,exp2_fx,log2_fx'
@@ -2319,7 +2570,7 @@ def run_by_args(self, program, n_runs, batch_size, test_X, test_Y,
         if model_input:
             for layer in self.layers:
                 layer.input_from(0)
-        else:
+        elif reset:
             self.reset()
         if 'one_iter' in program.args:
             print_float_prec(16)
@@ -2351,34 +2602,42 @@ def _(i):
         @for_range(n_runs)
         def _(i):
             if not acc_first:
-                start_timer(1)
+                if self.time_training:
+                    start_timer(1)
                 self.run(batch_size,
                          stop_on_loss=0 if 'no_loss' in program.args else 100)
-                stop_timer(1)
+                if self.time_training:
+                    stop_timer(1)
             if 'no_acc' in program.args:
                 return
             N = self.layers[0].X.sizes[0]
             n_trained = (N + batch_size - 1) // batch_size * batch_size
-            if not acc_first:
+            if not acc_first and self.print_accuracy and \
+               self.revealing_correctness:
                 print_ln('train_acc: %s (%s/%s)',
                          cfix(self.n_correct, k=63, f=31) / n_trained,
                          self.n_correct, n_trained)
             if test_X and test_Y:
                 print('use test set')
                 n_test = len(test_Y)
-                n_correct, loss = self.reveal_correctness(test_X, test_Y,
-                                                          acc_batch_size)
+                n_correct, loss = self.reveal_correctness(
+                    test_X, test_Y, acc_batch_size,
+                    running='part_acc' in program.args)
                 print_ln('test loss: %s', loss)
-                print_ln('acc: %s (%s/%s)',
-                         cfix(n_correct, k=63, f=31) / n_test,
-                         n_correct, n_test)
+                if self.print_accuracy:
+                    print_ln('acc: %s (%s/%s)',
+                             cfix(n_correct, k=63, f=31) / n_test,
+                             n_correct, n_test)
             if acc_first:
-                start_timer(1)
+                if self.time_training:
+                    start_timer(1)
                 self.run(batch_size)
-                stop_timer(1)
+                if self.time_training:
+                    stop_timer(1)
             else:
-                @if_(util.or_op(self.stopped_on_loss, n_correct <
-                                int(n_test // self.layers[-1].n_outputs * 1.2)))
+                @if_(util.or_op(self.stopped_on_loss, (n_correct <
+                                int(n_test // self.layers[-1].n_outputs * 1.2))
+                                    if test_X and test_Y else 0))
                 def _():
                     self.gamma.imul(.5)
                     if 'crash' in program.args:
@@ -2392,9 +2651,36 @@ def _():
                 self.gamma.imul(depreciation)
                 print_ln('reducing learning rate to %s', self.gamma)
             return 1 - self.stopped_on_low_loss
+        if self.missing_newline:
+            print_ln('')
         if 'model_output' in program.args:
             self.output_weights()
 
+    def fit(self, X, Y, epochs=1, batch_size=128, validation_data=(None, None),
+            program=None, reset=True, print_accuracy=False, print_loss=False):
+        """ Train model.
+
+        :param X: training sample data (sfix tensor)
+        :param Y: training labels (sint/sfix tensor)
+        :param epochs: number of epochs (int)
+        :param batch_size: batch size (int)
+        :param validation_data: tuple of test sample data and labels for
+          accuracy testing (optional; reveals labels)
+        :param program: :py:class:`~Compile.program.Program` instance to use
+          command-line parameters (optional)
+        :param reset: whether to initialize model
+        :param print_accuracy: print accuracy on training data (reveals labels)
+        :param print_loss: reveal and print training loss after every batch
+
+        """
+        self.layers[0].X = X
+        self.layers[-1].Y = Y
+        self.revealing_correctness = print_accuracy
+        self.print_losses = print_loss
+        self.time_training = False
+        self.run_by_args(program, epochs, batch_size, *validation_data,
+                         reset=reset)
+
     def output_weights(self):
         print_float_precision(max(6, sfix.f // 3))
         for layer in self.layers:
@@ -2405,6 +2691,19 @@ def summary(self):
         print(sizes)
         print('Trainable params:', sum(sizes))
 
+    @property
+    def trainable_variables(self):
+        return list(self.thetas)
+
+    def reveal_model_to_binary(self):
+        input_shape = self.layers[0].X.shape
+        for layer in self.layers:
+            if len(input_shape) == 4 and isinstance(layer, DenseBase):
+                layer.reveal_parameters_to_binary(reshape=input_shape[1:])
+            else:
+                layer.reveal_parameters_to_binary()
+            input_shape = layer.Y.shape
+
 class Adam(Optimizer):
     """ Adam/AMSgrad optimizer.
 
@@ -2414,7 +2713,8 @@ class Adam(Optimizer):
     """
     def __init__(self, layers, n_epochs=1, approx=False, amsgrad=False,
                  normalize=False):
-        self.gamma = MemValue(cfix(.001))
+        super(Adam, self).__init__()
+        self.set_learning_rate(.001)
         self.beta1 = 0.9
         self.beta2 = 0.999
         self.beta1_power = MemValue(cfix(1))
@@ -2425,15 +2725,15 @@ def __init__(self, layers, n_epochs=1, approx=False, amsgrad=False,
         self.amsgrad = amsgrad
         self.normalize = normalize
         if amsgrad:
-            print_str('Using AMSgrad ')
+            print_both('Using AMSgrad ', end='')
         else:
-            print_str('Using Adam ')
+            print_both('Using Adam ', end='')
         if approx:
-            print_ln('with inverse square root approximation')
+            print_both('with inverse square root approximation')
         else:
-            print_ln('with more precise inverse square root')
+            print_both('with more precise inverse square root')
         if normalize:
-            print_ln('Normalize gradient')
+            print_both('Normalize gradient')
 
         self.layers = layers
         self.ms = []
@@ -2448,9 +2748,7 @@ def __init__(self, layers, n_epochs=1, approx=False, amsgrad=False,
                 if amsgrad:
                     self.vhats.append(nabla.same_shape())
 
-        super(Adam, self).__init__()
-
-    def update(self, i_epoch, batch):
+    def _update(self, i_epoch, i_batch, batch):
         self.beta1_power *= self.beta1
         self.beta2_power *= self.beta2
         m_factor = MemValue(1 / (1 - self.beta1_power))
@@ -2478,20 +2776,30 @@ def _(base, size):
                 v_part = self.beta2 * v_part + (1 - self.beta2) * g_part ** 2
                 m.assign_vector(m_part, base)
                 v.assign_vector(v_part, base)
+                mhat = m_part * m_factor.expand_to_vector(size)
+                vhat = v_part * v_factor.expand_to_vector(size)
                 if self.amsgrad:
-                    vhat = self.vhats [i_layer].get_vector(base, size)
-                    vhat = util.max(vhat, v_part)
+                    v_max = self.vhats [i_layer].get_vector(base, size)
+                    vhat = util.max(vhat, v_max)
                     self.vhats[i_layer].assign_vector(vhat, base)
-                    diff = self.gamma.expand_to_vector(size) * m_part
-                else:
-                    mhat = m_part * m_factor.expand_to_vector(size)
-                    vhat = v_part * v_factor.expand_to_vector(size)
-                    diff = self.gamma.expand_to_vector(size) * mhat
+                diff = self.gamma.expand_to_vector(size) * mhat
                 if self.approx:
                     diff *= mpc_math.InvertSqrt(vhat + self.epsilon ** 2)
                 else:
                     diff /= mpc_math.sqrt(vhat) + self.epsilon
                 theta.assign_vector(theta.get_vector(base, size) - diff, base)
+                if self.output_diff:
+                    @if_(i_batch % 100 == 0)
+                    def _():
+                        diff.reveal().binary_output()
+            if self.output_stats and m.total_size() < 1000:
+                @if_(i_batch % self.output_stats == 0)
+                def _():
+                    self.stat('g', g)
+                    self.stat('m', m)
+                    self.stat('v', v)
+                    self.stat('vhat', self.vhats[i_layer])
+                    self.stat('theta', theta)
 
 class SGD(Optimizer):
     """ Stochastic gradient descent.
@@ -2500,7 +2808,8 @@ class SGD(Optimizer):
     :param n_epochs: number of epochs for training
     :param report_loss: disclose and print loss
     """
-    def __init__(self, layers, n_epochs, debug=False, report_loss=None):
+    def __init__(self, layers, n_epochs=1, debug=False, report_loss=None):
+        super(SGD, self).__init__(report_loss=report_loss)
         self.momentum = 0.9
         self.layers = layers
         self.n_epochs = n_epochs
@@ -2510,9 +2819,9 @@ def __init__(self, layers, n_epochs, debug=False, report_loss=None):
             self.nablas.extend(layer.nablas())
             for theta in layer.thetas():
                 self.delta_thetas.append(theta.same_shape())
-        self.gamma = MemValue(cfix(0.01))
+        self.set_learning_rate(0.01)
         self.debug = debug
-        super(SGD, self).__init__(report_loss)
+        print_both('Using SGD')
 
     @_no_mem_warnings
     def reset(self, X_by_label=None):
@@ -2532,7 +2841,7 @@ def _(i):
             y.assign_all(0)
         super(SGD, self).reset()
 
-    def update(self, i_epoch, batch):
+    def _update(self, i_epoch, i_batch, batch):
         for nabla, theta, delta_theta in zip(self.nablas, self.thetas,
                                              self.delta_thetas):
             @multithread(self.n_threads, nabla.total_size())
@@ -2604,14 +2913,16 @@ def _(i):
 
 def apply_padding(input_shape, kernel_size, strides, padding):
     if isinstance(padding, int):
-        input_shape = [x + 2 * padding for x in input_shape]
+        padding = [padding, padding]
+    if isinstance(padding, (tuple, list)):
+        input_shape = [x + sum(padding) for x in input_shape]
         padding = 'valid'
-    if padding == 'valid':
+    if padding.lower() == 'valid':
         res = (input_shape[0] - kernel_size[0] + 1) // strides[0], \
             (input_shape[1] - kernel_size[1] + 1) // strides[1],
         assert min(res) > 0, (input_shape, kernel_size, strides, padding)
         return res
-    elif padding == 'same':
+    elif padding.lower() == 'same':
         return (input_shape[0]) // strides[0], \
             (input_shape[1]) // strides[1],
     else:
@@ -2664,6 +2975,9 @@ def compile_by_args(self, program):
                     self.optimizer = 'adam', [], {}
                 elif 'amsgrad' in program.args:
                     self.optimizer = 'adam', [], {'amsgrad': True}
+                elif 'amsgradprec' in program.args:
+                    self.optimizer = 'adam', [], {'amsgrad': True,
+                                                  'approx': False}
                 else:
                     self.optimizer = 'sgd', [], {}
 
@@ -2679,7 +2993,7 @@ def summary(self):
             def build(self, input_shape, batch_size=128):
                 data_input_shape = input_shape
                 if self.opt != None and \
-                   input_shape == self.opt.layers[0].X.sizes and \
+                   input_shape == self.opt.layers[0]._X.sizes and \
                    batch_size <= self.batch_size and \
                    type(self.opt).__name__.lower() == self.optimizer[0]:
                     return
@@ -2714,36 +3028,18 @@ def build(self, input_shape, batch_size=128):
                         filters = layer[1]['filters']
                         strides = layer[1]['strides']
                         padding = layer[1]['padding']
-                        if isinstance(kernel_size, int):
-                            kernel_size = (kernel_size, kernel_size)
-                        if isinstance(strides, int):
-                            strides = (strides, strides)
-                        weight_shape = [filters] + list(kernel_size) + \
-                            [input_shape[-1]]
-                        output_shape = [batch_size] + list(
-                            apply_padding(input_shape[1:3], kernel_size,
-                                          strides, padding)) + [filters]
-                        padding = padding.upper() if isinstance(padding, str) \
-                            else padding
-                        layers.append(FixConv2d(input_shape, weight_shape,
-                                                (filters,), output_shape,
-                                                strides, padding))
+                        layers.append(easyConv2d(
+                            input_shape, batch_size, filters, kernel_size,
+                            strides, padding))
+                        output_shape = layers[-1].Y.sizes
                         input_shape = output_shape
                         print('conv output shape', output_shape)
                     elif name == 'maxpool':
                         pool_size = layer[1]['pool_size']
                         strides = layer[1]['strides']
                         padding = layer[1]['padding']
-                        if isinstance(pool_size, int):
-                            pool_size = (pool_size, pool_size)
-                        if isinstance(strides, int):
-                            strides = (strides, strides)
-                        if strides == None:
-                            strides = pool_size
-                        layers.append(MaxPool(input_shape,
-                                              [1] + list(strides) + [1],
-                                              [1] + list(pool_size) + [1],
-                                              padding))
+                        layers.append(easyMaxPool(input_shape, pool_size,
+                                                  strides, padding))
                         input_shape = layers[-1].Y.sizes
                     elif name == 'dropout':
                         layers.append(Dropout(batch_size, reduce(
@@ -2775,7 +3071,7 @@ def build(self, input_shape, batch_size=128):
                         opt.momentum = momentum
                 elif opt == 'adam':
                     opt = Adam(layers, amsgrad=opts.pop('amsgrad', None),
-                               approx=True)
+                               approx=opts.pop('approx', True))
                     beta1 = opts.pop('beta_1', None)
                     beta2 = opts.pop('beta_2', None)
                     epsilon = opts.pop('epsilon', None)
@@ -2795,7 +3091,7 @@ def build(self, input_shape, batch_size=128):
                     raise Exception(opt + ' not supported')
                 lr = opts.pop('learning_rate', None)
                 if lr != None:
-                    opt.gamma = MemValue(cfix(lr))
+                    opt.set_learning_rate(lr)
                 if opts:
                     raise Exception(opts + ' not supported')
                 self.batch_size = batch_size
@@ -2804,7 +3100,7 @@ def build(self, input_shape, batch_size=128):
             def fit(self, x, y, batch_size, epochs=1, validation_data=None):
                 assert len(x) == len(y)
                 self.build(x.sizes, batch_size)
-                if x.total_size() != self.opt.layers[0].X.total_size():
+                if x.total_size() != self.opt.layers[0]._X.total_size():
                     raise Exception('sample data size mismatch')
                 if y.total_size() != self.opt.layers[-1].Y.total_size():
                     print (y, self.opt.layers[-1].Y)
@@ -2814,7 +3110,7 @@ def fit(self, x, y, batch_size, epochs=1, validation_data=None):
                 else:
                     if len(validation_data[0]) != len(validation_data[1]):
                         raise Exception('test set size mismatch')
-                self.opt.layers[0].X.address = x.address
+                self.opt.layers[0]._X.address = x.address
                 self.opt.layers[-1].Y.address = y.address
                 self.opt.run_by_args(get_program(), epochs, batch_size,
                                      validation_data[0], validation_data[1],
@@ -2828,6 +3124,195 @@ def predict(self, x, batch_size=None):
                     batch_size = min(batch_size, self.batch_size)
                 return self.opt.eval(x, batch_size=batch_size)
 
+def layers_from_torch(sequence, data_input_shape, batch_size, input_via=None):
+    """ Convert a PyTorch Sequential object to MP-SPDZ layers.
+
+    :param sequence: PyTorch Sequential object
+    :param data_input_shape: input shape (list of four int)
+    :param batch_size: batch size (int)
+    :param input_via: player to input model data via (default: don't)
+
+    """
+    layers = []
+
+    def mul(x):
+        return reduce(operator.mul, x)
+
+    def process(item):
+        nonlocal input_shape
+        name = type(item).__name__
+        if name == 'Sequential':
+            for x in item:
+                process(x)
+        elif name == 'Linear':
+            assert mul(input_shape[1:]) == item.in_features
+            assert item.bias is not None
+            layers.append(Dense(input_shape[0], item.in_features,
+                                item.out_features))
+            if input_via is not None:
+                shapes = [x.shape for x in (layers[-1].W, layers[-1].b)]
+                import numpy
+                swapped = item.weight.detach().numpy()
+                if len(input_shape) == 4:
+                    print (swapped.shape)
+                    swapped = numpy.reshape(
+                        swapped,
+                        [item.out_features, input_shape[3]] + input_shape[1:3])
+                    print (swapped.shape)
+                    swapped = numpy.moveaxis(swapped, 1, -1)
+                    print (swapped.shape)
+                    swapped = numpy.reshape(
+                        swapped, [item.out_features, item.in_features])
+                    print (swapped.shape)
+                swapped = numpy.swapaxes(swapped, 0, 1)
+                layers[-1].W = sfix.input_tensor_via(
+                    input_via, swapped)
+                layers[-1].b = sfix.input_tensor_via(
+                    input_via, item.bias.detach())
+                assert layers[-1].W.shape == shapes[0]
+                assert layers[-1].b.shape == shapes[1]
+            input_shape = [batch_size, item.out_features]
+        elif name == 'Conv2d':
+            layers.append(easyConv2d(input_shape, batch_size, item.out_channels,
+                                     item.kernel_size, item.stride,
+                                     item.padding))
+            input_shape = layers[-1].Y.shape
+            if input_via is not None:
+                shapes = [x.shape for x in
+                          (layers[-1].weights, layers[-1].bias)]
+                import numpy
+                swapped = numpy.moveaxis(
+                    numpy.array(item.weight.detach()), 1, -1)
+                layers[-1].weights = sfix.input_tensor_via(input_via, swapped)
+                layers[-1].bias = sfix.input_tensor_via(
+                    input_via, item.bias.detach())
+                assert layers[-1].weights.shape == shapes[0]
+                assert layers[-1].bias.shape == shapes[1]
+        elif name == 'MaxPool2d':
+            layers.append(easyMaxPool(input_shape, item.kernel_size,
+                                      item.stride, item.padding))
+            input_shape = layers[-1].Y.shape
+        elif name == 'ReLU':
+            layers.append(Relu(input_shape))
+        elif name == 'Flatten':
+            pass
+        elif name == 'BatchNorm2d':
+            layers.append(BatchNorm(layers[-1].Y.sizes))
+        elif name == 'Dropout':
+            layers.append(Dropout(input_shape[0], mul(layers[-1].Y.sizes[1:]),
+                                  alpha=item.p))
+            input_shape = layers[-1].Y.sizes
+        else:
+            raise CompilerError('unknown PyTorch module: ' + name)
+
+    input_shape = data_input_shape + [1] * (4 - len(data_input_shape))
+    process(sequence)
+    if layers[-1].d_out == 1:
+        layers.append(Output(data_input_shape[0]))
+    else:
+        layers.append(MultiOutput(data_input_shape[0], layers[-1].d_out))
+    return layers
+
+class OneLayerSGD:
+    def __init__(self, n_epochs=1, batch_size=1, program=None):
+        self.n_epochs = n_epochs
+        self.batch_size = batch_size
+        self.program = program
+
+    def fit(self, X_train, y_train):
+        """ Train classifier.
+
+        :param X_train: training data (sfix matrix)
+        :param y_train: training binary labels (sint/sfix array)
+
+        """
+        self.init(X_train)
+        self.opt.fit(X_train, y_train, self.n_epochs, self.batch_size,
+                     program=self.program, print_accuracy=False,
+                     print_loss=False)
+
+    def fit_with_testing(self, X_train, y_train, X_test, y_test):
+        """ Train classifier with accuracy output after every epoch.
+        This reveals all labels to simplify the accuracy computation.
+
+        :param X_train: training data (sfix matrix)
+        :param y_train: training labels (sint/sfix array)
+        :param X_test: testing data (sfix matrix)
+        :param y_test: testing labels (sint/sfix array)
+
+        """
+        self.init(X_train)
+        self.opt.print_accuracy = self.print_accuracy
+        self.opt.fit(X_train, y_train, self.n_epochs, self.batch_size,
+                     validation_data=(X_test, y_test), program=self.program,
+                     print_accuracy=self.print_accuracy, print_loss=True)
+
+    def predict(self, X):
+        """ Use model for prediction.
+
+        :param X: sample data with row-wise samples (sfix matrix)
+        :returns: sfix array
+
+        """
+        return self.opt.eval(X)
+
+class SGDLogistic(OneLayerSGD):
+    """ Logistic regression using SGD.
+
+    :param n_epochs: number of epochs
+    :param batch_size: batch size
+    :param program: program object to use command-line options from (default is
+      not to use any)
+
+    """
+    print_accuracy = True
+
+    def init(self, X):
+        dense = Dense(*X.sizes, 1)
+        if self.program:
+            sigmoid = Output.from_args(X.sizes[0], self.program)
+            self.opt = Optimizer.from_args(self.program, [dense, sigmoid])
+        else:
+            sigmoid = Output(X.sizes[0])
+            self.opt = SGD([dense, sigmoid], 1)
+
+    def predict(self, X):
+        """ Use model to predict labels.
+
+        :param X: sample data with row-wise samples (sfix matrix)
+        :returns: sint array
+
+        """
+        return self.opt.eval(X, top=True)
+
+    def predict_proba(self, X):
+        """ Use model for probility estimates.
+
+        :param X: sample data with row-wise samples (sfix matrix)
+        :returns: sfix array
+
+        """
+        return super(SGDLogistic, self).predict(X)
+
+class SGDLinear(OneLayerSGD):
+    """ Logistic regression using SGD.
+
+    :param n_epochs: number of epochs
+    :param batch_size: batch size
+    :param program: program object to use command-line options from (default is
+      not to use any)
+
+    """
+    print_accuracy = False
+
+    def init(self, X):
+        dense = Dense(*X.sizes, 1)
+        output = LinearOutput(X.sizes[0])
+        if self.program:
+            self.opt = Optimizer.from_args(self.program, [dense, output])
+        else:
+            self.opt = SGD([dense, output], 1)
+
 def solve_linear(A, b, n_iterations, progress=False, n_threads=None,
                  stop=False, already_symmetric=False, precond=False):
     """ Iterative linear solution approximation for :math:`Ax=b`.
@@ -2867,6 +3352,8 @@ def _(i):
                      vr.reveal(), v_norm.reveal())
         if stop:
             return (alpha > 0).reveal()
+    if not already_symmetric:
+        AtA.delete()
     return x
 
 def solve_linear_diag_precond(A, b, x, r, n_iterations, progress=False,
@@ -2926,3 +3413,26 @@ def _(i):
     def _(i):
         res.iadd((x[i] - mean.read()) ** 2)
     return res.read()
+
+def cholesky(A, reveal_diagonal=False):
+    """ Cholesky decomposition. """
+    assert len(A.shape) == 2
+    assert A.shape[0] == A.shape[1]
+    L = A.same_shape()
+    L.assign_all(0)
+    @for_range(A.shape[0])
+    def _(i):
+        @for_range(i + 1)
+        def _(j):
+            sum = sfix.dot_product(L[i], L[j])
+
+            @if_e(i == j)
+            def _():
+                L[i][j] = mpc_math.sqrt(A[i][i] - sum)
+                if reveal_diagonal:
+                    print_ln('L[%s][%s] = %s = sqrt(%s - %s)', i, j,
+                             L[i][j].reveal(), A[i][j].reveal(), sum.reveal())
+            @else_
+            def _():
+                L[i][j] = (1.0 / L[j][j] * (A[i][j] - sum))
+    return L
diff --git a/Compiler/mpc_math.py b/Compiler/mpc_math.py
index 8f09bd776..39e31fc6f 100644
--- a/Compiler/mpc_math.py
+++ b/Compiler/mpc_math.py
@@ -916,7 +916,7 @@ def SqrtComp(z, old=False):
     k = len(z)
     if isinstance(z[0], types.sint):
         return types.sfix._new(sum(z[i] * types.cfix(
-            2 ** (-(i - f + 1) / 2)).v for i in range(k)))
+            2 ** (-(i - f + 1) / 2), k=k, f=f).v for i in range(k)))
     k_prime = k // 2
     f_prime = f // 2
     c1 = types.sfix(2 ** ((f + 1) / 2 + 1))
diff --git a/Compiler/program.py b/Compiler/program.py
index dfe08f87f..dfe6a5daf 100644
--- a/Compiler/program.py
+++ b/Compiler/program.py
@@ -10,6 +10,7 @@
 import os
 import re
 import sys
+import hashlib
 from collections import defaultdict, deque
 from functools import reduce
 
@@ -49,12 +50,12 @@ class defaults:
     garbled = False
     prime = None
     galois = 40
-    budget = 100000
+    budget = 1000
     mixed = False
     edabit = False
     invperm = False
     split = None
-    cisc = False
+    cisc = True
     comparison = None
     merge_opens = True
     preserve_mem_order = False
@@ -126,7 +127,13 @@ def __init__(self, args, options=defaults, name=None):
         self.n_threads = 1
         self.public_input_file = None
         self.types = {}
-        self.budget = int(self.options.budget)
+        if self.options.budget:
+            self.budget = int(self.options.budget)
+        else:
+            if self.options.optimize_hard:
+                self.budget = 100000
+            else:
+                self.budget = defaults.budget
         self.to_merge = [
             Compiler.instructions.asm_open_class,
             Compiler.instructions.gasm_open_class,
@@ -175,6 +182,11 @@ def __init__(self, args, options=defaults, name=None):
         self.relevant_opts = set()
         self.n_running_threads = None
         self.input_files = {}
+        self.base_addresses = {}
+        self._protect_memory = False
+        if not self.options.cisc:
+            self.options.cisc = not self.options.optimize_hard
+
         Program.prog = self
         from . import comparison, instructions, instructions_base, types
 
@@ -196,7 +208,7 @@ def init_names(self, args):
         # ignore path to file - source must be in Programs/Source
         if "Programs" in os.listdir(os.getcwd()):
             # compile prog in ./Programs/Source directory
-            self.programs_dir = os.getcwd() + "/Programs"
+            self.programs_dir = "Programs"
         else:
             # assume source is in main SPDZ directory
             self.programs_dir = sys.path[0] + "/Programs"
@@ -367,8 +379,12 @@ def write_bytes(self):
             sch_file.write("lgp:%s" % req)
         sch_file.write("\n")
         sch_file.write("opts: %s\n" % " ".join(self.relevant_opts))
+        sch_file.close()
+        h = hashlib.sha256()
         for tape in self.tapes:
             tape.write_bytes()
+            h.update(tape.hash)
+        print('Hash:', h.hexdigest())
 
     def finalize_tape(self, tape):
         if not tape.purged:
@@ -435,7 +451,9 @@ def malloc(self, size, mem_type, reg_type=None, creator_tape=None):
 
             tn = get_thread_number()
             runtime_error_if(tn > self.n_running_threads, "malloc")
-            return addr + single_size * (tn - 1)
+            res = addr + single_size * (tn - 1)
+            self.base_addresses[str(res)] = addr
+            return res
         else:
             return addr
 
@@ -443,6 +461,8 @@ def free(self, addr, mem_type):
         """Free memory"""
         if self.curr_block.alloc_pool is not self.curr_tape.basicblocks[0].alloc_pool:
             raise CompilerError("Cannot free memory within function block")
+        if not util.is_constant(addr):
+            addr = self.base_addresses[str(addr)]
         size = self.allocated_mem_blocks.pop((addr, mem_type))
         self.free_mem_blocks[mem_type].push(addr, size)
 
@@ -490,15 +510,26 @@ def public_input(self, x):
             )
         self.public_input_file.write("%s\n" % str(x))
 
+    def get_binary_input_file(self, player):
+        key = player, 'bin'
+        if key not in self.input_files:
+            filename = 'Player-Data/Input-Binary-P%d-0' % player
+            print('Writing binary data to', filename)
+            self.input_files[key] = open(filename, 'wb')
+        return self.input_files[key]
+
     def set_bit_length(self, bit_length):
         """Change the integer bit length for non-linear functions."""
         self.bit_length = bit_length
         print("Changed bit length for comparisons etc. to", bit_length)
 
     def set_security(self, security):
+        changed = self._security != security
         self._security = security
         self.non_linear.set_security(security)
-        print("Changed statistical security for comparison etc. to", security)
+        if changed:
+            print("Changed statistical security for comparison etc. to",
+                  security)
 
     @property
     def security(self):
@@ -626,6 +657,19 @@ def disable_memory_warnings(self):
         self.warn_about_mem.append(False)
         self.curr_block.warn_about_mem = False
 
+    def protect_memory(self, status):
+        """ Enable or disable memory protection. """
+        self._protect_memory = status
+
+    def use_cisc(self):
+        return self.options.cisc and (not self.prime or self.rabbit_gap())
+
+    def rabbit_gap(self):
+        assert self.prime
+        p = self.prime
+        logp = int(round(math.log(p, 2)))
+        return abs(p - 2 ** logp) / p < 2 ** -self.security
+
     @staticmethod
     def read_tapes(schedule):
         m = re.search(r"([^/]*)\.mpc", schedule)
@@ -644,7 +688,7 @@ def read_tapes(schedule):
             sys.exit(1)
 
         for tapename in lines[2].split(" "):
-            yield tapename.strip()
+            yield tapename.strip().split(":")[0]
 
 
 class Tape:
@@ -672,6 +716,7 @@ def __init__(self, name, program):
         self.singular = True
         self.free_threads = set()
         self.loop_breaks = []
+        self.warned_about_mem = False
 
     class BasicBlock(object):
         def __init__(self, parent, name, scope, exit_condition=None):
@@ -984,7 +1029,7 @@ def alloc_loop(block):
         if self.program.verbose:
             print("Tape requires", self.req_num)
         for req, num in sorted(self.req_num.items()):
-            if num == float("inf") or num >= 2**32:
+            if num == float("inf") or num >= 2**64:
                 num = -1
             if req[1] in data_types:
                 self.basicblocks[-1].instructions.append(
@@ -1092,10 +1137,14 @@ def write_bytes(self, filename=None):
             filename = self.program.programs_dir + "/Bytecode/" + filename
         print("Writing to", filename)
         f = open(filename, "wb")
+        h = hashlib.sha256()
         for i in self._get_instructions():
             if i is not None:
-                f.write(i.get_bytes())
+                b = i.get_bytes()
+                f.write(b)
+                h.update(b)
         f.close()
+        self.hash = h.digest()
 
     def new_reg(self, reg_type, size=None):
         return self.Register(reg_type, self, size=size)
@@ -1274,8 +1323,11 @@ class _no_truth(object):
 
         def __bool__(self):
             raise CompilerError(
-                "Cannot derive truth value from register, "
-                "consider using 'compile.py -l'"
+                "Cannot derive truth value from register. "
+                "This is a catch-all error appearing if you try to use a "
+                "run-time value where the compiler expects a compile-time "
+                "value, most likely a Python integer. "
+                "In some cases, you can fix this by using 'compile.py -l'."
             )
 
     class Register(_no_truth):
diff --git a/Compiler/sorting.py b/Compiler/sorting.py
index fc619b732..c8cb87e89 100644
--- a/Compiler/sorting.py
+++ b/Compiler/sorting.py
@@ -10,6 +10,16 @@ def dest_comp(B):
     return sum(Tt) - 1
 
 def reveal_sort(k, D, reverse=False):
+    """ Sort in place according to "perfect" key. The name hints at the fact
+    that a random order of the keys is revealed.
+
+    :param k: vector or Array of sint containing exactly :math:`0,\dots,n-1`
+      in any order
+    :param D: Array or MultiArray to sort
+    :param reverse: wether :py:obj:`key` is a permutation in forward or
+      backward order
+
+    """
     assert len(k) == len(D)
     library.break_point()
     shuffle = types.sint.get_secure_shuffle(len(k))
@@ -28,6 +38,14 @@ def reveal_sort(k, D, reverse=False):
     instructions.delshuffle(shuffle)
 
 def radix_sort(k, D, n_bits=None, signed=True):
+    """ Sort in place according to key.
+
+    :param k: keys (vector or Array of sint or sfix)
+    :param D: Array or MultiArray to sort
+    :param n_bits: number of bits in keys (int)
+    :param signed: whether keys are signed (bool)
+
+    """
     assert len(k) == len(D)
     bs = types.Matrix.create_from(k.get_vector().bit_decompose(n_bits))
     if signed and len(bs) > 1:
diff --git a/Compiler/sqrt_oram.py b/Compiler/sqrt_oram.py
index 741baaf74..ae1aa81ca 100644
--- a/Compiler/sqrt_oram.py
+++ b/Compiler/sqrt_oram.py
@@ -764,9 +764,8 @@ def condition_i(i):
         # We only need once, so we pick the first one we find
         @lib.for_range_opt(self.n)
         def _(i):
-            nonlocal done
             self.physical_demux[i] &= done.bit_not()
-            done |= self.physical_demux[i]
+            done.update(done | self.physical_demux[i])
 
         # Retrieve the value from the physical memory obliviously
         @lib.map_sum_opt(get_n_threads(self.n), self.n, [self.value_type])
diff --git a/Compiler/types.py b/Compiler/types.py
index ab100ce3b..34e574807 100644
--- a/Compiler/types.py
+++ b/Compiler/types.py
@@ -166,7 +166,7 @@ def vectorized_function(cls, *args, **kwargs):
         size = None
         if 'size' in kwargs:
             size = kwargs.pop('size')
-        if size:
+        if size is not None:
             set_global_vector_size(size)
             try:
                 res = function(cls, *args, **kwargs)
@@ -187,7 +187,7 @@ def vectorized_init(*args, **kwargs):
             if 'size' in kwargs and kwargs['size'] is not None \
                     and kwargs['size'] != size:
                 raise CompilerError('Mismatch in vector size')
-        if 'size' in kwargs and kwargs['size']:
+        if 'size' in kwargs and kwargs['size'] is not None:
             size = kwargs['size']
         if size is not None:
             set_global_vector_size(size)
@@ -344,6 +344,10 @@ def popcnt_bits(bits):
     def zero_if_not(self, condition):
         return condition * self
 
+    def iadd(self, other):
+        """ Addition assignment. This uses :py:func:`update` internally. """
+        self.update(self + other)
+
 class _int(Tape._no_truth):
     """ Integer functionality. """
 
@@ -537,6 +541,8 @@ def Tensor(cls, shape):
         """
         if len(shape) == 1:
             return Array(shape[0], cls)
+        elif len(shape) == 2:
+            return Matrix(*shape, cls)
         else:
             return MultiArray(shape, cls)
 
@@ -577,7 +583,8 @@ def input_tensor_from_client(cls, client_id, shape):
         return res
 
     @classmethod
-    def input_tensor_via(cls, player, content):
+    def input_tensor_via(cls, player, content=None, shape=None, binary=True,
+                         one_hot=False):
         """
         Input tensor-like data via a player. This overwrites the input
         file for the relevant player. The following returns an
@@ -586,37 +593,74 @@ def input_tensor_via(cls, player, content):
           M = [[1, 2], [3, 4]]
           sint.input_tensor_via(0, M)
 
-        Make sure to copy ``Player-Data/Input-P<player>-0`` if running
+        Make sure to copy ``Player-Data/Input-P<player>-0`` or
+        ``Player-Data/Input-Binary-P<player>-0`` if running
         on another host.
 
+        :param player: player to input via (int)
+        :param content: nested Python list or numpy array (binary mode only) or
+          left out if not available
+        :param shape: shape if content not given
+        :param binary: binary mode (bool)
+        :param one_hot: one-hot encoding (bool)
+
         """
         if program.curr_tape != program.tapes[0]:
             raise CompilerError('only available in main thread')
-        shape = []
-        tmp = content
-        while True:
-            try:
-                shape.append(len(tmp))
-                tmp = tmp[0]
-            except:
-                break
-        if not program.input_files.get(player, None):
-            program.input_files[player] = open(
-                'Player-Data/Input-P%d-0' % player, 'w')
-        f = program.input_files[player]
-        def traverse(content, level):
-            assert len(content) == shape[level]
-            if level == len(shape) - 1:
-                for x in content:
-                    f.write(' ')
-                    f.write(str(x))
+        if content is not None:
+            requested_shape = shape
+            if binary:
+                import numpy
+                content = numpy.array(content)
+                if issubclass(cls, _fix):
+                    min_k = \
+                        math.ceil(math.log(abs(content).max(), 2)) + cls.f + 1
+                    if cls.k < min_k:
+                        raise CompilerError(
+                            "data outside fixed-point range, "
+                            "use 'sfix.set_precision(%d, %d)'" % (cls.f, min_k))
+                    if binary == 2:
+                        t = numpy.double
+                    else:
+                        t = numpy.single
+                else:
+                    t = numpy.int64
+                if one_hot:
+                    content = numpy.eye(content.max() + 1)[content]
+                content = content.astype(t)
+                f = program.get_binary_input_file(player)
+                f.write(content.tobytes())
+                f.flush()
+                shape = content.shape
             else:
-                for x in content:
-                    traverse(x, level + 1)
-        traverse(content, 0)
-        f.write('\n')
+                shape = []
+                tmp = content
+                while True:
+                    try:
+                        shape.append(len(tmp))
+                        tmp = tmp[0]
+                    except:
+                        break
+                if not program.input_files.get(player, None):
+                    program.input_files[player] = open(
+                        'Player-Data/Input-P%d-0' % player, 'w')
+                f = program.input_files[player]
+                def traverse(content, level):
+                    assert len(content) == shape[level]
+                    if level == len(shape) - 1:
+                        for x in content:
+                            f.write(' ')
+                            f.write(str(x))
+                    else:
+                        for x in content:
+                            traverse(x, level + 1)
+                traverse(content, 0)
+                f.write('\n')
+            if requested_shape is not None and \
+               list(shape) != list(requested_shape):
+                raise CompilerError('content contradicts shape')
         res = cls.Tensor(shape)
-        res.input_from(player)
+        res.input_from(player, binary=binary)
         return res
 
 class _vec(Tape._no_truth):
@@ -1357,14 +1401,14 @@ def store_in_mem(self, address):
 
     @vectorized_classmethod
     def pop(cls):
-        """ Pop from stack. """
+        """ Pop from stack. Made obsolete by :py:func:`update`. """
         res = cls()
         popint(res)
         return res
 
     @vectorized_classmethod
     def push(cls, value):
-        """ Push to stack.
+        """ Push to stack. Made obsolete by :py:func:`update`.
 
         :param value: any convertible type """
         pushint(cls.conv(value))
@@ -1728,6 +1772,38 @@ def __init__(self, player, value):
         self.player = player
         self._v = value
 
+    @classmethod
+    def read_int(cls, player):
+        """ Read integer from
+        ``Player-Data/Input-Binary-P<player>-<threadnum>`` only on
+        party :py:obj:`player`.
+
+        :param player: player (int)
+        :return: personal cint
+
+        """
+        tmp = cint()
+        fixinput(player, tmp, 0, 0)
+        return cls(player, tmp)
+
+    @classmethod
+    def read_fix(cls, player, f, k, precision):
+        """ Read fixed-point value from
+        ``Player-Data/Input-Binary-P<player>-<threadnum>`` only on
+        party :py:obj:`player`.
+
+        :param player: player (int)
+        :param f: fixed-point precision (int)
+        :param k: fixed-point length (int)
+        :param precision: input precision (1: single, 2: double)
+        :return: personal cfix
+
+        """
+        assert precision in (1, 2)
+        tmp = cint()
+        fixinput(player, tmp, f, precision)
+        return cls(player, cfix._new(tmp, f=f, k=k))
+
     def binary_output(self):
         """ Write binary output to
         ``Player-Data/Binary-Output-P<playerno>-<threadno>`` if
@@ -2278,14 +2354,17 @@ def get_random(cls):
         return res
 
     @vectorized_classmethod
-    def get_input_from(cls, player):
+    def get_input_from(cls, player, binary=False):
         """ Secret input.
 
         :param player: public (regint/cint/int)
         :param size: vector size (int, default 1)
         """
-        res = cls()
-        inputmixed('int', res, player)
+        if binary:
+            return cls(personal.read_int(player))
+        else:
+            res = cls()
+            inputmixed('int', res, player)
         return res
 
     @vectorized_classmethod
@@ -2478,7 +2557,8 @@ def __init__(self, val=None, size=None):
             inputpersonal(size, val.player, self, self.clear_type.conv(val._v))
         elif isinstance(val, _fix):
             super(sint, self).__init__('s', size=val.v.size)
-            self.load_other(val.v.round(val.k, val.f))
+            self.load_other(val.v.round(val.k, val.f,
+                                        nearest=val.round_nearest))
         elif isinstance(val, sbitvec):
             super(sint, self).__init__('s', val=val, size=val[0].n)
         else:
@@ -2533,8 +2613,9 @@ def __ge__(self, other, bit_length=None, security=None):
     @type_comp
     @vectorize
     def __eq__(self, other, bit_length=None, security=None):
-        return floatingpoint.EQZ(self - other, bit_length or program.bit_length,
-                                 security or program.security)
+        return sintbit.conv(
+            floatingpoint.EQZ(self - other, bit_length or program.bit_length,
+                              security or program.security))
 
     @read_mem_value
     @type_comp
@@ -2696,7 +2777,8 @@ def Norm(self, k, f, kappa=None, simplex_flag=False):
 
     @vectorize
     def int_div(self, other, bit_length=None, security=None):
-        """ Secret integer division.
+        """ Secret integer division. Note that the domain bit length
+        needs to be about four times the bit length.
 
         :param other: sint
         :param bit_length: bit length of input (default: global bit length)
@@ -2710,7 +2792,8 @@ def int_div(self, other, bit_length=None, security=None):
 
     @vectorize
     def int_mod(self, other, bit_length=None):
-        """ Secret integer modulo.
+        """ Secret integer modulo. Note that the domain bit length
+        needs to be about four times the bit length.
 
         :param other: sint
         :param bit_length: bit length of input (default: global bit length)
@@ -3626,9 +3709,8 @@ class cfix(_number, _structure):
     scalars = (int, float, regint, cint)
     @classmethod
     def set_precision(cls, f, k = None):
-        """ Set the precision of the integer representation. Note that some
-        operations are undefined when the precision of :py:class:`sfix` and
-        :py:class:`cfix` differs. The initial defaults are chosen to
+        """ Set the precision of the integer representation.
+        The initial defaults are chosen to
         allow the best optimization of probabilistic truncation in
         computation modulo 2^64 (2*k < 64). Generally, 2*k must be at
         most the integer length for rings and at most m-s-1 for
@@ -3686,6 +3768,10 @@ def cfix_to_cint(fix_val):
     def malloc(size, creator_tape=None):
         return program.malloc(size, cint, creator_tape=creator_tape)
 
+    @classmethod
+    def free(cls, addr):
+        return cint.free(addr)
+
     @staticmethod
     def n_elements():
         return 1
@@ -3749,6 +3835,9 @@ def __getitem__(self, index):
             return [self._new(x, k=self.k, f=self.f) for x in self.v[index]]
         return self._new(self.v[index], k=self.k, f=self.f)
 
+    def get_vector(self):
+        return self
+
     @vectorize
     def load_int(self, v):
         self.v = cint(v) * (2 ** self.f)
@@ -3777,14 +3866,25 @@ def size(self):
     def sizeof(self):
         return self.size * 4
 
+    @read_mem_value
+    def parse_type(self, other):
+        res = parse_type(other, f=self.f, k=self.k)
+        # check attributes if available
+        try:
+            assert res.k == self.k
+            assert res.f == self.f
+        except AttributeError:
+            pass
+        return res
+
     @vectorize
     def add(self, other):
         """ Clear fixed-point addition.
 
         :param other: cfix/cint/regint/int """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
-            return cfix._new(self.v + other.v)
+            return cfix._new(self.v + other.v, k=self.k, f=self.f)
         else:
             return NotImplemented
 
@@ -3796,13 +3896,13 @@ def mul(self, other):
             return sfix._new(self.v * other, k=self.k, f=self.f)
         if isinstance(other, (int, regint, cint)):
             return cfix._new(self.v * cint(other), k=self.k, f=self.f)
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             assert self.f == other.f
             sgn = cint(1 - 2 * ((self < 0) ^ (other < 0)))
             absolute = self.v * other.v * sgn
             val = sgn * (absolute >> self.f)
-            return cfix._new(val)
+            return cfix._new(val, k=self.k, f=self.f)
         elif isinstance(other, sfix):
             return NotImplemented
         else:
@@ -3819,11 +3919,11 @@ def __sub__(self, other):
         """ Clear fixed-point subtraction.
 
         :param other: cfix/cint/regint/int """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
-            return cfix._new(self.v - other.v)
+            return cfix._new(self.v - other.v, k=self.k, f=self.f)
         elif isinstance(other, sfix):
-            return sfix._new(self.v - other.v)
+            return sfix._new(self.v - other.v, k=self.k, f=self.f)
         else:
             raise NotImplementedError
 
@@ -3831,7 +3931,7 @@ def __sub__(self, other):
     def __neg__(self):
         """ Clear fixed-point negation. """
         # cfix type always has .v
-        return cfix._new(-self.v)
+        return cfix._new(-self.v, f=self.f, k=self.k)
     
     def __rsub__(self, other):
         return -self + other
@@ -3844,7 +3944,7 @@ def __eq__(self, other):
         :param other: cfix/cint/regint/int
         :return: 0/1
         :rtype: regint """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             return self.v == other.v
         elif isinstance(other, sfix):
@@ -3855,7 +3955,7 @@ def __eq__(self, other):
     @vectorize
     def __lt__(self, other):
         """ Clear fixed-point comparison. """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             assert self.k == other.k
             return self.v.less_than(other.v, self.k)
@@ -3869,7 +3969,7 @@ def __lt__(self, other):
     @vectorize
     def __le__(self, other):
         """ Clear fixed-point comparison. """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             return 1 - (self > other)
         elif isinstance(other, sfix):
@@ -3880,7 +3980,7 @@ def __le__(self, other):
     @vectorize
     def __gt__(self, other):
         """ Clear fixed-point comparison. """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             return other.__lt__(self)
         elif isinstance(other, sfix):
@@ -3891,7 +3991,7 @@ def __gt__(self, other):
     @vectorize
     def __ge__(self, other):
         """ Clear fixed-point comparison. """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             return 1 - (self < other)
         elif isinstance(other, sfix):
@@ -3902,7 +4002,7 @@ def __ge__(self, other):
     @vectorize
     def __ne__(self, other):
         """ Clear fixed-point comparison. """
-        other = parse_type(other)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             return self.v != other.v
         elif isinstance(other, sfix):
@@ -3919,7 +4019,7 @@ def __truediv__(self, other):
         """ Clear fixed-point division.
 
         :param other: cfix/cint/regint/int """
-        other = parse_type(other, self.k, self.f)
+        other = self.parse_type(other)
         if isinstance(other, cfix):
             return cfix._new(library.cint_cint_division(
                 self.v, other.v, self.k, self.f), k=self.k, f=self.f)
@@ -3938,7 +4038,7 @@ def __rtruediv__(self, other):
         """ Fixed-point division.
 
         :param other: sfix/sint/cfix/cint/regint/int """
-        other = parse_type(other, self.k, self.f)
+        other = self.parse_type(other)
         return other / self
 
     @vectorize
@@ -4230,7 +4330,7 @@ def set_precision_from_args(cls, program, adapt_ring=False):
         elif k is not None:
             raise CompilerError('need to set fractional precision')
         if 'nearest' in program.args:
-            print('Nearest rounding instead of proabilistic '
+            print('Nearest rounding instead of probabilistic '
                   'for fixed-point computation')
             cls.round_nearest = True
         if adapt_ring and program.options.ring \
@@ -4242,7 +4342,7 @@ def set_precision_from_args(cls, program, adapt_ring=False):
                 program.set_ring_size(need)
 
     @classmethod
-    def coerce(cls, other):
+    def coerce(cls, other, equal_precision=None):
         if isinstance(other, (_fix, cls.clear_type)):
             return other
         else:
@@ -4303,6 +4403,10 @@ def __init__(self, _v=None, k=None, f=None, size=None):
             self.v = type(self)(_v.read()).v
         elif isinstance(_v, (list, tuple)):
             self.v = self.int_type(list(self.conv(x).v for x in _v))
+        elif isinstance(_v, personal):
+            assert _v._v.f == f
+            assert _v._v.k == k
+            self.v = self.int_type(personal(_v.player, _v._v.v))
         else:
             raise CompilerError('cannot convert %s to sfix' % _v)
         if not isinstance(self.v, self.int_type):
@@ -4347,7 +4451,7 @@ def mul(self, other):
             k = len(bin(abs(v))) - 1
             other = self.multipliable(v, k, f, self.size)
         try:
-            other = self.coerce(other)
+            other = self.coerce(other, equal_precision=False)
         except:
             return NotImplemented
         if isinstance(other, (_fix, self.clear_type)):
@@ -4463,16 +4567,19 @@ class sfix(_fix):
     default_type = sint
 
     @vectorized_classmethod
-    def get_input_from(cls, player):
+    def get_input_from(cls, player, binary=False):
         """ Secret fixed-point input.
 
         :param player: public (regint/cint/int)
         :param size: vector size (int, default 1)
         """
         cls.int_type.require_bit_length(cls.k)
-        v = cls.int_type()
-        inputmixed('fix', v, cls.f, player)
-        return cls._new(v)
+        if binary:
+            return cls(personal.read_fix(player, cls.f, cls.k, int(binary)))
+        else:
+            v = cls.int_type()
+            inputmixed('fix', v, cls.f, player)
+            return cls._new(v)
 
     @vectorized_classmethod
     def get_raw_input_from(cls, player):
@@ -4487,21 +4594,31 @@ def get_random(cls, lower, upper, symmetric=True):
         :param upper: float
         :param size: vector size (int, default 1)
         """
+        f = cls.f
+        k = cls.k
         log_range = int(math.log(upper - lower, 2))
         n_bits = log_range + cls.f
+        gen_range = (2 ** (n_bits) - 1) / 2 ** cls.f
+        diff = upper - lower
+        factor = diff / gen_range
+        real = lambda x: cfix.int_rep(x, f, k) * 2 ** -f
+        real_range = real(real(factor) * gen_range)
         average = lower + 0.5 * (upper - lower)
-        real_range = (2 ** (n_bits) - 1) / 2 ** cls.f
         lower = average - 0.5 * real_range
-        real_lower = round(lower * 2 ** cls.f) / 2 ** cls.f
-        r = cls._new(cls.int_type.get_random_int(n_bits)) + lower
+        upper = average + 0.5 * real_range
+        r = cls._new(cls.int_type.get_random_int(n_bits)) * factor + lower
         if symmetric:
             lowest = math.floor(lower * 2 ** cls.f) / 2 ** cls.f
-            print('randomness range [%f,%f], fringes half the probability' % \
-                  (lowest, lowest + 2 ** log_range))
+            highest = math.ceil(upper * 2 ** cls.f) / 2 ** cls.f
+            if program.verbose:
+                print('randomness range [%f,%f], '
+                      'fringes half the probability' % \
+                      (lowest, highest))
             return cls.int_type.get_random_bit().if_else(r, -r + 2 * average)
         else:
-            print('randomness range [%f,%f], %d bits' % \
-                  (real_lower, real_lower + real_range, n_bits))
+            if program.verbose:
+                print('randomness range [%f,%f], %d bits' % \
+                      (real(lower), real(lower) + real_range, n_bits))
             return r
 
     @classmethod
@@ -4531,8 +4648,17 @@ def dot_product(cls, x, y, res_params=None):
     def expand_to_vector(self, size):
         return self._new(self.v.expand_to_vector(size), k=self.k, f=self.f)
 
-    def coerce(self, other):
-        return parse_type(other, k=self.k, f=self.f)
+    @read_mem_value
+    def coerce(self, other, equal_precision=True):
+        res = parse_type(other, k=self.k, f=self.f)
+        if equal_precision:
+            # check parameters if available
+            try:
+                assert res.k == self.k
+                assert res.f == self.f
+            except AttributeError:
+                pass
+        return res
 
     def hard_conv_me(self, cls):
         assert cls == sint
@@ -4953,29 +5079,20 @@ def __init__(self, v, p=None, z=None, s=None, size=None):
         if isinstance(v, int):
             if not ((v >= 2**(self.vlen-1) and v < 2**(self.vlen)) or v == 0):
                 raise CompilerError('Floating point number malformed: significand')
-            self.v = sint(v)
-        else:
-            self.v = v
         if isinstance(p, int):
             if not (p >= -2**(self.plen - 1) and p < 2**(self.plen - 1)):
                 raise CompilerError('Floating point number malformed: exponent %d not unsigned %d-bit integer' % (p, self.plen))
-            self.p = sint(p)
-        else:
-            self.p = p
         if isinstance(z, int):
             if not (z == 0 or z == 1):
                 raise CompilerError('Floating point number malformed: zero bit')
-            self.z = sint()
-            ldsi(self.z, z)
-        else:
-            self.z = z
         if isinstance(s, int):
             if not (s == 0 or s == 1):
                 raise CompilerError('Floating point number malformed: sign')
-            self.s = sint()
-            ldsi(self.s, s)
-        else:
-            self.s = s
+        # copying necessary for update to work properly
+        self.v = sint(v)
+        self.p = sint(p)
+        self.z = sint(z)
+        self.s = sint(s)
 
     def __getitem__(self, index):
         return sfloat(*(x[index] for x in self))
@@ -5240,6 +5357,19 @@ def reveal(self):
         :return: cfloat """
         return cfloat(self.v.reveal(), self.p.reveal(), self.z.reveal(), self.s.reveal())
 
+    def update(self, other):
+        """
+        Update register. Useful in loops like
+        :py:func:`~Compiler.library.for_range`.
+
+        :param other: any convertible type
+
+        """
+        self.v.update(other.v)
+        self.p.update(other.p)
+        self.z.update(other.z)
+        self.s.update(other.s)
+
 class cfloat(Tape._no_truth):
     """ Helper class for printing revealed sfloats. """
     __slots__ = ['v', 'p', 'z', 's', 'nan']
@@ -5297,6 +5427,12 @@ def reveal_to_clients(self, clients):
         """
         self.value_type.reveal_to_clients(clients, [self.get_vector()])
 
+    @staticmethod
+    def _cmp_fail(*args):
+        raise CompilerError('equality of data structures is not implemented')
+
+    __eq__ = __ne__ = __le__ = __lt__ = __gt__ = __ge__ = _cmp_fail
+
 class Array(_vectorizable):
     """
     Array accessible by public index. That is, ``a[i]`` works for an
@@ -5361,14 +5497,28 @@ def __init__(self, length, value_type, address=None, debug=None, alloc=True):
             self.alloc()
 
     def alloc(self):
-        if self.address is None:
-            self.address = self.value_type.malloc(self.length,
-                                                  self.creator_tape)
+        if self._address is None:
+            try:
+                self.address = self.value_type.malloc(self.length,
+                                                      self.creator_tape)
+            except AttributeError:
+                raise CompilerError('cannot create Array of %s' % \
+                                    self.value_type)
 
     def delete(self):
         self.value_type.free(self.address)
         self.address = None
 
+    @property
+    def address(self):
+        if self._address is None:
+            raise CompilerError('trying access unallocated memory')
+        return self._address
+
+    @address.setter
+    def address(self, address):
+        self._address = address
+
     def get_address(self, index, size=None):
         if isinstance(index, (_secret, _single)):
             raise CompilerError('need cleartext index')
@@ -5498,6 +5648,10 @@ def __len__(self):
     def total_size(self):
         return self.length * self.value_type.n_elements()
 
+    @property
+    def shape(self):
+        return [self.length]
+
     def __iter__(self):
         for i in range(self.length):
             yield self[i]
@@ -5539,16 +5693,25 @@ def assign_all(self, value, use_threads=True, conv=True):
         """ Assign the same value to all entries.
 
         :param value: convertible to basic type """
-        if conv:
-            value = self.value_type.conv(value)
-            if value.size != 1:
-                raise CompilerError('cannot assign vector to all elements')
-        mem_value = MemValue(value)
+        from Compiler.GC.types import bits
+        use_vector = util.is_constant(value) and \
+            not issubclass(self.value_type, (bits, squant))
+        if not use_vector:
+            if conv:
+                value = self.value_type.conv(value)
+                if value.size != 1:
+                    raise CompilerError('cannot assign vector to all elements')
+            mem_value = MemValue(value)
         self.address = MemValue.if_necessary(self.address)
         n_threads = 8 if use_threads and len(self) > 2**20 else None
-        @library.for_range_multithread(n_threads, 1024, len(self))
-        def f(i):
-            self[i] = mem_value
+        @library.multithread(n_threads, len(self))
+        def _(base, size):
+            if use_vector:
+                self.assign_vector(self.value_type(value, size=size), base)
+            else:
+                @library.for_range_opt(size)
+                def _(i):
+                    self[base + i] = mem_value
         return self
 
     def get_vector(self, base=0, size=None):
@@ -5615,7 +5778,15 @@ def expand_to_vector(self, index, size):
     def get_mem_value(self, index):
         return MemValue(self[index], self.get_address(index))
 
-    def input_from(self, player, budget=None, raw=False):
+    def concat(self, other):
+        """ Concatenate two arrays. """
+        assert self.value_type == other.value_type
+        res = Array(len(self) + len(other), self.value_type)
+        res.assign_vector(self[:])
+        res.assign_vector(other[:], len(self))
+        return res
+
+    def input_from(self, player, budget=None, raw=False, **kwargs):
         """ Fill with inputs from player if supported by type.
 
         :param player: public (regint/cint/int) """
@@ -5624,12 +5795,15 @@ def input_from(self, player, budget=None, raw=False):
         else:
             input_from = self.value_type.get_input_from
         try:
-            self.assign(input_from(player, size=len(self)))
+            @library.multithread(None, len(self),
+                                 max_size=budget or program.budget)
+            def _(base, size):
+                self.assign(input_from(player, size=size, **kwargs), base)
         except (TypeError, CompilerError):
             print (budget)
             @library.for_range_opt(self.length, budget=budget)
             def _(i):
-                self[i] = input_from(player)
+                self[i] = input_from(player, **kwargs)
 
     def read_from_file(self, start):
         """ Read content from ``Persistence/Transactions-P<playerno>.data``.
@@ -5713,15 +5887,29 @@ def shuffle(self):
         self.assign_vector(self.get(regint.inc(len(self)).shuffle()))
 
     def secure_shuffle(self):
-        """ Secure shuffle in place according to the security model. """
+        """ Secure shuffle in place according to the security model.
+        See :py:func:`MultiArray.secure_shuffle` for references. """
         self.assign_vector(self.get_vector().secure_shuffle())
 
     def secure_permute(self, *args, **kwargs):
-        """ Secure permutate in place according to the security model. """
+        """ Secure permutate in place according to the security model.
+        See :py:func:`MultiArray.secure_shuffle` for references.
+
+        :param permutation: output of :py:func:`sint.get_secure_shuffle()`
+        :param reverse: whether to apply inverse (default: False)
+
+        """
         self.assign_vector(self.get_vector().secure_permute(*args, **kwargs))
 
     def randomize(self, *args):
-        """ Randomize according to data type. """
+        """ Randomize array according to data type.
+        If it is :py:class:`sfix`, the following will sample an
+        individual uniformly random entry of the array
+        :py:obj:`M` roughly in the range :math:`[a,b]`::
+
+          M.randomize(a, b)
+
+        """
         self.assign_vector(self.value_type.get_random(*args, size=len(self)))
 
     def reveal(self):
@@ -5780,10 +5968,12 @@ def reveal_to(self, player):
 
     def sort(self, n_threads=None, batcher=False, n_bits=None):
         """
-        Sort in place using radix sort with complexity :math:`O(n \log
-        n)` for :py:class:`sint` and :py:class:`sfix`, and Batcher's
-        odd-even mergesort with :math:`O(n (\log n)^2)` for
-        :py:class:`sfloat`.
+        Sort in place using `radix sort
+        <https://eprint.iacr.org/2014/121>`_ with complexity
+        :math:`O(n \log n)` for :py:class:`sint` and :py:class:`sfix`,
+        and `Batcher's odd-even mergesort
+        <https://eprint.iacr.org/2011/122>`_ with :math:`O(n (\log
+        n)^2)` for :py:class:`sfloat`.
 
         :param n_threads: number of threads to use (single thread by
           default), need to use Batcher's algorithm for several threads
@@ -5878,10 +6068,16 @@ def __len__(self):
         """ Size of top dimension. """
         return self.sizes[0]
 
+    @property
+    def shape(self):
+        return list(self.sizes)
+
     def __iter__(self):
         return (self[i] for i in range(len(self)))
 
     def to_array(self):
+        assert self.value_type.n_elements() == 1 and \
+            self.value_type.mem_size() == 1
         return Array(self.total_size(), self.value_type, address=self.address)
 
     def maybe_get(self, condition, index):
@@ -5895,9 +6091,12 @@ def assign_all(self, value):
         """ Assign the same value to all entries.
 
         :param value: convertible to relevant basic type """
-        @library.for_range(self.sizes[0])
-        def f(i):
-            self[i].assign_all(value)
+        try:
+            self.to_array().assign_all(value)
+        except AssertionError:
+            @library.for_range(self.sizes[0])
+            def f(i):
+                self[i].assign_all(value)
         return self
 
     def total_size(self):
@@ -6005,7 +6204,7 @@ def get_addresses(self, *indices):
     def get_vector_by_indices(self, *indices):
         """
         Vector with potential asterisks. The potential retrieves
-        all entry where the first dimension index is 0, and the third
+        all entries where the first dimension index is 0, and the third
         dimension index is 1::
 
             a.get_vector_by_indices(0, None, 1)
@@ -6046,22 +6245,18 @@ def concat(self, other):
         res.assign_part_vector(other[:], self.sizes[0])
         return res
 
-    def input_from(self, player, budget=None, raw=False):
+    def input_from(self, player, budget=None, raw=False, **kwargs):
         """ Fill with inputs from player if supported by type.
 
         :param player: public (regint/cint/int) """
         if util.is_constant(self.total_size()) and \
            self.value_type.n_elements() == 1 and \
            self.value_type.mem_size() == 1:
-            if raw or program.always_raw():
-                input_from = self.value_type.get_raw_input_from
-            else:
-                input_from = self.value_type.get_input_from
-            self.assign_vector(input_from(player, size=self.total_size()))
+            self.to_array().input_from(player, budget=budget, raw=raw, **kwargs)
         else:
             @library.for_range_opt(self.sizes[0], budget=budget)
             def _(i):
-                self[i].input_from(player, budget=budget, raw=raw)
+                self[i].input_from(player, budget=budget, raw=raw, **kwargs)
 
     def write_to_file(self, position=None):
         """ Write shares of integer representation to
@@ -6174,7 +6369,10 @@ def dot(self, other, res_params=None, n_threads=None):
 
         :param self: two-dimensional
         :param other: Matrix or Array of matching size and type
-        :param n_threads: number of threads (default: all in same thread) """
+        :param n_threads: number of threads (default: all in same thread)
+        :rtype: Matrix or Array of appropriate size and type
+
+        """
         assert len(self.sizes) == 2
         if isinstance(other, Array):
             assert len(other) == self.sizes[1]
@@ -6241,6 +6439,8 @@ def _(k):
 
     def direct_mul(self, other, reduce=True, indices=None):
         """ Matrix multiplication in the virtual machine.
+        Unlike :py:func:`dot`, this only works for sint and sfix, and it
+        returns a vector instead of a data structure.
 
         :param self: :py:class:`Matrix` / 2-dimensional :py:class:`MultiArray`
         :param other: :py:class:`Matrix` / 2-dimensional :py:class:`MultiArray`
@@ -6326,6 +6526,10 @@ def trans_mul_to(self, other, res, n_threads=None):
         :param res: matrix of matching dimension to store result
         :param n_threads: number of threads (default: single thread)
         """
+        assert other.sizes[0] == self.sizes[0]
+        assert res.sizes[0] == self.sizes[1]
+        assert res.sizes[1] == other.sizes[1]
+        assert len(res.sizes) == 2
         @library.for_range_multithread(n_threads, 1, self.sizes[1])
         def _(i):
             indices = [regint(i), regint.inc(self.sizes[0])]
@@ -6342,6 +6546,10 @@ def mul_trans_to(self, other, res, n_threads=None):
         :param res: matrix of matching dimension to store result
         :param n_threads: number of threads (default: single thread)
         """
+        assert other.sizes[1] == self.sizes[1]
+        assert res.sizes[0] == self.sizes[0]
+        assert res.sizes[1] == other.sizes[0]
+        assert len(res.sizes) == 2
         @library.for_range_multithread(n_threads, 1, self.sizes[0])
         def _(i):
             indices = [regint(i), regint.inc(self.sizes[1])]
@@ -6354,62 +6562,32 @@ def direct_mul_to_matrix(self, other):
         res.assign_vector(self.direct_mul(other))
         return res
 
-    def budget_mul(self, other, n_rows, row, n_columns, column, reduce=True,
-                   res=None):
-        assert len(self.sizes) == 2
-        assert len(other.sizes) == 2
-        if res is None:
-            if reduce:
-                res_matrix = Matrix(n_rows, n_columns, self.value_type)
-            else:
-                res_matrix = Matrix(n_rows, n_columns, \
-                                    self.value_type.unreduced_type)
-        else:
-            res_matrix = res
-        @library.for_range_opt(n_rows)
-        def _(i):
-            @library.for_range_opt(n_columns)
-            def _(j):
-                col = column(other, j)
-                r = row(self, i)
-                if reduce:
-                    res_matrix[i][j] = self.value_type.dot_product(r, col)
-                else:
-                    entry = self.value_type.unreduced_dot_product(r, col)
-                    res_matrix[i][j] = entry
-        return res_matrix
-
     def plain_mul(self, other, res=None):
-        """ Alternative matrix multiplication.
-
-        :param self: two-dimensional
-        :param other: two-dimensional container of matching type and size """
-        assert other.sizes[0] == self.sizes[1]
-        return self.budget_mul(other, self.sizes[0], lambda x, i: x[i], \
-                               other.sizes[1], \
-                               lambda x, j: [x[k][j] for k in range(len(x))],
-                               res=res)
+        raise CompilerError('Deprecated functionality. Use dot()')
 
     def mul_trans(self, other):
         """ Matrix multiplication with transpose of :py:obj:`other`.
 
         :param self: two-dimensional
-        :param other: two-dimensional container of matching type and size """
-        assert other.sizes[1] == self.sizes[1]
-        return self.budget_mul(other, self.sizes[0], lambda x, i: x[i], \
-                               other.sizes[0], lambda x, j: x[j])
+        :param other: two-dimensional container of matching type and size
+        :return: Matrix of matching type and size
+
+        """
+        res = Matrix(self.sizes[0], other.sizes[0], self.value_type)
+        self.mul_trans_to(other, res)
+        return res
 
-    def trans_mul(self, other, reduce=True, res=None):
+    def trans_mul(self, other):
         """ Matrix multiplication with transpose of :py:obj:`self`
 
         :param self: two-dimensional
-        :param other: two-dimensional container of matching type and size """
-        assert other.sizes[0] == self.sizes[0]
-        return self.budget_mul(other, self.sizes[1], \
-                               lambda x, j: [x[k][j] for k in range(len(x))], \
-                               other.sizes[1], \
-                               lambda x, j: [x[k][j] for k in range(len(x))],
-                               reduce=reduce, res=res)
+        :param other: two-dimensional container of matching type and size
+        :return: Matrix of matching type and size
+
+        """
+        res = Matrix(self.sizes[1], other.sizes[1], self.value_type)
+        self.trans_mul_to(other, res)
+        return res
 
     def parallel_mul(self, other):
         assert self.sizes[1] == other.sizes[0]
@@ -6467,16 +6645,27 @@ def diag(self):
         return self.array.get(regint.inc(n, 0, n + 1))
 
     def secure_shuffle(self):
-        """ Securely shuffle rows (first index). """
+        """ Securely shuffle rows (first index). This uses the algorithm in
+        Section 4.3 of `Keller and Scholl
+        <https://eprint.iacr.org/2014/137>`_ or Section 3.2 of
+        `Asharov et al. <https://eprint.iacr.org/2022/1595>`_ if applicable.
+        """
         self.assign_vector(self.get_vector().secure_shuffle(self.part_size()))
 
     def secure_permute(self, permutation, reverse=False):
-        """ Securely permute rows (first index). """
+        """ Securely permute rows (first index). See
+        :py:func:`secure_shuffle` for references.
+
+        :param permutation: output of :py:func:`sint.get_secure_shuffle()`
+        :param reverse: whether to apply inverse (default: False)
+
+        """
         self.assign_vector(self.get_vector().secure_permute(
             permutation, self.part_size(), reverse))
 
     def sort(self, key_indices=None, n_bits=None):
         """ Sort sub-arrays (different first index) in place.
+        This uses `radix sort <https://eprint.iacr.org/2014/121>`_.
 
         :param key_indices: indices to sorting keys, for example
           ``(1, 2)`` to sort three-dimensional array ``a`` by keys
@@ -6496,15 +6685,20 @@ def sort(self, key_indices=None, n_bits=None):
         keys = self.get_vector_by_indices(*key_indices)
         sorting.radix_sort(keys, self, n_bits=n_bits)
 
-    def randomize(self, *args):
-        """ Randomize according to data type. """
-        if self.total_size() < program.budget:
+    def randomize(self, *args, n_threads=None):
+        """ Randomize according to data type.
+        If it is :py:class:`sfix`, the following will sample an
+        individual uniformly random entry of the multi-array
+        :py:obj:`M` roughly in the range :math:`[a,b]`::
+
+          M.randomize(a, b)
+
+        """
+        @library.multithread(n_threads, self.total_size(),
+                             max_size=program.budget)
+        def _(base, size):
             self.assign_vector(
-                self.value_type.get_random(*args, size=self.total_size()))
-        else:
-            @library.for_range(self.sizes[0])
-            def _(i):
-                self[i].randomize(*args)
+                self.value_type.get_random(*args, size=size), base=base)
 
     def reveal(self):
         """ Reveal to :py:obj:`MultiArray` of same shape. """
@@ -6588,7 +6782,7 @@ def __init__(self, sizes, value_type, debug=None, address=None, alloc=True):
         else:
             self.array = Array(reduce(operator.mul, sizes), \
                                value_type, address=address, alloc=alloc)
-        SubMultiArray.__init__(self, sizes, value_type, self.array.address, 0, \
+        SubMultiArray.__init__(self, sizes, value_type, self.array._address, 0,
                                debug=debug)
         if len(sizes) < 2:
             raise CompilerError('Use Array')
@@ -6626,6 +6820,12 @@ def create_from(rows):
             t = type(rows[0][0])
         else:
             t = type(rows[0])
+            if t != sfix:
+                for row in rows:
+                    if isinstance(row, sfix) or \
+                       (isinstance(row, Array) and row.value_type == sfix):
+                        raise CompilerError(
+                            'accidental shortening by creating matrix')
         res = Matrix(len(rows), len(rows[0]), t)
         for i in range(len(rows)):
             res[i].assign(rows[i])
@@ -6661,6 +6861,20 @@ def set_column(self, index, vector):
                                self.sizes[1])
         self.value_type.conv(vector).store_in_mem(addresses)
 
+    def concat_columns(self, other):
+        """ Concatenate two matrices by columns. """
+        assert self.sizes[0] == other.sizes[0]
+        assert self.value_type == other.value_type
+        res = Matrix(self.sizes[0], self.sizes[1] + other.sizes[1],
+                     self.value_type)
+        @library.for_range(self.sizes[1])
+        def _(i):
+            res.set_column(i, self.get_column(i))
+        @library.for_range(other.sizes[1])
+        def _(i):
+            res.set_column(self.sizes[1] + i, other.get_column(i))
+        return res
+
 class VectorArray(object):
     def __init__(self, length, value_type, vector_size, address=None):
         self.array = Array(length * vector_size, value_type, address)
@@ -6800,7 +7014,11 @@ def write(self, value):
         self.check()
         if isinstance(value, MemValue):
             value = value.read()
-        value = self.value_type.conv(value)
+        try:
+            value = self.value_type.conv(value)
+        except:
+            raise CompilerError('Cannot store %s as MemValue of %s' % \
+                                (type(value), self.value_type))
         if value.size != self.size:
             raise CompilerError('size mismatch')
         self.register = value
diff --git a/ExternalIO/README.md b/ExternalIO/README.md
index 89328440e..b841b0bbc 100644
--- a/ExternalIO/README.md
+++ b/ExternalIO/README.md
@@ -1,9 +1,12 @@
-The ExternalIO directory contains an example of managing I/O between external client processes and SPDZ parties running SPDZ engines. These instructions assume that SPDZ has been built as per the [project readme](../README.md).
+The ExternalIO directory contains an example of managing I/O between
+external client processes and parties running MP-SPDZ engines. These
+instructions assume that MP-SPDZ has been built as per the [project
+readme](../README.md).
 
 ## Working Examples
 
-[bankers-bonus-client.cpp](./bankers-bonus-client.cpp) and
-[bankers-bonus-client.py](./bankers-bonus-client.py) act as a
+[bankers-bonus-client.cpp](../ExternalIO/bankers-bonus-client.cpp) and
+[bankers-bonus-client.py](../ExternalIO/bankers-bonus-client.py) act as a
 client to [bankers_bonus.mpc](../Programs/Source/bankers_bonus.mpc)
 and demonstrates sending input and receiving output as described by
 [Damgård et al.](https://eprint.iacr.org/2015/1006) The computation
@@ -56,5 +59,5 @@ Only the `sint` methods used in the example are documented here, equivalent meth
 
 The example uses the `Client` class implemented in
 `ExternalIO/Client.hpp` to handle the communication, see
-https://mp-spdz.readthedocs.io/en/latest/io.html#reference for
+[this reference](https://mp-spdz.readthedocs.io/en/latest/io.html#reference) for
 documentation.
diff --git a/GC/FakeSecret.cpp b/GC/FakeSecret.cpp
index 940fc5569..f69130009 100644
--- a/GC/FakeSecret.cpp
+++ b/GC/FakeSecret.cpp
@@ -9,6 +9,7 @@
 
 #include "GC/Processor.hpp"
 #include "GC/ShareSecret.hpp"
+#include "GC/ThreadMaster.hpp"
 #include "Processor/Input.hpp"
 
 namespace GC
@@ -121,4 +122,9 @@ void FakeSecret::finalize_input(Input& inputter, int from, int n_bits)
     *this = inputter.finalize(from, n_bits);
 }
 
+void FakeSecret::run_tapes(const vector<int>& args)
+{
+    Thread<FakeSecret>::s().master.machine.run_tapes(args);
+}
+
 } /* namespace GC */
diff --git a/GC/FakeSecret.h b/GC/FakeSecret.h
index cd43ae1d5..668b5a967 100644
--- a/GC/FakeSecret.h
+++ b/GC/FakeSecret.h
@@ -119,6 +119,8 @@ class FakeSecret : public ShareInterface, public BitVec
     static void andm(GC::Processor<U>&, const BaseInstruction&)
     { throw runtime_error("andm not implemented"); }
 
+    static void run_tapes(const vector<int>& args);
+
     static FakeSecret input(GC::Processor<FakeSecret>& processor, const InputArgs& args);
     static FakeSecret input(int from, word input, int n_bits);
 
diff --git a/GC/Program.hpp b/GC/Program.hpp
index f1547f592..768a09c58 100644
--- a/GC/Program.hpp
+++ b/GC/Program.hpp
@@ -126,7 +126,7 @@ BreakType Program::execute(Processor<T>& Proc, U& dynamic_memory,
         cout << "complexity at " << time << ": " << Proc.complexity << endl;
 #endif
     }
-    while (Proc.complexity < (1 << 19));
+    while (Proc.complexity < (size_t) OnlineOptions::singleton.batch_size);
     Proc.time = time;
 #ifdef DEBUG_ROUNDS
     cout << "breaking at time " << Proc.time << endl;
diff --git a/GC/RuntimeBranching.h b/GC/RuntimeBranching.h
index 6ba0faf06..a7cc8cdee 100644
--- a/GC/RuntimeBranching.h
+++ b/GC/RuntimeBranching.h
@@ -20,6 +20,9 @@ class RuntimeBranching
 
     void untaint()
     {
+#ifdef DEBUG_YAO
+        cout << "untaint from " << tainted << endl;
+#endif
         bool was_tainted = tainted;
         tainted = false;
         if (was_tainted)
diff --git a/GC/Secret.h b/GC/Secret.h
index 9fee3f2ff..b4f9ac8e9 100644
--- a/GC/Secret.h
+++ b/GC/Secret.h
@@ -133,6 +133,8 @@ class Secret
     static void andm(Processor<U>& processor, const BaseInstruction& instruction)
     { T::andm(processor, instruction); }
 
+    static void run_tapes(const vector<int>& args) { T::run_tapes(args); }
+
     Secret();
     Secret(const Integer& x) { *this = x; }
 
diff --git a/GC/SemiSecret.h b/GC/SemiSecret.h
index dc9e0a341..4110b9a49 100644
--- a/GC/SemiSecret.h
+++ b/GC/SemiSecret.h
@@ -34,6 +34,8 @@ class SemiSecretBase : public V, public ShareSecret<T>
     typedef T part_type;
     typedef T small_type;
 
+    static const bool is_real = true;
+
     static const int default_length = sizeof(BitVec) * 8;
 
     static string type_string() { return "binary secret"; }
diff --git a/GC/ShareSecret.h b/GC/ShareSecret.h
index d8c0c18c5..bdf6c9032 100644
--- a/GC/ShareSecret.h
+++ b/GC/ShareSecret.h
@@ -84,6 +84,9 @@ class ShareSecret
 
     static BitVec get_mask(int n) { return n >= 64 ? -1 : ((1L << n) - 1); }
 
+    static void run_tapes(const vector<int>& args)
+    { Thread<U>::s().master.machine.run_tapes(args); }
+
     void check_length(int n, const Integer& x);
 
     void invert(int n, const U& x);
@@ -160,7 +163,7 @@ class RepSecretBase : public FixedVec<BitVec, L>, public ShareSecret<U>
     void bitdec(Memory<U>& S, const vector<int>& regs) const;
 
     void xor_(int n, const This& x, const This& y)
-    { *this = x ^ y; (void)n; }
+    { *this = (x ^ y).mask(n); }
 
     This operator&(const Clear& other)
     { return super::operator&(BitVec(other)); }
diff --git a/GC/ThreadMaster.hpp b/GC/ThreadMaster.hpp
index 03eea7813..ff0763833 100644
--- a/GC/ThreadMaster.hpp
+++ b/GC/ThreadMaster.hpp
@@ -97,7 +97,8 @@ void ThreadMaster<T>::run()
         delete thread;
     }
 
-    exe_stats.print();
+    if (not exe_stats.empty())
+        exe_stats.print();
     stats.print();
 
     machine.print_timers();
diff --git a/GC/instructions.h b/GC/instructions.h
index 62a71603f..272011947 100644
--- a/GC/instructions.h
+++ b/GC/instructions.h
@@ -138,7 +138,7 @@
     X(PRINTINT, PROC.out << I0) \
     X(STARTGRIND, CALLGRIND_START_INSTRUMENTATION) \
     X(STOPGRIND, CALLGRIND_STOP_INSTRUMENTATION) \
-    X(RUN_TAPE, MACH->run_tapes(EXTRA)) \
+    X(RUN_TAPE, T::run_tapes(EXTRA)) \
     X(JOIN_TAPE, MACH->join_tape(R0)) \
     X(USE, ) \
     X(USE_INP, ) \
diff --git a/License.txt b/License.txt
index ab7ae3bb9..9c8f81b1c 100644
--- a/License.txt
+++ b/License.txt
@@ -1,4 +1,4 @@
-The Software is copyright (c) 2022, Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
+The Software is copyright (c) 2023, Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
 
 CSIRO grants you a licence to the Software on the terms of the BSD 3-Clause Licence.
 
diff --git a/Machines/TripleMachine.cpp b/Machines/TripleMachine.cpp
index a6b58db53..45c62e5fa 100644
--- a/Machines/TripleMachine.cpp
+++ b/Machines/TripleMachine.cpp
@@ -212,6 +212,10 @@ void TripleMachine::run()
                 generators[i] = new_generator<Spdz2kShare<66, 64>>(setup, i, mac_keyz);
             else if (z2k == 66 and z2s == 48)
                 generators[i] = new_generator<Spdz2kShare<66, 48>>(setup, i, mac_keyz);
+#ifdef RING_SIZE
+            else if (z2k == RING_SIZE and z2s == SPDZ2K_DEFAULT_SECURITY)
+                generators[i] = new_generator<Spdz2kShare<RING_SIZE, SPDZ2K_DEFAULT_SECURITY>>(setup, i, mac_keyz);
+#endif
             else
                 throw runtime_error("not compiled for k=" + to_string(z2k) + " and s=" + to_string(z2s));
         }
diff --git a/Makefile b/Makefile
index 467e6d8f1..c2cf93113 100644
--- a/Makefile
+++ b/Makefile
@@ -52,11 +52,11 @@ endif
 endif
 
 # used for dependency generation
-OBJS = $(BMR) $(FHEOBJS) $(TINYOTOFFLINE) $(YAO) $(COMPLETE) $(patsubst %.cpp,%.o,$(wildcard Machines/*.cpp Utils/*.cpp))
+OBJS = $(patsubst %.cpp,%.o,$(wildcard */*.cpp)) $(STATIC_OTE)
 DEPS := $(wildcard */*.d */*/*.d)
 
 # never delete
-.SECONDARY: $(OBJS) $(patsubst %.cpp,%.o,$(wildcard */*.cpp))
+.SECONDARY: $(OBJS)
 
 
 all: arithmetic binary gen_input online offline externalIO bmr ecdsa
@@ -75,6 +75,10 @@ arithmetic: semi-he gear
 -include $(DEPS)
 include $(wildcard *.d static/*.d)
 
+$(OBJS): CONFIG CONFIG.mine
+CONFIG.mine:
+	touch CONFIG.mine
+
 %.o: %.cpp
 	$(CXX) -o $@ $< $(CFLAGS) -MMD -MP -c
 
@@ -110,17 +114,17 @@ spdz2k: spdz2k-party.x ot-offline.x Check-Offline-Z2k.x galois-degree.x Fake-Off
 mascot: mascot-party.x spdz2k mama-party.x
 
 ifeq ($(OS), Darwin)
-tldr: mac-setup
+setup: mac-setup
 else
-tldr: mpir linux-machine-setup
+setup: boost mpir linux-machine-setup
 endif
 
-tldr: libote
+tldr: setup
 	$(MAKE) mascot-party.x
 	mkdir Player-Data 2> /dev/null; true
 
 ifeq ($(ARM), 1)
-Tools/intrinsics.h: deps/simde/simde
+$(patsubst %.cpp,%.o,$(wildcard */*.cpp)): deps/simde/simde
 endif
 
 shamir: shamir-party.x malicious-shamir-party.x atlas-party.x galois-degree.x
@@ -317,7 +321,17 @@ boost: deps/libOTe/libOTe
 	cd deps/libOTe; \
 	python3 build.py --setup --boost --install=$(CURDIR)/local
 
-OTE_OPTS = -DENABLE_SOFTSPOKEN_OT=ON -DCMAKE_CXX_COMPILER=$(CXX) -DCMAKE_INSTALL_LIBDIR=lib
+OTE_OPTS += -DENABLE_SOFTSPOKEN_OT=ON -DCMAKE_CXX_COMPILER=$(CXX) -DCMAKE_INSTALL_LIBDIR=lib
+
+ifeq ($(ARM), 1)
+OTE_OPTS += -DENABLE_AVX=OFF -DENABLE_SSE=OFF
+else
+ifeq ($(AVX_OT), 0)
+OTE_OPTS += -DENABLE_AVX=OFF
+else
+OTE_OPTS += -DENABLE_AVX=ON -DENABLE_SSE=ON
+endif
+endif
 
 ifeq ($(USE_SHARED_OTE), 1)
 OTE = $(SHARED_OTE)
@@ -331,17 +345,15 @@ libote:
 
 local/lib/libcryptoTools.a: $(STATIC_OTE)
 local/lib/libcryptoTools.so: $(SHARED_OTE)
+
+ifeq ($(USE_KOS), 0)
 OT/OTExtensionWithMatrix.o: $(OTE)
+endif
 
-ifeq ($(ARM), 1)
 local/lib/liblibOTe.a: deps/libOTe/libOTe
 	cd deps/libOTe; \
-	PATH="$(CURDIR)/local/bin:$(PATH)" python3 build.py --install=$(CURDIR)/local -- -DBUILD_SHARED_LIBS=0 -DENABLE_AVX=OFF -DENABLE_SSE=OFF $(OTE_OPTS)
-else
-local/lib/liblibOTe.a: deps/libOTe/libOTe
-	cd deps/libOTe; \
-	PATH="$(CURDIR)/local/bin:$(PATH)" python3 build.py --install=$(CURDIR)/local -- -DBUILD_SHARED_LIBS=0 $(OTE_OPTS)
-endif
+	PATH="$(CURDIR)/local/bin:$(PATH)" python3 build.py --install=$(CURDIR)/local -- -DBUILD_SHARED_LIBS=0 $(OTE_OPTS) && \
+	touch ../../local/lib/liblibOTe.a
 
 $(SHARED_OTE): deps/libOTe/libOTe
 	cd deps/libOTe; \
@@ -373,4 +385,4 @@ deps/simde/simde:
 	git submodule update --init deps/simde || git clone https://github.com/simd-everywhere/simde deps/simde
 
 clean:
-	-rm -f */*.o *.o */*.d *.d *.x core.* *.a gmon.out */*/*.o static/*.x *.so
+	-rm -f */*.o *.o */*.d *.d *.x core.* *.a gmon.out */*/*.o static/*.x *.so local/lib/liblibOTe.*
diff --git a/Math/bigint.h b/Math/bigint.h
index f99e3dfd7..2a929399d 100644
--- a/Math/bigint.h
+++ b/Math/bigint.h
@@ -37,6 +37,13 @@ namespace GC
   class Clear;
 }
 
+/**
+ * Type for arbitrarily large integers.
+ * This is a sub-class of ``mpz_class`` from MPIR. As such, it implements
+ * all integers operations and input/output via C++ streams. In addition,
+ * the ``get_ui()`` member function allows retrieving the least significant
+ * 64 bits.
+ */
 class bigint : public mpz_class
 {
 public:
@@ -51,15 +58,20 @@ class bigint : public mpz_class
   template<class U, class T>
   static void output_float(U& o, const mpf_class& x, T nan);
 
+  /// Initialize to zero.
   bigint() : mpz_class() {}
   template <class T>
   bigint(const T& x) : mpz_class(x) {}
+  /// Convert to canonical representation as non-negative number.
   template<int X, int L>
   bigint(const gfp_<X, L>& x);
+  /// Convert to canonical representation as non-negative number.
   template<int X, int L>
   bigint(const gfpvar_<X, L>& x);
+  /// Convert to canonical representation as non-negative number.
   template <int K>
   bigint(const Z2<K>& x);
+  /// Convert to canonical representation as non-negative number.
   template <int K>
   bigint(const SignedZ2<K>& x);
   template <int L>
diff --git a/Math/gf2n.cpp b/Math/gf2n.cpp
index d39a8593e..ba638d974 100644
--- a/Math/gf2n.cpp
+++ b/Math/gf2n.cpp
@@ -454,20 +454,16 @@ void gf2n_<U>::randomize(PRNG& G, int n)
   a&=mask;
 }
 
-template<>
-void gf2n_<octet>::output(ostream& s,bool human) const
-{
-  if (human)
-    s << hex << showbase << word(a) << dec;
-  else
-    s.write((char*) &a, sizeof(octet));
-}
-
 template<class U>
 void gf2n_<U>::output(ostream& s,bool human) const
 {
   if (human)
-    { s << hex << showbase << a << dec; }
+    {
+      if (n > 64)
+        s << hex << a << dec;
+      else
+        s << hex << to_word(a) << dec;
+    }
   else
     { s.write((char*) &a, (sizeof(U))); }
 }
@@ -484,7 +480,16 @@ void gf2n_<U>::input(istream& s,bool human)
     }
 
   if (human)
-    { s >> hex >> a >> dec; } 
+    {
+      if (n > 64)
+        s >> hex >> a >> dec;
+      else
+        {
+          word tmp;
+          s >> hex >> tmp >> dec;
+          *this = U(tmp);
+        }
+    }
   else
     { s.read((char*) &a, sizeof(U)); }
 
diff --git a/Math/gf2n.h b/Math/gf2n.h
index 235c08f5b..c44f9c0e8 100644
--- a/Math/gf2n.h
+++ b/Math/gf2n.h
@@ -191,9 +191,7 @@ class gf2n_ : public ValueInterface
     }
   friend istream& operator>>(istream& s,gf2n_& x)
     {
-      word tmp;
-      s >> hex >> tmp >> dec;
-      x = tmp;
+      x.input(s, true);
       return s;
     }
 
diff --git a/Math/gf2nlong.cpp b/Math/gf2nlong.cpp
index c2555681b..7d5f24794 100644
--- a/Math/gf2nlong.cpp
+++ b/Math/gf2nlong.cpp
@@ -27,26 +27,24 @@ ostream& operator<<(ostream& s, const int128& a)
 {
   word* tmp = (word*)&a.a;
   s << hex;
-
-  if (tmp[1] != 0)
-    {
-      s << noshowbase;
-      s.width(16);
-      s.fill('0');
-      s << tmp[1];
-      s.width(16);
-    }
-  else
-    s << showbase;
-
+  s << noshowbase;
+  s.width(16);
+  s.fill('0');
+  s << tmp[1];
+  s.width(16);
   s << tmp[0] << dec;
   return s;
 }
 
 istream& operator>>(istream& s, int128& a)
 {
-  gf2n_long tmp;
-  s >> tmp;
-  a = tmp.get();
+  bigint tmp;
+  s >> hex >> tmp;
+  a = 0;
+  auto size = tmp.get_mpz_t()->_mp_size;
+  assert(size >= 0);
+  assert(size <= 2);
+  mpn_copyi((mp_limb_t*) &a.a, tmp.get_mpz_t()->_mp_d, size);
+  s >> dec;
   return s;
 }
diff --git a/Math/gf2nlong.h b/Math/gf2nlong.h
index 85a668a74..a15dbfc62 100644
--- a/Math/gf2nlong.h
+++ b/Math/gf2nlong.h
@@ -154,21 +154,6 @@ class gf2n_long : public gf2n_<int128>
   gf2n_long(int g) : gf2n_long(int128(unsigned(g))) {}
   template<class T>
   gf2n_long(IntBase<T> g) : super(g.get()) {}
-
-  friend ostream& operator<<(ostream& s,const gf2n_long& x)
-    { s << hex << x.get() << dec;
-      return s;
-    }
-  friend istream& operator>>(istream& s,gf2n_long& x)
-    { bigint tmp;
-      s >> hex >> tmp >> dec;
-      x = 0;
-      auto size = tmp.get_mpz_t()->_mp_size;
-      assert(size >= 0);
-      assert(size <= 2);
-      mpn_copyi((mp_limb_t*)x.get_ptr(), tmp.get_mpz_t()->_mp_d, size);
-      return s;
-    }
 };
 
 #if defined(__aarch64__) && defined(__clang__)
diff --git a/Math/gfp.h b/Math/gfp.h
index 9a50dc035..de00934a0 100644
--- a/Math/gfp.h
+++ b/Math/gfp.h
@@ -49,6 +49,7 @@ template<class T> void generate_prime_setup(string, int, int);
  * ``L`` is the number of 64-bit limbs, that is,
  * the prime has to have bit length in `[64*L-63, 64*L]`.
  * See ``gfpvar_`` for a more flexible alternative.
+ * Convert to ``bigint`` to access the canonical integer representation.
  */
 template<int X, int L>
 class gfp_ : public ValueInterface
diff --git a/Networking/Exchanger.h b/Networking/Exchanger.h
index 29d88797d..33c1daa0d 100644
--- a/Networking/Exchanger.h
+++ b/Networking/Exchanger.h
@@ -67,6 +67,9 @@ class Exchanger
       #endif
             n_send++;
             size_t to_send = len - sent;
+#ifdef __APPLE__
+            to_send = min(to_send, 1ul << 16);
+#endif
             size_t newly_sent = send_non_blocking(send_socket, data + sent,
                     to_send);
 #ifdef TIME_ROUNDS
diff --git a/Networking/ServerSocket.cpp b/Networking/ServerSocket.cpp
index d69fd7b8d..bf56d2565 100644
--- a/Networking/ServerSocket.cpp
+++ b/Networking/ServerSocket.cpp
@@ -139,9 +139,25 @@ void ServerSocket::accept_clients()
 #ifdef DEBUG_NETWORKING
       fprintf(stderr, "Accepting...\n");
 #endif
-      int consocket = accept(main_socket, (struct sockaddr *)&dest, (socklen_t*) &socksize);
+      int consocket;
+      for (int i = 0; i < 25; i++)
+      {
+        consocket = accept(main_socket, (struct sockaddr*) &dest,
+          (socklen_t*) &socksize);
+        if (consocket < 0)
+          usleep(1 << i);
+        else
+          break;
+      }
       if (consocket<0) { error("set_up_socket:accept"); }
 
+#ifdef __APPLE__
+      int flags = fcntl(consocket, F_GETFL, 0);
+      int fl = fcntl(consocket, F_SETFL, O_NONBLOCK |  flags);
+      if (fl < 0)
+          error("set non-blocking on server");
+#endif
+
       octetStream client_id;
       char buf[1];
       if (recv(consocket, buf, 1, MSG_PEEK | MSG_DONTWAIT) > 0)
@@ -160,13 +176,6 @@ void ServerSocket::accept_clients()
           auto job = (new ServerJob(*this, consocket, dest));
           pthread_create(&job->thread, 0, ServerJob::run, job);
         }
-
-#ifdef __APPLE__
-      int flags = fcntl(consocket, F_GETFL, 0);
-      int fl = fcntl(consocket, F_SETFL, O_NONBLOCK |  flags);
-      if (fl < 0)
-          error("set non-blocking");
-#endif
     }
 }
 
diff --git a/Networking/sockets.cpp b/Networking/sockets.cpp
index fd064cd2e..8034809e6 100644
--- a/Networking/sockets.cpp
+++ b/Networking/sockets.cpp
@@ -125,7 +125,7 @@ void set_up_client_socket(int& mysocket,const char* hostname,int Portnum)
   int flags = fcntl(mysocket, F_GETFL, 0);
   fl = fcntl(mysocket, F_SETFL, O_NONBLOCK |  flags);
   if (fl < 0)
-    error("set non-blocking");
+    error("set non-blocking on client");
 #endif
 }
 
diff --git a/Networking/sockets.h b/Networking/sockets.h
index 37485f48a..7a4c2bd0a 100644
--- a/Networking/sockets.h
+++ b/Networking/sockets.h
@@ -52,9 +52,18 @@ inline size_t send_non_blocking(int socket, octet* msg, size_t len)
 inline void send(int socket,octet *msg,size_t len)
 {
   size_t i = 0;
+  long wait = 1;
   while (i < len)
     {
-      i += send_non_blocking(socket, msg + i, len - i);
+      size_t j = send_non_blocking(socket, msg + i, len - i);
+      i += j;
+      if (i > 0)
+	wait = 1;
+      else
+	{
+	  usleep(wait);
+	  wait *= 2;
+	}
     }
 }
 
@@ -107,7 +116,7 @@ inline void receive(T& socket, size_t& a, size_t len)
   a = decode_length(blen, len);
 }
 
-inline size_t check_non_blocking_result(int res)
+inline ssize_t check_non_blocking_result(ssize_t res)
 {
   if (res < 0)
     {
@@ -118,15 +127,15 @@ inline size_t check_non_blocking_result(int res)
   return res;
 }
 
-inline size_t receive_non_blocking(int socket,octet *msg,int len)
+inline ssize_t receive_non_blocking(int socket, octet *msg, size_t len)
 {
-  int res = recv(socket, msg, len, MSG_DONTWAIT);
+  ssize_t res = recv(socket, msg, len, MSG_DONTWAIT);
   return check_non_blocking_result(res);
 }
 
-inline size_t receive_all_or_nothing(int socket,octet *msg,int len)
+inline ssize_t receive_all_or_nothing(int socket, octet *msg, ssize_t len)
 {
-  int res = recv(socket, msg, len, MSG_DONTWAIT | MSG_PEEK);
+  ssize_t res = recv(socket, msg, len, MSG_DONTWAIT | MSG_PEEK);
   check_non_blocking_result(res);
   if (res == len)
     {
diff --git a/Networking/ssl_sockets.h b/Networking/ssl_sockets.h
index 816139953..a9ce63130 100644
--- a/Networking/ssl_sockets.h
+++ b/Networking/ssl_sockets.h
@@ -109,7 +109,7 @@ inline void receive(ssl_socket* socket, octet* data, size_t length)
         received += socket->read_some(boost::asio::buffer(data + received, length - received));
 }
 
-inline size_t receive_non_blocking(ssl_socket* socket, octet* data, int length)
+inline size_t receive_non_blocking(ssl_socket* socket, octet* data, size_t length)
 {
     return socket->read_some(boost::asio::buffer(data, length));
 }
diff --git a/OT/BitMatrix.h b/OT/BitMatrix.h
index a797b9798..b996d83b6 100644
--- a/OT/BitMatrix.h
+++ b/OT/BitMatrix.h
@@ -99,9 +99,6 @@ class aligned_allocator : public std::allocator<_Tp>
     _Tp*
     allocate(size_t __n, const void* = 0)
     {
-        if (__n > this->max_size())
-            std::__throw_bad_alloc();
-
         _Tp* res = 0;
         int err = posix_memalign((void**)&res, ALIGN, __n * sizeof(_Tp));
         if (err != 0 or res == 0)
diff --git a/OT/OTVole.hpp b/OT/OTVole.hpp
index 1dbcdbe00..13f58f7fd 100644
--- a/OT/OTVole.hpp
+++ b/OT/OTVole.hpp
@@ -205,7 +205,7 @@ void OTVoleBase<T>::consistency_check(vector<octetStream>& os) {
 #endif
         int total_bytes = t0[0].size() * T::size();
         int num_blocks = (total_bytes) / 16 + ((total_bytes % 16) != 0);
-        __m128i coefficients[num_blocks];
+        __m128i* coefficients = new __m128i[num_blocks];
         this->set_coeffs(coefficients, coef_prng_sender, num_blocks);
 
         for (int alpha = 0; alpha < S; ++alpha)
@@ -225,6 +225,7 @@ void OTVoleBase<T>::consistency_check(vector<octetStream>& os) {
                 this->hash_row(os[0], t11, coefficients);
             }
         }
+        delete[] coefficients;
 #ifdef OTVOLE_TIMER
         gettimeofday(&totalendv, NULL);
         double elapsed = timeval_diff(&totalstartv, &totalendv);
@@ -240,7 +241,7 @@ void OTVoleBase<T>::consistency_check(vector<octetStream>& os) {
 #endif
         int total_bytes = t[0].size() * T::size();
         int num_blocks = (total_bytes) / 16 + ((total_bytes % 16) != 0);
-        __m128i coefficients[num_blocks];
+        __m128i* coefficients = new __m128i[num_blocks];
         this->set_coeffs(coefficients, coef_prng_receiver, num_blocks);
 
         octet h00[VOLE_HASH_SIZE] = {0};
@@ -288,6 +289,7 @@ void OTVoleBase<T>::consistency_check(vector<octetStream>& os) {
                 }
             }
         }
+        delete[] coefficients;
 #ifdef OTVOLE_TIMER
         gettimeofday(&totalendv, NULL);
         double elapsed = timeval_diff(&totalstartv, &totalendv);
diff --git a/Processor/Data_Files.hpp b/Processor/Data_Files.hpp
index 46c84903c..6c189cac0 100644
--- a/Processor/Data_Files.hpp
+++ b/Processor/Data_Files.hpp
@@ -325,7 +325,10 @@ void Sub_Data_Files<T>::buffer_edabits_with_queues(bool strict, int n_bits,
     }
   auto& buffer = *edabit_buffers[n_bits];
   if (buffer.peek() == EOF)
-    buffer.seekg(file_signature<T>().get_length());
+    {
+      buffer.seekg(0);
+      check_file_signature<T>(buffer, "");
+    }
   edabitvec<T> eb;
   eb.input(n_bits, buffer);
   this->edabits[{strict, n_bits}].push_back(eb);
diff --git a/Processor/ExternalClients.cpp b/Processor/ExternalClients.cpp
index 48bb8bd17..2c8036cda 100644
--- a/Processor/ExternalClients.cpp
+++ b/Processor/ExternalClients.cpp
@@ -29,6 +29,7 @@ ExternalClients::~ExternalClients()
 
 void ExternalClients::start_listening(int portnum_base)
 {
+  ScopeLock _(lock);
   client_connection_servers[portnum_base] = new AnonymousServerSocket(portnum_base + get_party_num());
   client_connection_servers[portnum_base]->init();
   cerr << "Start listening on thread " << this_thread::get_id() << endl;
@@ -38,6 +39,7 @@ void ExternalClients::start_listening(int portnum_base)
 
 int ExternalClients::get_client_connection(int portnum_base)
 {
+  ScopeLock _(lock);
   map<int,AnonymousServerSocket*>::iterator it = client_connection_servers.find(portnum_base);
   if (it == client_connection_servers.end())
   {
@@ -61,6 +63,7 @@ int ExternalClients::get_client_connection(int portnum_base)
 
 void ExternalClients::close_connection(int client_id)
 {
+  ScopeLock _(lock);
   auto it = external_client_sockets.find(client_id);
   if (it == external_client_sockets.end())
     throw runtime_error("client id not active: " + to_string(client_id));
@@ -77,6 +80,7 @@ int ExternalClients::get_party_num()
 
 client_socket* ExternalClients::get_socket(int id)
 {
+  ScopeLock _(lock);
   if (external_client_sockets.find(id) == external_client_sockets.end())
     throw runtime_error("external connection not found for id " + to_string(id));
   return external_client_sockets[id];
diff --git a/Processor/ExternalClients.h b/Processor/ExternalClients.h
index 5ea1b3fdc..bada59b40 100644
--- a/Processor/ExternalClients.h
+++ b/Processor/ExternalClients.h
@@ -4,6 +4,7 @@
 #include "Networking/sockets.h"
 #include "Networking/ssl_sockets.h"
 #include "Tools/Exceptions.h"
+#include "Tools/Lock.h"
 #include "ExternalIO/Client.h"
 #include <vector>
 #include <map>
@@ -32,6 +33,8 @@ class ExternalClients
   ssl_service io_service;
   client_ctx* ctx;
 
+  Lock lock;
+
   public:
 
   ExternalClients(int party_num);
diff --git a/Processor/Instruction.h b/Processor/Instruction.h
index 011dcb581..a70e095cb 100644
--- a/Processor/Instruction.h
+++ b/Processor/Instruction.h
@@ -209,6 +209,7 @@ enum
     CONDPRINTPLAIN = 0xE1,
     INTOUTPUT = 0xE6,
     FLOATOUTPUT = 0xE7,
+    FIXINPUT = 0xE8,
 
     // GF(2^n) versions
     
diff --git a/Processor/Instruction.hpp b/Processor/Instruction.hpp
index da4dd01ea..969ae06be 100644
--- a/Processor/Instruction.hpp
+++ b/Processor/Instruction.hpp
@@ -200,9 +200,6 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
       case GSHLCI:
       case GSHRCI:
       case GSHRSI:
-      case USE:
-      case USE_INP:
-      case USE_EDABIT:
       case DIGESTC:
       case INPUTMASK:
       case GINPUTMASK:
@@ -211,6 +208,12 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
         get_ints(r, s, 2);
         n = get_int(s);
         break;
+      case USE:
+      case USE_INP:
+      case USE_EDABIT:
+          get_ints(r, s, 2);
+          n = get_long(s);
+          break;
       case STARTPRIVATEOUTPUT:
       case GSTARTPRIVATEOUTPUT:
       case STOPPRIVATEOUTPUT:
@@ -218,7 +221,7 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
         throw runtime_error("two-stage private output not supported any more");
       case USE_MATMUL:
         get_ints(r, s, 3);
-        n = get_int(s);
+        n = get_long(s);
         break;
       // instructions with 1 register + 1 integer operand
       case LDI:
@@ -407,7 +410,7 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
       case USE_PREP:
       case GUSE_PREP:
         s.read((char*)r, sizeof(r));
-        n = get_int(s);
+        n = get_long(s);
         break;
       case REQBL:
         n = get_int(s);
@@ -425,6 +428,7 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
       case XORM:
       case ANDM:
       case XORCB:
+      case FIXINPUT:
         n = get_int(s);
         get_ints(r, s, 3);
         break;
@@ -507,7 +511,7 @@ bool Instruction::get_offline_data_usage(DataPositions& usage)
       if (r[1] >= N_DTYPE)
         throw invalid_program();
       usage.files[r[0]][r[1]] = n;
-      return int(n) >= 0;
+      return long(n) >= 0;
     case USE_INP:
       if (r[0] >= N_DATA_FIELD_TYPE)
         throw invalid_program();
@@ -517,19 +521,19 @@ bool Instruction::get_offline_data_usage(DataPositions& usage)
             throw Processor_Error("Player number too high");
           usage.inputs[r[1]][r[0]] = n;
         }
-      return int(n) >= 0;
+      return long(n) >= 0;
     case USE_EDABIT:
       usage.edabits[{r[0], r[1]}] = n;
-      return int(n) >= 0;
+      return long(n) >= 0;
     case USE_MATMUL:
       usage.matmuls[{{r[0], r[1], r[2]}}] = n;
-      return int(n) >= 0;
+      return long(n) >= 0;
     case USE_PREP:
       usage.extended[DATA_INT][r] = n;
-      return int(n) >= 0;
+      return long(n) >= 0;
     case GUSE_PREP:
       usage.extended[gf2n::field_type()][r] = n;
-      return int(n) >= 0;
+      return long(n) >= 0;
     default:
       return true;
   }
@@ -623,6 +627,7 @@ int BaseInstruction::get_reg_type() const
     case FLOATOUTPUT:
     case READSOCKETC:
     case PRIVATEOUTPUT:
+    case FIXINPUT:
       return CINT;
     default:
       if (is_gf2n_instruction())
@@ -812,7 +817,12 @@ unsigned BaseInstruction::get_max_reg(int reg_type) const
       for (size_t i = offset; i < start.size(); i += skip)
       {
           if (size_offset != 0)
-              size = DIV_CEIL(start[i + size_offset], 64);
+          {
+              if (opcode & 0x200)
+                  size = DIV_CEIL(start[i + size_offset], 64);
+              else
+                  size = start[i + size_offset];
+          }
           m = max(m, (unsigned)start[i] + size);
       }
       return m;
@@ -1206,6 +1216,7 @@ inline void Instruction::execute(Processor<sint, sgf2n>& Proc) const
         break;
       case ACCEPTCLIENTCONNECTION:
       {
+        TimeScope _(Proc.client_timer);
         // get client connection at port number n + my_num())
         int client_handle = Proc.external_clients.get_client_connection(
             Proc.read_Ci(r[1]));
@@ -1261,11 +1272,11 @@ inline void Instruction::execute(Processor<sint, sgf2n>& Proc) const
             Proc.public_input, Proc.public_input_filename, 0).items[0];
         break;
       case RAWOUTPUT:
-        Proc.read_Cp(r[0]).output(Proc.public_output, false);
+        Proc.read_Cp(r[0]).output(Proc.get_public_output(), false);
         break;
       case INTOUTPUT:
         if (n == -1 or n == Proc.P.my_num())
-          Integer(Proc.read_Ci(r[0])).output(Proc.binary_output, false);
+          Integer(Proc.read_Ci(r[0])).output(Proc.get_binary_output(), false);
         break;
       case FLOATOUTPUT:
         if (n == -1 or n == Proc.P.my_num())
@@ -1273,9 +1284,13 @@ inline void Instruction::execute(Processor<sint, sgf2n>& Proc) const
             double tmp = bigint::get_float(Proc.read_Cp(start[0] + i),
               Proc.read_Cp(start[1] + i), Proc.read_Cp(start[2] + i),
               Proc.read_Cp(start[3] + i)).get_d();
-            Proc.binary_output.write((char*) &tmp, sizeof(double));
+            Proc.get_binary_output().write((char*) &tmp, sizeof(double));
+            Proc.get_binary_output().flush();
           }
         break;
+      case FIXINPUT:
+        Proc.fixinput(*this);
+        return;
       case PREP:
         Procp.DataF.get(Proc.Procp.get_S(), r, start, size);
         return;
diff --git a/Processor/Machine.h b/Processor/Machine.h
index d3c1346b2..7317d3199 100644
--- a/Processor/Machine.h
+++ b/Processor/Machine.h
@@ -13,6 +13,7 @@
 
 #include "Processor/Online-Thread.h"
 #include "Processor/ThreadJob.h"
+#include "Processor/ExternalClients.h"
 
 #include "GC/Machine.h"
 
@@ -73,6 +74,8 @@ class Machine : public BaseMachine
 
   ExecutionStats stats;
 
+  ExternalClients external_clients;
+
   static void init_binary_domains(int security_parameter, int lg2);
 
   Machine(Names& playerNames, bool use_encryption = true,
@@ -111,6 +114,8 @@ class Machine : public BaseMachine
   typename sint::mac_key_type get_sint_mac_key() { return alphapi; }
 
   Player& get_player() { return *P; }
+
+  void check_program();
 };
 
 #endif /* MACHINE_H_ */
diff --git a/Processor/Machine.hpp b/Processor/Machine.hpp
index 4ff526084..e9e3eb209 100644
--- a/Processor/Machine.hpp
+++ b/Processor/Machine.hpp
@@ -57,10 +57,18 @@ Machine<sint, sgf2n>::Machine(Names& playerNames, bool use_encryption,
   : my_number(playerNames.my_num()), N(playerNames),
     direct(opts.direct), opening_sum(opts.opening_sum),
     receive_threads(opts.receive_threads), max_broadcast(opts.max_broadcast),
-    use_encryption(use_encryption), live_prep(opts.live_prep), opts(opts)
+    use_encryption(use_encryption), live_prep(opts.live_prep), opts(opts),
+    external_clients(my_number)
 {
   OnlineOptions::singleton = opts;
 
+  if (N.num_players() == 1 and sint::is_real)
+    {
+      cerr << "Need more than one player to run a protocol." << endl;
+      cerr << "Use 'emulate.x' for just running the virtual machine" << endl;
+      exit(1);
+    }
+
   if (opening_sum < 2)
     this->opening_sum = N.num_players();
   if (max_broadcast < 2)
@@ -129,6 +137,7 @@ void Machine<sint, sgf2n>::prepare(const string& progname_str)
   int old_n_threads = nthreads;
   progs.clear();
   load_schedule(progname_str);
+  check_program();
 
   // keep preprocessing
   nthreads = max(old_n_threads, nthreads);
@@ -467,17 +476,21 @@ void Machine<sint, sgf2n>::run(const string& progname)
 
   print_timers();
 
-  size_t rounds = 0;
-  for (auto& x : comm_stats)
-      rounds += x.second.rounds;
-  cerr << "Data sent = " << comm_stats.sent / 1e6 << " MB in ~" << rounds
-      << " rounds (party " << my_number;
-  if (threads.size() > 1)
-      cerr << "; rounds counted double due to multi-threading";
-  cerr << ")" << endl;
-
-  auto& P = *this->P;
-  this->print_global_comm(P, comm_stats);
+  if (sint::is_real)
+  {
+      size_t rounds = 0;
+      for (auto& x : comm_stats)
+          rounds += x.second.rounds;
+      cerr << "Data sent = " << comm_stats.sent / 1e6 << " MB in ~" << rounds
+              << " rounds (party " << my_number;
+      if (threads.size() > 1)
+          cerr << "; rounds counted double due to multi-threading";
+      cerr << "; use '-v' for more details";
+      cerr << ")" << endl;
+
+      auto& P = *this->P;
+      this->print_global_comm(P, comm_stats);
+  }
 
 #ifdef VERBOSE_OPTIONS
   if (opening_sum < N.num_players() && !direct)
@@ -582,12 +595,35 @@ void Machine<sint, sgf2n>::suggest_optimizations()
   if (relevant_opts.find("split") != string::npos and sint::has_split)
     optimizations.append(
         "\tprogram.use_split(" + to_string(N.num_players()) + ")\n");
-  if (relevant_opts.find("edabit") != string::npos and not sint::has_split)
+  if (relevant_opts.find("edabit") != string::npos and not sint::has_split and sint::is_real)
     optimizations.append("\tprogram.use_edabit(True)\n");
   if (not optimizations.empty())
     cerr << "This program might benefit from some protocol options." << endl
         << "Consider adding the following at the beginning of '" << progname
         << ".mpc':" << endl << optimizations;
+#ifndef __clang__
+  cerr << "This virtual machine was compiled with GCC. Recompile with "
+      "'CXX = clang++' in 'CONFIG.mine' for optimal performance." << endl;
+#endif
+}
+
+template<class sint, class sgf2n>
+void Machine<sint, sgf2n>::check_program()
+{
+  Hash hasher;
+  for (auto& prog : progs)
+    hasher.update(prog.get_hash());
+  assert(P);
+  Bundle<octetStream> bundle(*P);
+  hasher.final(bundle.mine);
+  try
+  {
+    bundle.compare(*P);
+  }
+  catch (mismatch_among_parties&)
+  {
+    throw runtime_error("program differs between parties");
+  }
 }
 
 #endif
diff --git a/Processor/OnlineOptions.cpp b/Processor/OnlineOptions.cpp
index 34d7ce129..b4bf6594e 100644
--- a/Processor/OnlineOptions.cpp
+++ b/Processor/OnlineOptions.cpp
@@ -62,8 +62,9 @@ OnlineOptions::OnlineOptions(ez::ezOptionParser& opt, int argc,
           0, // Required?
           1, // Number of args expected.
           0, // Delimiter if expecting multiple args.
-          "Prefix for input file path (default: Player-Data/Private-Input). "
-          "Input will be read from {prefix}-P{id}-{thread_id}.", // Help description.
+          "Prefix for input file path (default: Player-Data/Input). "
+          "Text input will be read from {prefix}-P{id}-{thread_id} and "
+          "binary input from {prefix}-Binary-P{id}-{thread_id}", // Help description.
           "-IF", // Flag token.
           "--input-file" // Flag token.
     );
@@ -95,7 +96,7 @@ OnlineOptions::OnlineOptions(ez::ezOptionParser& opt, int argc,
             0, // Required?
             0, // Number of args expected.
             0, // Delimiter if expecting multiple args.
-            "Verbose output", // Help description.
+            "Verbose output, in particular more data on communication", // Help description.
             "-v", // Flag token.
             "--verbose" // Flag token.
     );
diff --git a/Processor/PrepBase.cpp b/Processor/PrepBase.cpp
index 4ca77daa1..a2f79027e 100644
--- a/Processor/PrepBase.cpp
+++ b/Processor/PrepBase.cpp
@@ -48,9 +48,14 @@ void PrepBase::print_left(const char* name, size_t n, const string& type_string,
                 << endl;
 
     if (n > used / 10)
+    {
         cerr << "Significant amount of unused " << name << " of " << type_string
                 << ". For more accurate benchmarks, "
-                << "consider reducing the batch size with -b." << endl;
+                << "consider reducing the batch size with --batch-size." << endl;
+        cerr
+                << "Note that some protocols have larger minimum batch sizes."
+                << endl;
+    }
 }
 
 void PrepBase::print_left_edabits(size_t n, size_t n_batch, bool strict,
@@ -67,6 +72,6 @@ void PrepBase::print_left_edabits(size_t n, size_t n_batch, bool strict,
     if (n > used / 10)
         cerr << "Significant amount of unused edaBits of size " << n_bits
                 << ". For more accurate benchmarks, "
-                << "consider reducing the batch size with -b "
-                << "or increasing the bucket size with -B." << endl;
+                << "consider reducing the batch size with --batch-size "
+                << "or increasing the bucket size with --bucket-size." << endl;
 }
diff --git a/Processor/Processor.h b/Processor/Processor.h
index 37227c41b..b35eb47f6 100644
--- a/Processor/Processor.h
+++ b/Processor/Processor.h
@@ -118,6 +118,9 @@ class ArithmeticProcessor : public ProcessorBase
 protected:
   CheckVector<long> Ci;
 
+  ofstream public_output;
+  ofstream binary_output;
+
 public:
   int thread_num;
 
@@ -126,11 +129,11 @@ class ArithmeticProcessor : public ProcessorBase
 
   string private_input_filename;
   string public_input_filename;
+  string binary_input_filename;
 
   ifstream private_input;
   ifstream public_input;
-  ofstream public_output;
-  ofstream binary_output;
+  ifstream binary_input;
 
   int sent, rounds;
 
@@ -173,6 +176,15 @@ class ArithmeticProcessor : public ProcessorBase
     throw not_implemented();
   }
 
+  virtual ofstream& get_public_output()
+  {
+    throw not_implemented();
+  }
+  virtual ofstream& get_binary_output()
+  {
+    throw not_implemented();
+  }
+
   void shuffle(const Instruction& instruction);
   void bitdecint(const Instruction& instruction);
 };
@@ -203,9 +215,11 @@ class Processor : public ArithmeticProcessor
   unsigned int PC;
   TempVars<sint, sgf2n> temp;
 
-  ExternalClients external_clients;
+  ExternalClients& external_clients;
   Binary_File_IO binary_file_io;
 
+  Timer client_timer;
+
   void reset(const Program& program,int arg); // Reset the state of the processor
   string get_filename(const char* basename, bool use_number);
 
@@ -268,10 +282,15 @@ class Processor : public ArithmeticProcessor
   
   cint get_inverse2(unsigned m);
 
+  void fixinput(const Instruction& instruction);
+
   // synchronize in asymmetric protocols
   long sync_Ci(size_t i) const;
   long sync(long x) const;
 
+  ofstream& get_public_output();
+  ofstream& get_binary_output();
+
   private:
 
   template<class T> friend class SPDZ;
diff --git a/Processor/Processor.hpp b/Processor/Processor.hpp
index 78aba8c81..c7c6bf359 100644
--- a/Processor/Processor.hpp
+++ b/Processor/Processor.hpp
@@ -54,6 +54,27 @@ SubProcessor<T>::~SubProcessor()
 #endif
 }
 
+template<class sint, class sgf2n>
+inline ofstream& Processor<sint, sgf2n>::get_public_output()
+{
+  if (not public_output.is_open())
+    public_output.open(get_filename(PREP_DIR "Public-Output-", true).c_str(),
+        ios_base::out);
+
+  return public_output;
+}
+
+template<class sint, class sgf2n>
+inline ofstream& Processor<sint, sgf2n>::get_binary_output()
+{
+  if (not binary_output.is_open())
+    binary_output.open(
+        get_parameterized_filename(P.my_num(), thread_num,
+            PREP_DIR "Binary-Output"), ios_base::out);
+
+  return binary_output;
+}
+
 template<class sint, class sgf2n>
 Processor<sint, sgf2n>::Processor(int thread_num,Player& P,
         typename sgf2n::MAC_Check& MC2,typename sint::MAC_Check& MCp,
@@ -64,7 +85,7 @@ Processor<sint, sgf2n>::Processor(int thread_num,Player& P,
   share_thread(DataF.DataFb, P, machine.get_bit_mac_key()),
   Procb(machine.bit_memories),
   Proc2(*this,MC2,DataF.DataF2,P),Procp(*this,MCp,DataF.DataFp,P),
-  external_clients(P.my_num()),
+  external_clients(machine.external_clients),
   binary_file_io(Binary_File_IO())
 {
   reset(program,0);
@@ -73,13 +94,19 @@ Processor<sint, sgf2n>::Processor(int thread_num,Player& P,
   public_input.open(public_input_filename);
   private_input_filename = (get_filename(PREP_DIR "Private-Input-",true));
   private_input.open(private_input_filename.c_str());
-  public_output.open(get_filename(PREP_DIR "Public-Output-",true).c_str(), ios_base::out);
-  binary_output.open(
-      get_parameterized_filename(P.my_num(), thread_num,
-          PREP_DIR "Binary-Output"), ios_base::out);
 
   open_input_file(P.my_num(), thread_num, machine.opts.cmd_private_input_file);
 
+  string input_prefix = machine.opts.cmd_private_input_file;
+  if (input_prefix == OnlineOptions().cmd_private_input_file
+      or input_prefix == ".")
+    input_prefix = PREP_DIR "Input-Binary";
+  else
+    input_prefix += "-Binary";
+  binary_input_filename = get_parameterized_filename(P.my_num(), thread_num,
+      input_prefix);
+  binary_input.open(binary_input_filename);
+
   secure_prng.ReSeed();
   shared_prng.SeedGlobally(P, false);
 
@@ -96,6 +123,8 @@ Processor<sint, sgf2n>::~Processor()
   if (sent)
     cerr << "Opened " << sent << " elements in " << rounds << " rounds" << endl;
 #endif
+  if (OnlineOptions::singleton.verbose and client_timer.elapsed())
+    cerr << "Client communication time = " << client_timer.elapsed() << endl;
 }
 
 template<class sint, class sgf2n>
@@ -286,6 +315,7 @@ void Processor<sint, sgf2n>::write_socket(const RegType reg_type,
 #endif
 
   try {
+    TimeScope _(client_timer);
     socket_stream.Send(external_clients.get_socket(socket_id));
   }
     catch (bad_value& e) {
@@ -302,7 +332,9 @@ void Processor<sint, sgf2n>::read_socket_ints(int client_id,
 {
   int m = registers.size();
   socket_stream.reset_write_head();
+  client_timer.start();
   socket_stream.Receive(external_clients.get_socket(client_id));
+  client_timer.stop();
   for (int j = 0; j < size; j++)
     for (int i = 0; i < m; i++)
       {
@@ -319,7 +351,9 @@ void Processor<sint, sgf2n>::read_socket_vector(int client_id,
 {
   int m = registers.size();
   socket_stream.reset_write_head();
+  client_timer.start();
   socket_stream.Receive(external_clients.get_socket(client_id));
+  client_timer.stop();
   for (int j = 0; j < size; j++)
     for (int i = 0; i < m; i++)
       get_Cp_ref(registers[i] + j) =
@@ -333,7 +367,9 @@ void Processor<sint, sgf2n>::read_socket_private(int client_id,
 {
   int m = registers.size();
   socket_stream.reset_write_head();
+  client_timer.start();
   socket_stream.Receive(external_clients.get_socket(client_id));
+  client_timer.stop();
 
   for (int j = 0; j < size; j++)
     for (int i = 0; i < m; i++)
@@ -773,6 +809,56 @@ typename sint::clear Processor<sint, sgf2n>::get_inverse2(unsigned m)
   return inverses2m[m];
 }
 
+template<class sint, class sgf2n>
+void Processor<sint, sgf2n>::fixinput(const Instruction& instruction)
+{
+  int n = instruction.get_n();
+  if (n == P.my_num() or n == -1)
+    {
+      typename sint::clear tmp;
+      bool use_double = false;
+      switch (instruction.get_r(2))
+      {
+      case 0:
+      case 1:
+        break;
+      case 2:
+        use_double = true;
+        break;
+      default:
+        throw runtime_error("unknown format for fixed-point input");
+      }
+
+      for (int i = 0; i < instruction.get_size(); i++)
+        {
+          if (binary_input.peek() == EOF)
+            throw IO_Error("not enough inputs in " + binary_input_filename);
+          double buf;
+          if (instruction.get_r(2) == 0)
+            {
+              int64_t x;
+              binary_input.read((char*) &x, sizeof(x));
+              tmp = x;
+            }
+          else
+            {
+              if (use_double)
+                binary_input.read((char*) &buf, sizeof(double));
+              else
+                {
+                  float x;
+                  binary_input.read((char*) &x, sizeof(float));
+                  buf = x;
+                }
+              tmp = bigint::tmp = round(buf * exp2(instruction.get_r(1)));
+            }
+          if (binary_input.fail())
+            throw IO_Error("failure reading from " + binary_input_filename);
+          write_Cp(instruction.get_r(0) + i, tmp);
+        }
+    }
+}
+
 template<class sint, class sgf2n>
 long Processor<sint, sgf2n>::sync_Ci(size_t i) const
 {
diff --git a/Processor/Program.cpp b/Processor/Program.cpp
index dac73400b..f9cb5c579 100644
--- a/Processor/Program.cpp
+++ b/Processor/Program.cpp
@@ -33,6 +33,18 @@ void Program::parse(string filename)
   if (pinp.fail())
     throw file_error(filename);
   parse(pinp);
+
+  // compute hash
+  pinp.clear();
+  pinp.seekg(0);
+  Hash hasher;
+  while (pinp.peek(), !pinp.eof())
+    {
+      char buf[1024];
+      size_t n = pinp.readsome(buf, 1024);
+      hasher.update(buf, n);
+    }
+  hash = hasher.final().str();
 }
 
 void Program::parse(istream& s)
diff --git a/Processor/Program.h b/Processor/Program.h
index 96a70e5eb..2c8470f8c 100644
--- a/Processor/Program.h
+++ b/Processor/Program.h
@@ -26,6 +26,8 @@ class Program
   // True if program contains variable-sized loop
   bool unknown_usage;
 
+  string hash;
+
   void compute_constants();
 
   public:
@@ -53,6 +55,9 @@ class Program
   size_t direct_mem(RegType reg_type) const
     { return max_mem[reg_type]; }
 
+  const string& get_hash() const
+    { return hash; }
+
   friend ostream& operator<<(ostream& s,const Program& P);
 
   // Execute this program, updateing the processor and memory
diff --git a/Processor/instructions.h b/Processor/instructions.h
index f22fde8e6..5912d8676 100644
--- a/Processor/instructions.h
+++ b/Processor/instructions.h
@@ -281,7 +281,7 @@
     X(GCONVGF2N, auto dest = &Proc.get_Ci()[r[0]]; auto source = &C2[r[1]], \
             *dest++ = source->get_word(); source++) \
     X(GRAWOUTPUT, auto source = &C2[r[0]], \
-            (*source++).output(Proc.public_output, false)) \
+            (*source++).output(Proc.get_public_output(), false)) \
 
 #define REMAINING_INSTRUCTIONS \
     X(CONVMODP, throw not_implemented(),) \
diff --git a/Programs/Source/bankers_bonus.mpc b/Programs/Source/bankers_bonus.mpc
index e3dfc9f92..674efcdad 100644
--- a/Programs/Source/bankers_bonus.mpc
+++ b/Programs/Source/bankers_bonus.mpc
@@ -20,6 +20,7 @@ from Compiler.util import if_else
 PORTNUM = 14000
 MAX_NUM_CLIENTS = 8
 n_rounds = 0
+n_threads = 2
 
 if len(program.args) > 1:
     n_rounds = int(program.args[1])
@@ -110,7 +111,7 @@ def main():
             # Clients secret input.
             client_values = t.Array(MAX_NUM_CLIENTS)
 
-            @for_range(number_clients)
+            @for_range_multithread(n_threads, 1, number_clients)
             def _(client_id):
                 client_values[client_id] = client_input(t, client_id)
 
diff --git a/Programs/Source/breast_logistic.mpc b/Programs/Source/breast_logistic.mpc
new file mode 100644
index 000000000..28ee6be61
--- /dev/null
+++ b/Programs/Source/breast_logistic.mpc
@@ -0,0 +1,54 @@
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
+
+X, y = load_breast_cancer(return_X_y=True)
+
+# normalize column-wise
+X /= X.max(axis=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+if 'horizontal' in program.args:
+    # split by sample
+    a = sfix.input_tensor_via(0, X_train[len(X_train) // 2:])
+    b = sfix.input_tensor_via(1, X_train[:len(X_train) // 2])
+    X_train = a.concat(b)
+
+    a = sint.input_tensor_via(0, y_train[len(y_train) // 2:])
+    b = sint.input_tensor_via(1, y_train[:len(y_train) // 2])
+    y_train = a.concat(b)
+elif 'vertical' in program.args:
+    print (X_train.shape, X_train.shape[1])
+    a = sfix.input_tensor_via(0, X_train[:,:X_train.shape[1] // 2])
+    b = sfix.input_tensor_via(1, X_train[:,X_train.shape[1] // 2:])
+    X_train = a.concat_columns(b)
+    y_train = sint.input_tensor_via(0, y_train)
+elif 'party0' in program.args:
+    a = sfix.input_tensor_via(0, X_train[:,:X_train.shape[1] // 2])
+    b = sfix.input_tensor_via(1, shape=X_train[:,X_train.shape[1] // 2:].shape)
+    X_train = a.concat_columns(b)
+    y_train = sint.input_tensor_via(0, y_train)
+elif 'party1' in program.args:
+    a = sfix.input_tensor_via(0, shape=X_train[:,:X_train.shape[1] // 2].shape)
+    b = sfix.input_tensor_via(1, X_train[:,X_train.shape[1] // 2:])
+    X_train = a.concat_columns(b)
+    y_train = sint.input_tensor_via(0, shape=y_train.shape)
+else:
+    X_train = sfix.input_tensor_via(0, X_train)
+    y_train = sint.input_tensor_via(0, y_train)
+
+if 'party1' in program.args:
+    X_test = sfix.input_tensor_via(0, shape=X_test.shape)
+    y_test = sint.input_tensor_via(0, shape=y_test.shape)
+else:
+    X_test = sfix.input_tensor_via(0, X_test)
+    y_test = sint.input_tensor_via(0, y_test)
+
+from Compiler import ml
+
+log = ml.SGDLogistic(20, 2, program)
+
+log.fit(X_train, y_train)
+print_ln('%s', (log.predict(X_test) - y_test.get_vector()).reveal())
+
+log.fit_with_testing(X_train, y_train, X_test, y_test)
+print_ln('%s', (log.predict_proba(X_test) - y_test.get_vector()).reveal())
diff --git a/Programs/Source/breast_tree.mpc b/Programs/Source/breast_tree.mpc
new file mode 100644
index 000000000..547964528
--- /dev/null
+++ b/Programs/Source/breast_tree.mpc
@@ -0,0 +1,33 @@
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
+
+X, y = load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+X_train = sfix.input_tensor_via(0, X_train)
+X_test = sfix.input_tensor_via(0, X_test)
+
+y_train = sint.input_tensor_via(0, y_train)
+y_test = sint.input_tensor_via(0, y_test)
+
+# use "nearest" option for deterministic result
+# otherwise the Gini coefficients vary slightly from run to run
+# resulting in different trees
+
+sfix.set_precision_from_args(program)
+
+from Compiler.decision_tree import TreeClassifier
+
+tree = TreeClassifier(max_depth=5)
+
+# plain training
+tree.fit(X_train, y_train)
+
+# output difference between truth and prediction
+print_ln('%s', (tree.predict(X_test) - y_test.get_vector()).reveal())
+
+# output tree
+tree.output()
+
+# training with level-wise accuracy output
+tree.fit_with_testing(X_train, y_train, X_test, y_test)
diff --git a/Programs/Source/diabetes.mpc b/Programs/Source/diabetes.mpc
new file mode 100644
index 000000000..4fdccf9c6
--- /dev/null
+++ b/Programs/Source/diabetes.mpc
@@ -0,0 +1,32 @@
+from sklearn import datasets, linear_model
+from sklearn.model_selection import train_test_split
+
+X, y = datasets.load_diabetes(return_X_y=True)
+
+# normalize
+y /= y.max()
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+X_train = sfix.input_tensor_via(0, X_train)
+y_train = sfix.input_tensor_via(0, y_train)
+
+X_test = sfix.input_tensor_via(0, X_test)
+y_test = sfix.input_tensor_via(0, y_test)
+
+from Compiler import ml
+
+try:
+    batch_size = int(program.args[1])
+except:
+    batch_size = 1
+
+linear = ml.SGDLinear(100, batch_size, program)
+
+linear.fit(X_train, y_train)
+print_ln('model %s', linear.opt.layers[0].W[:].reveal())
+print_ln('diff %s', (linear.predict(X_test) - y_test).reveal())
+
+linear.fit_with_testing(X_train, y_train, X_test, y_test)
+print_ln('model %s', linear.opt.layers[0].W[:].reveal())
+print_ln('diff %s', (linear.predict(X_test) - y_test).reveal())
diff --git a/Programs/Source/easy_adult.mpc b/Programs/Source/easy_adult.mpc
new file mode 100644
index 000000000..5f1ccc389
--- /dev/null
+++ b/Programs/Source/easy_adult.mpc
@@ -0,0 +1,38 @@
+import pandas
+from sklearn.model_selection import train_test_split
+from Compiler import decision_tree
+
+data = pandas.read_csv(
+    'https://datahub.io/machine-learning/adult/r/adult.csv')
+#'/tmp/adult.csv')
+
+data, attr_types = decision_tree.preprocess_pandas(data)
+
+# label is last column
+X = data[:,:-1]
+y = data[:,-1]
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+X_train = sint.input_tensor_via(0, X_train)
+X_test = sint.input_tensor_via(0, X_test)
+
+y_train = sint.input_tensor_via(0, y_train)
+y_test = sint.input_tensor_via(0, y_test)
+
+# needed for correct Gini coefficient
+sfix.round_nearest = True
+sfix.set_precision(15, 31)
+
+# input values all fit 32 bits
+program.set_bit_length(32)
+
+tree = decision_tree.TreeClassifier(max_depth=10)
+
+# training with level-wise accuracy output
+tree.fit_with_testing(X_train, y_train, X_test, y_test, attr_types=attr_types)
+
+# plain training
+tree.fit(X_train, y_train, attr_types=attr_types)
+
+print_ln('%s', (tree.predict(X_test) - y_test.get_vector()).reveal())
diff --git a/Programs/Source/keras_cifar_lenet.mpc b/Programs/Source/keras_cifar_lenet.mpc
index 882d2e187..cfd137883 100644
--- a/Programs/Source/keras_cifar_lenet.mpc
+++ b/Programs/Source/keras_cifar_lenet.mpc
@@ -3,22 +3,39 @@
 
 program.options_from_args()
 
-training_samples = MultiArray([50000, 32, 32, 3], sfix)
-training_labels = MultiArray([50000, 10], sint)
-
-test_samples = MultiArray([10000, 32, 32, 3], sfix)
-test_labels = MultiArray([10000, 10], sint)
-
-training_labels.input_from(0)
-training_samples.input_from(0)
-
-test_labels.input_from(0)
-test_samples.input_from(0)
-
 from Compiler import ml
 tf = ml
 ml.set_n_threads(36)
 
+try:
+    ml.set_n_threads(int(program.args[1]))
+except:
+    pass
+
+if 'torch' in program.args:
+    import torchvision, numpy
+    data = []
+    for train in True, False:
+        ds = torchvision.datasets.CIFAR10(root='/tmp', train=train, download=True)
+        # normalize to [-1,1] before input
+        samples = sfix.input_tensor_via(0, ds.data / 255 * 2 - 1, binary=True)
+        labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+        data += [(labels, samples)]
+
+    (training_labels, training_samples), (test_labels, test_samples) = data
+else:
+    training_samples = MultiArray([50000, 32, 32, 3], sfix)
+    training_labels = MultiArray([50000, 10], sint)
+
+    test_samples = MultiArray([10000, 32, 32, 3], sfix)
+    test_labels = MultiArray([10000, 10], sint)
+
+    training_labels.input_from(0)
+    training_samples.input_from(0)
+
+    test_labels.input_from(0)
+    test_samples.input_from(0)
+
 layers = [
     tf.keras.layers.Conv2D(20, 5, 1, 'valid', activation='relu'),
     tf.keras.layers.MaxPooling2D(2),
diff --git a/Programs/Source/keras_mnist_dense.mpc b/Programs/Source/keras_mnist_dense.mpc
index 76b1e23f5..4b281882d 100644
--- a/Programs/Source/keras_mnist_dense.mpc
+++ b/Programs/Source/keras_mnist_dense.mpc
@@ -3,17 +3,29 @@
 
 program.options_from_args()
 
-training_samples = sfix.Tensor([60000, 28, 28])
-training_labels = sint.Tensor([60000, 10])
-
-test_samples = sfix.Tensor([10000, 28, 28])
-test_labels = sint.Tensor([10000, 10])
-
-training_labels.input_from(0)
-training_samples.input_from(0)
-
-test_labels.input_from(0)
-test_samples.input_from(0)
+if 'torch' in program.args:
+    import torchvision
+    data = []
+    for train in True, False:
+        ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+	# normalize to [0,1] before input
+        samples = sfix.input_tensor_via(0, ds.data / 255., binary=True)
+        labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+        data += [(labels, samples)]
+
+    (training_labels, training_samples), (test_labels, test_samples) = data
+else:
+    training_samples = sfix.Tensor([60000, 28, 28])
+    training_labels = sint.Tensor([60000, 10])
+
+    test_samples = sfix.Tensor([10000, 28, 28])
+    test_labels = sint.Tensor([10000, 10])
+
+    training_labels.input_from(0)
+    training_samples.input_from(0)
+
+    test_labels.input_from(0)
+    test_samples.input_from(0)
 
 from Compiler import ml
 tf = ml
diff --git a/Programs/Source/keras_mnist_lenet.mpc b/Programs/Source/keras_mnist_lenet.mpc
index 90adf68e3..78acdd6a6 100644
--- a/Programs/Source/keras_mnist_lenet.mpc
+++ b/Programs/Source/keras_mnist_lenet.mpc
@@ -3,17 +3,29 @@
 
 program.options_from_args()
 
-training_samples = MultiArray([60000, 28, 28], sfix)
-training_labels = MultiArray([60000, 10], sint)
-
-test_samples = MultiArray([10000, 28, 28], sfix)
-test_labels = MultiArray([10000, 10], sint)
-
-training_labels.input_from(0)
-training_samples.input_from(0)
-
-test_labels.input_from(0)
-test_samples.input_from(0)
+if 'torch' in program.args:
+    import torchvision
+    data = []
+    for train in True, False:
+        ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+	# normalize to [0,1] before input
+        samples = sfix.input_tensor_via(0, ds.data / 255., binary=True)
+        labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+        data += [(labels, samples)]
+
+    (training_labels, training_samples), (test_labels, test_samples) = data
+else:
+    training_samples = sfix.Tensor([60000, 28, 28])
+    training_labels = sint.Tensor([60000, 10])
+
+    test_samples = sfix.Tensor([10000, 28, 28])
+    test_labels = sint.Tensor([10000, 10])
+
+    training_labels.input_from(0)
+    training_samples.input_from(0)
+
+    test_labels.input_from(0)
+    test_samples.input_from(0)
 
 from Compiler import ml
 tf = ml
diff --git a/Programs/Source/keras_mnist_lenet_predict.mpc b/Programs/Source/keras_mnist_lenet_predict.mpc
index 8b55de560..100dd564a 100644
--- a/Programs/Source/keras_mnist_lenet_predict.mpc
+++ b/Programs/Source/keras_mnist_lenet_predict.mpc
@@ -38,7 +38,7 @@ for var in model.trainable_variables:
     var.assign_all(0)
 #    start = var.read_from_file(start)
 
-guesses = model.predict(test_samples, batch_size=1)
+guesses = model.predict(test_samples)
 
 print_ln('guess %s', guesses.reveal_nested()[:3])
 print_ln('truth %s', test_labels.reveal_nested()[:3])
diff --git a/Programs/Source/test_sbitfix.mpc b/Programs/Source/test_sbitfix.mpc
index 513cfe010..6940799b2 100644
--- a/Programs/Source/test_sbitfix.mpc
+++ b/Programs/Source/test_sbitfix.mpc
@@ -5,8 +5,7 @@ sbitfix.set_precision(16, 32)
 def test(a, b, value_type=None):
     try:
         b = int(round((b * (1 << a.f))))
-        if b < 0:
-            b += 2 ** sbitfix.k
+        b += 2 ** sbitfix.k if b < 0 else 0
         a = a.v.reveal()
     except AttributeError:
         pass
diff --git a/Programs/Source/torch_alex_test.mpc b/Programs/Source/torch_alex_test.mpc
new file mode 100644
index 000000000..4bbccaca0
--- /dev/null
+++ b/Programs/Source/torch_alex_test.mpc
@@ -0,0 +1,92 @@
+# this trains an AlexNet-like network on CIFAR-10 in cleartext
+# before testing it in secure computation
+
+program.options_from_args()
+
+from Compiler import ml
+
+try:
+    ml.set_n_threads(int(program.args[1]))
+except:
+    pass
+
+import torchvision
+import torch
+import numpy
+
+get_data = lambda train, transform=None: torchvision.datasets.CIFAR10(
+    root='/tmp', train=train, download=True, transform=transform)
+
+ds = get_data(False)
+
+# get 100 random samples
+indices = numpy.random.randint(len(ds.data), size=(100,))
+# normalize to [-1,1] before input
+test_samples = sfix.input_tensor_via(
+    0, numpy.take(ds.data / 255 * 2 - 1, indices, 0))
+test_labels = sint.input_tensor_via(
+    0, numpy.take(ds.targets, indices, 0), one_hot=True)
+
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Conv2d(64, 96, kernel_size=3, padding=2),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Conv2d(96, 96, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(96, 64, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(64, 64, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=3, stride=2),
+    nn.Flatten(),
+    nn.Linear(1024, 128),
+    nn.ReLU(),
+    nn.Linear(128, 256),
+    nn.ReLU(),
+    nn.Linear(256, 10),
+)
+
+# train for a bit
+transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor(), lambda x: 2 * x - 1])
+ds = get_data(train=True, transform=transform)
+optimizer = torch.optim.Adam(net.parameters(), amsgrad=True)
+criterion = nn.CrossEntropyLoss()
+
+for i, data in enumerate(torch.utils.data.DataLoader(ds, batch_size=128)):
+    inputs, labels = data
+    optimizer.zero_grad()
+    outputs = net(inputs)
+    loss = criterion(outputs, labels)
+    loss.backward()
+    optimizer.step()
+
+with torch.no_grad():
+    ds = get_data(False, transform)
+    total = correct_classified = 0
+    for data in torch.utils.data.DataLoader(ds, batch_size=128):
+        inputs, labels = data
+        outputs = net(inputs)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct_classified += (predicted == labels).sum().item()
+    test_acc = (100 * correct_classified / total)
+    print('Cleartext test accuracy of the network: %.2f %%' % test_acc)
+
+from Compiler import ml
+
+layers = ml.layers_from_torch(net, test_samples.shape, 128, input_via=0)
+
+optimizer = ml.SGD(layers)
+
+# output to be used in Scripts/torch_cifar_alex_import.py
+optimizer.reveal_model_to_binary()
+
+n_correct, loss = optimizer.reveal_correctness(test_samples, test_labels, 128)
+print_ln('Secure accuracy: %s (%s/%s)', cfix(n_correct) / len(test_samples),
+         n_correct, len(test_samples))
diff --git a/Programs/Source/torch_cifar_alex.mpc b/Programs/Source/torch_cifar_alex.mpc
new file mode 100644
index 000000000..54f4b0d69
--- /dev/null
+++ b/Programs/Source/torch_cifar_alex.mpc
@@ -0,0 +1,70 @@
+# this trains LeNet on CIFAR-10
+
+program.options_from_args()
+
+from Compiler import ml
+
+try:
+    ml.set_n_threads(int(program.args[2]))
+except:
+    pass
+
+import torchvision, numpy
+data = []
+for train in True, False:
+    ds = torchvision.datasets.CIFAR10(root='/tmp', train=train, download=True)
+    # normalize to [-1,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255 * 2 - 1, binary=True)
+    labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+    data += [(labels, samples)]
+
+(training_labels, training_samples), (test_labels, test_samples) = data
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2),
+    nn.ReLU(),
+    nn.BatchNorm2d(64),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Conv2d(64, 96, kernel_size=3, padding=2),
+    nn.ReLU(),
+    nn.BatchNorm2d(96),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Conv2d(96, 96, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.BatchNorm2d(96),
+    nn.Conv2d(96, 64, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.BatchNorm2d(64),
+    nn.Conv2d(64, 64, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.BatchNorm2d(64),
+    nn.MaxPool2d(kernel_size=3, stride=2),
+    nn.Flatten(),
+    nn.Linear(1024, 128),
+    nn.ReLU(),
+    nn.Linear(128, 256),
+    nn.ReLU(),
+    nn.Linear(256, 10),
+)
+
+# test network
+ds = torchvision.datasets.CIFAR10(
+    root='/tmp', transform=torchvision.transforms.ToTensor())
+inputs = next(iter(torch.utils.data.DataLoader(ds)))[0]
+print(inputs.shape)
+outputs = net(inputs)
+
+layers = ml.layers_from_torch(net, training_samples.shape, 128)
+
+optimizer = ml.SGD(layers)
+optimizer.fit(
+    training_samples,
+    training_labels,
+    epochs=int(program.args[1]),
+    batch_size=128,
+    validation_data=(test_samples, test_labels),
+    program=program
+)
diff --git a/Programs/Source/torch_cifar_lenet.mpc b/Programs/Source/torch_cifar_lenet.mpc
new file mode 100644
index 000000000..3d9ea7cda
--- /dev/null
+++ b/Programs/Source/torch_cifar_lenet.mpc
@@ -0,0 +1,57 @@
+# this trains LeNet on CIFAR-10
+
+program.options_from_args()
+
+from Compiler import ml
+
+try:
+    ml.set_n_threads(int(program.args[2]))
+except:
+    pass
+
+import torchvision, numpy
+data = []
+for train in True, False:
+    ds = torchvision.datasets.CIFAR10(root='/tmp', train=train, download=True)
+    # normalize to [-1,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255 * 2 - 1, binary=True)
+    labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+    data += [(labels, samples)]
+
+(training_labels, training_samples), (test_labels, test_samples) = data
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Conv2d(3, 20, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Conv2d(20, 50, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Flatten(),
+    nn.ReLU(),
+    nn.Linear(1250, 500),
+    nn.ReLU(),
+    nn.Linear(500, 10)
+)
+
+# test network
+ds = torchvision.datasets.CIFAR10(
+    root='/tmp', transform=torchvision.transforms.ToTensor())
+inputs = next(iter(torch.utils.data.DataLoader(ds)))[0]
+print(inputs.shape)
+outputs = net(inputs)
+
+layers = ml.layers_from_torch(net, training_samples.shape, 128)
+
+optimizer = ml.SGD(layers)
+optimizer.fit(
+    training_samples,
+    training_labels,
+    epochs=int(program.args[1]),
+    batch_size=128,
+    validation_data=(test_samples, test_labels),
+    program=program
+)
diff --git a/Programs/Source/torch_cifar_lenet_pretrain.mpc b/Programs/Source/torch_cifar_lenet_pretrain.mpc
new file mode 100644
index 000000000..216a2d68f
--- /dev/null
+++ b/Programs/Source/torch_cifar_lenet_pretrain.mpc
@@ -0,0 +1,81 @@
+# this trains LeNet on CIFAR-10 on a model pretrained in cleartext
+
+program.options_from_args()
+
+from Compiler import ml
+
+try:
+    ml.set_n_threads(int(program.args[2]))
+except:
+    pass
+
+get_data = lambda train, transform=None: torchvision.datasets.CIFAR10(
+    root='/tmp', train=train, download=True, transform=transform)
+
+import torchvision, numpy
+data = []
+for train in True, False:
+    ds = get_data(train)
+    # normalize to [-1,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255 * 2 - 1, binary=True)
+    labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+    data += [(labels, samples)]
+
+(training_labels, training_samples), (test_labels, test_samples) = data
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Conv2d(3, 20, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Conv2d(20, 50, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Flatten(),
+    nn.ReLU(),
+    nn.Linear(1250, 500),
+    nn.ReLU(),
+    nn.Linear(500, 10)
+)
+
+# train for a bit
+transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor(), lambda x: 2 * x - 1])
+ds = get_data(train=True, transform=transform)
+optimizer = torch.optim.Adam(net.parameters(), amsgrad=True)
+criterion = nn.CrossEntropyLoss()
+
+for i, data in enumerate(torch.utils.data.DataLoader(ds, batch_size=128)):
+    inputs, labels = data
+    optimizer.zero_grad()
+    outputs = net(inputs)
+    loss = criterion(outputs, labels)
+    loss.backward()
+    optimizer.step()
+
+with torch.no_grad():
+    ds = get_data(False, transform)
+    total = correct_classified = 0
+    for data in torch.utils.data.DataLoader(ds, batch_size=128):
+        inputs, labels = data
+        outputs = net(inputs)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct_classified += (predicted == labels).sum().item()
+    test_acc = (100 * correct_classified / total)
+    print('Cleartext test accuracy of the network: %.2f %%' % test_acc)
+
+layers = ml.layers_from_torch(net, training_samples.shape, 128, input_via=0)
+
+optimizer = ml.SGD(layers)
+optimizer.fit(
+    training_samples,
+    training_labels,
+    epochs=int(program.args[1]),
+    batch_size=128,
+    validation_data=(test_samples, test_labels),
+    program=program,
+    reset=False
+)
diff --git a/Programs/Source/torch_mnist_dense.mpc b/Programs/Source/torch_mnist_dense.mpc
new file mode 100644
index 000000000..7b49a6a10
--- /dev/null
+++ b/Programs/Source/torch_mnist_dense.mpc
@@ -0,0 +1,57 @@
+# this trains a dense neural network on MNIST
+
+program.options_from_args()
+
+import torchvision
+
+data = []
+for train in True, False:
+    ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+    # normalize to [0,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255)
+    labels = sint.input_tensor_via(0, ds.targets, one_hot=True)
+    data += [(labels, samples)]
+
+(training_labels, training_samples), (test_labels, test_samples) = data
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Flatten(),
+    nn.Linear(28 * 28, 128),
+    nn.ReLU(),
+    nn.Linear(128, 128),
+    nn.ReLU(),
+    nn.Linear(128, 10)
+)
+
+# test network
+ds = torchvision.datasets.MNIST(
+    root='/tmp', transform=torchvision.transforms.ToTensor())
+inputs = next(iter(torch.utils.data.DataLoader(ds)))[0]
+print(inputs.shape)
+outputs = net(inputs)
+
+from Compiler import ml
+
+ml.set_n_threads(int(program.args[2]))
+
+layers = ml.layers_from_torch(net, training_samples.shape, 128)
+
+optimizer = ml.SGD(layers)
+optimizer.fit(
+    training_samples,
+    training_labels,
+    epochs=int(program.args[1]),
+    batch_size=128,
+    validation_data=(test_samples, test_labels),
+    program=program
+)
+
+# store secret model for use in torch_mnist_dense_test
+for var in optimizer.trainable_variables:
+    var.write_to_file()
+
+# output to be used in Scripts/torch_mnist_lenet_import.py
+optimizer.reveal_model_to_binary()
diff --git a/Programs/Source/torch_mnist_dense_pretrain.mpc b/Programs/Source/torch_mnist_dense_pretrain.mpc
new file mode 100644
index 000000000..4745d02d3
--- /dev/null
+++ b/Programs/Source/torch_mnist_dense_pretrain.mpc
@@ -0,0 +1,72 @@
+# this trains a dense neural network on MNIST
+
+program.options_from_args()
+
+import torchvision
+
+data = []
+for train in True, False:
+    ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+    # normalize to [0,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255., binary=True)
+    labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+    data += [(labels, samples)]
+
+(training_labels, training_samples), (test_labels, test_samples) = data
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Flatten(),
+    nn.Linear(28 * 28, 128),
+    nn.ReLU(),
+    nn.Linear(128, 128),
+    nn.ReLU(),
+    nn.Linear(128, 10)
+)
+
+# train for a bit
+transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor()])
+ds = torchvision.datasets.MNIST(root='/tmp', transform=transform, train=True)
+optimizer = torch.optim.Adam(net.parameters(), amsgrad=True)
+criterion = nn.CrossEntropyLoss()
+
+for i, data in enumerate(torch.utils.data.DataLoader(ds, batch_size=128)):
+    inputs, labels = data
+    optimizer.zero_grad()
+    outputs = net(inputs)
+    loss = criterion(outputs, labels)
+    loss.backward()
+    optimizer.step()
+
+with torch.no_grad():
+    ds = torchvision.datasets.MNIST(root='/tmp', transform=transform,
+                                    train=False)
+    total = correct_classified = 0
+    for data in torch.utils.data.DataLoader(ds, batch_size=128):
+        inputs, labels = data
+        outputs = net(inputs)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct_classified += (predicted == labels).sum().item()
+    test_acc = (100 * correct_classified / total)
+    print('Test accuracy of the network: %.2f %%' % test_acc)
+
+from Compiler import ml
+
+ml.set_n_threads(int(program.args[2]))
+
+layers = ml.layers_from_torch(net, training_samples.shape, 128, input_via=0)
+
+optimizer = ml.SGD(layers)
+optimizer.fit(
+    training_samples,
+    training_labels,
+    epochs=int(program.args[1]),
+    batch_size=128,
+    validation_data=(test_samples, test_labels),
+    program=program,
+    reset=False
+)
diff --git a/Programs/Source/torch_mnist_dense_test.mpc b/Programs/Source/torch_mnist_dense_test.mpc
new file mode 100644
index 000000000..ceb6d72e1
--- /dev/null
+++ b/Programs/Source/torch_mnist_dense_test.mpc
@@ -0,0 +1,40 @@
+# this tests a previously stored dense neural network on MNIST
+
+program.options_from_args()
+
+import torchvision
+
+data = []
+for train in True, False:
+    ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+    # normalize to [0,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255)
+    labels = sint.input_tensor_via(0, ds.targets, one_hot=True)
+    data += [(labels, samples)]
+
+(training_labels, training_samples), (test_labels, test_samples) = data
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Flatten(),
+    nn.Linear(28 * 28, 128),
+    nn.ReLU(),
+    nn.Linear(128, 128),
+    nn.ReLU(),
+    nn.Linear(128, 10)
+)
+
+from Compiler import ml
+
+layers = ml.layers_from_torch(net, training_samples.shape, 128)
+
+optimizer = ml.Optimizer(layers)
+
+start = 0
+for var in optimizer.trainable_variables:
+    start = var.read_from_file(start)
+
+n_correct, loss = optimizer.reveal_correctness(test_samples, test_labels, 128)
+print_ln('Accuracy: %s/%s', n_correct, len(test_samples))
diff --git a/Programs/Source/torch_mnist_lenet.mpc b/Programs/Source/torch_mnist_lenet.mpc
new file mode 100644
index 000000000..75ccf24d6
--- /dev/null
+++ b/Programs/Source/torch_mnist_lenet.mpc
@@ -0,0 +1,49 @@
+# this trains a dense neural network on MNIST
+
+program.options_from_args()
+
+import torchvision
+
+data = []
+for train in True, False:
+    ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+    # normalize to [0,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255., binary=True)
+    labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+    data += [(labels, samples)]
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Conv2d(1, 20, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Conv2d(20, 50, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Flatten(),
+    nn.ReLU(),
+    nn.Linear(800, 500),
+    nn.ReLU(),
+    nn.Linear(500, 10)
+)
+
+# test network
+ds = torchvision.datasets.MNIST(
+    root='/tmp', transform=torchvision.transforms.ToTensor())
+inputs = next(iter(torch.utils.data.DataLoader(ds)))[0]
+print(inputs.shape)
+outputs = net(inputs)
+
+from Compiler import ml
+
+ml.set_n_threads(int(program.args[2]))
+
+layers = ml.layers_from_torch(net, data[0][1].shape, 128)
+layers[0].X = data[0][1]
+layers[-1].Y = data[0][0]
+
+optimizer = ml.SGD(layers)
+optimizer.run_by_args(program, int(program.args[1]), 128,
+                      data[1][1], data[1][0])
diff --git a/Programs/Source/torch_mnist_lenet_predict.mpc b/Programs/Source/torch_mnist_lenet_predict.mpc
new file mode 100644
index 000000000..8e8b54cb1
--- /dev/null
+++ b/Programs/Source/torch_mnist_lenet_predict.mpc
@@ -0,0 +1,74 @@
+# this trains a LeNet on MNIST in cleartext and tests it securely
+
+program.options_from_args()
+
+import torchvision
+
+data = []
+for train in True, False:
+    ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+    # normalize to [0,1] before input
+    samples = sfix.input_tensor_via(0, ds.data / 255., binary=True)
+    labels = sint.input_tensor_via(0, ds.targets, binary=True, one_hot=True)
+    data += [(labels, samples)]
+
+(training_labels, training_samples), (test_labels, test_samples) = data
+
+import torch
+import torch.nn as nn
+
+net = nn.Sequential(
+    nn.Conv2d(1, 20, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Conv2d(20, 50, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Flatten(),
+    nn.ReLU(),
+    nn.Linear(800, 500),
+    nn.ReLU(),
+    nn.Linear(500, 10)
+)
+
+# train for a bit
+transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor()])
+ds = torchvision.datasets.MNIST(root='/tmp', transform=transform, train=True)
+optimizer = torch.optim.Adam(net.parameters(), amsgrad=True)
+criterion = nn.CrossEntropyLoss()
+
+for i, data in enumerate(torch.utils.data.DataLoader(ds, batch_size=128)):
+    inputs, labels = data
+    optimizer.zero_grad()
+    outputs = net(inputs)
+    loss = criterion(outputs, labels)
+    loss.backward()
+    optimizer.step()
+
+with torch.no_grad():
+    ds = torchvision.datasets.MNIST(root='/tmp', transform=transform,
+                                    train=False)
+    total = correct_classified = 0
+    for data in torch.utils.data.DataLoader(ds, batch_size=128):
+        inputs, labels = data
+        outputs = net(inputs)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct_classified += (predicted == labels).sum().item()
+    test_acc = (100 * correct_classified / total)
+    print('Cleartext accuracy of the network: %.2f %%' % test_acc)
+
+from Compiler import ml
+
+layers = ml.layers_from_torch(net, training_samples.shape, 128, input_via=0)
+
+ml.set_n_threads(8)
+
+optimizer = ml.Optimizer(layers)
+
+# output to be used in Scripts/torch_mnist_lenet_import.py
+optimizer.reveal_model_to_binary()
+
+n_correct, loss = optimizer.reveal_correctness(test_samples, test_labels, 128, running=True)
+print_ln('Secure accuracy: %s/%s', n_correct, len(test_samples))
diff --git a/Protocols/FakeShare.h b/Protocols/FakeShare.h
index e5bb9e9e5..a73142b75 100644
--- a/Protocols/FakeShare.h
+++ b/Protocols/FakeShare.h
@@ -34,6 +34,7 @@ class FakeShare : public T, public ShareInterface
     static const bool has_trunc_pr = true;
     static const bool dishonest_majority = false;
     static const bool malicious = false;
+    static const bool is_real = false;
 
     static string type_short()
     {
diff --git a/Protocols/Hemi.hpp b/Protocols/Hemi.hpp
index 1549e2cf4..9ba85290f 100644
--- a/Protocols/Hemi.hpp
+++ b/Protocols/Hemi.hpp
@@ -25,10 +25,10 @@ typename T::MatrixPrep& Hemi<T>::get_matrix_prep(const array<int, 3>& dims,
         SubProcessor<T>& processor)
 {
     if (matrix_preps.find(dims) == matrix_preps.end())
-        matrix_preps.insert({dims,
+        matrix_preps.insert(pair<array<int, 3>, typename T::MatrixPrep*>(dims,
             new typename T::MatrixPrep(dims[0], dims[1], dims[2],
                     dynamic_cast<typename T::LivePrep&>(processor.DataF),
-                    matrix_usage)});
+                    matrix_usage)));
     return *matrix_preps.at(dims);
 }
 
diff --git a/Protocols/SemiInput.hpp b/Protocols/SemiInput.hpp
index 7ab4a855a..5cdfae792 100644
--- a/Protocols/SemiInput.hpp
+++ b/Protocols/SemiInput.hpp
@@ -68,7 +68,7 @@ template<class T>
 void SemiInput<T>::finalize_other(int player, T& target, octetStream&,
         int)
 {
-    target = this->recv_prngs[player].template get<T>();
+    target = this->recv_prngs.at(player).template get<T>();
 }
 
 template<class T>
diff --git a/Protocols/ShareInterface.h b/Protocols/ShareInterface.h
index 4e6b975cb..c168a464e 100644
--- a/Protocols/ShareInterface.h
+++ b/Protocols/ShareInterface.h
@@ -46,6 +46,8 @@ class ShareInterface
 
     const static bool symmetric = true;
 
+    static const bool is_real = true;
+
     static const int default_length = 1;
 
     static string type_short() { throw runtime_error("shorthand undefined"); }
diff --git a/README.md b/README.md
index c14f41ce9..5c96b7e15 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,9 @@ solutions](https://mp-spdz.readthedocs.io/en/latest/troubleshooting.html).
 ##### Filing Issues
 
 Please file complete code examples because it's usually not possible
-to reproduce problems from incomplete code.
+to reproduce problems from incomplete code, and please include which
+protocol you have used (if applicable) because there are considerable
+differences between the various protocols.
 
 #### Frequently Asked Questions
 
@@ -40,10 +42,9 @@ the top folder:
 
 ```
 Scripts/tldr.sh
-./compile.py tutorial
 echo 1 2 3 4 > Player-Data/Input-P0-0
 echo 1 2 3 4 > Player-Data/Input-P1-0
-Scripts/mascot.sh tutorial
+Scripts/compile-run.sh -E mascot tutorial
 ```
 
 This runs [the tutorial](Programs/Source/tutorial.mpc) with two
@@ -54,7 +55,7 @@ parties and malicious security.
 On Linux, this requires a working toolchain and [all
 requirements](#requirements). On Ubuntu, the following might suffice:
 ```
-sudo apt-get install automake build-essential cmake git libboost-dev libboost-thread-dev libntl-dev libsodium-dev libssl-dev libtool m4 python3 texinfo yasm
+sudo apt-get install automake build-essential clang cmake git libboost-dev libboost-thread-dev libntl-dev libsodium-dev libssl-dev libtool m4 python3 texinfo yasm
 ```
 On MacOS, this requires [brew](https://brew.sh) to be installed,
 which will be used for all dependencies.
@@ -62,17 +63,16 @@ It will execute [the
 tutorial](Programs/Source/tutorial.mpc) with two parties and malicious
 security.
 
-Note that this only works with a git clone but not with a binary
-release.
-
 ```
-make -j 8 tldr
-./compile.py tutorial
+make setup
 echo 1 2 3 4 > Player-Data/Input-P0-0
 echo 1 2 3 4 > Player-Data/Input-P1-0
-Scripts/mascot.sh tutorial
+Scripts/compile-run.sh -E mascot tutorial
 ```
 
+On strong enough hardware setups (several cores and GB of RAM), you
+can speed up the last step by running `make -j8 mascot-party.x` beforehand.
+
 #### TL;DR (Docker)
 Build a docker image for `mascot-party.x`:
 
@@ -271,7 +271,7 @@ compute the preprocessing time for a particular computation.
 #### Requirements
 
  - GCC 5 or later (tested with up to 11) or LLVM/clang 6 or later
-   (tested with up to 14). We recommend clang because it performs
+   (tested with up to 14). The default is to use clang because it performs
    better. Note that GCC 5/6 and clang 9 don't support libOTe, so you
    need to deactivate its use for these compilers (see the next
    section).
@@ -284,16 +284,16 @@ compute the preprocessing time for a particular computation.
    install it locally.
    libOTe also requires boost of version at least 1.75, which is not
    available by default on relatively recent systems such as Ubuntu
-   20.04. You can install it locally by running `make boost`.
+   22.04. You can install it locally by running `make boost`.
  - MPIR library, compiled with C++ support (use flag `--enable-cxx` when running configure). You can use `make -j8 mpir` to install it locally.
  - libsodium library, tested against 1.0.18
- - OpenSSL, tested against 1.1.1
- - Boost.Asio with SSL support (`libboost-dev` on Ubuntu), tested against 1.71
- - Boost.Thread for BMR (`libboost-thread-dev` on Ubuntu), tested against 1.71
+ - OpenSSL, tested against 3.0.2
+ - Boost.Asio with SSL support (`libboost-dev` on Ubuntu), tested against 1.81
+ - Boost.Thread for BMR (`libboost-thread-dev` on Ubuntu), tested against 1.81
  - x86 or ARM 64-bit CPU (the latter tested with AWS Gravitron and
    Apple Silicon)
  - Python 3.5 or later
- - NTL library for homomorphic encryption (optional; tested with NTL 10.5)
+ - NTL library for homomorphic encryption (optional; tested with NTL 11.5.1)
  - If using macOS, Sierra or later
  - Windows/VirtualBox: see [this
    issue](https://github.com/data61/MP-SPDZ/issues/557) for a discussion
@@ -328,14 +328,67 @@ compute the preprocessing time for a particular computation.
    parts only. Remember to run `make clean` first after changing `CONFIG`
    or `CONFIG.mine`.
 
-# Running computation
+# Running Computation
 
 See `Programs/Source/` for some example MPC programs, in particular
 `tutorial.mpc`. Furthermore, [Read the
 Docs](https://mp-spdz.readthedocs.io/en/latest/) hosts a more
-detailed reference of the high-level functionality extracted from the
-Python code in the `Compiler` directory as well as a summary of
-relevant compiler options.
+detailed reference of all aspects of MP-SPDZ.
+
+There are three ways of running computation:
+
+1. Separate compilation and execution. This is the default in the
+   further documentation. It allows to run the same program several
+   times while only compiling once, for example:
+
+   ```
+   ./compile.py <program> <argument>
+   Scripts/mascot.sh <program>-<argument> [<runtime-arg>...]
+   Scripts/mascot.sh <program>-<argument> [<runtime-arg>...]
+   ```
+
+2. One-command local execution. This compiles the program and the
+   virtual machine if necessary before executing it locally with the
+   given protocol. The name of the protocols correspond to the script
+   names below (without the `.sh`). Furthermore, some
+   protocol-specific optimization options are automatically used as
+   well as required options.
+
+   ```
+   Scripts/compile-run.py -E mascot <program> <argument> -- [<runtime-arg>...]
+   ```
+
+3. One-command remote execution. This compiles the program and the
+   virtual machine if necessary before uploading them together with
+   all necessary input and certificate files via SSH.
+
+   ```
+   Scripts/compile-run.py -HOSTS -E mascot <program> <argument> -- [<runtime-arg>...]
+   ```
+
+   `HOSTS` has to be a text file in the following format:
+
+   ```
+   [<user>@]<host0>[/<path>]
+   [<user>@]<host1>[/<path>]
+   ...
+   ```
+
+   If <path> does not start with `/` (only one `/` after the
+   hostname), the path with be relative to the home directory of the
+   user. Otherwise (`//` after the hostname it will be relative to the
+   root directory.
+
+Even with the integrated execution it is important to keep in mind
+that there are two different phases, the compilation and the run-time
+phase. Any secret data is only available in the second phase, when the
+Python compilation has concluded. Therefore, the types like `sint` and
+`sfix` are mere placeholders for data to be used later, and they don't
+contain any shares. See also [the
+documentation](https://mp-spdz.readthedocs.io/en/latest/compilation.html#compilation-vs-run-time)
+for what this means when using Python data structures and Python
+language features.
+
 
 ### Compiling high-level programs
 
@@ -347,8 +400,10 @@ to be compiled accordingly.
 ```./compile.py [-F <integer bit length>] [-P <prime>] <program>```
 
 The integer bit length defaults to 64, and the prime defaults to none
-given. If a prime is given, it has to be at least two bits longer
-than the integer length.
+given. If a prime is given, it has to be at least two bits longer than
+the integer length. Note that `-P` is optional, and it involves
+algorithms that are more expensive while allowing for a wider range of
+integer lengths.
 
 Note that in this context integers do not wrap around according to the
 bit integer bit length but the length is used for non-linear
@@ -763,7 +818,7 @@ for computation modulo a power of two. It involves sharing both a
 secret value and information-theoretic tag similar to SPDZ but not
 with additive secret sharing, hence the name.
 Rep4 refers to the four-party protocol by [Dalskov et
-al.](https://eprint.iacr.org/2020/1330).
+al.](https://eprint.iacr.org/2020/1330)
 `malicious-rep-bin-party.x` is based on cut-and-choose triple
 generation by [Furukawa et al.](https://eprint.iacr.org/2016/944) but
 using Beaver multiplication instead of their post-sacrifice
diff --git a/Scripts/build.sh b/Scripts/build.sh
index 1c3f72866..0aaf49525 100755
--- a/Scripts/build.sh
+++ b/Scripts/build.sh
@@ -4,11 +4,8 @@ function build
 {
     echo ARCH = $1 >> CONFIG.mine
     echo GDEBUG = >> CONFIG.mine
-    root=`pwd`
-    cd deps/libOTe
-    rm -R out
-    python3 build.py --install=$root/local -- -DENABLE_SOFTSPOKEN_OT=ON -DBUILD_SHARED_LIBS=0 -DCMAKE_INSTALL_LIBDIR=lib $3
-    cd $root
+    echo OTE_OPTS= -DENABLE_SOFTSPOKEN_OT=ON -DBUILD_SHARED_LIBS=0 -DCMAKE_INSTALL_LIBDIR=lib $3 >> CONFIG.mine
+    rm -R deps/libOTe/out
     make clean
     rm -R static
     mkdir static
diff --git a/Scripts/compile-emulate.py b/Scripts/compile-emulate.py
new file mode 100755
index 000000000..4f346c325
--- /dev/null
+++ b/Scripts/compile-emulate.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+import os, sys
+
+sys.path.append('.')
+
+from Compiler.compilerLib import Compiler
+
+compiler = Compiler()
+compiler.prep_compile(build=False)
+compiler.execute = True
+compiler.options.execute = 'emulate'
+compiler.options.ring = compiler.options.ring or '64'
+compiler.options.keep_cisc = compiler.options.keep_cisc or ''
+compiler.build()
+prog = compiler.compile_file()
+compiler.local_execution()
diff --git a/Scripts/compile-run.py b/Scripts/compile-run.py
new file mode 100755
index 000000000..d7f2711b3
--- /dev/null
+++ b/Scripts/compile-run.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import os, sys
+
+sys.path.append('.')
+
+from Compiler.compilerLib import Compiler
+
+try:
+    split = sys.argv.index('--')
+except ValueError:
+    split = len(sys.argv)
+
+compiler_args = sys.argv[1:split]
+runtime_args = sys.argv[split + 1:]
+compiler = Compiler(execute=True, custom_args=compiler_args)
+compiler.prep_compile()
+prog = compiler.compile_file()
+
+if prog.options.hostfile:
+    compiler.remote_execution(runtime_args)
+else:
+    compiler.local_execution(runtime_args)
diff --git a/Scripts/memory-usage.py b/Scripts/memory-usage.py
index eaec677fd..1977fc2c6 100755
--- a/Scripts/memory-usage.py
+++ b/Scripts/memory-usage.py
@@ -13,8 +13,9 @@
 
 res = collections.defaultdict(lambda: 0)
 regs = collections.defaultdict(lambda: 0)
+thread_regs = collections.defaultdict(lambda: 0)
 
-for tapename in Program.read_tapes(sys.argv[1]):
+def process(tapename, res, regs):
     for inst in Tape.read_instructions(tapename):
         t = inst.type
         if issubclass(t, DirectMemoryInstruction):
@@ -24,7 +25,17 @@
             if isinstance(arg, RegisterArgFormat):
                 regs[type(arg)] = max(regs[type(arg)], arg.i + inst.size)
 
+tapes = Program.read_tapes(sys.argv[1])
+
+process(next(tapes), res, regs)
+
+for tapename in tapes:
+    process(tapename, res, thread_regs)
+
 reverse_formats = dict((v, k) for k, v in ArgFormats.items())
 
+regout = lambda regs: dict((reverse_formats[t], n) for t, n in regs.items())
+
 print ('Memory:', dict(res))
-print ('Registers:', dict((reverse_formats[t], n) for t, n in regs.items()))
+print ('Registers in main thread:', regout(regs))
+print ('Registers in other threads:', regout(thread_regs))
diff --git a/Scripts/setup-clients.sh b/Scripts/setup-clients.sh
index 74010c266..e07a3f978 100755
--- a/Scripts/setup-clients.sh
+++ b/Scripts/setup-clients.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash
 
+# brew-installed OpenSSL on MacOS
+PATH="/opt/homebrew/opt/openssl@3/bin:$PATH"
+
 n=$1
 
 test -e Player-Data || mkdir Player-Data
diff --git a/Scripts/setup-ssl.sh b/Scripts/setup-ssl.sh
index 01113f166..b479a31f3 100755
--- a/Scripts/setup-ssl.sh
+++ b/Scripts/setup-ssl.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # brew-installed OpenSSL on MacOS
-PATH=/usr/local/opt/openssl/bin:$PATH
+PATH="/opt/homebrew/opt/openssl@3/bin:$PATH"
 
 n=${1:-4}
 ssl_dir=${2:-"Player-Data"}
diff --git a/Scripts/test_tutorial.sh b/Scripts/test_tutorial.sh
index 60157c934..094a6393d 100755
--- a/Scripts/test_tutorial.sh
+++ b/Scripts/test_tutorial.sh
@@ -83,15 +83,17 @@ fi
 
 ./compile.py tutorial
 
-for i in cowgear chaigear; do
-    test_vm $i $run_opts -S 3 -c 2 -J
-done
+if test $no_top_gear; then
+    for i in cowgear chaigear; do
+	test_vm $i $run_opts -S 3 -c 2 -J
+    done
+fi
 
 if test $skip_binary; then
    exit
 fi
 
-./compile.py -B 16  $compile_opts tutorial
+./compile.py -GB 16  $compile_opts tutorial
 
 for i in replicated mal-rep-bin ps-rep-bin semi-bin ccd mal-ccd; do
     test_vm $i $run_opts
diff --git a/Scripts/tldr.sh b/Scripts/tldr.sh
index bd5b396a7..54cde516e 100755
--- a/Scripts/tldr.sh
+++ b/Scripts/tldr.sh
@@ -11,9 +11,10 @@ elif test `uname` = Darwin; then
 	    echo Aborting
 	    exit 1
 	else
-	    /usr/bin/env ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+	    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
 	fi
     fi
+    make mac-setup
     make tldr
 else
     echo OS unknown
diff --git a/Scripts/torch_cifar_alex_import.py b/Scripts/torch_cifar_alex_import.py
new file mode 100755
index 000000000..1adb34a5a
--- /dev/null
+++ b/Scripts/torch_cifar_alex_import.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+# test model output by torch_alex_test.mpc
+
+import torchvision
+import torch
+import torch.nn as nn
+import numpy
+
+net = nn.Sequential(
+    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=2),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Conv2d(64, 96, kernel_size=3, padding=2),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=2),
+    nn.Conv2d(96, 96, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(96, 64, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.Conv2d(64, 64, kernel_size=3, padding=1),
+    nn.ReLU(),
+    nn.MaxPool2d(kernel_size=3, stride=2),
+    nn.Flatten(),
+    nn.Linear(1024, 128),
+    nn.ReLU(),
+    nn.Linear(128, 256),
+    nn.ReLU(),
+    nn.Linear(256, 10),
+)
+
+f = open('Player-Data/Binary-Output-P0-0')
+
+state = net.state_dict()
+
+for name in state:
+    shape = state[name].shape
+    size = numpy.prod(shape)
+    var = numpy.fromfile(f, 'double', count=size)
+    var = var.reshape(shape)
+    state[name] = torch.Tensor(var)
+
+net.load_state_dict(state)
+
+get_data = lambda train, transform=None: torchvision.datasets.CIFAR10(
+    root='/tmp', train=train, download=True, transform=transform)
+
+transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor(), lambda x: 2 * x - 1])
+
+with torch.no_grad():
+    ds = get_data(False, transform)
+    total = correct_classified = 0
+    for data in torch.utils.data.DataLoader(ds, batch_size=128):
+        inputs, labels = data
+        outputs = net(inputs)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct_classified += (predicted == labels).sum().item()
+    test_acc = (100 * correct_classified / total)
+    print('Test accuracy of the network: %.2f %%' % test_acc)
diff --git a/Scripts/torch_mnist_dense_import.py b/Scripts/torch_mnist_dense_import.py
new file mode 100755
index 000000000..9286cc72e
--- /dev/null
+++ b/Scripts/torch_mnist_dense_import.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+# test model output by torch_mnist_dense.mpc
+
+import torchvision
+import torch
+import torch.nn as nn
+import numpy
+
+net = nn.Sequential(
+    nn.Flatten(),
+    nn.Linear(28 * 28, 128),
+    nn.ReLU(),
+    nn.Linear(128, 128),
+    nn.ReLU(),
+    nn.Linear(128, 10)
+)
+
+f = open('Player-Data/Binary-Output-P0-0')
+
+state = net.state_dict()
+
+for name in state:
+    shape = state[name].shape
+    size = numpy.prod(shape)
+    var = numpy.fromfile(f, 'double', count=size)
+    var = var.reshape(shape)
+    state[name] = torch.Tensor(var)
+
+net.load_state_dict(state)
+
+transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor()])
+
+with torch.no_grad():
+    ds = torchvision.datasets.MNIST(root='/tmp', transform=transform,
+                                    train=False)
+    total = correct_classified = 0
+    for data in torch.utils.data.DataLoader(ds, batch_size=128):
+        inputs, labels = data
+        outputs = net(inputs)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct_classified += (predicted == labels).sum().item()
+    test_acc = (100 * correct_classified / total)
+    print('Test accuracy of the network: %.2f %%' % test_acc)
diff --git a/Scripts/torch_mnist_lenet_import.py b/Scripts/torch_mnist_lenet_import.py
new file mode 100755
index 000000000..9df05285d
--- /dev/null
+++ b/Scripts/torch_mnist_lenet_import.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+# test model output by torch_mnist_lenet_predict.mpc
+
+import torchvision
+import torch
+import torch.nn as nn
+import numpy
+
+net = nn.Sequential(
+    nn.Conv2d(1, 20, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Conv2d(20, 50, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Flatten(),
+    nn.ReLU(),
+    nn.Linear(800, 500),
+    nn.ReLU(),
+    nn.Linear(500, 10)
+)
+
+f = open('Player-Data/Binary-Output-P0-0')
+
+state = net.state_dict()
+
+for name in state:
+    shape = state[name].shape
+    size = numpy.prod(shape)
+    var = numpy.fromfile(f, 'double', count=size)
+    var = var.reshape(shape)
+    state[name] = torch.Tensor(var)
+
+net.load_state_dict(state)
+
+transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor()])
+
+with torch.no_grad():
+    ds = torchvision.datasets.MNIST(root='/tmp', transform=transform,
+                                    train=False)
+    total = correct_classified = 0
+    for data in torch.utils.data.DataLoader(ds, batch_size=128):
+        inputs, labels = data
+        outputs = net(inputs)
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct_classified += (predicted == labels).sum().item()
+    test_acc = (100 * correct_classified / total)
+    print('Test accuracy of the network: %.2f %%' % test_acc)
diff --git a/Tools/FlexBuffer.cpp b/Tools/FlexBuffer.cpp
index 1dabd8c26..6c663cb63 100644
--- a/Tools/FlexBuffer.cpp
+++ b/Tools/FlexBuffer.cpp
@@ -20,6 +20,8 @@ ReceivedMsgStore::~ReceivedMsgStore()
 			<< push_timer.elapsed() << " seconds and retrieved them in "
 			<< pop_timer.elapsed() << " seconds " << endl;
 #endif
+	for (auto& file : files)
+	    remove(file.c_str());
 }
 
 void ReceivedMsgStore::push(ReceivedMsg& msg)
@@ -41,7 +43,8 @@ void ReceivedMsgStore::push(ReceivedMsg& msg)
 		sprintf(filename, "%s/%d.XXXXXX", BUFFER_DIR, getpid());
 		FILE* file = fdopen(mkstemp(filename), "w");
 		if (!file)
-			throw runtime_error("can't open file");
+			throw runtime_error("can't open file, check space on "
+					BUFFER_DIR);
 		size_t len = msg.size();
 		size_t ptr = msg.ptr - msg.buf;
 		if (fwrite(&len, sizeof(len), 1, file) != 1)
diff --git a/Tools/Hash.cpp b/Tools/Hash.cpp
index 680bec969..bee5adfb9 100644
--- a/Tools/Hash.cpp
+++ b/Tools/Hash.cpp
@@ -35,6 +35,11 @@ void Hash::update(const octetStream& os)
     update(os.get_data(), os.get_length());
 }
 
+void Hash::update(const string& str)
+{
+    update(str.data(), str.size());
+}
+
 void Hash::final(octetStream& os)
 {
     os.resize_precise(hash_length);
diff --git a/Tools/Hash.h b/Tools/Hash.h
index 6d1938ca2..706ddf326 100644
--- a/Tools/Hash.h
+++ b/Tools/Hash.h
@@ -41,6 +41,7 @@ class Hash
 	        v[i].pack(tmp, bit_lengths[i]);
 	    update(tmp);
 	}
+	void update(const string& str);
 
 	void final(unsigned char hashout[hash_length])
 	{
diff --git a/Tools/Lock.h b/Tools/Lock.h
index 299aa62b5..459e1f01b 100644
--- a/Tools/Lock.h
+++ b/Tools/Lock.h
@@ -19,4 +19,21 @@ class Lock
     void unlock();
 };
 
+class ScopeLock
+{
+    Lock& lock;
+
+public:
+    ScopeLock(Lock& lock) :
+            lock(lock)
+    {
+        lock.lock();
+    }
+
+    ~ScopeLock()
+    {
+        lock.unlock();
+    }
+};
+
 #endif /* TOOLS_LOCK_H_ */
diff --git a/Tools/ezOptionParser.h b/Tools/ezOptionParser.h
index 500ffee72..dac9db6b7 100644
--- a/Tools/ezOptionParser.h
+++ b/Tools/ezOptionParser.h
@@ -2094,7 +2094,7 @@ void ezOptionParser::prettyPrint(std::string & out) {
   
   out += "First Args:\n";
   for(i=0; i < (long int)firstArgs.size(); ++i) {
-    sprintf(tmp, "%d: %s\n", i+1, firstArgs[i]->c_str());
+    snprintf(tmp, 256, "%d: %s\n", i+1, firstArgs[i]->c_str());
     out += tmp;
   }
 
@@ -2115,46 +2115,46 @@ void ezOptionParser::prettyPrint(std::string & out) {
     out += "\n";
     // The flag names:
     for(j=0; j < (long int)g->flags.size()-1; ++j) {
-      sprintf(tmp, "%s, ", g->flags[j]->c_str());
+      snprintf(tmp, 256, "%s, ", g->flags[j]->c_str());
       out += tmp;
     }
-    sprintf(tmp, "%s:\n", g->flags.back()->c_str());
+    snprintf(tmp, 256, "%s:\n", g->flags.back()->c_str());
     out += tmp;
 
     if (g->isSet) {
       if (g->expectArgs) {
         if (g->args.empty()) {
-          sprintf(tmp, "%s (default)\n", g->defaults.c_str());
+          snprintf(tmp, 256, "%s (default)\n", g->defaults.c_str());
           out += tmp;
         } else {
           for(k=0; k < (long int)g->args.size(); ++k) {            
             for(j=0; j < (long int)g->args[k]->size()-1; ++j) {
-              sprintf(tmp, "%s%c", g->args[k]->at(j)->c_str(), g->delim);
+              snprintf(tmp, 256, "%s%c", g->args[k]->at(j)->c_str(), g->delim);
               out += tmp;
             }
-            sprintf(tmp, "%s\n", g->args[k]->back()->c_str());
+            snprintf(tmp, 256, "%s\n", g->args[k]->back()->c_str());
             out += tmp;
           }
         }
       } else { // Set but no args expected.
-        sprintf(tmp, "Set\n");
+        snprintf(tmp, 256, "Set\n");
         out += tmp;
       }
     } else {
-      sprintf(tmp, "Not set\n");
+      snprintf(tmp, 256, "Not set\n");
       out += tmp;
     }
   }
   
   out += "\nLast Args:\n";
   for(i=0; i < (long int)lastArgs.size(); ++i) {
-    sprintf(tmp, "%d: %s\n", i+1, lastArgs[i]->c_str());
+    snprintf(tmp, 256, "%d: %s\n", i+1, lastArgs[i]->c_str());
     out += tmp;
   }
   
   out += "\nUnknown Args:\n";
   for(i=0; i < (long int)unknownArgs.size(); ++i) {
-    sprintf(tmp, "%d: %s\n", i+1, unknownArgs[i]->c_str());
+    snprintf(tmp, 256, "%d: %s\n", i+1, unknownArgs[i]->c_str());
     out += tmp;
   }
 };
diff --git a/Yao/YaoEvalWire.cpp b/Yao/YaoEvalWire.cpp
index c8a4bad34..456a68922 100644
--- a/Yao/YaoEvalWire.cpp
+++ b/Yao/YaoEvalWire.cpp
@@ -16,6 +16,7 @@
 #include "GC/Secret.hpp"
 #include "GC/Thread.hpp"
 #include "GC/ShareSecret.hpp"
+#include "GC/ThreadMaster.hpp"
 #include "YaoCommon.hpp"
 
 void YaoEvalWire::random()
@@ -256,6 +257,14 @@ void YaoEvalWire::convcbit2s(GC::Processor<whole_type>& processor,
 	}
 }
 
+void YaoEvalWire::run_tapes(const vector<int>& args)
+{
+	auto& party = YaoEvaluator::s();
+	party.master.machine.run_tapes(args);
+	if (party.continuous())
+		party.untaint();
+}
+
 template void YaoEvalWire::and_<false>(
         GC::Processor<GC::Secret<YaoEvalWire> >& processor,
         const vector<int>& args);
diff --git a/Yao/YaoEvalWire.h b/Yao/YaoEvalWire.h
index 7257e5ad3..0f082657e 100644
--- a/Yao/YaoEvalWire.h
+++ b/Yao/YaoEvalWire.h
@@ -65,6 +65,8 @@ class YaoEvalWire : public YaoWire
 	static void convcbit2s(GC::Processor<whole_type>& processor,
 			const BaseInstruction& instruction);
 
+	static void run_tapes(const vector<int>& args);
+
 	void set(const Key& key);
 	void set(Key key, bool external);
 
diff --git a/Yao/YaoEvaluator.cpp b/Yao/YaoEvaluator.cpp
index 7b1b60154..652f5798e 100644
--- a/Yao/YaoEvaluator.cpp
+++ b/Yao/YaoEvaluator.cpp
@@ -77,13 +77,17 @@ void YaoEvaluator::run_from_store(GC::Program& program)
 
 bool YaoEvaluator::receive(Player& P)
 {
+#ifdef DEBUG_YAO
+	printf("waiting to receive at %d in thread %d\n", processor.PC, thread_num);
+#endif
 	if (P.receive_long(0) == YaoCommon::DONE)
 		return false;
 	P.receive_player(0, gates);
 	P.receive_player(0, output_masks);
 #ifdef DEBUG_YAO
-	cout << "received " << gates.size() << " gates and " << output_masks.size()
-	        << " output masks at " << processor.PC << endl;
+	cout << "received " << gates.size() << " bytes for gates and "
+			<< output_masks.size() << " output masks at " << processor.PC
+			<< " in thread " << thread_num << endl;
 #endif
 	return true;
 }
diff --git a/Yao/YaoEvaluator.h b/Yao/YaoEvaluator.h
index 749ba2878..416118eaf 100644
--- a/Yao/YaoEvaluator.h
+++ b/Yao/YaoEvaluator.h
@@ -43,7 +43,7 @@ class YaoEvaluator: public GC::Thread<GC::Secret<YaoEvalWire>>,
 
 	YaoEvaluator(int thread_num, YaoEvalMaster& master);
 
-	bool continuous() { return master.continuous and master.machine.nthreads == 1; }
+	bool continuous() { return master.continuous; }
 
 	void pre_run();
 	void run(GC::Program& program);
diff --git a/Yao/YaoGarbleWire.cpp b/Yao/YaoGarbleWire.cpp
index fb1a534ee..e9f7e2d90 100644
--- a/Yao/YaoGarbleWire.cpp
+++ b/Yao/YaoGarbleWire.cpp
@@ -14,6 +14,7 @@
 #include "GC/Secret.hpp"
 #include "GC/Thread.hpp"
 #include "GC/ShareSecret.hpp"
+#include "GC/ThreadMaster.hpp"
 #include "YaoCommon.hpp"
 
 void YaoGarbleWire::random()
@@ -245,3 +246,11 @@ void YaoGarbleWire::convcbit2s(GC::Processor<whole_type>& processor,
 					processor.C[instruction.get_r(1) + i].get_bit(j));
 	}
 }
+
+void YaoGarbleWire::run_tapes(const vector<int>& args)
+{
+	auto& garbler = YaoGarbler::s();
+	if (garbler.continuous())
+		garbler.untaint();
+	garbler.master.machine.run_tapes(args);
+}
diff --git a/Yao/YaoGarbleWire.h b/Yao/YaoGarbleWire.h
index 65feb8da2..20d56ef8f 100644
--- a/Yao/YaoGarbleWire.h
+++ b/Yao/YaoGarbleWire.h
@@ -66,6 +66,8 @@ class YaoGarbleWire : public YaoWire
 	static void convcbit2s(GC::Processor<whole_type>& processor,
 			const BaseInstruction& instruction);
 
+	static void run_tapes(const vector<int>& args);
+
 	void randomize(PRNG& prng);
 	void set(Key key, bool mask);
 
diff --git a/Yao/YaoGarbler.cpp b/Yao/YaoGarbler.cpp
index 647369a15..b9112c4b9 100644
--- a/Yao/YaoGarbler.cpp
+++ b/Yao/YaoGarbler.cpp
@@ -94,8 +94,9 @@ void YaoGarbler::post_run()
 void YaoGarbler::send(Player& P)
 {
 #ifdef DEBUG_YAO
-    cerr << "sending " << gates.size() << " gates and " <<
-            output_masks.size() << " output masks at " << processor.PC << endl;
+	cerr << "sending " << gates.size() << " bytes for gates and "
+			<< output_masks.size() << " output masks at " << processor.PC
+			<< " in thread " << thread_num << endl;
 #endif
 	P.send_long(1, YaoCommon::MORE);
 	size_t size = gates.size();
diff --git a/Yao/YaoGarbler.h b/Yao/YaoGarbler.h
index 0608336c8..8597182aa 100644
--- a/Yao/YaoGarbler.h
+++ b/Yao/YaoGarbler.h
@@ -56,7 +56,7 @@ class YaoGarbler: public GC::Thread<GC::Secret<YaoGarbleWire>>,
 	YaoGarbler(int thread_num, YaoGarbleMaster& master);
 	~YaoGarbler();
 
-	bool continuous() { return master.continuous and master.machine.nthreads == 1; }
+	bool continuous() { return master.continuous; }
 
 	void run(GC::Program& program);
 	void run(Player& P, bool continuous);
diff --git a/Yao/YaoPlayer.cpp b/Yao/YaoPlayer.cpp
index b1e0e0736..f943a9545 100644
--- a/Yao/YaoPlayer.cpp
+++ b/Yao/YaoPlayer.cpp
@@ -19,7 +19,7 @@ YaoPlayer::YaoPlayer(int argc, const char** argv)
 			0, // Required?
 			0, // Number of args expected.
 			0, // Delimiter if expecting multiple args.
-			"Evaluate only after garbling (default only with multi-threading).", // Help description.
+			"Evaluate only after garbling (very limited functionality).", // Help description.
 			"-O", // Flag token.
 			"--oneshot" // Flag token.
 	);
diff --git a/deps/libOTe b/deps/libOTe
index db02f8b8d..5d9f9c400 160000
--- a/deps/libOTe
+++ b/deps/libOTe
@@ -1 +1 @@
-Subproject commit db02f8b8d1e4805fb3bd86f1e06442d8acdc010c
+Subproject commit 5d9f9c400c6acda734cbd20b5b8ea02392c0f75e
diff --git a/doc/Compiler.rst b/doc/Compiler.rst
index 34343c51e..593ddbc25 100644
--- a/doc/Compiler.rst
+++ b/doc/Compiler.rst
@@ -75,6 +75,7 @@ Compiler.ml module
    :no-undoc-members:
    :exclude-members: Tensor
    :show-inheritance:
+   :inherited-members:
 .. autofunction:: approx_sigmoid
 
 Compiler.decision_tree module
@@ -129,3 +130,10 @@ Compiler.sqrt_oram module
    :no-undoc-members:
    :exclude-members: LinearPositionMap, PositionMap, RecursivePositionMap,
 		     refresh, shuffle_the_shuffle
+
+
+Compiler.sorting module
+-----------------------
+.. automodule:: Compiler.sorting
+   :members:
+   :no-undoc-members:
diff --git a/doc/Doxyfile b/doc/Doxyfile
index f82046ebc..d816a9727 100644
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@@ -829,7 +829,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ../Networking ../Tools/octetStream.h ../Processor/Data_Files.h ../Protocols/Replicated.h ../Protocols/ReplicatedPrep.h ../Protocols/MAC_Check_Base.h ../Processor/Input.h ../ExternalIO/Client.h ../Protocols/ProtocolSet.h ../Protocols/ProtocolSetup.h ../Math/gfp.h ../Math/gfpvar.h ../Math/Z2k.h ../FHE/Ciphertext.h ../FHE/FHE_Keys.h ../FHE/FHE_Params.h ../FHE/Plaintext.h ../Tools/random.h
+INPUT                  = ../Networking ../Tools/octetStream.h ../Processor/Data_Files.h ../Protocols/Replicated.h ../Protocols/ReplicatedPrep.h ../Protocols/MAC_Check_Base.h ../Processor/Input.h ../ExternalIO/Client.h ../Protocols/ProtocolSet.h ../Protocols/ProtocolSetup.h ../Math/gfp.h ../Math/gfpvar.h ../Math/Z2k.h ../FHE/Ciphertext.h ../FHE/FHE_Keys.h ../FHE/FHE_Params.h ../FHE/Plaintext.h ../Tools/random.h ../Math/bigint.h
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/doc/add-protocol.rst b/doc/add-protocol.rst
index 7b7199c03..1f741f36a 100644
--- a/doc/add-protocol.rst
+++ b/doc/add-protocol.rst
@@ -49,11 +49,21 @@ found in ``Protocols/Replicated.h``.
    Constant sharing and public output allows to execute the
    following program::
 
-     print_ln('%s', sint(123).reveal())
+     print_ln('result: %s', sint(123).reveal())
 
    This allows to check the correct execution of further
    functionality.
 
+   Put the above code in ``Programs/Source/test.mpc`` and run the
+   following if your protocol works for two parties (otherwise add
+   more parties and change the ``-N`` argument accordingly)::
+
+     make no-party.x
+     ./compile.py test
+     ./no-party.x 0 test -N 2 & ./no-party.1 test -N 2
+
+   This should output ``result: 123``.
+
 2. Fill in the operator functions in :c:type:`NoShare` and check
    them::
 
diff --git a/doc/compilation.rst b/doc/compilation.rst
index 01753edde..993f75da6 100644
--- a/doc/compilation.rst
+++ b/doc/compilation.rst
@@ -1,4 +1,4 @@
-Compilation process
+Compilation Process
 -------------------
 
 The easiest way of using MP-SPDZ is using ``compile.py`` as
@@ -38,7 +38,11 @@ The following options influence the computation domain:
    Specify a concrete prime modulus for computation. This can be used
    together with :option:`-F`, in which case *integer length* has to
    be at most the prime length minus two. The security implications of
-   overflows in the secrets do not go beyond incorrect results.
+   overflows in the secrets do not go beyond incorrect results. You
+   can use prime order domains without specifying this option.
+   Using this option involves algorithms for non-linear computation
+   which are generally more expensive but allow for integer lengths
+   that are close to the bit length of the prime.
 
 .. cmdoption:: -R <ring size>
 	       --ring=<ring size>
diff --git a/doc/gen-readme.sh b/doc/gen-readme.sh
index e7b825975..9e40fbc41 100755
--- a/doc/gen-readme.sh
+++ b/doc/gen-readme.sh
@@ -1,4 +1,7 @@
 #!/bin/sh
 
-echo '# Getting started' > readme.md
-sed -e '1 d' ../README.md >> readme.md
+echo '# Getting Started' > readme.md
+sed -e '1 d' -e 's#(Programs/Source#(../Programs/Source#g' -e 's#(./Dockerfile#(../Dockerfile#' ../README.md >> readme.md
+
+echo '# Client Interface' > client-interface.md
+cat ../ExternalIO/README.md >> client-interface.md
diff --git a/doc/index.rst b/doc/index.rst
index 648546c89..f072135bc 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -1,12 +1,18 @@
 Welcome to MP-SPDZ's documentation!
 ===================================
 
+MP-SPDZ is a framework for multi-party computation, a
+privacy-enhancing technology focused on input privacy. Please see
+`this gentle introduction <https://eprint.iacr.org/2020/300>`_ for
+more information on multi-party computation.
+
 If you're new to MP-SPDZ, consider the following:
 
 1. `Quickstart tutorial <readme.html#tl-dr--binary-distribution-on-linux-or-source-distribution-on-macos->`_
-2. `Implemented protocols <readme.html#protocols>`_
-3. :ref:`troubleshooting`
-4. :ref:`io` lists all the ways of getting data in and out.
+2. :ref:`Machine learning quickstart <ml-quickstart>`
+3. `Implemented protocols <readme.html#protocols>`_
+4. :ref:`troubleshooting`
+5. :ref:`io` lists all the ways of getting data in and out.
 
 .. toctree::
    :maxdepth: 4
@@ -17,9 +23,11 @@ If you're new to MP-SPDZ, consider the following:
    Compiler
    instructions
    low-level
+   ml-quickstart
    machine-learning
    networking
    io
+   client-interface
    non-linear
    preprocessing
    add-protocol
diff --git a/doc/io.rst b/doc/io.rst
index 50128d945..bd6b4db88 100644
--- a/doc/io.rst
+++ b/doc/io.rst
@@ -78,8 +78,8 @@ Clients (Non-computing Parties)
 
 :py:func:`Compiler.types.sint.receive_from_client` and
 :py:func:`Compiler.types.sint.reveal_to_clients` allow
-communicating securely with the clients. See `this example
-<https://github.com/data61/MP-SPDZ/tree/master/ExternalIO>`_
+communicating securely with the clients. See `the relevant section
+<client-interface.html>`_
 covering both client code and server-side high-level code.
 :py:func:`Compiler.types.sint.input_tensor_from_client` and
 :py:func:`Compiler.types.MultiArray.reveal_to_clients`. The same
diff --git a/doc/low-level.rst b/doc/low-level.rst
index e8b7a4cc6..89302d97e 100644
--- a/doc/low-level.rst
+++ b/doc/low-level.rst
@@ -381,3 +381,9 @@ Domain Reference
 
 .. doxygenclass:: SignedZ2
    :members:
+
+The following is not used as a domain, but it helps using the above types,
+in particular ``gfp_<X, L>`` and ``gfpvar_<X, L>``.
+
+.. doxygenclass:: bigint
+   :members:
diff --git a/doc/machine-learning.rst b/doc/machine-learning.rst
index 54764e37f..e7ed6f70c 100644
--- a/doc/machine-learning.rst
+++ b/doc/machine-learning.rst
@@ -1,15 +1,156 @@
 Machine Learning
 ----------------
 
-MP-SPDZ supports a limited subset of the Keras interface for machine
-learning. This includes the SGD and Adam optimizers and the following
-layer types: dense, 2D convolution, 2D max-pooling, and dropout.
+The purpose of this document is to demonstrate the machine learning
+functionality of MP-SPDZ, a software implementing multi-party
+computation, one of the most important privacy-enhancing
+techniques. Please see `this gentle introduction
+<https://eprint.iacr.org/2020/300>`_ for more information on
+multi-party computation and the `installation instructions
+<readme.html#tl-dr-binary-distribution-on-linux-or-source-distribution-on-macos>`_
+on how to install the software.
+
+MP-SPDZ supports a number of machine learning algorithms such as
+logistic and linear regression, decision trees, and some common deep
+learning functionality. The latter includes the SGD and Adam
+optimizers and the following layer types: dense, 2D convolution, 2D
+max-pooling, and dropout.
 
 The machine learning code only works in with arithmetic machines, that
 is, you cannot compile it with ``-B``.
 
-In the following we will walk through the example code in
-``keras_mnist_dense.mpc``, which trains a dense neural network for
+This document explains how to input data, how to train a model, and
+how to use an existing model for prediction.
+
+
+Data Input
+~~~~~~~~~~
+
+It's easiest to input data if it's available during compilation,
+either centrally or per party. Another way is to only define the data
+size in the high-level code and put the data independently into the
+right files used by the virtual machine.
+
+
+Integrated Data Input
+=====================
+
+If the data is available during compilation, for example as a PyTorch
+or numpy tensor, you can use
+:py:func:`Compiler.types.sfix.input_tensor_via` and
+:py:func:`Compiler.types.sint.input_tensor_via`. Consider the
+following code from ``breast_logistic.mpc`` (requiring
+`scikit-learn <https://scikit-learn.org>`_)::
+
+  from sklearn.datasets import load_breast_cancer
+  from sklearn.model_selection import train_test_split
+
+  X, y = load_breast_cancer(return_X_y=True)
+
+  # normalize column-wise
+  X /= X.max(axis=0)
+  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+  X_train = sfix.input_tensor_via(0, X_train)
+  y_train = sint.input_tensor_via(0, y_train)
+
+This downloads the Wisconsin Breast Cancer dataset, normalizes the
+sample data, splits it into a training and a test set, and then
+converts it to an the relevant MP-SPDZ data structures. Under the
+hood, the data is stored in ``Player-Data/Input-Binary-P0-0``, which
+is where binary-encoded inputs for player 0 are read from. You
+therefore have to copy said file if you execute it in another place
+than where you compiled it.
+
+MP-SPDZ also allows splitting the data input between parties, for
+example horizontally::
+
+  a = sfix.input_tensor_via(0, X_train[len(X_train) // 2:])
+  b = sfix.input_tensor_via(1, X_train[:len(X_train) // 2])
+  X_train = a.concat(b)
+
+  a = sint.input_tensor_via(0, y_train[len(y_train) // 2:])
+  b = sint.input_tensor_via(1, y_train[:len(y_train) // 2])
+  y_train = a.concat(b)
+
+The concatenation creates a unified secret tensor that can be used for
+training over the whole dataset. Similarly, you can split a dataset
+vertically::
+
+  a = sfix.input_tensor_via(0, X_train[:,:X_train.shape[1] // 2])
+  b = sfix.input_tensor_via(1, X_train[:,X_train.shape[1] // 2:])
+  X_train = a.concat_columns(b)
+
+The three approaches in this section can be run as follows::
+
+  Scripts/emulate-run.py -E ring breast_logistic
+  Scripts/emulate-run.py -E ring breast_logistic horizontal
+  Scripts/emulate-run.py -E ring breast_logistic vertical
+
+In the last variants, the labels are all input via party 0.
+
+Finally, MP-SPDZ also facilitates inputting data that is also
+available party by party. Party 0 can run::
+
+  a = sfix.input_tensor_via(0, X_train[:,:X_train.shape[1] // 2])
+  b = sfix.input_tensor_via(1, shape=X_train[:,X_train.shape[1] // 2:].shape)
+  X_train = a.concat_columns(b)
+  y_train = sint.input_tensor_via(0, y_train)
+
+while party 1 runs::
+
+  a = sfix.input_tensor_via(0, shape=X_train[:,:X_train.shape[1] // 2].shape)
+  b = sfix.input_tensor_via(1, X_train[:,X_train.shape[1] // 2:])
+  X_train = a.concat_columns(b)
+  y_train = sint.input_tensor_via(0, shape=y_train.shape)
+
+Note that that the respective party only accesses the shape of data
+they don't input.
+
+You can run this case by running on one hand:
+
+.. code-block:: console
+
+  ./compile.py breast_logistic party0
+  ./semi-party.x 0 breast_logistic-party0
+
+and on the other (but on the same host):
+
+.. code-block:: console
+
+  ./compile.py breast_logistic party1
+  ./semi-party.x 1 breast_logistic-party1
+
+The compilation will output a hash at the end, which has to agree
+between the parties. Otherwise the virtual machine will abort with an
+error message. To run the two parties on different hosts, use the
+:ref:`networking options <networking>`.
+
+
+Data preprocessing
+""""""""""""""""""
+
+Sometimes it's necessary to preprocess data. We're using the following
+code from ``torch_mnist_dense.mpc`` to demonstrate this::
+
+  ds = torchvision.datasets.MNIST(root='/tmp', train=train, download=True)
+  # normalize to [0,1] before input
+  samples = sfix.input_tensor_via(0, ds.data / 255)
+  labels = sint.input_tensor_via(0, ds.targets, one_hot=True)
+
+This downloads the default training or the test set of MNIST
+(depending on :py:obj:`train`) and then processes it to make it
+usable. The sample data is normalized from an 8-bit integer to the
+interval :math:`[0,1]` by dividing by 255. This is done within PyTorch
+for efficiency. Then, the labels are encoded as one-hot vectors
+because this is necessary for multi-label training in MP-SPDZ.
+
+
+Independent Data Input
+======================
+
+The example code in
+``keras_mnist_dense.mpc`` trains a dense neural network for
 MNIST. It starts by defining tensors to hold data::
 
   training_samples = sfix.Tensor([60000, 28, 28])
@@ -28,8 +169,122 @@ is used by ``convert.sh`` in `the preparation code
   test_labels.input_from(0)
   test_samples.input_from(0)
 
-This is followed by Keras-like code setting up the model and training
-it::
+The virtual machine then expect the data as whitespace-separated text
+in ``Player-Data/Input-P0-0``. If you use ``binary=True`` with
+:py:func:`input_from`, the input is expected in
+``Player-Data/Input-Binary-P0-0``, value by value as single-precision
+float or 64-bit integer in the machine byte order (most likely
+little-endian these days).
+
+
+Training
+~~~~~~~~
+
+There are a number of interfaces for different algorithms.
+
+
+Logistic regression with SGD
+============================
+
+This is available via :py:class:`~Compiler.ml.SGDLogistic`. We will
+use ``breast_logistic.mpc`` as an example.
+
+After inputting the data as above, you can call the following::
+
+  log = ml.SGDLogistic(20, 2, program)
+  log.fit(X_train, y_train)
+
+This trains a logistic regression model in secret for 20 epochs with
+mini-batches of size 2. Adding the :py:obj:`program` object as a
+parameter uses further command-line parameters. Most notably, you can
+add ``approx`` to use a three-piece approximate sigmoid function:
+
+.. code-block:: console
+
+  Scripts/compile-emulate.py breast_logistic approx
+
+Omitting it invokes the default sigmoid function.
+
+To check accuracy during training, you can call the following instead
+of :py:func:`~Compiler.ml.SGDLogistic.fit`::
+
+  log.fit_with_testing(X_train, y_train, X_test, y_test)
+
+This outputs losses and accuracy for both the training and test set
+after every epoch.
+
+You can use :py:func:`~Compiler.ml.SGDLogistic.predict` to predict
+labels and :py:func:`~Compiler.ml.SGDLogistic.predict_proba` to
+predict probabilities. The following outputs the correctness (0 for
+correct, :math:`\pm 1` for incorrect) and a measure of how much off
+the probability estimate is::
+
+  print_ln('%s', (log.predict(X_test) - y_test.get_vector()).reveal())
+  print_ln('%s', (log.predict_proba(X_test) - y_test.get_vector()).reveal())
+
+
+Linear regression with SGD
+==========================
+
+This is available via :py:class:`~Compiler.ml.SGDLinear`. It
+implements an interface similar to logistic regression. The main
+difference is that there is only
+:py:func:`~Compiler.ml.SGDLinear.predict` for prediction as there is
+no notion of labels in this case. See ``diabetes.mpc`` for an example
+of linear regression.
+
+
+PyTorch interface
+=================
+
+MP-SPDZ supports importing sequential models from PyTorch as shown in
+this code snippet in ``torch_mnist_dense.mpc``::
+
+  import torch.nn as nn
+
+  net = nn.Sequential(
+    nn.Flatten(),
+    nn.Linear(28 * 28, 128),
+    nn.ReLU(),
+    nn.Linear(128, 128),
+    nn.ReLU(),
+    nn.Linear(128, 10)
+  )
+
+  from Compiler import ml
+
+  ml.set_n_threads(int(program.args[2]))
+
+  layers = ml.layers_from_torch(net, training_samples.shape, 128)
+
+  optimizer = ml.SGD(layers)
+  optimizer.fit(
+    training_samples,
+    training_labels,
+    epochs=int(program.args[1]),
+    batch_size=128,
+    validation_data=(test_samples, test_labels),
+    program=program
+  )
+
+This trains a network with three dense layers on MNIST using SGD,
+softmax, and cross-entropy loss. The number of epochs and threads is
+taken from the command line. For example, the following trains the
+network for 10 epochs using 4 threads::
+
+  Scripts/compile-emulate.py torch_mnist_dense 10 4
+
+See ``Programs/Source/torch_*.mpc`` for further examples of the
+PyTorch functionality, :py:func:`~Compiler.ml.Optimizer.fit` for
+further training options, and :py:class:`~Compiler.ml.Adam` for an
+alternative Optimizer.
+
+
+Keras interface
+===============
+
+The following Keras-like code sets up a model with three dense layers
+and then trains it::
 
   from Compiler import ml
   tf = ml
@@ -55,28 +310,209 @@ it::
     validation_data=(test_samples, test_labels)
   )
 
-Lastly, the model is stored on disk in secret-shared form::
 
-  for var in model.trainable_variables:
-    var.write_to_file()
+Decision trees
+==============
 
+MP-SPDZ can train decision trees for binary labels by using the
+algorithm by `Hamada et al.`_ The following example in
+``breast_tree.mpc`` trains a tree of height five before outputting the
+difference between the prediction on a test set and the ground truth::
 
-Prediction
-~~~~~~~~~~
+  from Compiler.decision_tree import TreeClassifier
+  tree = TreeClassifier(max_depth=5)
+  tree.fit(X_train, y_train)
+  print_ln('%s', (tree.predict(X_test) - y_test.get_vector()).reveal())
+
+You can run the example as follows:
+
+.. code-block:: console
+
+  Scripts/compile-emulate.py breast_tree
+
+It is also possible to output the accuracy after every level::
+
+  tree.fit_with_testing(X_train, y_train, X_test, y_test)
+
+You can output the trained tree as follows::
+
+  tree.output()
+
+The format of the output follows the description of `Hamada et al.`_
+
+MP-SPDZ by default uses probabilistic rounding for fixed-point
+division, which is used to compute Gini coefficients in decision tree
+training. This has the effect that the tree isn't deterministic. You
+can switch to deterministic rounding as follows::
+
+  sfix.round_nearest = True
+
+The ``breast_tree.mpc`` uses the following code to allow switching on
+the command line::
+
+  sfix.set_precision_from_args(program)
+
+Nearest rounding can then be activated as follows:
+
+.. code-block:: console
+
+  Scripts/compile-emulate.py breast_tree nearest
+
+.. _`Hamada et al.`: https://arxiv.org/abs/2112.12906
+
+
+Data preparation
+""""""""""""""""
+
+MP-SPDZ currently support continuous and binary attributes but not
+discrete non-binary attributes. However, such attributes can be
+converted as follows using the `pandas <https://pandas.pydata.org>`_
+library::
+
+  import pandas
+  from sklearn.model_selection import train_test_split
+  from Compiler import decision_tree
+
+  data = pandas.read_csv(
+    'https://datahub.io/machine-learning/adult/r/adult.csv')
 
-The example code in ``keras_mnist_dense_predict.mpc`` uses the model
-stored above for prediction. Much of the setup is the same, but
+  data, attr_types = decision_tree.preprocess_pandas(data)
+
+  # label is last column
+  X = data[:,:-1]
+  y = data[:,-1]
+
+  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+This downloads the adult dataset and convert discrete attributes to
+binary using one-hot encoding. See ``easy_adult`` for the full
+example. :py:obj:`attr_types` has to be used to indicates the
+attribute types during training::
+
+  tree.fit(X_train, y_train, attr_types=attr_types)
+
+
+Loading pre-trained models
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to import pre-trained from PyTorch as shown in
+``torch_mnist_lenet_predict.mpc``::
+
+  net = nn.Sequential(
+    nn.Conv2d(1, 20, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Conv2d(20, 50, 5),
+    nn.ReLU(),
+    nn.MaxPool2d(2),
+    nn.Flatten(),
+    nn.ReLU(),
+    nn.Linear(800, 500),
+    nn.ReLU(),
+    nn.Linear(500, 10)
+  )
+
+  # train for a bit
+  transform = torchvision.transforms.Compose(
+    [torchvision.transforms.ToTensor()])
+  ds = torchvision.datasets.MNIST(root='/tmp', transform=transform, train=True)
+  optimizer = torch.optim.Adam(net.parameters(), amsgrad=True)
+  criterion = nn.CrossEntropyLoss()
+
+  for i, data in enumerate(torch.utils.data.DataLoader(ds, batch_size=128)):
+    inputs, labels = data
+    optimizer.zero_grad()
+    outputs = net(inputs)
+    loss = criterion(outputs, labels)
+    loss.backward()
+    optimizer.step()
+
+This trains LeNet on MNIST for one epoch. The model can then be input
+and used in MP-SPDZ::
+
+  from Compiler import ml
+  layers = ml.layers_from_torch(net, training_samples.shape, 128, input_via=0)
+  optimizer = ml.Optimizer(layers)
+  n_correct, loss = optimizer.reveal_correctness(test_samples, test_labels, 128, running=True)
+  print_ln('Secure accuracy: %s/%s', n_correct, len(test_samples))
+
+This outputs the accuracy of the network.
+
+
+Storing and loading models
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Both the Keras interface and the native
+:py:class:`~Compiler.ml.Optimizer` class support an interface to
+iterate through all model parameters. The following code from
+``torch_mnist_dense.mpc`` uses it to store the model on disk in
+secret-shared form::
+
+  for var in optimizer.trainable_variables:
+    var.write_to_file()
+
+The example code in ``torch_mnist_dense_predict.mpc`` then uses the
+model stored above for prediction. Much of the setup is the same, but
 instead of training it reads the model from disk::
 
-  model.build(test_samples.sizes)
+  optimizer = ml.Optimizer(layers)
 
   start = 0
-  for var in model.trainable_variables:
+  for var in optimizer.trainable_variables:
     start = var.read_from_file(start)
 
-Then it runs the prediction::
+Then it runs the accuracy test::
 
-  guesses = model.predict(test_samples)
+  n_correct, loss = optimizer.reveal_correctness(test_samples, test_labels, 128)
+  print_ln('Accuracy: %s/%s', n_correct, len(test_samples))
 
 Using ``var.input_from(player)`` instead the model would be input
 privately by a party.
+
+
+Exporting models
+~~~~~~~~~~~~~~~~
+
+Models can be exported as follows::
+
+  optimizer.reveal_model_to_binary()
+
+if :py:obj:`optimizer` is an instance of
+:py:class:`Compiler.ml.Optimizer`. The model parameters are then
+stored in ``Player-Data/Binary-Output-P<playerno>-0``. They can be
+imported for use in PyTorch::
+
+  f = open('Player-Data/Binary-Output-P0-0')
+
+  state = net.state_dict()
+
+  for name in state:
+      shape = state[name].shape
+      size = numpy.prod(shape)
+      var = numpy.fromfile(f, 'double', count=size)
+      var = var.reshape(shape)
+      state[name] = torch.Tensor(var)
+
+  net.load_state_dict(state)
+
+if :py:obj:`net` is a PyTorch module with the correct meta-parameters.
+This demonstrates that the parameters are stored with double precision
+in the canonical order.
+
+There are a number of scripts in ``Scripts``, namely
+``torch_cifar_alex_import.py``, ``torch_mnist_dense_import.py``, and
+``torch_mnist_lenet_import.py``, which import the models output by
+``torch_alex_test.mpc``, ``torch_mnist_dense.mpc``, and
+``torch_mnist_lenet_predict.mpc``. For example you can run:
+
+.. code-block:: console
+
+  $ Scripts/compile-emulate.py torch_mnist_lenet_predict
+  ...
+  Secure accuracy: 9822/10000
+  ...
+  $ Scripts/torch_mnist_lenet_import.py
+  Test accuracy of the network: 98.22 %
+
+The accuracy values might vary as the model is freshly trained, but
+they should match.
diff --git a/doc/ml-quickstart.rst b/doc/ml-quickstart.rst
new file mode 100644
index 000000000..f6114378c
--- /dev/null
+++ b/doc/ml-quickstart.rst
@@ -0,0 +1,92 @@
+.. _ml-quickstart:
+
+Machine Learning Quickstart
+---------------------------
+
+This document is a short introduction to running privacy-preserving
+logistic regression in MP-SPDZ. It assumes that you have the framework
+already installed as explained in the `installation instructions
+<https://mp-spdz.readthedocs.io/en/latest/readme.html#tl-dr-binary-distribution-on-linux-or-source-distribution-on-macos>`_.
+For more information on how to run machine learning algorithms in MP-SPDZ,
+see the `full machine learning section
+<https://mp-spdz.readthedocs.io/en/latest/machine-learning.html>`_.
+
+The easiest way to use is to put Python code in an ``.mpc`` in
+``Programs/Source``, for example ``Programs/Source/foo.mpc``. Put the
+following code there to use the breast cancer dataset::
+
+  X = sfix.input_tensor_via(0, [[1, 2, 3], # 2 samples
+                                [11, 12, 13]])
+  y = sint.input_tensor_via(0, [0, 1]) # 2 labels
+
+  from Compiler import ml
+  log = ml.SGDLogistic(100)
+  log.fit(X, y)
+
+  print_ln('%s', log.predict(X).reveal())
+
+The first two lines make the data available to the secure
+computation. The next lines create a logistic regression instance and
+train it (for one hundred epochs). Finally, the last line uses the
+instances for predictions and outputs the results.
+
+After adding all the above code to ``Programs/Source/foo.mpc``, you
+can run it either insecurely:
+
+.. code-block:: console
+
+  Scripts/compile-emulate.py foo
+
+or securely with three parties on the same machine:
+
+.. code-block:: console
+
+  Scripts/compile-run.py -E ring foo
+
+The first call should give the following output:
+
+.. code-block:: console
+
+  $ Scripts/compile-emulate.py foo
+  Default bit length: 63
+  Default security parameter: 40
+  Compiling file Programs/Source/foo.mpc
+  Writing binary data to Player-Data/Input-Binary-P0-0
+  Setting learning rate to 0.01
+  Using SGD
+  Initializing dense weights in [-1.224745,1.224745]
+  Writing to Programs/Bytecode/foo-multithread-1.bc
+  2 runs per epoch
+  Writing to Programs/Bytecode/foo-multithread-3.bc
+  Writing to Programs/Bytecode/foo-multithread-4.bc
+  Writing to Programs/Bytecode/foo-multithread-5.bc
+  Initializing dense weights in [-1.224745,1.224745]
+  Writing to Programs/Bytecode/foo-multithread-7.bc
+  Writing to Programs/Bytecode/foo-multithread-8.bc
+  Writing to Programs/Bytecode/foo-multithread-9.bc
+  Writing to Programs/Schedules/foo.sch
+  Writing to Programs/Bytecode/foo-0.bc
+  Hash: 33f8d22d99960897f41fb2da31e7f5a0501d2e1071789e52d73b4043e5343831
+  Program requires at most:
+             8 integer inputs from player 0
+         61054 integer bits
+        190109 integer triples
+           200 matrix multiplications (1x3 * 3x1)
+           200 matrix multiplications (3x1 * 1x1)
+             1 matrix multiplications (2x3 * 3x1)
+         28406 virtual machine rounds
+  Using security parameter 40
+  Trying to run 64-bit computation
+  Using SGD
+  done with epoch 99
+  [0, 1]
+  The following benchmarks are including preprocessing (offline phase).
+  Time = 0.0250086 seconds 
+
+See `the documentation
+<https://mp-spdz.readthedocs.io/en/latest/readme.html#running-computation>`_
+for further
+options such as different protocols or running remotely and `the
+machine learning section
+<https://mp-spdz.readthedocs.io/en/latest/machine-learning.html>`_ for
+other machine learning methods.
diff --git a/doc/networking.rst b/doc/networking.rst
index c7e031f10..1ec9e1581 100644
--- a/doc/networking.rst
+++ b/doc/networking.rst
@@ -1,3 +1,5 @@
+.. _networking:
+
 Networking
 ----------
 
diff --git a/doc/troubleshooting.rst b/doc/troubleshooting.rst
index 6a32bd37e..f2295bc9e 100644
--- a/doc/troubleshooting.rst
+++ b/doc/troubleshooting.rst
@@ -25,6 +25,16 @@ lists only exists at compile time. Consider using
 :py:class:`~Compiler.types.Array`.
 
 
+Local variable referenced before assignment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This error can occur if you try to reassign a variable in a run-time
+loop like :py:func:`~Compiler.library.for_range`. Use
+:py:func:`~Compiler.program.Tape.Register.update` instead of assignment. See
+:py:func:`~Compiler.library.for_range` for an example.
+You can also use :py:func:`~Compiler.types.sint.iadd` instead of ``+=``.
+
+
 ``compile.py`` takes too long or runs out of memory
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -35,6 +45,16 @@ resulting in potentially too much virtual machine code. Consider using
 version.
 
 
+Incorrect results when using :py:class:`~Compiler.types.sfix`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is most likely caused by an overflow of the precision
+parameters because the default choice unlike accommodates numbers up
+to around 16,000. See :py:class:`~Compiler.types.sfix` for an
+introduction and :py:func:`~Compiler.types.sfix.set_precision` for how
+to change the precision.
+
+
 Order of memory instructions not preserved
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~