diff --git a/CMakeLists.txt b/CMakeLists.txt
index f519fe98a6..387679292f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ if(${TC_BUILD_PYTHON})
 
   include(UseCython)
   add_definitions(-DTC_HAS_PYTHON)
+  add_definitions(-DTC_BUILD_VISUALIZATION_CLIENT)
 else()
   message("Skipping python libraries.")
 endif()
@@ -183,7 +184,7 @@ message(WARNING "OpenMP Libraries were not found")
 #**************************************************************************/
 if (CLANG)
 set(CPP11_FLAGS "-std=c++11 -stdlib=libc++ -Wno-deprecated-register -Wno-enum-compare -Wno-conversion-null -Wno-constant-logical-operand -Wno-parentheses-equality -ftemplate-depth=900" CACHE STRING "C++11 enabling flags")
-set(WERROR_FLAGS "-Werror -Wno-error=tautological-undefined-compare -Wno-error=reorder -Wno-error=exceptions -Wno-error=switch -Wno-error=sometimes-uninitialized -Wno-error=unused-lambda-capture -Wno-error=missing-braces -Wno-error=absolute-value -Wno-error=potentially-evaluated-expression -Wno-error=null-arithmetic -Wno-error=format -Wno-error=pessimizing-move -Wno-error=comment -Wno-error=main -Wno-error=constant-conversion -Wno-error=deprecated-declarations -Wno-error=return-type -Wno-error=inconsistent-missing-override -Wno-error=overloaded-virtual -Wno-error=unused-private-field -Wno-error=unused-variable -Wno-error=unused-local-typedef")
+set(WERROR_FLAGS "-Werror -Wno-error=tautological-undefined-compare -Wno-error=reorder -Wno-error=exceptions -Wno-error=switch -Wno-error=sometimes-uninitialized -Wno-error=unused-lambda-capture -Wno-error=missing-braces -Wno-error=absolute-value -Wno-error=potentially-evaluated-expression -Wno-error=null-arithmetic -Wno-error=format -Wno-error=pessimizing-move -Wno-error=comment -Wno-error=main -Wno-error=constant-conversion -Wno-error=deprecated-declarations -Wno-error=return-type -Wno-error=inconsistent-missing-override -Wno-error=overloaded-virtual -Wno-error=unused-private-field -Wno-error=unused-variable -Wno-error=unused-local-typedef -Wno-error=unguarded-availability-new")
 else()
 set(CPP11_FLAGS "-std=c++11 -Wno-enum-compare -Wno-conversion-null -ftemplate-depth=900" CACHE STRING "C++11 enabling flags")
 set(WERROR_FLAGS "")
@@ -411,21 +412,6 @@ include(copy_file)
 include(CMakeParseArguments)
 include(eval)
 
-add_custom_target(external_dependencies)
-
-include(ExternalProject)
-file(GLOB packages "${DEPS_CMAKE}/ExternalProject*.cmake")
-foreach(package ${packages})
-        message(STATUS "We found local package: ${package}")
-        get_filename_component(packagename "${package}" NAME_WE)
-        #package is of the form ExternalProjectXXX"
-        include(${package})
-        STRING(SUBSTRING "${packagename}" 15 -1 depname)
-        message(STATUS "We found local package definition: ${depname}")
-        string(TOLOWER ${depname} depname)
-        set(package_${depname} requires_${depname} CACHE STRING "Package map")
-        add_dependencies(external_dependencies ex_${depname})
-endforeach()
 
 # This is an internal function and should not be used
 # Usage:
@@ -738,7 +724,6 @@ function (make_boost_test NAME)
 endfunction()
 
 
-# Core ML is only present on macOS 10.13 or higher
 if(APPLE)
   EXEC_PROGRAM(xcrun ARGS --show-sdk-version OUTPUT_VARIABLE mac_version RETURN_VALUE _xcrun_ret)
 
@@ -878,6 +863,21 @@ endmacro()
 include(SharedLibraryFromStatic)
 include(MergeStaticLibraries)
 
+add_custom_target(external_dependencies)
+
+include(ExternalProject)
+file(GLOB packages "${DEPS_CMAKE}/ExternalProject*.cmake")
+foreach(package ${packages})
+        message(STATUS "We found local package: ${package}")
+        get_filename_component(packagename "${package}" NAME_WE)
+        #package is of the form ExternalProjectXXX"
+        include(${package})
+        STRING(SUBSTRING "${packagename}" 15 -1 depname)
+        message(STATUS "We found local package definition: ${depname}")
+        string(TOLOWER ${depname} depname)
+        set(package_${depname} requires_${depname} CACHE STRING "Package map")
+        add_dependencies(external_dependencies ex_${depname})
+endforeach()
 
 include_directories(src)
 include_directories(SYSTEM src/external)
diff --git a/build_capi.sh b/build_capi.sh
index 1f36af4d23..7d1d6be02d 100755
--- a/build_capi.sh
+++ b/build_capi.sh
@@ -141,7 +141,6 @@ function build_capi {
     echo "Stripping local and debug symbols."
     strip -S -x ${install_dir}/lib*.* || echo "Non-fatal error stripping symbols."
   fi
-
 }
 
 function build_capi_framework {
@@ -149,7 +148,7 @@ function build_capi_framework {
   echo "Building C-API as macOS/iOS Framework"
   echo
 
-  run_configure --with-capi-framework ${ios_flag} --no-python --no-visualization -D TC_CAPI_FRAMEWORK_PATH=\"${framework_path}\" || exit 1
+  run_configure --with-capi-framework ${ios_flag} --no-python --no-visualization -D TC_CAPI_FRAMEWORK_PATH=\"${framework_path}\" --release-opt-for-size || exit 1
   mkdir -p ${target_dir}
   cd ${build_dir}/src/capi || exit 1
   make -j ${jobs} || exit 1
diff --git a/build_python_wheel.sh b/build_python_wheel.sh
index 82f528d931..c78d84ec00 100755
--- a/build_python_wheel.sh
+++ b/build_python_wheel.sh
@@ -110,14 +110,5 @@ rm -rf ${target_dir}/python
 mkdir -p ${target_dir}/python
 
 bash scripts/make_wheel.sh --skip_test --skip_cpp_test --build_number="$build_number" --num_procs=${jobs} --${build_mode} --target-dir="${install_dir}"
-pushd ${build_mode}/src
-
-if [[ $apple -eq 1 ]]; then
-  find . -type f -name '*.dylib' -o -name '*.so' | xargs strip -x -
-else
-  find . -type f -name '*.so' | xargs strip -s
-fi
-
-find . -type f -name '*.dylib' -o -name '*.so' | xargs tar cvzf ${install_dir}/shared_objects.tar.gz
 
 
diff --git a/gitlab_scripts/use_ccache.sh b/gitlab_scripts/use_ccache.sh
index 7890079117..6cb4238738 100755
--- a/gitlab_scripts/use_ccache.sh
+++ b/gitlab_scripts/use_ccache.sh
@@ -1,4 +1,2 @@
-export CCACHE_COMPILERCHECK=content
-export CCACHE_DIR=/var/ccache
 ccache -M 100.0G
 ccache -s
diff --git a/scenario-tests/additional_requirements.txt b/scenario-tests/additional_requirements.txt
index 31cb4c9c25..06f86460ce 100644
--- a/scenario-tests/additional_requirements.txt
+++ b/scenario-tests/additional_requirements.txt
@@ -1,6 +1,5 @@
 beautifulsoup4
 certifi==2015.04.28
-gensim==0.12.2
 nltk==3.2
 pyscreenshot==0.4
 python-swiftclient
diff --git a/scripts/make_wheel.sh b/scripts/make_wheel.sh
index 705306a74a..7f05d36e10 100755
--- a/scripts/make_wheel.sh
+++ b/scripts/make_wheel.sh
@@ -272,11 +272,13 @@ package_wheel() {
     temp=`echo $WHEEL_PATH | perl -ne 'print m/(^.*-).*$/'`
     temp=${temp/-cpdarwin-/-cp35m-}
 
-    platform_tag="macosx_10_13_intel.macosx_10_13_x86_64"
-    mac_version=`sw_vers -productVersion`
-    if [[ $mac_version =~ ^10\.12(.\d+)? ]]; then
-        platform_tag="macosx_10_12_intel.macosx_10_12_x86_64"
-    fi
+    platform_tag="macosx_10_12_intel.macosx_10_12_x86_64.macosx_10_13_intel.macosx_10_13_x86_64.macosx_10_14_intel.macosx_10_14_x86_64"
+    #  sdk_version=`xcrun --show-sdk-version`
+    #  if [[ $sdk_version =~ ^10\.13 ]]; then
+    #      platform_tag="macosx_10_13_intel.macosx_10_12_x86_64"
+    #  elif [[ $sdk_version =~ ^10\.12 ]]; then
+    #      platform_tag="macosx_10_12_intel.macosx_10_12_x86_64"
+    #  fi
 
     NEW_WHEEL_PATH=${temp}${platform_tag}".whl"
     mv ${WHEEL_PATH} ${NEW_WHEEL_PATH}
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 474d36daf8..b15b9a6395 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,4 +1,4 @@
-coremltools==0.8
+coremltools==2.0b1
 scipy==0.19.1
 numpy==1.11.1
 cython==0.24
@@ -16,4 +16,4 @@ scikit-learn==0.17.1
 six==1.10.0
 statsmodels==0.8.0
 wheel==0.29.0
-mxnet==0.11
+mxnet==1.1.0
diff --git a/src/fileio/fileio_constants.hpp b/src/fileio/fileio_constants.hpp
index 6c6691a779..683eacd982 100644
--- a/src/fileio/fileio_constants.hpp
+++ b/src/fileio/fileio_constants.hpp
@@ -85,6 +85,12 @@ extern size_t FILEIO_WRITER_BUFFER_SIZE;
  */
 extern std::string S3_ENDPOINT;
 
+/**
+ * \ingroup fileio
+ * The number of GPUs.
+ */
+extern int64_t NUM_GPUS;
+
 /**
  * \ingroup fileio
  * Gets the alternative ssl certificate file and directory.
diff --git a/src/fileio/fs_utils.cpp b/src/fileio/fs_utils.cpp
index cccdf429ab..720fb22b1c 100644
--- a/src/fileio/fs_utils.cpp
+++ b/src/fileio/fs_utils.cpp
@@ -155,14 +155,10 @@ EXPORT file_status get_file_status(const std::string& path) {
       return file_status::MISSING;
     }
   } else if (boost::starts_with(path, "s3://")) {
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("s3:// URLs not supported.");
-#else
     std::pair<bool, bool> ret = is_directory(path);
     if (ret.first == false) return file_status::MISSING;
     else if (ret.second == false) return file_status::REGULAR_FILE;
     else if (ret.second == true) return file_status::DIRECTORY;
-#endif
   } else if (is_web_protocol(get_protocol(path))) {
     return file_status::REGULAR_FILE;
     // some other web protocol?
@@ -208,9 +204,6 @@ get_directory_listing(const std::string& path) {
       // failure for some reason. return with nothing
     }
   } else if (boost::starts_with(path, "s3://")) {
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("s3:// URLs not supported.");
-#else
     list_objects_response response = list_directory(path);
     for (auto dir: response.directories) {
       ret.push_back({dir, file_status::DIRECTORY});
@@ -253,9 +246,6 @@ EXPORT bool create_directory(const std::string& path) {
 #ifdef TC_ENABLE_REMOTEFS
   } else if(boost::starts_with(path, "hdfs://")) {
     // hdfs
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("hdfs:// URLs not supported.");
-#else
     std::string host, port, hdfspath;
     std::tie(host, port, hdfspath) = parse_hdfs_url(path);
     try {
@@ -342,9 +332,6 @@ bool delete_path_impl(const std::string& path,
       return false;
     }
   } else if (boost::starts_with(path, "s3://")) {
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("s3:// URLs not supported.");
-#else
     return delete_object(path).empty();
 #endif
   } else {
@@ -373,9 +360,6 @@ EXPORT bool delete_path_recursive(const std::string& path) {
 #ifdef TC_ENABLE_REMOTEFS
   } else if(boost::starts_with(path, "hdfs://")) {
     // hdfs
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("hdfs:// URLs not supported.");
-#else
     std::string host, port, hdfspath;
     std::tie(host, port, hdfspath) = parse_hdfs_url(path);
     try {
@@ -387,11 +371,7 @@ EXPORT bool delete_path_recursive(const std::string& path) {
       // failure for some reason. return with nothing
       return false;
     }
-#endif
   } else if(boost::starts_with(path, "s3://")) {
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("s3:// URLs not supported.");
-#else
     return delete_prefix(path).empty();
 #endif
   } else {
@@ -683,9 +663,6 @@ bool change_file_mode(const std::string path, short mode) {
   }
 
   if(boost::starts_with(path, "hdfs://")) {
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("hdfs:// URLs not supported.");
-#else
 #ifdef HAS_HADOOP
     // hdfs
     std::string host, port, hdfspath;
@@ -701,7 +678,6 @@ bool change_file_mode(const std::string path, short mode) {
     }
 #else
       return false;
-#endif
 #endif
   } else if (boost::starts_with(path, fileio::get_cache_prefix())) {
     // this is a cache file. There is no filesystem.
diff --git a/src/fileio/union_fstream.cpp b/src/fileio/union_fstream.cpp
index 6ac0b84e78..1c06ae73ce 100644
--- a/src/fileio/union_fstream.cpp
+++ b/src/fileio/union_fstream.cpp
@@ -52,9 +52,6 @@ union_fstream::union_fstream(std::string url,
 #ifdef TC_ENABLE_REMOTEFS
   } else if(boost::starts_with(url, "hdfs://")) {
     // HDFS file type
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw("hdfs:// URLs not supported.");
-#else
     type = HDFS;
     std::string host, port, path;
     std::tie(host, port, path) = fileio::parse_hdfs_url(url);
@@ -78,9 +75,6 @@ union_fstream::union_fstream(std::string url,
   } else if (boost::starts_with(url, "s3://")) {
     // the S3 file type currently works by download/uploading a local file
     // i.e. the s3_stream simply remaps a local file stream
-#ifdef TC_BUILD_CAPI_IOS
-    log_and_throw_io_failure("Not implemented: compiled without support for s3:// URLs.");
-#else
     type = STD;
     if (is_output_stream) {
       output_stream = std::make_shared<s3_fstream>(url, true);
diff --git a/src/serialization/oarchive.hpp b/src/serialization/oarchive.hpp
index 01168fe658..0cbc4c09d4 100644
--- a/src/serialization/oarchive.hpp
+++ b/src/serialization/oarchive.hpp
@@ -128,7 +128,7 @@ namespace turi {
     inline void direct_assign(const T& t) {
       if (out == NULL) {
         expand_buf(sizeof(T));
-        (*reinterpret_cast<T*>(buf + off)) = t;
+        std::memcpy(buf + off, &t, sizeof(T));
         off += sizeof(T);
       }
       else {
diff --git a/src/serialization/vector.hpp b/src/serialization/vector.hpp
index 150798b6e0..8b68126e1b 100644
--- a/src/serialization/vector.hpp
+++ b/src/serialization/vector.hpp
@@ -56,7 +56,7 @@ namespace turi {
     struct vector_serialize_impl<OutArcType, ValueType, true > {
       static void exec(OutArcType& oarc, const std::vector<ValueType>& vec) {
         oarc << size_t(vec.size());
-        serialize(oarc, &(vec[0]),sizeof(ValueType)*vec.size());
+        serialize(oarc, vec.data(),sizeof(ValueType)*vec.size());
       }
     };
 
@@ -80,7 +80,7 @@ namespace turi {
         size_t len;
         iarc >> len;
         vec.clear(); vec.resize(len);
-        deserialize(iarc, &(vec[0]), sizeof(ValueType)*vec.size());
+        deserialize(iarc, vec.data(), sizeof(ValueType)*vec.size());
       }
     };
 
diff --git a/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.cpp b/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.cpp
index 1eb03062cb..db0c74f88b 100644
--- a/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.cpp
+++ b/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.cpp
@@ -25,8 +25,8 @@ void image_deep_feature_extractor_toolkit::init_options(const std::map<std::stri
 
 gl_sarray image_deep_feature_extractor_toolkit::extract_features(gl_sframe data,
                                                                  const std::string& column_name,
-                                                                 bool verbose) const {
-  return m_feature_extractor->extract_features(data[column_name]);
+                                                                 bool verbose, size_t batch_size) const {
+  return m_feature_extractor->extract_features(data[column_name], verbose, batch_size);
 }
 
 }  // image_deep_feature_extractor
diff --git a/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.hpp b/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.hpp
index 2ae69cca52..73a5201b20 100644
--- a/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.hpp
+++ b/src/toolkits/image_deep_feature_extractor/image_deep_feature_extractor_toolkit.hpp
@@ -25,7 +25,7 @@ class EXPORT image_deep_feature_extractor_toolkit : public ml_model_base {
 
   void init_options(const std::map<std::string, flexible_type>& options);
 
-  gl_sarray extract_features(gl_sframe data, const std::string& column_name, bool verbose) const;
+  gl_sarray extract_features(gl_sframe data, const std::string& column_name, bool verbose, size_t batch_size) const;
 
   inline size_t get_version() const { return -1; }
 
@@ -41,7 +41,7 @@ class EXPORT image_deep_feature_extractor_toolkit : public ml_model_base {
 
   REGISTER_CLASS_MEMBER_FUNCTION(image_deep_feature_extractor_toolkit::init_options, "options");
     
-  REGISTER_CLASS_MEMBER_FUNCTION(image_deep_feature_extractor_toolkit::extract_features, "data",  "column_name", "verbose");
+  REGISTER_CLASS_MEMBER_FUNCTION(image_deep_feature_extractor_toolkit::extract_features, "data",  "column_name", "verbose", "batch_size");
 
   END_CLASS_MEMBER_REGISTRATION
 
diff --git a/src/toolkits/image_deep_feature_extractor/image_feature_extractor.hpp b/src/toolkits/image_deep_feature_extractor/image_feature_extractor.hpp
index 1fb1a2c2bd..4447f8f5eb 100644
--- a/src/toolkits/image_deep_feature_extractor/image_feature_extractor.hpp
+++ b/src/toolkits/image_deep_feature_extractor/image_feature_extractor.hpp
@@ -30,7 +30,7 @@ class image_feature_extractor {
   // free to perform this computation in a more optimized fashion. The input
   // SArray may also contain flex_string values, in which case each string is
   // interpreted as a URL from which the image can be loaded.
-  virtual gl_sarray extract_features(gl_sarray images) const = 0;
+  virtual gl_sarray extract_features(gl_sarray images, bool verbose, size_t batch_size) const = 0;
 };
 
 }  // image_deep_feature_extractor
diff --git a/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.hpp b/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.hpp
index 891433a37b..d2a7e4922b 100644
--- a/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.hpp
+++ b/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.hpp
@@ -27,7 +27,7 @@ class mlmodel_image_feature_extractor: public image_feature_extractor {
 
   // image_feature_extractor interface
   const CoreML::Specification::Model& coreml_spec() const override;
-  gl_sarray extract_features(gl_sarray images) const override;
+  gl_sarray extract_features(gl_sarray images, bool verbose, size_t batch_size) const override;
 
 private:
   // Use PIMPL pattern to hide Objective C from this C++ header.
diff --git a/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.mm b/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.mm
index 9a24fc4bab..f81c72f2e0 100644
--- a/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.mm
+++ b/src/toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.mm
@@ -6,13 +6,17 @@
 #include <toolkits/image_deep_feature_extractor/mlmodel_image_feature_extractor.hpp>
 
 #include <boost/filesystem.hpp>
+#include <fileio/fileio_constants.hpp>
 #include <fileio/curl_downloader.hpp>
 #include <fileio/fs_utils.hpp>
 #include <logger/logger.hpp>
+#include <parallel/lambda_omp.hpp>
+#include <table_printer/table_printer.hpp>
 #include <unity/lib/image_util.hpp>
 #include <unity/toolkits/coreml_export/mlmodel_include.hpp>
 
 #import <CoreML/CoreML.h>
+#include <memory>
 
 #include <unity/toolkits/coreml_export/MLModel/src/Format.hpp>
 
@@ -99,6 +103,7 @@ bool has_feature_layer_output_name(const CoreML::Specification::NeuralNetworkLay
 const std::map<const std::string, const neural_network_model_details> model_name_to_info =
   {{"resnet-50", {224, 224, 2048, "flatten0", "data",
                   "Resnet50.mlmodel"}},
+   {"VisionFeaturePrint_Screen", {299, 299, 2048, "output", "image_input", ""}},
    {"squeezenet_v1.1", {227, 227, 1000, "pool10", "image",
                         "https://docs-assets.developer.apple.com/coreml/models/SqueezeNet.mlmodel"}}};
 
@@ -118,34 +123,102 @@ static void checkNSError(NSError *error) {
   return model_info_entry->second;
 }
 
+void build_vision_feature_print_screen_spec(const std::string& model_path) {
+  const neural_network_model_details& model_info = get_model_info("VisionFeaturePrint_Screen");
+
+  CoreML::Specification::Model spec = CoreML::Specification::Model();
+  spec.set_specificationversion(CoreML::MLMODEL_SPECIFICATION_VERSION);
+
+  auto* description = spec.mutable_description();
+
+  auto* input = description->add_input();
+  input->set_name("image_input");
+  auto* input_type = input->mutable_type()->mutable_imagetype();
+
+  input_type->set_width(model_info.input_width);
+  input_type->set_height(model_info.input_height);
+
+  input_type->set_colorspace(CoreML::Specification::ImageFeatureType::BGR);
+
+  auto* output = description->add_output();
+  output->set_name("output");
+  auto* output_type = output->mutable_type()->mutable_multiarraytype();
+  output_type->set_datatype(CoreML::Specification::ArrayFeatureType::DOUBLE);
+  output_type->add_shape(model_info.feature_layer_size);
+
+  auto vision_feature_print = spec.mutable_visionfeatureprint();
+  auto scene = vision_feature_print->mutable_scene();
+  scene->set_version(CoreML::Specification::CoreMLModels::VisionFeaturePrint_Scene_SceneVersion_SCENE_VERSION_1);
+
+  // Save the model
+  CoreML::Result r = CoreML::Model(spec).save(model_path);
+  if(!r.good()) {
+    log_and_throw("Could not save model: " + r.message());
+  }
+
+}
 
 static MLModel *create_model(const std::string& download_path,
 			     const std::string& model_name) {
-  const std::string modified_model_path = download_path + "/" + model_name + "_modified.mlmodel";
-  if(! boost::filesystem::exists(modified_model_path)) {
-    std::string base_model_path;
-    const neural_network_model_details& model_info = get_model_info(model_name);
-
-    if(turi::fileio::get_protocol(model_info.base_model_url) != "") {
-      base_model_path = download_path + "/" + model_name + ".mlmodel";
-      logstream(LOG_PROGRESS) << "Downloading base mlmodel" << std::endl;
-      turi::download_url(model_info.base_model_url, base_model_path);
+
+  const std::string compiled_modified_model_path = download_path + "/" + model_name + "_modified.mlmodelc";
+
+  // Create the compiled modified model, if we don't already have it
+  if(! boost::filesystem::exists(compiled_modified_model_path)) {
+
+    // Create the modified model
+    const std::string modified_model_path = download_path + "/" + model_name + "_modified.mlmodel";
+    if(model_name == "VisionFeaturePrint_Screen") {
+      build_vision_feature_print_screen_spec(modified_model_path);
     } else {
-      base_model_path = download_path + "/" + model_info.base_model_url;
+      std::string base_model_path;
+      const neural_network_model_details& model_info = get_model_info(model_name);
+
+      if(turi::fileio::get_protocol(model_info.base_model_url) != "") {
+        base_model_path = download_path + "/" + model_name + ".mlmodel";
+        logstream(LOG_PROGRESS) << "Downloading base mlmodel" << std::endl;
+        turi::download_url(model_info.base_model_url, base_model_path);
+      } else {
+        base_model_path = download_path + "/" + model_info.base_model_url;
+      }
+
+      model_info.modify_neural_network(base_model_path, modified_model_path);
     }
 
-    model_info.modify_neural_network(base_model_path, modified_model_path);
+    @autoreleasepool {
+      NSError* error = nil;
+
+      // Swallow output for the very verbose coremlcompiler
+      int stdoutBack = dup(STDOUT_FILENO);
+      int devnull = open("/dev/null", O_WRONLY);
+      dup2(devnull, STDOUT_FILENO);
+
+      // Compile the modified model
+      NSString* temp = [NSString stringWithUTF8String:modified_model_path.c_str()];
+      NSURL* specPath = [NSURL fileURLWithPath:temp];
+      NSURL* modelPath = [MLModel compileModelAtURL:specPath error:&error];
+      checkNSError(error);
+
+      // Close all the file descriptors and revert back to normal
+      dup2(stdoutBack, STDOUT_FILENO);
+      close(devnull);
+      close(stdoutBack);
+
+      // Copy the compiled modified model
+      temp = [NSString stringWithUTF8String:compiled_modified_model_path.c_str()];
+      NSURL* compiledModelPath = [NSURL fileURLWithPath:temp];
+      [[NSFileManager defaultManager] copyItemAtURL:modelPath toURL:compiledModelPath error:&error];
+      checkNSError(error);
+    }
   }
 
-  // Load the model.
+  // Load the compiled modified model
   MLModel* result = nil;
   @autoreleasepool {
     NSError* error = nil;
-    NSString* temp = [NSString stringWithUTF8String:modified_model_path.c_str()];
-    NSURL* specPath = [NSURL fileURLWithPath:temp];
-    NSURL* modelPath = [MLModel compileModelAtURL:specPath error:&error];
-    checkNSError(error);
-    result = [MLModel modelWithContentsOfURL:modelPath error:&error];
+    NSString* temp = [NSString stringWithUTF8String:compiled_modified_model_path.c_str()];
+    NSURL* compiledModelPath = [NSURL fileURLWithPath:temp];
+    result = [MLModel modelWithContentsOfURL:compiledModelPath error:&error];
     checkNSError(error);
     result = [result retain];  // Safe to retain now that no exceptions possible
   }
@@ -162,8 +235,7 @@ static void handleCVReturn(CVReturn status) {
   }
 }
 
-
-static CVPixelBufferRef flex_image_to_CVPixelBuffer(const flex_image image) {
+CVPixelBufferRef create_pixel_buffer_from_flex_image(const flex_image image) {
   // The code in this function is largely adapted from convertValueToImage here:
   // https://github.com/apple/coremltools/blob/master/coremlpython/CoreMLPythonUtils.mm
 
@@ -243,7 +315,7 @@ static CVPixelBufferRef flex_image_to_CVPixelBuffer(const flex_image image) {
   }
 
   m_impl->name = model_name;
-  m_impl->model = create_model(download_path, model_name);  // retained
+  m_impl->model = create_model(download_path, model_name);
 
   // Read the spec from the file written to produce the MLModel.
   // TODO: Just save this value before writing it to disk.
@@ -262,54 +334,175 @@ static CVPixelBufferRef flex_image_to_CVPixelBuffer(const flex_image image) {
 }
 
 gl_sarray
-mlmodel_image_feature_extractor::extract_features(gl_sarray data) const {
-  const neural_network_model_details& model_info = get_model_info(m_impl->name);
+mlmodel_image_feature_extractor::extract_features(gl_sarray data, bool verbose, size_t kBatchSize) const {
   ASSERT_EQ((int)data.dtype(), (int)flex_type_enum::IMAGE);
+  ASSERT_TRUE(kBatchSize >= 1);
+
+  const neural_network_model_details& model_info = get_model_info(m_impl->name);
+
+  BOOL use_only_cpu = (turi::fileio::NUM_GPUS == 0);
 
   std::vector<flexible_type> result(data.size());
-  @autoreleasepool {
-  NSError* error = nil;
-  for(size_t i = 0; i < data.size(); i++) {
+
+  mutex mut;
+
+  timer tt;
+  tt.start();
+  table_printer table(
+        { {"Images Processed", 0}, {"Elapsed Time", 0}, {"Percent Complete", 0} }, 0);
+  if (verbose) {
+    logprogress_stream << "Analyzing and extracting image features." << std::endl;
+    table.print_header();
+  }
+
+  // Lambda converting one flex_image from `data` into a MLFeatureProvider to
+  // feed into the CoreML model. Must be called inside an autorelease pool.
+  auto convert_image_to_feature_provider = [&](size_t i) {
     flexible_type decoded_image = image_util::resize_image(data[i], model_info.input_width, model_info.input_height, 3, true);
     const flex_image& image = decoded_image.get<flex_image>();
-    CVPixelBufferRef buffer = flex_image_to_CVPixelBuffer(image);
-
+    CVPixelBufferRef buffer = create_pixel_buffer_from_flex_image(image);
     MLFeatureValue* image_feature = [MLFeatureValue featureValueWithPixelBuffer:buffer];
-    NSString* input_name = [NSString stringWithUTF8String: model_info.input_name.c_str()];
-    MLDictionaryFeatureProvider *input = [[MLDictionaryFeatureProvider alloc] initWithDictionary:@{input_name: image_feature} error:&error];
-    checkNSError(error);
-    id<MLFeatureProvider> model_prediction = [m_impl->model predictionFromFeatures:input error:&error];
-    checkNSError(error);
+    CFRelease(buffer);
 
+    NSString* input_name = [NSString stringWithUTF8String: model_info.input_name.c_str()];
+    NSError *error = nil;
+    MLDictionaryFeatureProvider *input = [[[MLDictionaryFeatureProvider alloc] initWithDictionary:@{input_name: image_feature} error:&error] autorelease];
+    checkNSError(error);  // Can throw, must autorelease before here.
+    return input;
+  };
+
+  // Lambda converting one MLFeatureProvider output from the CoreML model into
+  // a flex_vec value, written to `result[i]`. Must be called inside an
+  // autorelease pool.
+  auto set_output_vector =
+      [&](size_t i, id<MLFeatureProvider> model_prediction) {
     MLFeatureValue* deep_features = [model_prediction featureValueForName: [NSString stringWithUTF8String: model_info.feature_layer_output_name.c_str()]];
     MLMultiArray* deep_features_values = [deep_features multiArrayValue];
 
     // Santiy check prediction shape
     NSArray<NSNumber *> * shape = [deep_features_values shape];
-    ASSERT_EQ(shape.count, (unsigned long)5);
-    ASSERT_EQ(shape[0].intValue, 1);
-    ASSERT_EQ(shape[1].intValue, 1);
-    ASSERT_EQ(shape[2].intValue, model_info.feature_layer_size);
-    ASSERT_EQ(shape[3].intValue, 1);
-    ASSERT_EQ(shape[4].intValue, 1);
+    size_t feature_dim = -1;
+    if(m_impl->name != "VisionFeaturePrint_Screen") {
+      ASSERT_EQ(shape.count, (unsigned long)5);
+      ASSERT_EQ(shape[0].intValue, 1);
+      ASSERT_EQ(shape[1].intValue, 1);
+      ASSERT_EQ(shape[2].intValue, model_info.feature_layer_size);
+      ASSERT_EQ(shape[3].intValue, 1);
+      ASSERT_EQ(shape[4].intValue, 1);
+      feature_dim = 2;
+    } else {
+      ASSERT_EQ(shape.count, (unsigned long)1);
+      ASSERT_EQ(shape[0].intValue, model_info.feature_layer_size);
+      feature_dim = 0;
+    }
 
     // Copy deep features to a flexible type vector
-    size_t deep_feature_length = shape[2].intValue;
-    size_t stride = deep_features_values.strides[2].intValue;
+    size_t deep_feature_length = shape[feature_dim].intValue;
+    size_t stride = deep_features_values.strides[feature_dim].intValue;
     flex_vec dest(deep_feature_length);
     double *srcPtr = (double *) deep_features_values.dataPointer;
     for(size_t j = 0; j < deep_feature_length; j++) {
       size_t offset = j * stride;
       dest[j] = srcPtr[offset];
     }
-    result[i] = dest;
+    result[i] = std::move(dest);
+  };
+
+  // Lambda performing feature extraction on one batch of images, writing the
+  // output into `results`. Must be called inside an autorelease pool.
+  auto perform_batch = [&](size_t batch_index) {
+    const size_t batch_offset = batch_index * kBatchSize;
+    const size_t batch_end = std::min(data.size(), batch_offset + kBatchSize);
+    const size_t batch_size = batch_end - batch_offset;
+
+    // Create the batch input for the CoreML model.
+    NSMutableArray<id<MLFeatureProvider>> *inputs =
+        [NSMutableArray arrayWithCapacity:batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+      [inputs addObject: convert_image_to_feature_provider(batch_offset + i)];
+    }
+    NSMutableArray<id<MLFeatureProvider>> *outputs =
+        [NSMutableArray arrayWithCapacity:batch_size];
+
+    // The CoreML batch API only exists if the base SDK is new enough.
+#ifdef HAS_CORE_ML_BATCH_INFERENCE
+    // Even when compiled with a new enough SDK, guard against older deployment
+    // targets at runtime.
+    if (@available(macOS 10.14, *)) {
+      // Invoke CoreML using the batch inference API for better performance.
+      MLArrayBatchProvider *image_batch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray: inputs];
+      MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
+      [options setUsesCPUOnly:use_only_cpu];
+      NSError *error = nil;
+      id<MLBatchProvider> features_batch = [m_impl->model predictionsFromBatch:image_batch options:options error:&error];
+      [options release];
+      [image_batch release];
+      checkNSError(error);
+
+      for (NSInteger i = 0; i < features_batch.featureProviderCount; ++i) {
+        [outputs addObject:[features_batch featureProviderAtIndex:i]];
+      }
+    } else {
+#else
+    {
+#endif
+      // Once it's our turn to use CoreML, don't let any other threads in until
+      // we're done and ready to move on to the CPU-bound phase of processing.
+      std::lock_guard<mutex> lock(mut);
+      for (size_t i = 0; i < batch_size; ++i) {
+        // Invoke the CoreML model.
+        NSError *error = nil;
+        MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
+        [options setUsesCPUOnly:use_only_cpu];
+        id<MLFeatureProvider> features = [m_impl->model predictionFromFeatures:inputs[i]  options:options error:&error];
+        [options release];
+        checkNSError(error);
+
+        // Just collect the outputs for now. Delay any copying until after we
+        // release the mutex.
+        [outputs addObject:features];
+      }
+    }
 
+    // Convert/copy the output of the CoreML model.
+    for (size_t i = 0; i < batch_size; ++i) {
+      set_output_vector(batch_offset + i, outputs[i]);
+    }
+  };
+
+  // Submit batches in parallel, one for each CPU core, so that:
+  // - CoreML is busy all the time, assuming each core can prepare a batch
+  //   faster than CoreML can evaluate the other n - 1 batches.
+  // - Every core is busy, except when there is a backlog of batches.
+  // - The number of batches in flight is bounded (by the number of cores).
+  // - The worker threads do not contend or synchronize with one another, except
+  //   within CoreML and when joining at the very end.
+  std::atomic<size_t> batches_completed(0);
+  const size_t batch_count = (data.size() + kBatchSize - 1) / kBatchSize;
+  parallel_for(0, batch_count, [&](size_t batch_index) {
+    @autoreleasepool {
+
+      if (verbose) {
+        std::ostringstream d;
+        // For pretty printing, floor percent done
+        // resolution to the nearest .25% interval.  Do this by multiplying by
+        // 400, then do integer division by the total size, then float divide
+        // by 4.0
+        d << (double(size_t(400 * batches_completed) / batch_count) / 4.0) << '%';
+        table.print_progress_row(batches_completed, batches_completed * kBatchSize,
+                                 progress_time(tt), d.str());
+      }
+
+      perform_batch(batch_index);
+      batches_completed++;
+    } // end autoreleasepool
+  });
+
+  if (verbose) {
+    table.print_footer();
   }
-
-  } // end autoreleasepool
-
   return gl_sarray(result, flex_type_enum::VECTOR);
 }
 
-} // image_deep_feature_extractor
+} // namespace image_deep_feature_extractor
 } // namespace turi
diff --git a/src/unity/extensions/additional_sframe_utilities.cpp b/src/unity/extensions/additional_sframe_utilities.cpp
index 4b11934e9b..d7419e3f84 100644
--- a/src/unity/extensions/additional_sframe_utilities.cpp
+++ b/src/unity/extensions/additional_sframe_utilities.cpp
@@ -5,9 +5,11 @@
  */
 #include <string>
 #include <vector>
+#include <parallel/lambda_omp.hpp>
 #include <parallel/pthread_tools.hpp>
 #include <unity/lib/gl_sarray.hpp>
 #include <unity/lib/gl_sframe.hpp>
+#include <unity/lib/image_util.hpp>
 #include <unity/lib/toolkit_function_macros.hpp>
 #include <sframe/sframe_config.hpp>
 #include <image/image_type.hpp>
@@ -17,60 +19,57 @@ using namespace turi;
 
 
 template <typename T>
-void copy_image_to_memory(const image_type& img, T *outptr,
+void copy_image_to_memory(const image_type& input, T *outptr,
                           const std::vector<size_t>& outstrides,
+                          const std::vector<size_t>& outshape,
                           bool channel_last) {
   ASSERT_EQ(outstrides.size(), 3);
-  size_t index_h, index_w, index_c;
+  ASSERT_EQ(outshape.size(), 3);
+  size_t stride_h, stride_w, stride_c;
+  size_t height, width, channels;
   if (channel_last) {
     // Format: HWC
-    index_h = 0;
-    index_w = 1;
-    index_c = 2;
+    stride_h = outstrides[0];
+    stride_w = outstrides[1];
+    stride_c = outstrides[2];
+    height = outshape[0];
+    width = outshape[1];
+    channels = outshape[2];
   } else {
     // Format: CHW
-    index_c = 0;
-    index_h = 1;
-    index_w = 2;
+    stride_c = outstrides[0];
+    stride_h = outstrides[1];
+    stride_w = outstrides[2];
+    channels = outshape[0];
+    height = outshape[1];
+    width = outshape[2];
   }
 
-  // Decode if needed
-  if (!img.is_decoded()) {
-    char* buf = NULL;
-    size_t length = 0;
-    if (img.m_format == Format::JPG) {
-      decode_jpeg((const char*)img.get_image_data(), img.m_image_data_size, &buf, length);
-    } else if (img.m_format == Format::PNG) {
-      decode_png((const char*)img.get_image_data(), img.m_image_data_size, &buf, length);
-    } else {
-      ASSERT_MSG(false, "Unsupported image format");
-    }
-    size_t cnt = 0;
-    for (size_t i = 0; i < img.m_height; ++i) {
-      for (size_t j = 0; j < img.m_width; ++j) {
-        for (size_t k = 0; k < img.m_channels; ++k) {
-          outptr[i * outstrides[index_h] + j * outstrides[index_w] + k * outstrides[index_c]] = static_cast<unsigned char>(buf[cnt++]);
-        }
-      }
-    }
-    delete[] buf;
-  } else {
-    size_t cnt = 0;
-    const unsigned char* raw_data = img.get_image_data();
-    for (size_t i = 0; i < img.m_height; ++i) {
-      for (size_t j = 0; j < img.m_width; ++j) {
-        for (size_t k = 0; k < img.m_channels; ++k) {
-          outptr[i * outstrides[index_h] + j * outstrides[index_w] + k * outstrides[index_c]] = static_cast<unsigned char>(raw_data[cnt++]);
-        }
+  // Resize.
+  flexible_type resized = image_util::resize_image(input, width, height,
+						   channels, /* decode */ true);
+  const image_type& img = resized.get<flex_image>();
+
+  // Copy.
+  size_t cnt = 0;
+  const unsigned char* raw_data = img.get_image_data();
+  for (size_t i = 0; i < img.m_height; ++i) {
+    for (size_t j = 0; j < img.m_width; ++j) {
+      for (size_t k = 0; k < img.m_channels; ++k) {
+	outptr[i * stride_h + j * stride_w + k * stride_c] =
+	    static_cast<T>(raw_data[cnt++]);
       }
     }
   }
+
+  // Further optimization is possible (but not trivial) by combining the resize
+  // operation and the copy operation, removing an intermediate buffer.
 }
 
 void copy_to_memory(const sframe_rows::row& data,
                     float* outptr, 
                     const std::vector<size_t>& outstrides,
-                    const std::vector<size_t>& field_length_p) {
+                    const std::vector<size_t>& outshape) {
   ASSERT_GE(data.size(), 1);
 
   for (size_t i = 0; i < data.size(); ++i) {
@@ -82,7 +81,7 @@ void copy_to_memory(const sframe_rows::row& data,
   if (type == flex_type_enum::IMAGE) {
     ASSERT_MSG(data.size() == 1, "Image data only support one input field");
     const image_type& img = data[0].get<flex_image>();
-    copy_image_to_memory<float>(img, outptr, outstrides, false);
+    copy_image_to_memory<float>(img, outptr, outstrides, outshape, false);
     return;
   } else if (data.size() == 1 && (type == flex_type_enum::FLOAT || type == flex_type_enum::INTEGER)) {
     // Case 2: Single value type (should really get rid of this special case) 
@@ -91,21 +90,20 @@ void copy_to_memory(const sframe_rows::row& data,
     return;
   } else if (data.size() == 1 && type == flex_type_enum::LIST) {
     // Case 3: 2D arrays: list of vectors or list of lists of values
-    // field_length defines shape of the 2d array
-    ASSERT_EQ(field_length_p.size(), 2);
+    ASSERT_EQ(outshape.size(), 2);
     const flex_list& dim0_lst = data[0].to<flex_list>();
-    ASSERT_EQ(dim0_lst.size(), field_length_p[0]);
+    ASSERT_EQ(dim0_lst.size(), outshape[0]);
     for (size_t i = 0; i < dim0_lst.size(); ++i) {
       auto dim1_type = dim0_lst[i].get_type();
       if (dim1_type == flex_type_enum::VECTOR) {
         const flex_vec& dim1_vec = dim0_lst[i].to<flex_vec>();
-        ASSERT_EQ(dim1_vec.size(), field_length_p[1]);
+        ASSERT_EQ(dim1_vec.size(), outshape[1]);
         for (size_t j = 0; j < dim1_vec.size(); ++j) {
           outptr[outstrides[0] * i + outstrides[1] * j] = (float)(dim1_vec[j]);
         }
       } else if (dim1_type == flex_type_enum::LIST) {
         const flex_list& dim1_lst = dim0_lst[i].to<flex_list>();
-        ASSERT_EQ(dim1_lst.size(), field_length_p[1]);
+        ASSERT_EQ(dim1_lst.size(), outshape[1]);
         for (size_t j = 0; j < dim1_lst.size(); ++j) {
           auto value_type = dim1_lst[j].get_type();
           if (value_type == flex_type_enum::INTEGER ||
@@ -122,12 +120,12 @@ void copy_to_memory(const sframe_rows::row& data,
   } else {
     // Case 4: Array type or mixed types
     ASSERT_EQ(outstrides.size(), 1);
+    ASSERT_EQ(outshape.size(), 1);
     size_t pos = 0;
     for (size_t i = 0; i < data.size(); ++i) {
       auto type = data[i].get_type();
       if (type == flex_type_enum::VECTOR) {
         const flex_vec& v = data[i].to<flex_vec>();
-        ASSERT_EQ(v.size(), field_length_p[i]);
         for (size_t j = 0; j < v.size(); ++j) {
           outptr[outstrides[0] * pos] = (float)(v[j]);
           ++pos;
@@ -140,13 +138,14 @@ void copy_to_memory(const sframe_rows::row& data,
         ASSERT_MSG(false, "Unsupported type");
       }
     }
+    ASSERT_EQ(pos, outshape[0]);
   }
   return;
 }
 void sframe_load_to_numpy(turi::gl_sframe input, size_t outptr_addr,
-                     std::vector<size_t> outstrides,
-                     std::vector<size_t> field_length,
-                     size_t begin, size_t end) {
+			  std::vector<size_t> outstrides,
+			  std::vector<size_t> outshape,
+			  size_t begin, size_t end) {
   if (!input.is_materialized()) {
     input.materialize();
   }
@@ -154,27 +153,41 @@ void sframe_load_to_numpy(turi::gl_sframe input, size_t outptr_addr,
   ASSERT_MSG(input.num_columns() > 0, "SFrame has no column");
   float* outptr = reinterpret_cast<float*>(outptr_addr);
 
+  ASSERT_EQ(outstrides.size(), outshape.size());
   ASSERT_GE(outstrides.size(), 1);
   for (size_t& stride: outstrides) {
     stride /= sizeof(float);
   }
+
   // we consume the first index. copy_to_memory takes the rest
-  std::vector<size_t> descendent_strides(outstrides.begin() + 1, outstrides.end());
-  for (const auto& row : input.range_iterator(begin, end)) {
-    copy_to_memory(row, outptr, descendent_strides, field_length);
-    outptr += outstrides[0];
-  }
+  size_t row_stride = outstrides[0];
+  outstrides.erase(outstrides.begin());
+  outshape.erase(outshape.begin());
+
+  const size_t num_rows = end - begin;
+  in_parallel([&](size_t worker_idx, size_t num_workers) {
+    // Compute the input range and output address for this thread.
+    size_t worker_begin = begin + num_rows * worker_idx / num_workers;
+    size_t worker_end = begin + num_rows * (worker_idx + 1) / num_workers;
+    float* worker_out = outptr + row_stride * (worker_begin - begin);
+
+    for (const auto& row : input.range_iterator(worker_begin, worker_end)) {
+      copy_to_memory(row, worker_out, outstrides, outshape);
+      worker_out += row_stride;
+    }
+  });
 }
 
 // Loads image into row-major array with shape HWC (height, width, channel)
 void image_load_to_numpy(const image_type& img, size_t outptr_addr,
                          const std::vector<size_t>& outstrides) {
   unsigned char *outptr = reinterpret_cast<unsigned char *>(outptr_addr);
-  copy_image_to_memory(img, outptr, outstrides, true);
+  copy_image_to_memory(img, outptr, outstrides,
+                       {img.m_height, img.m_width, img.m_channels}, true);
 }
 
 
 BEGIN_FUNCTION_REGISTRATION
-REGISTER_FUNCTION(sframe_load_to_numpy, "input", "outptr_addr", "outstrides", "field_length", "begin", "end");
+REGISTER_FUNCTION(sframe_load_to_numpy, "input", "outptr_addr", "outstrides", "outshape", "begin", "end");
 REGISTER_FUNCTION(image_load_to_numpy, "img", "outptr_addr", "outstrides");
 END_FUNCTION_REGISTRATION
diff --git a/src/unity/lib/image_util.cpp b/src/unity/lib/image_util.cpp
index 88df99fbd6..f9a8773f26 100644
--- a/src/unity/lib/image_util.cpp
+++ b/src/unity/lib/image_util.cpp
@@ -220,13 +220,11 @@ std::vector<std::string> get_directory_files(std::string url, bool recursive) {
   path_status_vec_t path_status_vec = fileio::get_directory_listing(url);
   std::vector<std::string> ret;
   for (const auto& path_status : path_status_vec) {
-    if (path_status.first[0] != '.') {
-      if (recursive && path_status.second == fileio::file_status::DIRECTORY) {
-        auto tmp = get_directory_files(path_status.first, recursive);
-        ret.insert(ret.end(), tmp.begin(), tmp.end());
-      } else if (path_status.second == fileio::file_status::REGULAR_FILE){
-        ret.push_back(path_status.first);
-      }
+    if (recursive && path_status.second == fileio::file_status::DIRECTORY) {
+      auto tmp = get_directory_files(path_status.first, recursive);
+      ret.insert(ret.end(), tmp.begin(), tmp.end());
+    } else if (path_status.second == fileio::file_status::REGULAR_FILE){
+      ret.push_back(path_status.first);
     }
   }
   return ret;
@@ -340,25 +338,19 @@ flexible_type resize_image(const flexible_type& input, size_t resized_width,
     std::string error = "Cannot resize non-image type";
     log_and_throw(error);
   }
-  const flex_image& src_image = image.get<flex_image>();
-  // is this resize a no opt?
-  if (src_image.m_width == resized_width && src_image.m_height == resized_height && src_image.m_channels == resized_channels && src_image.is_decoded() == decode) {
-    return image;
+  flex_image image = input.get<flex_image>();
+  auto has_desired_size = [&] {
+    return image.m_width == resized_width && image.m_height == resized_height && image.m_channels == resized_channels;
+  };
+
+  // Is this resize a no-op?
+  if (has_desired_size() && image.is_decoded() == decode) {
+    return input;
   }
-  char* resized_data;
-  if (src_image.is_decoded()) {
-    // skip decoding
-    image_util_detail::resize_image_impl((const char*)src_image.get_image_data(),
-        src_image.m_width, src_image.m_height, src_image.m_channels, resized_width,
-        resized_height, resized_channels, &resized_data);
-  } else {
-    // make a copy and decode
-    flexible_type tmp = image;
-    flex_image& decoded_image = tmp.mutable_get<flex_image>();
-    image_util_detail::decode_image_impl(decoded_image);
-    image_util_detail::resize_image_impl((const char*)decoded_image.get_image_data(),
-        decoded_image.m_width, decoded_image.m_height, decoded_image.m_channels, resized_width,
-        resized_height, resized_channels, &resized_data);
+
+  // Decode if necessary.
+  if (!image.is_decoded()) {
+    image_util_detail::decode_image_impl(image);
   }
 
   // Resize if necessary.
@@ -379,9 +371,10 @@ flexible_type resize_image(const flexible_type& input, size_t resized_width,
 
   // Encode if necessary.
   if (!decode) {
-    image_util_detail::encode_image_impl(dst_img);
+    image_util_detail::encode_image_impl(image);
   }
-  return dst_img;
+
+  return image;
 };
 
 
diff --git a/src/unity/lib/unity_sarray.cpp b/src/unity/lib/unity_sarray.cpp
index f33404b140..30b3c67cc0 100644
--- a/src/unity/lib/unity_sarray.cpp
+++ b/src/unity/lib/unity_sarray.cpp
@@ -2902,7 +2902,6 @@ std::shared_ptr<model_base> unity_sarray::plot(const std::string& path_to_client
   using namespace turi;
   using namespace turi::visualization;
 
-  logprogress_stream << "Materializing SArray" << std::endl;
   this->materialize();
 
   if (this->size() == 0) {
diff --git a/src/unity/lib/version_number.hpp b/src/unity/lib/version_number.hpp
index a45b8d99b6..9ec8e4f2be 100644
--- a/src/unity/lib/version_number.hpp
+++ b/src/unity/lib/version_number.hpp
@@ -3,4 +3,4 @@
  * Use of this source code is governed by a BSD-3-clause license that can
  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
  */
-#define __UNITY_VERSION__ "4.3a1"//#{{VERSION_STRING}}
+#define __UNITY_VERSION__ "5.0b1"//#{{VERSION_STRING}}
diff --git a/src/unity/python/setup.py b/src/unity/python/setup.py
index 497f4be888..c2dd7911a6 100644
--- a/src/unity/python/setup.py
+++ b/src/unity/python/setup.py
@@ -13,7 +13,7 @@
 from setuptools.command.install import install
 
 PACKAGE_NAME="turicreate"
-VERSION='4.3.2'#{{VERSION_STRING}}
+VERSION='5.0b1'#{{VERSION_STRING}}
 
 # Prevent distutils from thinking we are a pure python package
 class BinaryDistribution(Distribution):
@@ -178,8 +178,8 @@ def run(self):
             "decorator >= 4.0.9",
             "prettytable == 0.7.2",
             "requests >= 2.9.1",
-            "mxnet >= 0.11, < 1.2.0",
-            "coremltools == 0.8",
+            "mxnet >= 1.1.0, < 1.2.0",
+            "coremltools == 2.0b1",
             "pillow >= 3.3.0",
             "pandas >= 0.19.0",
             "numpy"
diff --git a/src/unity/python/turicreate/CMakeLists.txt b/src/unity/python/turicreate/CMakeLists.txt
index da460708db..172ac803c7 100644
--- a/src/unity/python/turicreate/CMakeLists.txt
+++ b/src/unity/python/turicreate/CMakeLists.txt
@@ -43,6 +43,7 @@ ADD_CUSTOM_COMMAND(
 )
 
 add_subdirectory(cython)
+add_subdirectory(toolkits)
 
 set_property(DIRECTORY PROPERTY INSTALLATION_EXTENSIONS "${INSTALLATION_EXTENSIONS}")
 set_property(DIRECTORY PROPERTY INSTALLATION_BINARY_TARGETS "${INSTALLATION_BINARY_TARGETS}")
@@ -59,11 +60,6 @@ if (APPLE)
     "${CMAKE_BINARY_DIR}/src/visualization/${CMAKE_BUILD_TYPE}/Turi Create Visualization.app"
   )
   add_dependencies(visualization_client_app visualization_client)
-  add_custom_target(
-    DEPENDS unity_toolkits
-    COMMENT "cp tcmps dylib"
-    COMMAND cp -a "${CMAKE_BINARY_DIR}/src/unity/toolkits/tcmps/${CMAKE_BUILD_TYPE}/libtcmps.dylib" ${CMAKE_CURRENT_BINARY_DIR}
-  )
 elseif(LINUX)
   add_custom_target(
     visualization_client_app ALL
diff --git a/src/unity/python/turicreate/__init__.py b/src/unity/python/turicreate/__init__.py
index a47e452472..c51dcdc4a0 100644
--- a/src/unity/python/turicreate/__init__.py
+++ b/src/unity/python/turicreate/__init__.py
@@ -79,6 +79,8 @@
 import turicreate.toolkits.style_transfer as style_transfer
 import turicreate.toolkits.activity_classifier as activity_classifier
 
+from turicreate.toolkits.image_analysis.image_analysis import load_images
+
 from turicreate.toolkits import evaluation
 
 # internal util
diff --git a/src/unity/python/turicreate/cython/cy_server.pyx b/src/unity/python/turicreate/cython/cy_server.pyx
index f3feb9dc04..570c5a0c7f 100644
--- a/src/unity/python/turicreate/cython/cy_server.pyx
+++ b/src/unity/python/turicreate/cython/cy_server.pyx
@@ -63,6 +63,11 @@ class GraphLabServer(object):
         """ Return the logger object. """
         raise NotImplementedError
 
+    def log_progress_enabled(self):
+        """ Return True if progress is enabled else False. """
+        raise NotImplementedError
+
+
 cdef void print_status(const string& status_string) nogil:
     with gil:
         print_callback(cpp_to_str(status_string).rstrip())
@@ -83,6 +88,7 @@ class EmbeddedServer(GraphLabServer):
         root_path = os.path.abspath(os.path.join(root_path, os.pardir))  # sframe/
         self.root_path = root_path
         self.started = False
+        self._log_progress_enabled = False
 
         if not self.unity_log:
             self.unity_log = default_local_conf.get_unity_log()
@@ -122,11 +128,17 @@ class EmbeddedServer(GraphLabServer):
     def get_logger(self):
         return self.logger
 
+    def log_progress_enabled(self):
+        """ Return True if progress is enabled else False. """
+        raise NotImplementedError
+
     def set_log_progress(self, enable):
         if enable:
             set_log_progress_callback(print_status)
+            self._log_progress_enabled = True
         else:
             set_log_progress(False)
+            self._log_progress_enabled = False
 
 class QuietProgress(object):
     """
@@ -136,7 +148,10 @@ class QuietProgress(object):
     def __init__(self, verbose):
         self.verbose = verbose
     def __enter__(self):
+        server = _connect.main.get_server()
+        self.log_progress_enabled = server.log_progress_enabled
         if not self.verbose:
-            _connect.main.get_server().set_log_progress(False)
+            server.set_log_progress(False)
+
     def __exit__(self, type, value, traceback):
-        _connect.main.get_server().set_log_progress(False)
+        _connect.main.get_server().set_log_progress(self.log_progress_enabled)
diff --git a/src/unity/python/turicreate/mx/_mx_sframe_iter.py b/src/unity/python/turicreate/mx/_mx_sframe_iter.py
index 8879ead946..bdb8a1ce3e 100644
--- a/src/unity/python/turicreate/mx/_mx_sframe_iter.py
+++ b/src/unity/python/turicreate/mx/_mx_sframe_iter.py
@@ -23,15 +23,15 @@
 from turicreate import extensions as sf_extension
 
 
-def _copy_from_sframe(sf, buf, start, end, field_length, bias=0):
+def _copy_from_sframe(sf, buf, start, end, shape, bias=0):
     assert isinstance(sf, SFrame)
-    sf_extension.sframe_load_to_numpy(sf, buf.ctypes.data + buf.strides[0] * bias, buf.strides, field_length, start, end)
+    sf_extension.sframe_load_to_numpy(sf, buf.ctypes.data + buf.strides[0] * bias, buf.strides, shape, start, end)
 
 
-def _copy_from_sarray(sa, buf, start, end, field_length, bias=0):
+def _copy_from_sarray(sa, buf, start, end, shape, bias=0):
     assert isinstance(sa, SArray)
     sf = SFrame({'__tmp__': sa})
-    _copy_from_sframe(sf, buf, start, end, [field_length], bias)
+    _copy_from_sframe(sf, buf, start, end, shape, bias)
 
 
 def _init_data(data, allow_empty, default_name):
@@ -115,12 +115,10 @@ def __init__(self, sframe, data_field, label_field=None, batch_size=1, data_name
             self.label_sframe = sframe[label_field]
 
         # allocate ndarray
-        inferred_shape = self.infer_shape()
-        data_shape = list(inferred_shape["final_shape"])
+        data_shape = list(self.infer_shape())
         data_shape.insert(0, batch_size)
         self.data_shape = tuple(data_shape)
         self.label_shape = (batch_size, )
-        self.field_length = inferred_shape["field_length"]
         self.data_ndarray = np.zeros(self.data_shape, dtype=np.float32)
         self.label_ndarray = np.zeros(self.label_shape, dtype=np.float32)
         self.data_mx_ndarray = None
@@ -176,7 +174,7 @@ def _infer_column_shape(self, sarray):
             return (first_image.channels, first_image.height, first_image.width)
 
     def infer_shape(self):
-        ret = {"field_length": [], "final_shape": None}
+        ret = None
         features = self.data_sframe.column_names()
         assert len(features) > 0
         if len(features) > 1:
@@ -187,22 +185,16 @@ def infer_shape(self):
                 if len(colshape) != 1:
                     raise ValueError('Only one column is allowed if input is image typed')
                 shape += colshape[0]
-                ret["field_length"].append(colshape[0])
-            ret["final_shape"] = (shape,)
+            ret = (shape,)
         else:
-            col_shape = self._infer_column_shape(self.data_sframe[features[0]])
-            ret["final_shape"] = col_shape
-            length = 1
-            for x in col_shape:
-                length = length * x
-            ret["field_length"].append(length)
+            ret = self._infer_column_shape(self.data_sframe[features[0]])
         return ret
 
     def _copy(self, start, end, bias=0):
-        _copy_from_sframe(self.data_sframe, self.data_ndarray, start, end, self.field_length, bias)
+        _copy_from_sframe(self.data_sframe, self.data_ndarray, start, end, self.data_shape, bias)
         self.data_mx_ndarray = None
         if self.label_field is not None:
-            _copy_from_sarray(self.label_sframe, self.label_ndarray, start, end, 1, bias)
+            _copy_from_sarray(self.label_sframe, self.label_ndarray, start, end, (self.batch_size, 1), bias)
             self.label_mx_ndarray = None
 
     def iter_next(self):
@@ -256,6 +248,8 @@ class SFrameImageIter(SFrameIter):
         label field in SFrame
     batch_size : int, optional
         batch size
+    image_shape : tuple, optional
+        if specified, each image will be resized to this (channel, height, width)
     mean_r : float, optional
         normalize the image by subtracting the mean value of r channel, or the first channel for
     mean_g : float, optional
@@ -284,10 +278,11 @@ class SFrameImageIter(SFrameIter):
 
     Notes
     -----
-    - Image column must contain images of the same size.
+    - Image column must contain images of the same size if image_shape is not provided.
     """
 
     def __init__(self, sframe, data_field, label_field=None, batch_size=1,
+                 image_shape=None,
                  data_name='data', label_name='softmax_label',
                  mean_r=0.0,
                  mean_g=0.0,
@@ -296,6 +291,10 @@ def __init__(self, sframe, data_field, label_field=None, batch_size=1,
                  scale=1.0,
                  random_flip=False,
                  **kwargs):
+        if image_shape is not None and len(image_shape) != 3:
+            raise ValueError('image_shape must be a (channels, height, width) tuple')
+        self.image_shape = image_shape
+
         super(SFrameImageIter, self).__init__(sframe, data_field, label_field, batch_size,
                                               data_name, label_name)
 
@@ -340,6 +339,9 @@ def _infer_column_shape(self, sarray):
         if not dtype is Image:
             raise TypeError('Data column must be image type')
 
+        if self.image_shape is not None:
+            return self.image_shape
+
         first_image = sarray.head(1)[0]
         if first_image is None:
             raise ValueError('Column cannot contain missing value')
diff --git a/src/unity/python/turicreate/test/test_image_classifier.py b/src/unity/python/turicreate/test/test_image_classifier.py
index 1728f74ba2..11ee346114 100644
--- a/src/unity/python/turicreate/test/test_image_classifier.py
+++ b/src/unity/python/turicreate/test/test_image_classifier.py
@@ -18,10 +18,12 @@
 import coremltools
 import platform
 
-def _get_data(num_examples = 100):
+def _get_data(num_examples = 100, label_type = int):
     from PIL import Image as _PIL_Image
     import numpy as np
 
+    assert(label_type in [str, int])
+
     rs = np.random.RandomState(1234)
     _format = {'JPG': 0, 'PNG': 1, 'RAW': 2, 'UNDEFINED': 3}
 
@@ -47,12 +49,20 @@ def from_pil_image(pil_img):
         return img
 
     images = []
-    random_labels = [rs.randint(0,5) for i in range(num_examples)]
+    if label_type == int:
+        random_labels = [rs.randint(0,5) for _ in range(num_examples)]
+    else:
+        random_labels = [rs.choice(['a', 'b', 'c', 'd', 'e']) for _ in range(num_examples)]
     for i in range(num_examples):
         img_shape = tuple(rs.randint(100, 1000, size=2)) + (3,)
         img = rs.randint(255, size=img_shape)
-        label = random_labels[i]
+
         # Give a slight color hint about the label
+        if label_type == int:
+            label = int(random_labels[i])
+        else:
+            label = ord(random_labels[i]) - ord('a')
+
         img = (img + [label * 3, 0, -label * 3]).clip(0, 255)
         pil_img = _PIL_Image.fromarray(img, mode='RGB')
         images.append(from_pil_image(pil_img))
@@ -63,14 +73,15 @@ def from_pil_image(pil_img):
 
 class ImageClassifierTest(unittest.TestCase):
     @classmethod
-    def setUpClass(self, model='resnet-50', input_image_shape=(3, 224, 224), tol=0.02, num_examples = 100):
+    def setUpClass(self, model = 'resnet-50', input_image_shape = (3, 224, 224), tol=0.02,
+                   num_examples = 100, label_type = int):
         self.feature = 'awesome_image'
         self.target = 'awesome_label'
         self.input_image_shape = input_image_shape
         self.pre_trained_model = model
         self.tolerance = tol
 
-        self.sf = _get_data(num_examples)
+        self.sf = _get_data(num_examples = num_examples, label_type = label_type)
         self.model = tc.image_classifier.create(self.sf, target=self.target,
                                                 model=self.pre_trained_model,
                                                 seed=42)
@@ -161,8 +172,8 @@ def test_export_coreml_with_predict(self):
         coreml_model = coremltools.models.MLModel(filename)
         img = self.sf[0:1][self.feature][0]
         img_fixed = tc.image_analysis.resize(img, *reversed(self.input_image_shape))
-        import PIL
-        pil_img = PIL.Image.fromarray(img_fixed.pixel_data)
+        from PIL import Image
+        pil_img = Image.fromarray(img_fixed.pixel_data)
 
         if _mac_ver() >= (10, 13):
             classes = self.model.classifier.classes
@@ -238,6 +249,16 @@ def setUpClass(self):
                                                               input_image_shape=(3, 227, 227),
                                                               tol=0.005, num_examples = 200)
 
+# TODO: if on skip OS, test negative case
+@unittest.skipIf(_mac_ver() < (10,14), 'VisionFeaturePrint_Screen only supported on macOS 10.14+')
+class VisionFeaturePrintScreenTest(ImageClassifierTest):
+    @classmethod
+    def setUpClass(self):
+        super(VisionFeaturePrintScreenTest, self).setUpClass(model='VisionFeaturePrint_Screen',
+                                                              input_image_shape=(3, 299, 299),
+                                                              tol=0.005, num_examples = 100,
+                                                              label_type = str)
+
 
 @unittest.skipIf(tc.util._num_available_cuda_gpus() == 0, 'Requires CUDA GPU')
 @pytest.mark.gpu
diff --git a/src/unity/python/turicreate/test/test_image_similarity.py b/src/unity/python/turicreate/test/test_image_similarity.py
index 1d23b3c409..869635a90c 100644
--- a/src/unity/python/turicreate/test/test_image_similarity.py
+++ b/src/unity/python/turicreate/test/test_image_similarity.py
@@ -17,7 +17,7 @@
 from turicreate.toolkits._main import ToolkitError as _ToolkitError
 
 
-def _get_data():
+def _get_data(image_length):
     from PIL import Image as _PIL_Image
 
     random = np.random.RandomState(100)
@@ -45,7 +45,7 @@ def from_pil_image(pil_img):
         return img
 
     num_examples = 100
-    dims = (224, 224)
+    dims = (image_length, image_length)
     total_dims = dims[0] * dims[1]
     images = []
     for i in range(num_examples):
@@ -64,13 +64,13 @@ def rand_image():
 class ImageSimilarityTest(unittest.TestCase):
 
     @classmethod
-    def setUpClass(self, model = 'resnet-50'):
+    def setUpClass(self, input_image_shape = (3,224,224), model = 'resnet-50'):
         """
         The setup class method for the basic test case with all default values.
         """
         self.feature = 'awesome_image'
         self.label = None
-        self.input_image_shape = (3, 224, 224)
+        self.input_image_shape = input_image_shape
         self.pre_trained_model = model
 
         ## Create the model
@@ -80,7 +80,7 @@ def setUpClass(self, model = 'resnet-50'):
         }
 
         # Model
-        self.sf = _get_data()
+        self.sf = _get_data(self.input_image_shape[2])
         self.model = tc.image_similarity.create(self.sf, feature=self.feature,
                                                 label=None, model=self.pre_trained_model)
         self.nn_model = self.model.feature_extractor
@@ -197,7 +197,7 @@ def test_export_coreml(self):
             # Compare distances
             coreml_distances = np.array(sorted(coreml_ret['distance']))
             tc_distances = tc_ret['distance'].to_numpy()
-            self.assertListAlmostEquals(tc_distances, coreml_distances, 0.02)
+            self.assertListAlmostEquals(tc_distances, coreml_distances, 0.025)
 
     def test_save_and_load(self):
         with test_util.TempDirectory() as filename:
@@ -218,6 +218,11 @@ def test_save_and_load(self):
             self.test_export_coreml()
             print("Export coreml passed")
 
+class ImageSimilaritySqueezeNetTest(ImageSimilarityTest):
+    @classmethod
+    def setUpClass(self):
+        super(ImageSimilaritySqueezeNetTest, self).setUpClass(model='squeezenet_v1.1',
+                                                              input_image_shape=(3, 227, 227))
 
 @unittest.skipIf(tc.util._num_available_cuda_gpus() == 0, 'Requires CUDA GPU')
 @pytest.mark.gpu
diff --git a/src/unity/python/turicreate/test/test_object_detector.py b/src/unity/python/turicreate/test/test_object_detector.py
index fa4812b28b..1c1d926f82 100644
--- a/src/unity/python/turicreate/test/test_object_detector.py
+++ b/src/unity/python/turicreate/test/test_object_detector.py
@@ -290,6 +290,7 @@ def test_export_coreml(self):
     @unittest.skipIf(_mac_ver() < (10, 14), 
         "Non-maximum suppression is only supported on MacOS 10.14+.")
     def test_export_coreml_with_non_maximum_suppression(self):
+        from PIL import Image
         filename = tempfile.mkstemp('bingo.mlmodel')[1]
         self.model.export_coreml(filename, include_non_maximum_suppression=True)
 
@@ -317,8 +318,6 @@ def test_export_coreml_with_non_maximum_suppression(self):
         model2 = tc.object_detector.create(sf, max_iterations=1)
         model2.export_coreml(filename2, include_non_maximum_suppression=True)
 
-
-
     @unittest.skipIf(sys.platform != 'darwin' or _mac_ver() >= (10, 14),
         "GPU selection should fail on macOS 10.13 or below")
     def test_no_gpu_support_on_unsupported_macos(self):
diff --git a/src/unity/python/turicreate/test/test_style_transfer.py b/src/unity/python/turicreate/test/test_style_transfer.py
index 8ea7e287d9..ec96f7196b 100644
--- a/src/unity/python/turicreate/test/test_style_transfer.py
+++ b/src/unity/python/turicreate/test/test_style_transfer.py
@@ -71,12 +71,13 @@ def setUpClass(self):
         self.pre_trained_model = 'resnet-16'
         ## Create the model
         # Model
-        self.style_sf = _get_data(feature=self.feature, num_examples=_NUM_STYLES)
-        self.content_sf = _get_data(feature=self.feature)
+        self.style_sf = _get_data(feature=self.style_feature, num_examples=_NUM_STYLES)
+        self.content_sf = _get_data(feature=self.content_feature)
         self.num_styles = _NUM_STYLES
         self.model = tc.style_transfer.create(self.style_sf,
                                               self.content_sf,
-                                              feature=self.feature,
+                                              style_feature=self.style_feature,
+                                              content_feature=self.content_feature,
                                               max_iterations=0,
                                               model=self.pre_trained_model)
 
@@ -144,7 +145,13 @@ def test_stylize_success(self):
 
             # Check the structure of the output
             _raise_error_if_not_sframe(stylized_out)
-            self.assertEqual(len(stylized_out), len(sf))
+            if style is None:
+                num_styles = self.num_styles
+            elif isinstance(style, list):
+                num_styles = len(style)
+            else:
+                num_styles = 1
+            self.assertEqual(len(stylized_out), len(sf)*num_styles)
 
             # Check if input and output image have the same shape
             input_size = (sf[self.content_feature][0].width, sf[self.content_feature][0].height)
@@ -199,8 +206,8 @@ def test_export_coreml(self):
         if _mac_ver() >= (10, 13):
             index_data = np.zeros(self.num_styles)
             index_data[0] = 1
-            coreml_output = coreml_model.predict({self.feature: pil_img, 'index':index_data}, usesCPUOnly = True)
-            img = coreml_output[coreml_output.keys()[0]]
+            coreml_output = coreml_model.predict({self.content_feature: pil_img, 'index':index_data}, usesCPUOnly = True)
+            img = next(iter(coreml_output.values()))
             img = np.asarray(img)
             img = img[..., 0:3]
 
@@ -215,14 +222,6 @@ def test_export_coreml(self):
         model2 = tc.style_transfer.create(self.style_sf, self.content_sf, max_iterations=1)
         model2.export_coreml(filename2)
 
-    @unittest.skipIf(sys.platform != 'darwin', 'Only supported on Mac')
-    def test_no_gpu_mac_support(self):
-        num_gpus = tc.config.get_num_gpus()
-        tc.config.set_num_gpus(1)
-        with self.assertRaises(_ToolkitError):
-            tc.style_transfer.create(self.style_sf, self.content_sf, max_iterations=1)
-        tc.config.set_num_gpus(num_gpus)
-
     def test_repr(self):
         model = self.model
         self.assertEqual(type(str(model)), str)
@@ -239,14 +238,14 @@ def test_save_and_load(self):
             print("Get styles passed")
 
 
-@unittest.skipIf(tc.util._num_available_gpus() == 0, 'Requires GPU')
+@unittest.skipIf(tc.util._num_available_cuda_gpus() == 0, 'Requires CUDA GPU')
 @pytest.mark.gpu
 class StyleTransferGPUTest(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        self.feature = 'image'
-        self.style_sf = _get_data(feature=self.feature)
-        self.content_sf = _get_data(feature=self.feature)
+        self.style_feature = self.content_feature = 'image'
+        self.style_sf = _get_data(feature=self.style_feature)
+        self.content_sf = _get_data(feature=self.content_feature)
 
 
     def test_gpu_save_load_export(self):
diff --git a/src/unity/python/turicreate/toolkits/_image_feature_extractor.py b/src/unity/python/turicreate/toolkits/_image_feature_extractor.py
index dbd220e941..cbc45df61c 100644
--- a/src/unity/python/turicreate/toolkits/_image_feature_extractor.py
+++ b/src/unity/python/turicreate/toolkits/_image_feature_extractor.py
@@ -28,12 +28,13 @@ def _create_feature_extractor(model_name):
         os.makedirs(download_path)
 
     if(model_name == 'resnet-50'):
-        # TODO: save converted model on developer.apple.com
-        from turicreate.toolkits import _pre_trained_models
-        mxnetResNet = _pre_trained_models.ResNetImageClassifier()
-        feature_extractor = MXFeatureExtractor(mxnetResNet)
-        mlModel = feature_extractor.get_coreml_model()
-        mlModel.save(download_path + "/Resnet50.mlmodel")
+        mlmodel_resnet_save_path = download_path + "/Resnet50.mlmodel"
+        if not os.path.exists(mlmodel_resnet_save_path):
+            from turicreate.toolkits import _pre_trained_models
+            mxnetResNet = _pre_trained_models.ResNetImageClassifier()
+            feature_extractor = MXFeatureExtractor(mxnetResNet)
+            mlModel = feature_extractor.get_coreml_model()
+            mlModel.save(mlmodel_resnet_save_path)
 
     result = extensions.__dict__["image_deep_feature_extractor"]()
     result.init_options({'model_name': model_name, 'download_path': download_path})
@@ -103,7 +104,7 @@ def _get_mx_module(mxmodel, data_layer, feature_layer, context,
         model.set_params(arg_params, aux_params)
         return model
 
-    def extract_features(self, dataset, feature, batch_size=512, verbose=False):
+    def extract_features(self, dataset, feature, batch_size=64, verbose=False):
         """
         Parameters
         ----------
@@ -111,48 +112,74 @@ def extract_features(self, dataset, feature, batch_size=512, verbose=False):
             SFrame of images
         """
         from ..mx import SFrameImageIter as _SFrameImageIter
+        from six.moves.queue import Queue as _Queue
+        from threading import Thread as _Thread
         import turicreate as _tc
         import array
 
         if len(dataset) == 0:
             return _tc.SArray([], array.array)
 
-        # Resize images if needed
-        preprocessed_dataset =  _tc.SFrame()
-        if verbose:
-            print("Resizing images...")
-        preprocessed_dataset[feature] = _tc.image_analysis.resize(
-                dataset[feature],  *tuple(reversed(self.image_shape)))
-
         batch_size = min(len(dataset), batch_size)
         # Make a data iterator
-        dataIter = _SFrameImageIter(sframe=preprocessed_dataset, data_field=[feature], batch_size=batch_size)
+        dataIter = _SFrameImageIter(sframe=dataset, data_field=[feature], batch_size=batch_size, image_shape=self.image_shape)
 
         # Setup the MXNet model
         model = MXFeatureExtractor._get_mx_module(self.ptModel.mxmodel,
                 self.data_layer, self.feature_layer, self.context, self.image_shape, batch_size)
 
         out = _tc.SArrayBuilder(dtype = array.array)
-        num_processed = 0
+        progress = { 'num_processed' : 0, 'total' : len(dataset) }
         if verbose:
             print("Performing feature extraction on resized images...")
-        while dataIter.has_next:
-            if dataIter.data_shape[1:] != self.image_shape:
-                raise RuntimeError("Expected image of size %s. Got %s instead." % (
-                                               self.image_shape, dataIter.data_shape[1:]))
-            model.forward(next(dataIter))
+
+        # Encapsulates the work done by the MXNet model for a single batch
+        def handle_request(batch):
+            model.forward(batch)
             mx_out = [array.array('d',m) for m in model.get_outputs()[0].asnumpy()]
-            if dataIter.getpad() != 0:
+            if batch.pad != 0:
                 # If batch size is not evenly divisible by the length, it will loop back around.
                 # We don't want that.
-                mx_out = mx_out[:-dataIter.getpad()]
+                mx_out = mx_out[:-batch.pad]
+            return mx_out
+
+        # Copies the output from MXNet into the SArrayBuilder and emits progress
+        def consume_response(mx_out):
             out.append_multiple(mx_out)
 
-            num_processed += batch_size
-            num_processed = min(len(dataset), num_processed)
+            progress['num_processed'] += len(mx_out)
             if verbose:
                 print('Completed {num_processed:{width}d}/{total:{width}d}'.format(
-                    num_processed = num_processed, total=len(dataset), width = len(str(len(dataset)))))
+                    width = len(str(progress['total'])), **progress))
+
+        # Create a dedicated thread for performing MXNet work, using two FIFO
+        # queues for communication back and forth with this thread, with the
+        # goal of keeping MXNet busy throughout.
+        request_queue = _Queue()
+        response_queue = _Queue()
+        def mx_worker():
+            while True:
+                batch = request_queue.get()  # Consume request
+                if batch is None:
+                    # No more work remains. Allow the thread to finish.
+                    return
+                response_queue.put(handle_request(batch))  # Produce response
+        mx_worker_thread = _Thread(target=mx_worker)
+        mx_worker_thread.start()
+
+        try:
+            # Attempt to have two requests in progress at any one time (double
+            # buffering), so that the iterator is creating one batch while MXNet
+            # performs inference on the other.
+            if dataIter.has_next:
+                request_queue.put(next(dataIter))  # Produce request
+                while dataIter.has_next:
+                    request_queue.put(next(dataIter))  # Produce request
+                    consume_response(response_queue.get())
+                consume_response(response_queue.get())
+        finally:
+            # Tell the worker thread to shut down.
+            request_queue.put(None)
 
         return out.close()
 
diff --git a/src/unity/python/turicreate/toolkits/_mxnet_utils.py b/src/unity/python/turicreate/toolkits/_mxnet_utils.py
index 170019e9fa..4ce96ffb19 100644
--- a/src/unity/python/turicreate/toolkits/_mxnet_utils.py
+++ b/src/unity/python/turicreate/toolkits/_mxnet_utils.py
@@ -127,19 +127,3 @@ def load_net_params_from_state(net_params, state, ctx = None):
         #net_params[k].set_data(net_params_dict[k])
         net_params[k]._load_init(net_params_dict[k], ctx)
     return net_params
-
-
-# mean subtraction
-def subtract_imagenet_mean(batch):
-    """Subtract ImageNet mean from RGB image"""
-    from mxnet import nd
-    batch = batch * 255.0
-    batch = nd.swapaxes(batch,0, 1)
-    (r, g, b) = nd.split(batch, num_outputs=3, axis=0)
-    mean_values = [123.68, 116.779, 103.939]
-    r = r - mean_values[0]
-    g = g - mean_values[1]
-    b = b - mean_values[2]
-    batch = nd.concat(r, g, b, dim=0)
-    batch = nd.swapaxes(batch,0, 1)
-    return batch
diff --git a/src/unity/python/turicreate/toolkits/activity_classifier/_sframe_sequence_iterator.py b/src/unity/python/turicreate/toolkits/activity_classifier/_sframe_sequence_iterator.py
index a436928fee..e5b24afd4f 100644
--- a/src/unity/python/turicreate/toolkits/activity_classifier/_sframe_sequence_iterator.py
+++ b/src/unity/python/turicreate/toolkits/activity_classifier/_sframe_sequence_iterator.py
@@ -51,7 +51,8 @@ def _load_into_numpy(sf, np_array, start, end, strides=None, shape=None):
     np_array[:] = 0.0
     np_array_2d = np_array.reshape((np_array.shape[0], np_array.shape[1] * np_array.shape[2]))
     _extensions.sframe_load_to_numpy(sf, np_array.ctypes.data,
-                                     np_array_2d.strides, np_array_2d.shape[1:], start, end)
+                                     np_array_2d.strides, np_array_2d.shape,
+                                     start, end)
 
 
 class SFrameSequenceIter(_mx.io.DataIter):
diff --git a/src/unity/python/turicreate/toolkits/image_analysis/image_analysis.py b/src/unity/python/turicreate/toolkits/image_analysis/image_analysis.py
index 65142a7723..6d881d318c 100644
--- a/src/unity/python/turicreate/toolkits/image_analysis/image_analysis.py
+++ b/src/unity/python/turicreate/toolkits/image_analysis/image_analysis.py
@@ -56,7 +56,7 @@ def load_images(url, format='auto', with_path=True, recursive=True, ignore_failu
     """
     from ... import extensions as _extensions
     from ...util import _make_internal_url
-    return _extensions.load_images(_make_internal_url(url), format, with_path,
+    return _extensions.load_images(url, format, with_path,
                                      recursive, ignore_failure, random_order)
 
 
diff --git a/src/unity/python/turicreate/toolkits/image_classifier/image_classifier.py b/src/unity/python/turicreate/toolkits/image_classifier/image_classifier.py
index 85f6f25e23..feaa3b21a9 100644
--- a/src/unity/python/turicreate/toolkits/image_classifier/image_classifier.py
+++ b/src/unity/python/turicreate/toolkits/image_classifier/image_classifier.py
@@ -20,6 +20,7 @@
 from turicreate.toolkits._main import ToolkitError as _ToolkitError
 from turicreate.toolkits._model import PythonProxy as _PythonProxy
 from turicreate import config as _tc_config
+from .._internal_utils import _mac_ver
 from .. import _mxnet_utils
 from .. import _pre_trained_models
 from .. import _image_feature_extractor
@@ -27,7 +28,8 @@
                                                  _numeric_param_check_range)
 
 def create(dataset, target, feature = None, model = 'resnet-50',
-           max_iterations=10, verbose=True, seed=None):
+           validation_set='auto', max_iterations = 10, verbose = True,
+           seed = None, batch_size=64):
     """
     Create a :class:`ImageClassifier` model.
 
@@ -53,14 +55,30 @@ def create(dataset, target, feature = None, model = 'resnet-50',
         feature.
 
     model : string optional
-        Uses a pretrained model to bootstrap an image classifier
+        Uses a pretrained model to bootstrap an image classifier:
 
            - "resnet-50" : Uses a pretrained resnet model.
+                           Exported Core ML model will be ~90M.
+
            - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
+                                 Exported Core ML model will be ~4.7M.
+
+           - "VisionFeaturePrint_Screen": Uses an OS internal feature extractor.
+                                          Only on available on iOS,tvOS 12.0+,
+                                          macOS 10.14+.
+                                          Exported Core ML model will be ~41K.
 
         Models are downloaded from the internet if not available locally. Once
         downloaded, the models are cached for future use.
 
+    validation_set : SFrame, optional
+        A dataset for monitoring the model's generalization performance.
+        The format of this SFrame must be the same as the training set.
+        By default this argument is set to 'auto' and a validation set is
+        automatically sampled and used for progress printing. If
+        validation_set is set to None, then no additional metrics
+        are computed. The default value is 'auto'.
+
     max_iterations : float, optional
         The maximum number of allowed passes through the data. More passes over
         the data can result in a more accurately trained model. Consider
@@ -74,6 +92,10 @@ def create(dataset, target, feature = None, model = 'resnet-50',
         Seed for random number generation. Set this value to ensure that the
         same model is created every time.
 
+    batch_size : int, optional
+        If you are getting memory errors, try decreasing this value. If you
+        have a powerful computer, increasing this value may improve performance.
+
     Returns
     -------
     out : ImageClassifier
@@ -102,7 +124,7 @@ def create(dataset, target, feature = None, model = 'resnet-50',
     # Check model parameter
     allowed_models = list(_pre_trained_models.MODELS.keys())
     if _mac_ver() >= (10,14):
-        allowed_models.append('sceneVisionFeaturePrint_v1')
+        allowed_models.append('VisionFeaturePrint_Screen')
     _tkutl._check_categorical_option_type('model', model, allowed_models)
 
     # Check dataset parameter
@@ -113,6 +135,12 @@ def create(dataset, target, feature = None, model = 'resnet-50',
     if target not in dataset.column_names():
         raise _ToolkitError("Target column '%s' does not exist" % target)
 
+    if(batch_size < 1):
+        raise ValueError("'batch_size' must be greater than or equal to 1")
+
+    if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None):
+        raise TypeError("Unrecognized value for 'validation_set'.")
+
     if feature is None:
         feature = _tkutl._find_only_image_column(dataset)
 
@@ -121,8 +149,15 @@ def create(dataset, target, feature = None, model = 'resnet-50',
     # Extract features
     extracted_features = _tc.SFrame({
         target: dataset[target],
-        '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose),
+        '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size),
         })
+    if isinstance(validation_set, _tc.SFrame):
+        extracted_features_validation = _tc.SFrame({
+            target: validation_set[target],
+            '__image_features__': feature_extractor.extract_features(validation_set, feature, verbose=verbose, batch_size=batch_size),
+        })
+    else:
+        extracted_features_validation = validation_set
 
     # Train a classifier using the extracted features
     extracted_features[target] = dataset[target]
@@ -130,10 +165,15 @@ def create(dataset, target, feature = None, model = 'resnet-50',
                                               features=['__image_features__'],
                                               target=target,
                                               max_iterations=max_iterations,
+                                              validation_set=extracted_features_validation,
                                               seed=seed,
                                               verbose=verbose)
 
-    input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
+    # set input image shape
+    if model in _pre_trained_models.MODELS:
+        input_image_shape = _pre_trained_models.MODELS[model].input_image_shape
+    else:    # model == VisionFeaturePrint_Screen
+        input_image_shape = (3, 299, 299)
 
     # Save the model
     state = {
@@ -197,7 +237,11 @@ def _load_version(cls, state, version):
         state['classes'] = state['classifier'].classes
 
         # Load pre-trained model & feature extractor
-        state['feature_extractor'] = _image_feature_extractor._create_feature_extractor(state['model'])
+        model_name = state['model']
+        if model_name == "VisionFeaturePrint_Screen" and _mac_ver() < (10,14):
+            raise ToolkitError("Can not load model on this operating system. This model uses VisionFeaturePrint_Screen, " \
+                               "which is only supported on macOS 10.14 and higher.")
+        state['feature_extractor'] = _image_feature_extractor._create_feature_extractor(model_name)
         state['input_image_shape'] = tuple([int(i) for i in state['input_image_shape']])
         return ImageClassifier(state)
 
@@ -256,7 +300,7 @@ def _get_summary_struct(self):
         section_titles = ['Schema', 'Training summary']
         return([model_fields, training_fields], section_titles)
 
-    def predict(self, dataset, output_type='class'):
+    def predict(self, dataset, output_type='class', batch_size=64):
         """
         Return predictions for ``dataset``, using the trained logistic
         regression model. Predictions can be generated as class labels,
@@ -293,6 +337,10 @@ class as a vector. The probability of the first class (sorted
             - 'class': Class prediction. For multi-class classification, this
               returns the class with maximum probability.
 
+        batch_size : int, optional
+            If you are getting memory errors, try decreasing this value. If you
+            have a powerful computer, increasing this value may improve performance.
+
         Returns
         -------
         out : SArray
@@ -311,16 +359,18 @@ class as a vector. The probability of the first class (sorted
         """
         if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)):
             raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image')
+        if(batch_size < 1):
+            raise ValueError("'batch_size' must be greater than or equal to 1")
 
         if isinstance(dataset, _tc.SArray):
             dataset = _tc.SFrame({self.feature: dataset})
         elif isinstance(dataset, _tc.Image):
             dataset = _tc.SFrame({self.feature: [dataset]})
 
-        extracted_features = self._extract_features(dataset)
+        extracted_features = self._extract_features(dataset, batch_size=batch_size)
         return self.classifier.predict(extracted_features, output_type=output_type)
 
-    def classify(self, dataset):
+    def classify(self, dataset, batch_size=64):
         """
         Return a classification, for each example in the ``dataset``, using the
         trained logistic regression model. The output SFrame contains predictions
@@ -335,6 +385,10 @@ def classify(self, dataset):
             names as the features used for model training, but does not require
             a target column. Additional columns are ignored.
 
+        batch_size : int, optional
+            If you are getting memory errors, try decreasing this value. If you
+            have a powerful computer, increasing this value may improve performance.
+
         Returns
         -------
         out : SFrame
@@ -351,16 +405,18 @@ def classify(self, dataset):
         """
         if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)):
             raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image')
+        if(batch_size < 1):
+            raise ValueError("'batch_size' must be greater than or equal to 1")
 
         if isinstance(dataset, _tc.SArray):
             dataset = _tc.SFrame({self.feature: dataset})
         elif isinstance(dataset, _tc.Image):
             dataset = _tc.SFrame({self.feature: [dataset]})
 
-        extracted_features = self._extract_features(dataset)
+        extracted_features = self._extract_features(dataset, batch_size=batch_size)
         return self.classifier.classify(extracted_features)
 
-    def predict_topk(self, dataset, output_type="probability", k=3):
+    def predict_topk(self, dataset, output_type="probability", k=3, batch_size=64):
         """
         Return top-k predictions for the ``dataset``, using the trained model.
         Predictions are returned as an SFrame with three columns: `id`,
@@ -417,6 +473,8 @@ def predict_topk(self, dataset, output_type="probability", k=3):
         """
         if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)):
             raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image')
+        if(batch_size < 1):
+            raise ValueError("'batch_size' must be greater than or equal to 1")
 
         if isinstance(dataset, _tc.SArray):
             dataset = _tc.SFrame({self.feature: dataset})
@@ -426,7 +484,7 @@ def predict_topk(self, dataset, output_type="probability", k=3):
         extracted_features = self._extract_features(dataset)
         return self.classifier.predict_topk(extracted_features, output_type = output_type, k = k)
 
-    def evaluate(self, dataset, metric='auto', verbose=True):
+    def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64):
         """
         Evaluate the model by making predictions of target values and comparing
         these to actual values.
@@ -457,6 +515,10 @@ def evaluate(self, dataset, metric='auto', verbose=True):
         verbose : bool, optional
             If True, prints progress updates and model details.
 
+        batch_size : int, optional
+            If you are getting memory errors, try decreasing this value. If you
+            have a powerful computer, increasing this value may improve performance.
+
         Returns
         -------
         out : dict
@@ -475,13 +537,16 @@ def evaluate(self, dataset, metric='auto', verbose=True):
           >>> results = model.evaluate(data)
           >>> print results['accuracy']
         """
-        extracted_features = self._extract_features(dataset, verbose=verbose)
+        if(batch_size < 1):
+            raise ValueError("'batch_size' must be greater than or equal to 1")
+
+        extracted_features = self._extract_features(dataset, verbose=verbose, batch_size=batch_size)
         extracted_features[self.target] = dataset[self.target]
         return self.classifier.evaluate(extracted_features, metric = metric)
 
-    def _extract_features(self, dataset, verbose=False):
+    def _extract_features(self, dataset, verbose=False, batch_size=64):
         return _tc.SFrame({
-            '__image_features__': self.feature_extractor.extract_features(dataset, self.feature, verbose=verbose)
+            '__image_features__': self.feature_extractor.extract_features(dataset, self.feature, verbose=verbose, batch_size=batch_size)
             })
 
     def export_coreml(self, filename):
@@ -496,79 +561,191 @@ def export_coreml(self, filename):
         --------
         >>> model.export_coreml('myModel.mlmodel')
         """
-        ptModel = _pre_trained_models.MODELS[self.model]()
-        feature_extractor = _image_feature_extractor.MXFeatureExtractor(ptModel)
-
-        coreml_model = feature_extractor.get_coreml_model()
-        spec = coreml_model.get_spec()
-        nn_spec = spec.neuralNetworkClassifier
-        num_classes = self.num_classes
-
-        # Replace the softmax layer with new coeffients
-        fc_layer = nn_spec.layers[-2]
-        fc_layer_params = fc_layer.innerProduct
-        fc_layer_params.outputChannels = self.classifier.num_classes
-        inputChannels = fc_layer_params.inputChannels
-        fc_layer_params.hasBias = True
-
-        coefs = self.classifier.coefficients
-        weights = fc_layer_params.weights
-        bias = fc_layer_params.bias
-        del weights.floatValue[:]
-        del bias.floatValue[:]
-
-        import numpy as np
-        W = np.array(coefs[coefs['index'] != None]['value'], ndmin = 2).reshape(
-                                          inputChannels, num_classes - 1, order = 'F')
-        b =  coefs[coefs['index'] == None]['value']
-        Wa = np.hstack((np.zeros((inputChannels, 1)), W))
-        weights.floatValue.extend(Wa.flatten(order = 'F'))
-        bias.floatValue.extend([0.0] + list(b))
-
-        # Replace the classifier with the new classes
-        class_labels = self.classifier.classes
-
-        probOutput = spec.description.output[0]
-        classLabel = spec.description.output[1]
-        probOutput.type.dictionaryType.MergeFromString(b'')
-        if type(class_labels[0]) == int:
-            nn_spec.ClearField('int64ClassLabels')
-            probOutput.type.dictionaryType.int64KeyType.MergeFromString(b'')
-            classLabel.type.int64Type.MergeFromString(b'')
-            del nn_spec.int64ClassLabels.vector[:]
-            for c in class_labels:
-                nn_spec.int64ClassLabels.vector.append(c)
-        else:
-            nn_spec.ClearField('stringClassLabels')
-            probOutput.type.dictionaryType.stringKeyType.MergeFromString(b'')
-            classLabel.type.stringType.MergeFromString(b'')
-            del nn_spec.stringClassLabels.vector[:]
-            for c in class_labels:
-                nn_spec.stringClassLabels.vector.append(c)
-
         import coremltools
-        prob_name = self.target + 'Probability'
-        label_name = self.target
-        old_output_name = spec.neuralNetworkClassifier.layers[-1].name
-        coremltools.models.utils.rename_feature(spec, 'classLabel', label_name)
-        coremltools.models.utils.rename_feature(spec, old_output_name, prob_name)
-        if spec.neuralNetworkClassifier.layers[-1].name == old_output_name:
-            spec.neuralNetworkClassifier.layers[-1].name = prob_name
-        if spec.neuralNetworkClassifier.labelProbabilityLayerName == old_output_name:
-            spec.neuralNetworkClassifier.labelProbabilityLayerName = prob_name
-        coremltools.models.utils.rename_feature(spec, 'data', self.feature)
-        spec.neuralNetworkClassifier.preprocessing[0].featureName = self.feature
-
-        mlmodel = coremltools.models.MLModel(spec)
-        model_type = 'image classifier (%s)' % self.model
-        mlmodel.short_description = _coreml_utils._mlmodel_short_description(model_type)
-        mlmodel.input_description[self.feature] = u'Input image'
-        mlmodel.output_description[prob_name] = 'Prediction probabilities'
-        mlmodel.output_description[label_name] = 'Class label of top prediction'
-        _coreml_utils._set_model_metadata(mlmodel, self.__class__.__name__, {
+        # First define three internal helper functions
+
+
+        # Internal helper function
+        def _create_vision_feature_print_screen():
+            prob_name = self.target + 'Probability'
+
+            #
+            # Setup the top level (pipeline classifier) spec
+            #
+            top_spec = coremltools.proto.Model_pb2.Model()
+            top_spec.specificationVersion = 3
+
+            desc = top_spec.description
+            desc.output.add().name = prob_name
+            desc.output.add().name = self.target
+
+            desc.predictedFeatureName = self.target
+            desc.predictedProbabilitiesName = prob_name
+
+            input = desc.input.add()
+            input.name = self.feature
+            input.type.imageType.width = 299
+            input.type.imageType.height = 299
+            BGR_VALUE = coremltools.proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value('BGR')
+            input.type.imageType.colorSpace = BGR_VALUE
+
+            #
+            # Scene print feature extractor
+            #
+            pipelineClassifier = top_spec.pipelineClassifier
+            scene_print = pipelineClassifier.pipeline.models.add()
+            scene_print.specificationVersion = 3
+            scene_print.visionFeaturePrint.scene.version = 1
+
+            input = scene_print.description.input.add()
+            input.name = self.feature
+            input.type.imageType.width = 299
+            input.type.imageType.height = 299
+            input.type.imageType.colorSpace = BGR_VALUE
+
+            output = scene_print.description.output.add()
+            output.name = "output_name"
+            DOUBLE_ARRAY_VALUE = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value('DOUBLE')
+            output.type.multiArrayType.dataType = DOUBLE_ARRAY_VALUE
+            output.type.multiArrayType.shape.append(2048)
+
+            #
+            # Neural Network Classifier, which is just logistic regression, in order to use GPUs
+            #
+            temp = top_spec.pipelineClassifier.pipeline.models.add()
+            temp.specificationVersion = 3
+
+            # Empty inner product layer
+            nn_spec = temp.neuralNetworkClassifier
+            feature_layer = nn_spec.layers.add()
+            feature_layer.name = "feature_layer"
+            feature_layer.input.append("output_name")
+            feature_layer.output.append("softmax_input")
+            fc_layer_params = feature_layer.innerProduct
+            fc_layer_params.inputChannels = 2048
+
+            # Softmax layer
+            softmax = nn_spec.layers.add()
+            softmax.name = "softmax"
+            softmax.softmax.MergeFromString(b'')
+            softmax.input.append("softmax_input")
+            softmax.output.append(prob_name)
+
+            input = temp.description.input.add()
+            input.name = "output_name"
+            input.type.multiArrayType.dataType = DOUBLE_ARRAY_VALUE
+            input.type.multiArrayType.shape.append(2048)
+
+            # Set outputs
+            desc = temp.description
+            prob_output = desc.output.add()
+            prob_output.name = prob_name
+            label_output = desc.output.add()
+            label_output.name = self.target
+
+            if type(self.classifier.classes[0]) == int:
+                prob_output.type.dictionaryType.int64KeyType.MergeFromString(b'')
+                label_output.type.int64Type.MergeFromString(b'')
+            else:
+                prob_output.type.dictionaryType.stringKeyType.MergeFromString(b'')
+                label_output.type.stringType.MergeFromString(b'')
+
+            temp.description.predictedFeatureName = self.target
+            temp.description.predictedProbabilitiesName = prob_name
+
+            return top_spec
+
+
+        # Internal helper function
+        def _update_last_two_layers(nn_spec):
+            # Replace the softmax layer with new coeffients
+            num_classes = self.num_classes
+            fc_layer = nn_spec.layers[-2]
+            fc_layer_params = fc_layer.innerProduct
+            fc_layer_params.outputChannels = self.classifier.num_classes
+            inputChannels = fc_layer_params.inputChannels
+            fc_layer_params.hasBias = True
+
+            coefs = self.classifier.coefficients
+            weights = fc_layer_params.weights
+            bias = fc_layer_params.bias
+            del weights.floatValue[:]
+            del bias.floatValue[:]
+
+            import numpy as np
+            W = np.array(coefs[coefs['index'] != None]['value'], ndmin = 2).reshape(
+                                          inputChannels, num_classes - 1, order = 'F')
+            b =  coefs[coefs['index'] == None]['value']
+            Wa = np.hstack((np.zeros((inputChannels, 1)), W))
+            weights.floatValue.extend(Wa.flatten(order = 'F'))
+            bias.floatValue.extend([0.0] + list(b))
+
+        # Internal helper function
+        def _set_inputs_outputs_and_metadata(spec, nn_spec):
+            # Replace the classifier with the new classes
+            class_labels = self.classifier.classes
+
+            probOutput = spec.description.output[0]
+            classLabel = spec.description.output[1]
+            probOutput.type.dictionaryType.MergeFromString(b'')
+            if type(class_labels[0]) == int:
+                nn_spec.ClearField('int64ClassLabels')
+                probOutput.type.dictionaryType.int64KeyType.MergeFromString(b'')
+                classLabel.type.int64Type.MergeFromString(b'')
+                del nn_spec.int64ClassLabels.vector[:]
+                for c in class_labels:
+                    nn_spec.int64ClassLabels.vector.append(c)
+            else:
+                nn_spec.ClearField('stringClassLabels')
+                probOutput.type.dictionaryType.stringKeyType.MergeFromString(b'')
+                classLabel.type.stringType.MergeFromString(b'')
+                del nn_spec.stringClassLabels.vector[:]
+                for c in class_labels:
+                    nn_spec.stringClassLabels.vector.append(c)
+
+            prob_name = self.target + 'Probability'
+            label_name = self.target
+            old_output_name = nn_spec.layers[-1].name
+            coremltools.models.utils.rename_feature(spec, 'classLabel', label_name)
+            coremltools.models.utils.rename_feature(spec, old_output_name, prob_name)
+            if nn_spec.layers[-1].name == old_output_name:
+                nn_spec.layers[-1].name = prob_name
+            if nn_spec.labelProbabilityLayerName == old_output_name:
+                nn_spec.labelProbabilityLayerName = prob_name
+            coremltools.models.utils.rename_feature(spec, 'data', self.feature)
+            if len(nn_spec.preprocessing) > 0:
+                nn_spec.preprocessing[0].featureName = self.feature
+
+            mlmodel = coremltools.models.MLModel(spec)
+            model_type = 'image classifier (%s)' % self.model
+            mlmodel.short_description = _coreml_utils._mlmodel_short_description(model_type)
+            mlmodel.input_description[self.feature] = u'Input image'
+            mlmodel.output_description[prob_name] = 'Prediction probabilities'
+            mlmodel.output_description[label_name] = 'Class label of top prediction'
+            _coreml_utils._set_model_metadata(mlmodel, self.__class__.__name__, {
                 'model': self.model,
                 'target': self.target,
                 'features': self.feature,
                 'max_iterations': str(self.max_iterations),
             }, version=ImageClassifier._PYTHON_IMAGE_CLASSIFIER_VERSION)
+
+            return mlmodel
+
+
+        # main part of the export_coreml function
+        if self.model in _pre_trained_models.MODELS:
+            ptModel = _pre_trained_models.MODELS[self.model]()
+            feature_extractor = _image_feature_extractor.MXFeatureExtractor(ptModel)
+
+            coreml_model = feature_extractor.get_coreml_model()
+            spec = coreml_model.get_spec()
+            nn_spec = spec.neuralNetworkClassifier
+        else:     # model == VisionFeaturePrint_Screen
+            spec = _create_vision_feature_print_screen()
+            nn_spec = spec.pipelineClassifier.pipeline.models[1].neuralNetworkClassifier
+
+        _update_last_two_layers(nn_spec)
+        mlmodel = _set_inputs_outputs_and_metadata(spec, nn_spec)
         mlmodel.save(filename)
+
+
diff --git a/src/unity/python/turicreate/toolkits/image_similarity/image_similarity.py b/src/unity/python/turicreate/toolkits/image_similarity/image_similarity.py
index 64f42930f6..8dda98d8d8 100644
--- a/src/unity/python/turicreate/toolkits/image_similarity/image_similarity.py
+++ b/src/unity/python/turicreate/toolkits/image_similarity/image_similarity.py
@@ -23,7 +23,8 @@
                                                  _numeric_param_check_range)
 
 
-def create(dataset, label = None, feature = None, model = 'resnet-50', verbose = True):
+def create(dataset, label = None, feature = None, model = 'resnet-50', verbose = True,
+           batch_size = 64):
     """
     Create a :class:`ImageSimilarityModel` model.
 
@@ -47,6 +48,7 @@ def create(dataset, label = None, feature = None, model = 'resnet-50', verbose =
         Uses a pretrained model to bootstrap an image similarity model
 
            - "resnet-50" : Uses a pretrained resnet model.
+           - "squeezenet_v1.1" : Uses a pretrained squeezenet model.
 
         Models are downloaded from the internet if not available locally. Once
         downloaded, the models are cached for future use.
@@ -54,6 +56,10 @@ def create(dataset, label = None, feature = None, model = 'resnet-50', verbose =
     verbose : bool, optional
         If True, print progress updates and model details.
 
+    batch_size : int, optional
+        If you are getting memory errors, try decreasing this value. If you
+        have a powerful computer, increasing this value may improve performance.
+
     Returns
     -------
     out : ImageSimilarityModel
@@ -98,6 +104,8 @@ def create(dataset, label = None, feature = None, model = 'resnet-50', verbose =
         raise _ToolkitError("Row label column '%s' does not exist" % label)
     if (feature is not None) and (feature not in dataset.column_names()):
         raise _ToolkitError("Image feature column '%s' does not exist" % feature)
+    if(batch_size < 1):
+        raise ValueError("'batch_size' must be greater than or equal to 1")
 
     # Set defaults
     if feature is None:
@@ -107,7 +115,8 @@ def create(dataset, label = None, feature = None, model = 'resnet-50', verbose =
 
     # Extract features
     extracted_features = _tc.SFrame({
-        '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose),
+        '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose,
+                                                                 batch_size=batch_size),
         })
 
     # Train a similarity model using the extracted features
@@ -249,12 +258,13 @@ def _get_summary_struct(self):
         section_titles = ['Schema', 'Training summary']
         return([model_fields, training_fields], section_titles)
 
-    def _extract_features(self, dataset, verbose):
+    def _extract_features(self, dataset, verbose, batch_size = 64):
         return _tc.SFrame({
-            '__image_features__': self.feature_extractor.extract_features(dataset, self.feature, verbose=verbose)
+            '__image_features__': self.feature_extractor.extract_features(dataset, self.feature, verbose=verbose,
+                                                                          batch_size=batch_size)
             })
 
-    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
+    def query(self, dataset, label=None, k=5, radius=None, verbose=True, batch_size=64):
         """
         For each image, retrieve the nearest neighbors from the model's stored
         data. In general, the query dataset does not need to be the same as
@@ -288,6 +298,10 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True):
         verbose: bool, optional
             If True, print progress updates and model details.
 
+        batch_size : int, optional
+            If you are getting memory errors, try decreasing this value. If you
+            have a powerful computer, increasing this value may improve performance.
+
         Returns
         -------
         out : SFrame
@@ -325,13 +339,15 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True):
         """
         if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)):
             raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image')
+        if(batch_size < 1):
+            raise ValueError("'batch_size' must be greater than or equal to 1")
 
         if isinstance(dataset, _tc.SArray):
             dataset = _tc.SFrame({self.feature: dataset})
         elif isinstance(dataset, _tc.Image):
             dataset = _tc.SFrame({self.feature: [dataset]})
 
-        extracted_features = self._extract_features(dataset, verbose=verbose)
+        extracted_features = self._extract_features(dataset, verbose=verbose, batch_size=batch_size)
         if label is not None:
             extracted_features[label] = dataset[label]
         return self.similarity_model.query(extracted_features, label, k, radius, verbose)
diff --git a/src/unity/python/turicreate/toolkits/style_transfer/_model.py b/src/unity/python/turicreate/toolkits/style_transfer/_model.py
index 598edae406..91536e53dd 100644
--- a/src/unity/python/turicreate/toolkits/style_transfer/_model.py
+++ b/src/unity/python/turicreate/toolkits/style_transfer/_model.py
@@ -17,13 +17,11 @@ class InstanceNorm(HybridBlock):
     """
     Conditional Instance Norm
     """
-    def __init__(self, epsilon=1e-5, center=True, scale=True,
-                 beta_initializer='zeros', gamma_initializer='ones',
-                 in_channels=0, num_styles=0,  **kwargs):
+    def __init__(self, in_channels, num_styles, batch_size, epsilon=1e-5,
+                 center=True, scale=True, beta_initializer='zeros',
+                 gamma_initializer='ones', **kwargs):
         super(InstanceNorm, self).__init__(**kwargs)
         self._kwargs = {'eps': epsilon}
-        if in_channels != 0:
-            self.in_channels = in_channels
         self.gamma = self.params.get('gamma', grad_req='write' if scale else 'null',
                                      shape=(num_styles, in_channels, ), init=gamma_initializer,
                                      allow_deferred_init=True)
@@ -32,19 +30,33 @@ def __init__(self, epsilon=1e-5, center=True, scale=True,
                                     allow_deferred_init=True)
         self.num_styles = num_styles
         self.in_channels = in_channels
-
+        self.batch_size = batch_size
 
     def hybrid_forward(self, F, X, style_idx, gamma, beta):
-        if F == _mx.sym:  # for coreml
+        if F == _mx.sym and self.batch_size == 0:  # for coreml
             gamma = _mx.sym.Embedding(data=style_idx, input_dim=self.num_styles, output_dim=self.in_channels)
             beta = _mx.sym.Embedding(data=style_idx, input_dim=self.num_styles, output_dim=self.in_channels)
             return F.InstanceNorm(X, gamma, beta, name='_fwd', **self._kwargs)
 
-        res = []
-        for idx, style in enumerate(style_idx):
-            res.append(F.InstanceNorm(X[idx:idx+1], gamma[int(style)], beta[int(style)], name='_fwd', **self._kwargs))
+        em_gamma = F.take(gamma, indices=style_idx, axis=0)
+        em_beta = F.take(beta, indices=style_idx, axis=0)
+
+        sp_gammas = F.split(em_gamma, axis=0, num_outputs=self.batch_size, squeeze_axis=True)
+        sp_betas = F.split(em_beta, axis=0, num_outputs=self.batch_size, squeeze_axis=True)
+
+        if self.batch_size == 1:
+            return F.InstanceNorm(X, sp_gammas, sp_betas, name='_fwd', **self._kwargs)
+        else:
+            Xs = F.split(X, axis=0, num_outputs=self.batch_size)
+
+            res = []
+            for idx in range(self.batch_size):
+                gamma0 = sp_gammas[idx]
+                beta0 = sp_betas[idx]
+                X_slice = Xs[idx]
+                res.append(F.InstanceNorm(X_slice, gamma0, beta0, name='_fwd', **self._kwargs))
 
-        return _mx.nd.concat(*res, dim=0)
+            return F.concat(*res, dim=0)
 
 
 class ResidualBlock(HybridBlock):
@@ -52,14 +64,26 @@ class ResidualBlock(HybridBlock):
     Residual network
     """
 
-    def __init__(self, num_styles):
+    def __init__(self, num_styles, batch_size):
         super(ResidualBlock, self).__init__()
 
         with self.name_scope():
             self.conv1 = _nn.Conv2D(128, 3, 1, 1, in_channels=128, use_bias=False)
-            self.inst_norm1 = InstanceNorm(in_channels=128, num_styles=num_styles)
+            self.inst_norm1 = InstanceNorm(in_channels=128, num_styles=num_styles, batch_size=batch_size)
             self.conv2 = _nn.Conv2D(128, 3, 1, 1, in_channels=128, use_bias=False)
-            self.inst_norm2 = InstanceNorm(in_channels=128, num_styles=num_styles)
+            self.inst_norm2 = InstanceNorm(in_channels=128, num_styles=num_styles, batch_size=batch_size)
+
+        self._batch_size = batch_size
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @batch_size.setter
+    def batch_size(self, batch_size):
+        self.inst_norm1.batch_size = batch_size
+        self.inst_norm2.batch_size = batch_size
+        self._batch_size = batch_size
 
     def hybrid_forward(self, F, x, style_idx):
         h1 = self.conv1(x)
@@ -80,35 +104,51 @@ def gram_matrix(y):
 
 
 class Transformer(HybridBlock):
-    def __init__(self, num_styles):
+    def __init__(self, num_styles, batch_size):
         super(Transformer, self).__init__(prefix='transformer_')
         self.num_styles = num_styles
         block = ResidualBlock
+        self.scale255 = False
 
         with self.name_scope():
             self.conv1 = _nn.Conv2D(32, 9, 1, 4, in_channels=3, use_bias=False)
-            self.inst_norm1 = InstanceNorm(in_channels=32, num_styles=num_styles)
+            self.inst_norm1 = InstanceNorm(in_channels=32, num_styles=num_styles, batch_size=batch_size)
 
             self.conv2 = _nn.Conv2D(64, 3, 2, 1, in_channels=32, use_bias=False)
-            self.inst_norm2 = InstanceNorm(in_channels=64, num_styles=num_styles)
+            self.inst_norm2 = InstanceNorm(in_channels=64, num_styles=num_styles, batch_size=batch_size)
 
             self.conv3 = _nn.Conv2D(128, 3, 2, 1, in_channels=64, use_bias=False)
-            self.inst_norm3 = InstanceNorm(in_channels=128, num_styles=num_styles)
+            self.inst_norm3 = InstanceNorm(in_channels=128, num_styles=num_styles, batch_size=batch_size)
 
-            self.residual1 = block(num_styles)
-            self.residual2 = block(num_styles)
-            self.residual3 = block(num_styles)
-            self.residual4 = block(num_styles)
-            self.residual5 = block(num_styles)
+            self.residual1 = block(num_styles, batch_size=batch_size)
+            self.residual2 = block(num_styles, batch_size=batch_size)
+            self.residual3 = block(num_styles, batch_size=batch_size)
+            self.residual4 = block(num_styles, batch_size=batch_size)
+            self.residual5 = block(num_styles, batch_size=batch_size)
 
             self.decoder_conv1 = _nn.Conv2D(64, 3, 1, 1, in_channels=128, use_bias=False)
-            self.inst_norm4 = InstanceNorm(in_channels=64, num_styles=num_styles)
+            self.inst_norm4 = InstanceNorm(in_channels=64, num_styles=num_styles, batch_size=batch_size)
 
             self.decoder_conv2 = _nn.Conv2D(32, 3, 1, 1, in_channels=64, use_bias=False)
-            self.inst_norm5 = InstanceNorm(in_channels=32, num_styles=num_styles)
+            self.inst_norm5 = InstanceNorm(in_channels=32, num_styles=num_styles, batch_size=batch_size)
 
             self.decoder_conv3 = _nn.Conv2D(3, 9, 1, 4, in_channels=32, use_bias=False)
-            self.inst_norm6 = InstanceNorm(in_channels=3, num_styles=num_styles)
+            self.inst_norm6 = InstanceNorm(in_channels=3, num_styles=num_styles, batch_size=batch_size)
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @batch_size.setter
+    def batch_size(self, batch_size):
+        inst_norm_layers = [
+            self.inst_norm1, self.inst_norm2, self.inst_norm3,
+            self.inst_norm4, self.inst_norm5, self.inst_norm6,
+            self.residual1, self.residual2, self.residual3,
+            self.residual4, self.residual5,
+        ]
+        for layer in inst_norm_layers:
+            layer.batch_size = batch_size
 
     def hybrid_forward(self, F, X, style_idx):
         h1 = self.conv1(X)
@@ -141,10 +181,12 @@ def hybrid_forward(self, F, X, style_idx):
 
         d3 = self.decoder_conv3(d2)
         d3 = self.inst_norm6(d3, style_idx)
-        if F == _mx.sym:
-            return F.Activation(d3, 'sigmoid') * 255.0
 
-        return F.Activation(d3, 'sigmoid')
+        z = F.Activation(d3, 'sigmoid')
+        if self.scale255:
+            return z * 255
+        else:
+            return z
 
 
 class Vgg16(HybridBlock):
@@ -166,7 +208,6 @@ def __init__(self):
             self.conv4_2 = _nn.Conv2D(in_channels=512, channels=512, kernel_size=3, padding=1)
             self.conv4_3 = _nn.Conv2D(in_channels=512, channels=512, kernel_size=3, padding=1)
 
-
     def hybrid_forward(self, F, X):
         h = F.Activation(self.conv1_1(X), act_type='relu')
         h = F.Activation(self.conv1_2(h), act_type='relu')
diff --git a/src/unity/python/turicreate/toolkits/style_transfer/_sframe_loader.py b/src/unity/python/turicreate/toolkits/style_transfer/_sframe_loader.py
index b814c164d5..e57ffab7ee 100644
--- a/src/unity/python/turicreate/toolkits/style_transfer/_sframe_loader.py
+++ b/src/unity/python/turicreate/toolkits/style_transfer/_sframe_loader.py
@@ -43,7 +43,7 @@ class SFrameSTIter(_mx.io.DataIter):
 
     def __init__(self, sframe, batch_size, shuffle, feature_column,
                  input_shape, num_epochs=None, repeat_each_image=1,
-                 loader_type='stretch', aug_params={}):
+                 loader_type='stretch', aug_params={}, sequential=True):
 
         if sframe[feature_column].dtype != _tc.Image:
             raise _ToolkitError('Feature column must be of type Image')
@@ -92,7 +92,13 @@ def __init__(self, sframe, batch_size, shuffle, feature_column,
         self.sframe = sframe.copy()
 
         # Convert images to raw to eliminate overhead of decoding
-        self.sframe[_TMP_COL_PREP_IMAGE] = self.sframe[self.feature_column].apply(img_prep_fn)
+        if sequential:
+            builder = _tc.SArrayBuilder(_tc.Image)
+            for img in self.sframe[self.feature_column]:
+                builder.append(img_prep_fn(img))
+            self.sframe[_TMP_COL_PREP_IMAGE] = builder.close()
+        else:
+            self.sframe[_TMP_COL_PREP_IMAGE] = self.sframe[self.feature_column].apply(img_prep_fn)
 
         self._provide_data = [
             _mx.io.DataDesc(name='image',
diff --git a/src/unity/python/turicreate/toolkits/style_transfer/style_transfer.py b/src/unity/python/turicreate/toolkits/style_transfer/style_transfer.py
index 1ee4fbbb64..1ba88c12cb 100644
--- a/src/unity/python/turicreate/toolkits/style_transfer/style_transfer.py
+++ b/src/unity/python/turicreate/toolkits/style_transfer/style_transfer.py
@@ -13,22 +13,26 @@
 from turicreate.toolkits import _coreml_utils
 from turicreate.toolkits._internal_utils import _raise_error_if_not_sframe
 from .. import _mxnet_utils
-from ._model import Transformer as _Transformer
-from ._model import Vgg16 as _Vgg16
-from ._model import gram_matrix as _gram_matrix
 from ._utils import _seconds_as_string
 from .. import _pre_trained_models
 from turicreate.toolkits._model import CustomModel as _CustomModel
 from turicreate.toolkits._main import ToolkitError as _ToolkitError
-from mxnet import gluon as _gluon
 from turicreate.toolkits._model import PythonProxy as _PythonProxy
 import turicreate as _tc
 import numpy as _np
-import mxnet as _mx
 import math as _math
 import six as _six
 
 
+def _vgg16_data_prep(batch):
+    """
+    Takes images scaled to [0, 1] and returns them appropriately scaled and
+    mean-subtracted for VGG-16
+    """
+    from mxnet import nd
+    mean = nd.array([123.68, 116.779, 103.939], ctx=batch.context)
+    return nd.broadcast_sub(255 * batch, mean.reshape((-1, 1, 1)))
+
 def create(style_dataset, content_dataset, style_feature=None,
         content_feature=None, max_iterations=None, model='resnet-16',
         verbose=True, batch_size = 6, **kwargs):
@@ -40,20 +44,25 @@ def create(style_dataset, content_dataset, style_feature=None,
     style_dataset: SFrame
         Input style images. The columns named by the ``style_feature`` parameters will
         be extracted for training the model.
+
     content_dataset : SFrame
         Input content images. The columns named by the ``content_feature`` parameters will
         be extracted for training the model.
+
     style_feature: string
         Name of the column containing the input images in style SFrame.
         'None' (the default) indicates the only image column in the style SFrame
         should be used as the feature.
+
     content_feature: string
         Name of the column containing the input images in content SFrame.
         'None' (the default) indicates the only image column in the content
         SFrame should be used as the feature.
+
     max_iterations : int
         The number of training iterations. If 'None' (the default), then it will
         be automatically determined based on the amount of data you provide.
+
     model : string optional
         Style transfer model to use:
 
@@ -82,6 +91,10 @@ def create(style_dataset, content_dataset, style_feature=None,
     --------
     .. sourcecode:: python
 
+        # Create datasets
+        >>> content_dataset = turicreate.image_analysis.load_images('content_images/')
+        >>> style_dataset = turicreate.image_analysis.load_images('style_images/')
+
         # Train a style transfer model
         >>> model = turicreate.style_transfer.create(content_dataset, style_dataset)
 
@@ -91,7 +104,6 @@ def create(style_dataset, content_dataset, style_feature=None,
         # Visualize the stylized images
         >>> stylized_images.explore()
 
-
     """
     if len(style_dataset) == 0:
         raise _ToolkitError("style_dataset SFrame cannot be empty")
@@ -120,11 +132,12 @@ def create(style_dataset, content_dataset, style_feature=None,
         'lr': 0.001,
         'content_loss_mult': 1.0,
         'style_loss_mult': [1e-4, 1e-4, 1e-4, 1e-4],  # conv 1-4 layers
-        'fine_tune_all_params': False,
+        'finetune_all_params': False,
         'print_loss_breakdown': False,
         'input_shape': (256, 256),
         'training_content_loader_type': 'stretch',
         'use_augmentation': False,
+        'sequential_image_processing': False,
         # Only used if use_augmentaion is True
         'aug_resize': 0,
         'aug_rand_crop': 0.9,
@@ -175,25 +188,29 @@ def create(style_dataset, content_dataset, style_feature=None,
     content_images_loader = _SFrameSTIter(content_dataset, batch_size, shuffle=True,
                                   feature_column=content_feature, input_shape=input_shape,
                                   num_epochs=max_iterations,
-                                  loader_type='stretch', aug_params=params)
-    style_images_loader = _SFrameSTIter(style_dataset, batch_size, shuffle=False,
-                                  feature_column=style_feature, input_shape=input_shape,
-                                  loader_type='stretch')
+                                  loader_type='stretch', aug_params=params,
+                                  sequential=params['sequential_image_processing'])
     ctx = _mxnet_utils.get_mxnet_context(max_devices=params['batch_size'])
 
     num_styles = len(style_dataset)
 
     # TRANSFORMER MODEL
+    from ._model import Transformer as _Transformer
     transformer_model_path = _pre_trained_models.STYLE_TRANSFER_BASE_MODELS[model]().get_model_path()
-    transformer = _Transformer(num_styles)
+    transformer = _Transformer(num_styles, batch_size_each)
     transformer.collect_params().initialize(ctx=ctx)
     transformer.load_params(transformer_model_path, ctx, allow_missing=True)
+    # For some reason, the transformer fails to hybridize for training, so we
+    # avoid this until resolved
+    # transformer.hybridize()
 
     # VGG MODEL
+    from ._model import Vgg16 as _Vgg16
     vgg_model_path = _pre_trained_models.STYLE_TRANSFER_BASE_MODELS['Vgg16']().get_model_path()
     vgg_model = _Vgg16()
     vgg_model.collect_params().initialize(ctx=ctx)
     vgg_model.load_params(vgg_model_path, ctx=ctx, ignore_extra=True)
+    vgg_model.hybridize()
 
     # TRAINER
     from mxnet import gluon as _gluon
@@ -219,60 +236,102 @@ def create(style_dataset, content_dataset, style_feature=None,
         else:
             print('Using CPU to create model')
 
-        # Print progress table header
-        column_names = ['Iteration', 'Loss', 'Elapsed Time']
-        num_columns = len(column_names)
-        column_width = max(map(lambda x: len(x), column_names)) + 2
-        hr = '+' + '+'.join(['-' * column_width] * num_columns) + '+'
-        print(hr)
-        print(('| {:<{width}}' * num_columns + '|').format(*column_names, width=column_width-1))
-        print(hr)
+    #
+    # Pre-compute gram matrices for style images
+    #
+    if verbose:
+        print('Analyzing visual features of the style images')
+
+    style_images_loader = _SFrameSTIter(style_dataset, batch_size, shuffle=False, num_epochs=1,
+                                        feature_column=style_feature, input_shape=input_shape,
+                                        loader_type='stretch',
+                                        sequential=params['sequential_image_processing'])
+    num_layers = len(params['style_loss_mult'])
+    gram_chunks = [[] for _ in range(num_layers)]
+    for s_batch in style_images_loader:
+        s_data = _gluon.utils.split_and_load(s_batch.data[0], ctx_list=ctx, batch_axis=0)
+        results = []
+        for s in s_data:
+            vgg16_s = _vgg16_data_prep(s)
+            ret = vgg_model(vgg16_s)
+            grams = [_gram_matrix(x) for x in ret]
+            for i, gram in enumerate(grams):
+                if gram.context != _mx.cpu(0):
+                    gram = gram.as_in_context(_mx.cpu(0))
+                gram_chunks[i].append(gram)
+    del style_images_loader
+
+    grams = [
+        # The concatenated styles may be padded, so we slice overflow
+        _mx.nd.concat(*chunks, dim=0)[:num_styles]
+        for chunks in gram_chunks
+    ]
+
+    # A context->grams look-up table, where all the gram matrices have been
+    # distributed
+    ctx_grams = {}
+    if ctx[0] == _mx.cpu(0):
+        ctx_grams[_mx.cpu(0)] = grams
+    else:
+        for ctx0 in ctx:
+            ctx_grams[ctx0] = [gram.as_in_context(ctx0) for gram in grams]
+
+    #
+    # Training loop
+    #
 
+    vgg_content_loss_layer = params['vgg16_content_loss_layer']
+    rs = _np.random.RandomState(1234)
     while iterations < max_iterations:
         content_images_loader.reset()
         for c_batch in content_images_loader:
-            s_batch = style_images_loader.next()
             c_data = _gluon.utils.split_and_load(c_batch.data[0], ctx_list=ctx, batch_axis=0)
-            s_data = _gluon.utils.split_and_load(s_batch.data[0], ctx_list=ctx, batch_axis=0)
-            indices_data = _gluon.utils.split_and_load(_mx.nd.array(s_batch.indices, dtype=_np.int64),
-                                                       ctx_list=[_mx.cpu(0)]*len(ctx), batch_axis=0)
 
             Ls = []
             curr_content_loss = []
             curr_style_loss = []
             with _mx.autograd.record():
-                for c, s, indices in zip(c_data, s_data, indices_data):
-                    stylized = transformer(c, indices.asnumpy())
+                for c in c_data:
+                    # Randomize styles to train
+                    indices = _mx.nd.array(rs.randint(num_styles, size=batch_size_each),
+                                           dtype=_np.int64, ctx=c.context)
+
+                    # Generate pastiche
+                    p = transformer(c, indices)
 
                     # mean subtraction
-                    s = _mxnet_utils.subtract_imagenet_mean(s)
-                    stylized = _mxnet_utils.subtract_imagenet_mean(stylized)
-                    c = _mxnet_utils.subtract_imagenet_mean(c)
+                    vgg16_p = _vgg16_data_prep(p)
+                    vgg16_c = _vgg16_data_prep(c)
 
                     # vgg forward
-                    style_vgg_outputs = vgg_model(s)
-                    stylized_vgg_outputs = vgg_model(stylized)
-                    content_vgg_outputs = vgg_model(c)
+                    p_vgg_outputs = vgg_model(vgg16_p)
+
+                    c_vgg_outputs = vgg_model(vgg16_c)
+                    c_content_layer = c_vgg_outputs[vgg_content_loss_layer]
+                    p_content_layer = p_vgg_outputs[vgg_content_loss_layer]
 
                     # Calculate Loss
                     # Style Loss between style image and stylized image
                     # Ls = sum of L2 norm of gram matrix of vgg16's conv layers
-                    style_loss = 0.0
-                    for style_vgg_output, stylized_vgg_output, style_loss_mult in zip(style_vgg_outputs, stylized_vgg_outputs, _style_loss_mult):
-                        gram_style_vgg = _gram_matrix(style_vgg_output)
-                        gram_stylized_vgg = _gram_matrix(stylized_vgg_output)
+                    style_losses = []
+                    for gram, p_vgg_output, style_loss_mult in zip(ctx_grams[c.context], p_vgg_outputs, _style_loss_mult):
+                        gram_s_vgg = gram[indices]
+                        gram_p_vgg = _gram_matrix(p_vgg_output)
 
-                        style_loss = style_loss + style_loss_mult * mse_loss(gram_style_vgg, gram_stylized_vgg)
+                        style_losses.append(style_loss_mult * mse_loss(gram_s_vgg, gram_p_vgg))
+
+                    style_loss = _mx.nd.add_n(*style_losses)
 
                     # Content Loss between content image and stylized image
-                    # Lc = L2 norm of vgg16's 3rd conv layer
-                    vgg_content_loss_layer = params['vgg16_content_loss_layer']
-                    content_loss = _content_loss_mult * mse_loss(content_vgg_outputs[vgg_content_loss_layer],
-                                                                 stylized_vgg_outputs[vgg_content_loss_layer])
+                    # Lc = L2 norm at a single layer in vgg16
+                    content_loss = _content_loss_mult * mse_loss(c_content_layer,
+                                                                 p_content_layer)
 
                     curr_content_loss.append(content_loss)
                     curr_style_loss.append(style_loss)
-                    total_loss = content_loss + style_loss
+                    # Divide loss by large number to get into a more legible
+                    # range
+                    total_loss = (content_loss + style_loss) / 10000.0
                     Ls.append(total_loss)
                 for L in Ls:
                     L.backward()
@@ -285,8 +344,19 @@ def create(style_dataset, content_dataset, style_feature=None,
                 smoothed_loss = 0.9 * smoothed_loss + 0.1 * cur_loss
             iterations += 1
             trainer.step(batch_size)
+
+            if verbose and iterations == 1:
+                # Print progress table header
+                column_names = ['Iteration', 'Loss', 'Elapsed Time']
+                num_columns = len(column_names)
+                column_width = max(map(lambda x: len(x), column_names)) + 2
+                hr = '+' + '+'.join(['-' * column_width] * num_columns) + '+'
+                print(hr)
+                print(('| {:<{width}}' * num_columns + '|').format(*column_names, width=column_width-1))
+                print(hr)
+
             cur_time = _time.time()
-            if verbose and cur_time > last_time + 10:
+            if verbose and (cur_time > last_time + 10 or iterations == max_iterations):
                 # Print progress table row
                 elapsed_time = cur_time - start_time
                 print("| {cur_iter:<{width}}| {loss:<{width}.3f}| {time:<{width}.1f}|".format(
@@ -295,15 +365,13 @@ def create(style_dataset, content_dataset, style_feature=None,
                 if params['print_loss_breakdown']:
                     print_content_loss = _np.mean([L.asnumpy()[0] for L in curr_content_loss])
                     print_style_loss = _np.mean([L.asnumpy()[0] for L in curr_style_loss])
-                    print('Total Loss: {:6.3f}| Content Loss: {:6.3f} | Style Loss: {:6.3f}'.format(cur_loss, print_content_loss, print_style_loss))
+                    print('Total Loss: {:6.3f} | Content Loss: {:6.3f} | Style Loss: {:6.3f}'.format(cur_loss, print_content_loss, print_style_loss))
                 last_time = cur_time
             if iterations == max_iterations:
+                print(hr)
                 break
 
     training_time = _time.time() - start_time
-    if verbose:
-        print(hr)
-
     style_sa = style_dataset[style_feature]
     idx_column = _tc.SArray(range(0, style_sa.shape[0]))
     style_sframe = _tc.SFrame({"style": idx_column, style_feature: style_sa})
@@ -369,9 +437,10 @@ def _get_version(self):
 
     @classmethod
     def _load_version(cls, state, version):
+        from ._model import Transformer as _Transformer
         _tkutl._model_version_check(version, cls._PYTHON_STYLE_TRANSFER_VERSION)
 
-        net = _Transformer(state['num_styles'])
+        net = _Transformer(state['num_styles'], state['batch_size'])
         ctx = _mxnet_utils.get_mxnet_context(max_devices=state['batch_size'])
 
         net_params = net.collect_params()
@@ -587,6 +656,9 @@ def stylize(self, images, style=None, verbose=True, max_size=800, batch_size = 4
             # for smaller images
             loader_type = 'pad'
 
+        self._model.batch_size = batch_size_each
+        self._model.hybridize()
+
         ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size_each)
         batch_size = max(num_mxnet_gpus, 1) * batch_size_each
         last_time = 0
@@ -609,6 +681,10 @@ def stylize(self, images, style=None, verbose=True, max_size=800, batch_size = 4
         if input_shape[1] > max_w:
             input_shape = (input_shape[0], max_w)
 
+        # If we find large images, let's switch to sequential iterator
+        # pre-processing, to prevent memory issues.
+        sequential = max(max_h, max_w) > 2000
+
         if verbose and output_size != 1:
             print('Stylizing {} image(s) using {} style(s)'.format(dataset_size, len(style)))
             if oversized_count > 0:
@@ -620,7 +696,8 @@ def stylize(self, images, style=None, verbose=True, max_size=800, batch_size = 4
                                               input_shape=input_shape,
                                               num_epochs=1,
                                               loader_type=loader_type,
-                                              repeat_each_image=len(style))
+                                              repeat_each_image=len(style),
+                                              sequential=sequential)
 
         sb = _tc.SFrameBuilder([int, int, _tc.Image],
                                column_names=['row_id', 'style', 'stylized_{}'.format(self.content_feature)])
@@ -632,10 +709,11 @@ def stylize(self, images, style=None, verbose=True, max_size=800, batch_size = 4
             else:
                 c_data = _gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
             indices_data = _gluon.utils.split_and_load(_mx.nd.array(batch.repeat_indices, dtype=_np.int64),
-                                                       ctx_list=[_mx.cpu(0)]*len(ctx), batch_axis=0)
+                                                       ctx_list=ctx, batch_axis=0)
             outputs = []
             for b_img, b_indices in zip(c_data, indices_data):
-                b_batch_styles = [style[idx] for idx in b_indices.asnumpy()]
+                mx_style = _mx.nd.array(style, dtype=_np.int64, ctx=b_indices.context)
+                b_batch_styles = mx_style[b_indices]
                 output = self._model(b_img, b_batch_styles)
                 outputs.append(output)
 
@@ -715,7 +793,12 @@ def export_coreml(self, path, image_shape=(256, 256)):
         c_image = _mx.sym.Variable('image', shape=image_shape,
                                          dtype=_np.float32)
 
+        # signal that we want the transformer to prepare for coreml export
+        # using a zero batch size
+        transformer.batch_size = 0
+        transformer.scale255 = True
         sym_out = transformer(c_image, index)
+
         mod = _mx.mod.Module(symbol=sym_out, data_names=["image", "index"],
                                     label_names=None)
         mod.bind(data_shapes=zip(["image", "index"], [image_shape, (1,)]), for_training=False,
@@ -732,10 +815,10 @@ def export_coreml(self, path, image_shape=(256, 256)):
 
         mod.set_params(sym_weight_dict, sym_weight_dict)
         index_dim = (1, self.num_styles)
+        coreml_model = _mxnet_converter.convert(mod, input_shape=[(self.content_feature, image_shape), ('index', index_dim)],
+                        mode=None, preprocessor_args=None, builder=None, verbose=False)
 
-        coreml_model = _mxnet_converter.convert(mod, input_shape=[('image', image_shape), ('index', index_dim)],
-                        mode=None, preprocessor_args=None, builder=None, verbose=True)
-
+        transformer.scale255 = False
         spec = coreml_model.get_spec()
         image_input = spec.description.input[0]
         image_output = spec.description.output[0]
diff --git a/src/unity/python/turicreate/version_info.py b/src/unity/python/turicreate/version_info.py
index 63f71317bf..5e754a0593 100644
--- a/src/unity/python/turicreate/version_info.py
+++ b/src/unity/python/turicreate/version_info.py
@@ -11,7 +11,7 @@
 from __future__ import absolute_import as _
 
 # python egg version
-__version__ = '4.3.2'#{{VERSION_STRING}}
+__version__ = '5.0b1'#{{VERSION_STRING}}
 version = __version__
 build_number = '0'#{{BUILD_NUMBER}}
 git_sha = 'NA'#{{GIT_SHA}}
diff --git a/src/unity/toolkits/supervised_learning/supervised_learning.cpp b/src/unity/toolkits/supervised_learning/supervised_learning.cpp
index fefd7ef47f..a6d5065f08 100644
--- a/src/unity/toolkits/supervised_learning/supervised_learning.cpp
+++ b/src/unity/toolkits/supervised_learning/supervised_learning.cpp
@@ -145,21 +145,24 @@ void supervised_learning_model_base::init(const sframe& X, const sframe& y,
   this->state["num_features"] = feature_column_names.size();
   this->state["num_unpacked_features"] = feature_names.size();
 
+  // Turned off temporarily until we can find a better way to hide for image classification
+  bool simple_mode = true;
+
+
   // Check the number of dimensions in this dataset is small, otherwise warn the
   // user. (see  #3001 for context)
-  /*
-  // Turned off temporarily until we can find a better way to hide for image classification. 
-  size_t num_dims = get_number_of_coefficients(this->ml_mdata);
-  if(num_dims >= X.num_rows()) {
-    std::stringstream ss;
-    ss << "WARNING: The number of feature dimensions in this problem is "
-       << "very large in comparison with the number of examples. Unless "
-       << "an appropriate regularization value is set, this model "
-       << "may not provide accurate predictions for a validation/test set."
-       << std::endl;
-    logprogress_stream << ss.str() << std::endl;
+  if (not simple_mode) {
+      size_t num_dims = get_number_of_coefficients(this->ml_mdata);
+      if(num_dims >= X.num_rows()) {
+        std::stringstream ss;
+        ss << "WARNING: The number of feature dimensions in this problem is "
+           << "very large in comparison with the number of examples. Unless "
+           << "an appropriate regularization value is set, this model "
+           << "may not provide accurate predictions for a validation/test set."
+           << std::endl;
+        logprogress_stream << ss.str() << std::endl;
+      }
   }
-  */
 
   ml_data valid_data;
   if (valid_X.num_rows() > 0) {
@@ -175,8 +178,9 @@ void supervised_learning_model_base::init(const sframe& X, const sframe& y,
   // Finally call the model-specific init function.
   model_specific_init(data, valid_data);
 
-  // Raise error if mean and variance are not finite.
-  check_feature_means_and_variances(this->ml_mdata, show_extra_warnings);
+  // Raise error if mean and variance are not finite
+  check_feature_means_and_variances(this->ml_mdata,
+             show_extra_warnings && (not simple_mode));
 
   // One class classification error message.
   if(this->is_classifier()) {
diff --git a/src/visualization/Turi Create Visualization.xcodeproj/project.pbxproj b/src/visualization/Turi Create Visualization.xcodeproj/project.pbxproj
index 51820f848a..610f3ddf88 100644
--- a/src/visualization/Turi Create Visualization.xcodeproj/project.pbxproj	
+++ b/src/visualization/Turi Create Visualization.xcodeproj/project.pbxproj	
@@ -16,29 +16,12 @@
 		FC245BED1F4F541D009F54C6 /* VegaContainer.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC245BEC1F4F541D009F54C6 /* VegaContainer.swift */; };
 		FC245BF31F4F7FFB009F54C6 /* JSON.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC245BF21F4F7FFB009F54C6 /* JSON.swift */; };
 		FC245BF51F4F8110009F54C6 /* Pipe.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC245BF41F4F8110009F54C6 /* Pipe.swift */; };
-		FC3E34CD206EEF3100B6B69A /* package.json in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34CC206EEF3100B6B69A /* package.json */; };
-		FC3E34D1206EF08500B6B69A /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34D0206EF08500B6B69A /* index.js */; };
-		FC3E34D3206EF09800B6B69A /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34D2206EF09800B6B69A /* index.css */; };
-		FC3E34D5206EF12300B6B69A /* build in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34D4206EF12300B6B69A /* build */; };
-		FC3E34DD206EF4EB00B6B69A /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34DC206EF4EB00B6B69A /* index.js */; };
-		FC3E34DF206EF4F600B6B69A /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34DE206EF4F600B6B69A /* index.css */; };
-		FC3E34E1206EF50100B6B69A /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34E0206EF50100B6B69A /* index.js */; };
-		FC3E34E3206EF50E00B6B69A /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34E2206EF50E00B6B69A /* index.js */; };
-		FC3E34E6206EF52000B6B69A /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FC3E34E5206EF52000B6B69A /* index.css */; };
 		FC5F0B3C1F5F3A3E0001BCEE /* Error.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5F0B3B1F5F3A3E0001BCEE /* Error.swift */; };
-		FC78046020729FF9004BE45B /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC78045F20729FF9004BE45B /* index.js */; };
-		FC7804672072A061004BE45B /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC7804662072A061004BE45B /* index.js */; };
-		FC7804692072A075004BE45B /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC7804682072A075004BE45B /* index.js */; };
-		FC78046C2072A0F6004BE45B /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FC78046B2072A0F6004BE45B /* index.js */; };
-		FC78046E2072A106004BE45B /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FC78046D2072A106004BE45B /* index.css */; };
-		FC7804702072A115004BE45B /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FC78046F2072A115004BE45B /* index.css */; };
-		FC7804722072A14B004BE45B /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FC7804712072A14B004BE45B /* index.css */; };
-		FC7804742072A168004BE45B /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FC7804732072A168004BE45B /* index.css */; };
 		FC9066011F425B4800AE3881 /* AppData.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9066001F425B4800AE3881 /* AppData.swift */; };
-		FCB8DFDE206EF548001A7089 /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FCB8DFDD206EF548001A7089 /* index.css */; };
 		FCB8DFE1206EF950001A7089 /* index.html in Resources */ = {isa = PBXBuildFile; fileRef = FCB8DFE0206EF950001A7089 /* index.html */; };
 		FCB8DFE3206F1756001A7089 /* index.css in Resources */ = {isa = PBXBuildFile; fileRef = FCB8DFE2206F1756001A7089 /* index.css */; };
 		FCB8DFE5206F187D001A7089 /* index.js in Resources */ = {isa = PBXBuildFile; fileRef = FCB8DFE4206F187D001A7089 /* index.js */; };
+		FCE3954420B4C2CB0029FDB5 /* build in Resources */ = {isa = PBXBuildFile; fileRef = FCE3954320B4C2CB0029FDB5 /* build */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
@@ -56,7 +39,6 @@
 		FC3E34CC206EEF3100B6B69A /* package.json */ = {isa = PBXFileReference; lastKnownFileType = text.json; path = package.json; sourceTree = "<group>"; };
 		FC3E34D0206EF08500B6B69A /* index.js */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.javascript; path = index.js; sourceTree = "<group>"; };
 		FC3E34D2206EF09800B6B69A /* index.css */ = {isa = PBXFileReference; lastKnownFileType = text.css; path = index.css; sourceTree = "<group>"; };
-		FC3E34D4206EF12300B6B69A /* build */ = {isa = PBXFileReference; lastKnownFileType = folder; path = build; sourceTree = "<group>"; };
 		FC3E34DC206EF4EB00B6B69A /* index.js */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.javascript; path = index.js; sourceTree = "<group>"; };
 		FC3E34DE206EF4F600B6B69A /* index.css */ = {isa = PBXFileReference; lastKnownFileType = text.css; path = index.css; sourceTree = "<group>"; };
 		FC3E34E0206EF50100B6B69A /* index.js */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.javascript; path = index.js; sourceTree = "<group>"; };
@@ -76,6 +58,7 @@
 		FCB8DFE0206EF950001A7089 /* index.html */ = {isa = PBXFileReference; lastKnownFileType = text.html; path = index.html; sourceTree = "<group>"; };
 		FCB8DFE2206F1756001A7089 /* index.css */ = {isa = PBXFileReference; lastKnownFileType = text.css; path = index.css; sourceTree = "<group>"; };
 		FCB8DFE4206F187D001A7089 /* index.js */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.javascript; path = index.js; sourceTree = "<group>"; };
+		FCE3954320B4C2CB0029FDB5 /* build */ = {isa = PBXFileReference; lastKnownFileType = folder; path = build; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -136,7 +119,7 @@
 		FC1A87821FB1464D00A67DAD /* user_interface */ = {
 			isa = PBXGroup;
 			children = (
-				FC3E34D4206EF12300B6B69A /* build */,
+				FCE3954320B4C2CB0029FDB5 /* build */,
 				FCB8DFDF206EF93A001A7089 /* public */,
 				FC3E34CF206EF04F00B6B69A /* src */,
 				FC3E34CC206EEF3100B6B69A /* package.json */,
@@ -318,27 +301,10 @@
 			buildActionMask = 2147483647;
 			files = (
 				FCB8DFE5206F187D001A7089 /* index.js in Resources */,
-				FC7804742072A168004BE45B /* index.css in Resources */,
-				FC3E34D5206EF12300B6B69A /* build in Resources */,
-				FC7804722072A14B004BE45B /* index.css in Resources */,
 				FCB8DFE1206EF950001A7089 /* index.html in Resources */,
-				FC3E34CD206EEF3100B6B69A /* package.json in Resources */,
-				FC3E34D1206EF08500B6B69A /* index.js in Resources */,
 				FCB8DFE3206F1756001A7089 /* index.css in Resources */,
-				FC3E34E6206EF52000B6B69A /* index.css in Resources */,
+				FCE3954420B4C2CB0029FDB5 /* build in Resources */,
 				9226E7C81F366F2E00C33A64 /* Assets.xcassets in Resources */,
-				FC3E34D3206EF09800B6B69A /* index.css in Resources */,
-				FC7804672072A061004BE45B /* index.js in Resources */,
-				FC3E34E1206EF50100B6B69A /* index.js in Resources */,
-				FC7804702072A115004BE45B /* index.css in Resources */,
-				FC78046C2072A0F6004BE45B /* index.js in Resources */,
-				FC7804692072A075004BE45B /* index.js in Resources */,
-				FCB8DFDE206EF548001A7089 /* index.css in Resources */,
-				FC3E34E3206EF50E00B6B69A /* index.js in Resources */,
-				FC78046020729FF9004BE45B /* index.js in Resources */,
-				FC78046E2072A106004BE45B /* index.css in Resources */,
-				FC3E34DF206EF4F600B6B69A /* index.css in Resources */,
-				FC3E34DD206EF4EB00B6B69A /* index.js in Resources */,
 				9226E7CB1F366F2E00C33A64 /* Main.storyboard in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
diff --git a/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/index.js b/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/index.js
index 3da67081d7..e5b6c9b1c4 100644
--- a/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/index.js	
+++ b/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/index.js	
@@ -590,13 +590,24 @@ class TcTable extends Component {
         rows.push(<Row key={"spacer2"} spacers={true}>
                     {empty_cells_2}
                   </Row>);
+          
+        var empty_cells_3 = [];
+        empty_cells_3.push(<Cell className={"header_element accordion_helper"} key={"0_"+r+"spacer3"}>&nbsp;</Cell>);
+          
+        for(var x = 1; x < cells.length;x++){
+            empty_cells_3.push(<Cell className={"elements accordion_helper"} key={x+"_"+r+"spacer3"}>&nbsp;</Cell>);
+        }
+          
+        rows.push(<Row key={"spacer3"} spacers={true}>
+                    {empty_cells_3}
+                  </Row>);
       }
     }
 
       
     var n = Math.floor(Math.min(...row_ids)/this.step_size);
       
-    this.set_higher = n + 2;
+    this.set_higher = n + 3;
     this.set_lower = n;
       
     var parent_context = this;
diff --git a/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/sticky-table/index.js b/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/sticky-table/index.js
index 085b8be567..daba80fc92 100644
--- a/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/sticky-table/index.js	
+++ b/src/visualization/Turi Create Visualization/src/user_interface/src/elements/Explore/Table/sticky-table/index.js	
@@ -329,7 +329,7 @@ class StickyTable extends PureComponent {
     
       
         if(document.getElementById("data_container")){
-            document.getElementById("data_container").style.height =  this.getModeHeights()*3 - 30 + "px"
+            document.getElementById("data_container").style.height =  this.getModeHeights()*4 - 30 + "px"
             document.getElementById("data_container").style.width = (this.xScrollbar.clientWidth - 30) + "px";
             document.getElementById("data_container").style.left = 15 + "px";
             document.getElementById("data_container").style.top = column_offset_top + this.getModeHeights() + 15 + "px";
diff --git a/userguide/style_transfer/export-coreml.md b/userguide/style_transfer/export-coreml.md
index 3c625f5530..dc239c9788 100644
--- a/userguide/style_transfer/export-coreml.md
+++ b/userguide/style_transfer/export-coreml.md
@@ -39,7 +39,7 @@ Now, you can stylize your images using:
 let mlModel = MyStyleTransferModel()
 let visionModel = try VNCoreMLModel(for: mlModel)
 
-let styleTranser = VNCoreMLRequest(model: visionModel, completionHandler: { (request, error) in
+let styleTransfer = VNCoreMLRequest(model: visionModel, completionHandler: { (request, error) in
         guard let results = request.results else { return }
 
     for case let styleTransferedImage as VNPixelBufferObservation in results {
@@ -47,4 +47,4 @@ let styleTranser = VNCoreMLRequest(model: visionModel, completionHandler: { (req
         imageLayer.contents = CIImage(cvPixelBuffer: styleTransferedImage.pixelBuffer, options: [:])
     }
 })
-```
+```
\ No newline at end of file