diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h
index 85f1044..435e906 100644
--- a/include/hashinator/hashinator.h
+++ b/include/hashinator/hashinator.h
@@ -63,6 +63,7 @@ class Hashmap {
 private:
    // CUDA device handle
    Hashmap* device_map;
+   split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>* device_buckets;
    //~CUDA device handle
 
    // Host members
@@ -83,6 +84,7 @@ class Hashmap {
    void preallocate_device_handles() {
 #ifndef HASHINATOR_CPU_ONLY_MODE
       SPLIT_CHECK_ERR(split_gpuMalloc((void**)&device_map, sizeof(Hashmap)));
+      device_buckets = (split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>*)((char*)device_map + offsetof(Hashmap, buckets));
 #endif
    }
 
@@ -94,6 +96,7 @@ class Hashmap {
 #ifndef HASHINATOR_CPU_ONLY_MODE
       SPLIT_CHECK_ERR(split_gpuFree(device_map));
       device_map = nullptr;
+      device_buckets = nullptr;
 #endif
    }
 
@@ -107,6 +110,7 @@ class Hashmap {
       *_mapInfo = MapInfo(5);
       buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
           1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()));
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
    };
 
    Hashmap(int sizepower) {
@@ -115,6 +119,7 @@ class Hashmap {
       *_mapInfo = MapInfo(sizepower);
       buckets = split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
           1 << _mapInfo->sizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE()));
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
    };
 
    Hashmap(const Hashmap<KEY_TYPE, VAL_TYPE>& other) {
@@ -122,6 +127,7 @@ class Hashmap {
       _mapInfo = _metaAllocator.allocate(1);
       *_mapInfo = *(other._mapInfo);
       buckets = other.buckets;
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
    };
 
    Hashmap(Hashmap<KEY_TYPE, VAL_TYPE>&& other) {
@@ -129,6 +135,7 @@ class Hashmap {
       _mapInfo = other._mapInfo;
       other._mapInfo=nullptr;
       buckets = std::move(other.buckets);
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
    };
 
    Hashmap& operator=(const Hashmap<KEY_TYPE,VAL_TYPE>& other) {
@@ -137,6 +144,7 @@ class Hashmap {
       }
       *_mapInfo = *(other._mapInfo);
       buckets = other.buckets;
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
       return *this;
    }
 
@@ -146,7 +154,8 @@ class Hashmap {
          return;
       }
       SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_mapInfo,other._mapInfo, sizeof(MapInfo), split_gpuMemcpyDeviceToDevice, stream));
-      buckets.overwrite(other.buckets);
+      buckets.overwrite(other.buckets, stream);
+      SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, stream));
       return;
    }
 
@@ -157,7 +166,8 @@ class Hashmap {
       _metaAllocator.deallocate(_mapInfo, 1);
       _mapInfo = other._mapInfo;
       other._mapInfo=nullptr;
-      buckets =std::move(other.buckets);
+      buckets = std::move(other.buckets);
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
       return *this;
    }
 
@@ -246,11 +256,15 @@ class Hashmap {
       buckets = newBuckets;
       _mapInfo->currentMaxBucketOverflow = Hashinator::defaults::BUCKET_OVERFLOW;
       _mapInfo->tombstoneCounter = 0;
+      #ifndef HASHINATOR_CPU_ONLY_MODE
+      SPLIT_CHECK_ERR(split_gpuMemcpy(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice));
+      #endif
    }
 
 #ifndef HASHINATOR_CPU_ONLY_MODE
    // Resize the table to fit more things. This is automatically invoked once
    // maxBucketOverflow has triggered. This can only be done on host (so far)
+   template <bool prefetches = true>
    void device_rehash(int newSizePower, split_gpuStream_t s = 0) {
       if (newSizePower > 32) {
          throw std::out_of_range("Hashmap ran into rehashing catastrophe and exceeded 32bit buckets.");
@@ -261,8 +275,10 @@ class Hashmap {
       hash_pair<KEY_TYPE, VAL_TYPE>* validElements;
       SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&validElements,
                                            (_mapInfo->fill + 1) * sizeof(hash_pair<KEY_TYPE, VAL_TYPE>), s));
-      optimizeGPU(s);
-      SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+      if (prefetches) {
+         optimizeGPU(s);
+         SPLIT_CHECK_ERR(split_gpuStreamSynchronize(s));
+      }
 
       auto isValidKey = [] __host__ __device__(hash_pair<KEY_TYPE, VAL_TYPE> & element) {
          if (element.first != TOMBSTONE && element.first != EMPTYBUCKET) {
@@ -283,10 +299,16 @@ class Hashmap {
          split_gpuFreeAsync(validElements, s);
          return;
       }
-      optimizeCPU(s);
-      buckets = std::move(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
-          1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE())));
-      optimizeGPU(s);
+      if (newSizePower == _mapInfo->sizePower) {
+         // Just clear the current contents
+         DeviceHasher::reset_all(buckets.data(),_mapInfo, buckets.size(), s);
+      } else {
+         // Need new buckets
+         buckets = std::move(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>(
+                                1 << newSizePower, hash_pair<KEY_TYPE, VAL_TYPE>(EMPTYBUCKET, VAL_TYPE())));
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, s));
+         optimizeGPU(s);
+      }
       *_mapInfo = Info(newSizePower);
       // Insert valid elements to now larger buckets
       insert(validElements, nValidElements, 1, s);
@@ -430,7 +452,8 @@ class Hashmap {
       return;
    }
 #else
-   void clear(targets t = targets::host, split_gpuStream_t s = 0, bool prefetches = true) {
+   template <bool prefetches = true>
+   void clear(targets t = targets::host, split_gpuStream_t s = 0, size_t len = 0) {
       switch (t) {
       case targets::host:
          buckets =
@@ -440,9 +463,12 @@ class Hashmap {
 
       case targets::device:
          if (prefetches) {
-            buckets.optimizeGPU(s);
+            optimizeGPU(s);
          }
-         DeviceHasher::reset_all(buckets.data(),_mapInfo, buckets.size(), s);
+         if (len==0) { // If size is provided, no need to page fault size information.
+            len = buckets.size();
+         }
+         DeviceHasher::reset_all(buckets.data(),_mapInfo, len, s);
          #ifdef HASHINATOR_DEBUG
          set_status((_mapInfo->fill == 0) ? success : fail);
          #endif
@@ -561,6 +587,7 @@ class Hashmap {
       buckets.swap(other.buckets);
       std::swap(_mapInfo, other._mapInfo);
       std::swap(device_map, other.device_map);
+      std::swap(device_buckets, other.device_buckets);
    }
 
 #ifdef HASHINATOR_CPU_ONLY_MODE
@@ -576,12 +603,13 @@ class Hashmap {
    }
 #else
    // Try to get the overflow back to the original one
+   template <bool prefetches = true>
    void performCleanupTasks(split_gpuStream_t s = 0) {
-      while (_mapInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
-         device_rehash(_mapInfo->sizePower + 1, s);
-      }
       if (tombstone_ratio() > 0.025) {
-         clean_tombstones(s);
+         clean_tombstones<prefetches>(s);
+      }
+      while (_mapInfo->currentMaxBucketOverflow > Hashinator::defaults::BUCKET_OVERFLOW) {
+         device_rehash<prefetches>(_mapInfo->sizePower + 1, s);
       }
    }
 
@@ -1080,9 +1108,9 @@ class Hashmap {
     * Then call this:
     *   hmap.extractPattern(elements,Rule<uint32_t,uint32_t>());
     * */
-   template <typename Rule>
+   template <typename Rule, bool prefetches = true>
    size_t extractPattern(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, Rule rule,
-                         split_gpuStream_t s = 0, bool prefetches = true) {
+                         split_gpuStream_t s = 0) {
       elements.resize(_mapInfo->fill + 1, true);
       if (prefetches) {
          elements.optimizeGPU(s);
@@ -1116,7 +1144,7 @@ class Hashmap {
    void extractPatternLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, Rule rule, split_gpuStream_t s = 0) {
       // Extract elements matching the Pattern Rule(element)==true;
       split::tools::copy_if_loop<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE,
-                                 defaults::WARPSIZE>(buckets, elements, rule, s);
+                                 defaults::WARPSIZE>(*device_buckets, elements, rule, s);
    }
    void extractLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, split_gpuStream_t s = 0) {
       // Extract all valid elements
@@ -1126,9 +1154,8 @@ class Hashmap {
       extractPatternLoop(elements, rule, s);
    }
 
-   template <typename Rule>
-   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0,
-                               bool prefetches = true) {
+   template <typename Rule, bool prefetches = true>
+   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0) {
       elements.resize(_mapInfo->fill + 1, true);
       if (prefetches) {
          elements.optimizeGPU(s);
@@ -1138,12 +1165,13 @@ class Hashmap {
                                  defaults::WARPSIZE>(buckets, elements, rule, s);
       //FIXME: there is an issue where paging to host occurs and following calls to hashmap operations take a hit.
       //temp fix: call optimizeGPU() here
-      optimizeGPU(s);
+      if (prefetches) {
+         optimizeGPU(s);
+      }
       return elements.size();
    }
-   template <typename Rule>
-   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0,
-                               bool prefetches = true) {
+   template <typename Rule, bool prefetches = true>
+   size_t extractKeysByPattern(split::SplitVector<KEY_TYPE>& elements, Rule rule, void *stack, size_t max_size, split_gpuStream_t s = 0) {
       elements.resize(_mapInfo->fill + 1, true);
       if (prefetches) {
          elements.optimizeGPU(s);
@@ -1157,17 +1185,19 @@ class Hashmap {
    void extractKeysByPatternLoop(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0) {
       // Extract element **keys** matching the Pattern Rule(element)==true;
       split::tools::copy_if_keys_loop<hash_pair<KEY_TYPE, VAL_TYPE>, KEY_TYPE, Rule, defaults::MAX_BLOCKSIZE,
-                                 defaults::WARPSIZE>(buckets, elements, rule, s);
+                                 defaults::WARPSIZE>(*device_buckets, elements, rule, s);
    }
 
-   size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, split_gpuStream_t s = 0, bool prefetches = true) {
+   template <bool prefetches = true>
+   size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, split_gpuStream_t s = 0) {
       // Extract all keys
       auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
          return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
       };
       return extractKeysByPattern(elements, rule, s, prefetches);
    }
-   size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void *stack, size_t max_size, split_gpuStream_t s = 0, bool prefetches = true) {
+   template <bool prefetches = true>
+   size_t extractAllKeys(split::SplitVector<KEY_TYPE>& elements, void *stack, size_t max_size, split_gpuStream_t s = 0) {
       // Extract all keys
       auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
          return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
@@ -1182,7 +1212,8 @@ class Hashmap {
       extractKeysByPatternLoop(elements, rule, s);
    }
 
-   void clean_tombstones(split_gpuStream_t s = 0, bool prefetches = false) {
+   template <bool prefetches = true>
+   void clean_tombstones(split_gpuStream_t s = 0) {
 
       if (_mapInfo->tombstoneCounter == 0) {
          return;
@@ -1240,8 +1271,8 @@ class Hashmap {
    }
 
    // Uses Hasher's insert_kernel to insert all elements
-   void insert(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0,
-               bool prefetches = true) {
+   template <bool prefetches = true>
+   void insert(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0) {
       // Here we do some calculations to estimate how much if any we need to grow our buckets
       // TODO fix these if paths or at least annotate them .
       if (len == 0) {
@@ -1262,7 +1293,8 @@ class Hashmap {
    }
 
    // Uses Hasher's insert_index_kernel to insert all elements, with the index as the value
-   void insertIndex(KEY_TYPE* keys, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0, bool prefetches = true) {
+   template <bool prefetches = true>
+   void insertIndex(KEY_TYPE* keys, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0) {
       // Here we do some calculations to estimate how much if any we need to grow our buckets
       // TODO fix these if paths or at least annotate them .
       if (len == 0) {
@@ -1283,8 +1315,8 @@ class Hashmap {
    }
 
    // Uses Hasher's insert_kernel to insert all elements
-   void insert(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0,
-               bool prefetches = true) {
+   template <bool prefetches = true>
+   void insert(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, float targetLF = 0.5, split_gpuStream_t s = 0) {
       if (len == 0) {
          set_status(status::success);
          return;
@@ -1303,7 +1335,8 @@ class Hashmap {
    }
 
    // Uses Hasher's retrieve_kernel to read all elements
-   void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0,bool prefetches=true) {
+   template <bool prefetches = true>
+   void retrieve(KEY_TYPE* keys, VAL_TYPE* vals, size_t len, split_gpuStream_t s = 0) {
       if (prefetches){
          buckets.optimizeGPU(s);
       }
@@ -1313,7 +1346,8 @@ class Hashmap {
    }
 
    // Uses Hasher's retrieve_kernel to read all elements
-   void retrieve(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, split_gpuStream_t s = 0, bool prefetches=true) {
+   template <bool prefetches = true>
+   void retrieve(hash_pair<KEY_TYPE, VAL_TYPE>* src, size_t len, split_gpuStream_t s = 0) {
       if (prefetches){
          buckets.optimizeGPU(s);
       }
@@ -1322,7 +1356,8 @@ class Hashmap {
    }
 
    // Uses Hasher's erase_kernel to delete all elements
-   void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0,bool prefetches=true) {
+   template <bool prefetches = true>
+   void erase(KEY_TYPE* keys, size_t len, split_gpuStream_t s = 0) {
       if (prefetches){
          buckets.optimizeGPU(s);
       }
@@ -1341,8 +1376,11 @@ class Hashmap {
     * The pointer is internally cleaned up by the destructors, however the user **must**
     * call download() after usage on device.
     */
+   template <bool prefetches = true>
    Hashmap* upload(split_gpuStream_t stream = 0) {
-      optimizeGPU(stream);
+      if (prefetches) {
+         optimizeGPU(stream);
+      }
       SPLIT_CHECK_ERR(split_gpuMemcpyAsync(device_map, this, sizeof(Hashmap), split_gpuMemcpyHostToDevice, stream));
       return device_map;
    }
diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h
index d2f8c78..1779444 100644
--- a/include/splitvector/split_tools.h
+++ b/include/splitvector/split_tools.h
@@ -1005,7 +1005,7 @@ void copy_if_loop(
    split::SplitVector<T, split::split_unified_allocator<T>>& input,
    split::SplitVector<T, split::split_unified_allocator<T>>& output,
    Rule rule, split_gpuStream_t s = 0) {
-   #ifndef NDEBUG
+   #ifdef HASHINATOR_DEBUG
    bool input_ok = isDeviceAccessible( reinterpret_cast<void*>(&input));
    bool output_ok= isDeviceAccessible( reinterpret_cast<void*>(&output));
    assert( (input_ok && output_ok) && "This method supports splitvectors dynamically allocated on device or unified memory!");
@@ -1018,7 +1018,7 @@ void copy_if_keys_loop(
    split::SplitVector<T, split::split_unified_allocator<T>>& input,
    split::SplitVector<U, split::split_unified_allocator<U>>& output,
    Rule rule, split_gpuStream_t s = 0) {
-   #ifndef NDEBUG
+   #ifdef HASHINATOR_DEBUG
    bool input_ok = isDeviceAccessible( reinterpret_cast<void*>(&input));
    bool output_ok= isDeviceAccessible( reinterpret_cast<void*>(&output));
    assert( (input_ok && output_ok) && "This method supports splitvectors dynamically allocated on device or unified memory!");
diff --git a/include/splitvector/splitvec.h b/include/splitvector/splitvec.h
index 84bb2a3..fa59582 100644
--- a/include/splitvector/splitvec.h
+++ b/include/splitvector/splitvec.h
@@ -89,6 +89,7 @@ class SplitVector {
    size_t _alloc_multiplier = 2; // host variable; multiplier for  when reserving more space
    Allocator _allocator;         // Allocator used to allocate and deallocate memory;
    Residency _location;          // Flags that describes the current residency of our data
+   SplitVector* d_vec;           // device copy pointer
 
    /**
     * @brief Checks if a pointer is valid and throws an exception if it's null.
@@ -213,6 +214,7 @@ class SplitVector {
     */
    HOSTONLY explicit SplitVector() : _location(Residency::host) {
       this->_allocate(0); // seems counter-intuitive based on stl but it is not!
+      d_vec = NULL;
    }
 
    /**
@@ -220,7 +222,10 @@ class SplitVector {
     *
     * @param size The size of the SplitVector to be created.
     */
-   HOSTONLY explicit SplitVector(size_t size) : _location(Residency::host) { this->_allocate(size); }
+   HOSTONLY explicit SplitVector(size_t size) : _location(Residency::host) {
+      this->_allocate(size);
+      d_vec = NULL;
+   }
 
    /**
     * @brief Constructor to create a SplitVector of a specified size with initial values.
@@ -233,6 +238,7 @@ class SplitVector {
       for (size_t i = 0; i < size; i++) {
          _data[i] = val;
       }
+      d_vec = NULL;
    }
 
    /**
@@ -269,6 +275,7 @@ class SplitVector {
       }
       copySafe();
       _location = Residency::host;
+      d_vec = NULL;
    }
 #endif
    /**
@@ -284,6 +291,7 @@ class SplitVector {
       *(other._size) = 0;
       other._data = nullptr;
       _location = other._location;
+      d_vec = NULL;
    }
 
    /**
@@ -296,6 +304,7 @@ class SplitVector {
       for (size_t i = 0; i < size(); i++) {
          _data[i] = init_list.begin()[i];
       }
+      d_vec = NULL;
    }
 
    /**
@@ -308,12 +317,18 @@ class SplitVector {
       for (size_t i = 0; i < size(); i++) {
          _data[i] = other[i];
       }
+      d_vec = NULL;
    }
 
    /**
     * @brief Destructor for the SplitVector. Deallocates memory.
     */
-   HOSTONLY ~SplitVector() { _deallocate(); }
+   HOSTONLY ~SplitVector() {
+      _deallocate();
+      if (d_vec != NULL) {
+         SPLIT_CHECK_ERR(split_gpuFree(d_vec));
+      }
+   }
 
 /**
  * @brief Custom assignment operator to assign the content of another SplitVector.
@@ -357,6 +372,7 @@ class SplitVector {
       }
       copySafe();
       _location = Residency::host;
+      d_vec = NULL;
       return *this;
    }
 
@@ -376,13 +392,17 @@ class SplitVector {
       if constexpr (std::is_trivially_copyable<T>::value) {
             if (other._location == Residency::device) {
                _location = Residency::device;
-               optimizeGPU(stream);
                SPLIT_CHECK_ERR(split_gpuMemcpyAsync(_data, other._data, size() * sizeof(T), split_gpuMemcpyDeviceToDevice,stream));
+               int device;
+               SPLIT_CHECK_ERR(split_gpuGetDevice(&device));
+               SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_size, sizeof(size_t), device, stream));
+               SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(_capacity, sizeof(size_t), device, stream));
                return;
             }
          }
       copySafe();
       _location = Residency::host;
+      d_vec = NULL;
       return;
    }
 
@@ -407,6 +427,7 @@ class SplitVector {
       *(other._size) = 0;
       other._data = nullptr;
       _location = other._location;
+      d_vec = NULL;
       return *this;
    }
 
@@ -456,16 +477,24 @@ class SplitVector {
     *
     * @param stream The GPU stream to perform the upload on.
     * @return Pointer to the uploaded SplitVector on the GPU.
-    * Has to be split_gpuFree'd after use otherwise memleak (small one but still)!
     */
    HOSTONLY
    SplitVector<T, Allocator>* upload(split_gpuStream_t stream = 0) {
-      SplitVector* d_vec;
-      optimizeGPU(stream);
-      SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&d_vec, sizeof(SplitVector), stream));
+      if (d_vec == NULL) {
+         SPLIT_CHECK_ERR(split_gpuMallocAsync((void**)&d_vec, sizeof(SplitVector), stream));
+      }
       SPLIT_CHECK_ERR(split_gpuMemcpyAsync(d_vec, this, sizeof(SplitVector), split_gpuMemcpyHostToDevice, stream));
       return d_vec;
    }
+   /**
+    * @brief Returns pre-uploaded pointer to the SplitVector on the GPU.
+    *
+    * @return Pointer to the uploaded SplitVector on the GPU.
+    */
+   HOSTONLY
+   SplitVector<T, Allocator>* device_pointer() {
+      return d_vec;
+   }
 
    /**
     * @brief Manually prefetches data to the GPU.
@@ -731,6 +760,7 @@ class SplitVector {
       }
       reserve(newSize, eco);
       *_size = newSize;
+      // TODO: should it set entries to zero?
    }
 
    /**
@@ -756,15 +786,26 @@ class SplitVector {
 #else
 
    /**
-    * @brief Reallocates data to a bigger chunk of memory.
+    * @brief Reallocates data to a bigger (or smaller) chunk of memory.
     *
     * @param requested_space The size of the requested space.
     */
    HOSTONLY
    void reallocate(size_t requested_space, split_gpuStream_t stream = 0) {
+      // Store addresses
+      const size_t __size = *_size;
+      const size_t __old_capacity = *_capacity;
+      T* __old_data = _data;
+      // Verify allocation sufficiency
+      if (__size > requested_space) {
+         printf("Tried reallocating to capacity %d with size %d\n", (int)requested_space, (int)__size);
+         this->_deallocate();
+         throw std::bad_alloc();
+      }
+      // Check for complete deallocation
       if (requested_space == 0) {
          if (_data != nullptr) {
-            _deallocate_and_destroy(capacity(), _data);
+            _deallocate_and_destroy(__old_capacity, __old_data);
          }
          _data = nullptr;
          *_capacity = 0;
@@ -778,28 +819,18 @@ class SplitVector {
          this->_deallocate();
          throw std::bad_alloc();
       }
-      // Store addresses
-      const size_t __size = *_size;
-      const size_t __old_capacity = *_capacity;
       T* __new_data = _new_data;
-      T* __data = _data;
       // Swap pointers & update capacity
-      // Size remains the same ofc
       _data = _new_data;
       *_capacity = requested_space;
       // Perform copy on device
       if (__size>0) {
-         int device;
-         SPLIT_CHECK_ERR(split_gpuGetDevice(&device));
-         SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(__data, __size * sizeof(T), device, stream));//
-         SPLIT_CHECK_ERR(split_gpuMemPrefetchAsync(__new_data, requested_space * sizeof(T), device, stream));
-         SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
-         SPLIT_CHECK_ERR(split_gpuMemcpy(__new_data, __data, __size * sizeof(T), split_gpuMemcpyDeviceToDevice));
+         SPLIT_CHECK_ERR(split_gpuMemcpyAsync(__new_data, __old_data, __size * sizeof(T), split_gpuMemcpyDeviceToDevice,stream));
          SPLIT_CHECK_ERR(split_gpuStreamSynchronize(stream));
       }
 
       // Deallocate old space
-      _deallocate_and_destroy(__old_capacity, __data);
+      _deallocate_and_destroy(__old_capacity, __old_data);
       return;
    }
 
@@ -815,7 +846,11 @@ class SplitVector {
     */
    HOSTONLY
    void reserve(size_t requested_space, bool eco = false, split_gpuStream_t stream = 0) {
-      const size_t current_space = *_capacity;
+      // If the users passes eco=true we allocate
+      // exactly what was requested
+      if (!eco) {
+         requested_space *= _alloc_multiplier;
+      }
       // Vector was default initialized
       if (_data == nullptr) {
          _deallocate();
@@ -823,23 +858,12 @@ class SplitVector {
          *_size = 0;
          return;
       }
-      // Nope.
-      const size_t currentSize = size();
+      // Already has sufficient capacity?
+      const size_t current_space = *_capacity;
       if (requested_space <= current_space) {
-         if (std::is_trivially_constructible<T>::value && _location == Residency::device) {
-            SPLIT_CHECK_ERR( split_gpuMemsetAsync(&_data[currentSize],0,(requested_space-currentSize)*sizeof(T), stream) );
-         } else {
-            for (size_t i = currentSize; i < requested_space; ++i) {
-               _allocator.construct(&_data[i], T());
-            }
-         }
          return;
       }
-      // If the users passes eco=true we allocate
-      // exactly what was requested
-      if (!eco) {
-         requested_space *= _alloc_multiplier;
-      }
+      // Reallocate.
       reallocate(requested_space,stream);
       return;
    }
@@ -864,6 +888,7 @@ class SplitVector {
       }
       reserve(newSize, eco, stream);
       *_size = newSize;
+      // TODO: should it set entries to zero?
    }
 
    /**
@@ -872,12 +897,14 @@ class SplitVector {
     * @param newSize The new size of the SplitVector.
     */
    DEVICEONLY
-   void device_resize(size_t newSize) {
+   void device_resize(size_t newSize, bool construct=true) {
       if (newSize > capacity()) {
          assert(0 && "Splitvector has a catastrophic failure trying to resize on device.");
       }
-      for (size_t i = size(); i < newSize; ++i) {
-         _allocator.construct(&_data[i], T());
+      if (construct) {
+         for (size_t i = size(); i < newSize; ++i) {
+            _allocator.construct(&_data[i], T());
+         }
       }
       *_size = newSize;
    }
@@ -901,7 +928,6 @@ class SplitVector {
       if (curr_cap == curr_size) {
          return;
       }
-
       reallocate(curr_size,stream);
       return;
    }
@@ -1350,7 +1376,7 @@ class SplitVector {
       }
 
       // Increase size;
-      device_resize(size() + count);
+      device_resize(size() + count, false); // false means don't construct base objects
       for (size_t i = 0; i < count; ++i) {
          _data[index + i] = *(p0.data() + i);
       }
@@ -1386,11 +1412,11 @@ class SplitVector {
       }
 
       // Increase size;
+      device_resize(size() + 1, false); // false means don't construct base objects
       for (int64_t i = size() - 1; i >= index; i--) {
          _data[i + 1] = _data[i];
       }
       _data[index] = val;
-      device_resize(size() + 1);
       return iterator(_data + index);
    }
 
@@ -1414,7 +1440,7 @@ class SplitVector {
                      "space available.");
       }
 
-      device_resize(newSize);
+      device_resize(newSize, false); // false means don't construct base objects
 
       it = begin().data() + index;
       iterator last = it.data() + oldsize;