diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16ebf4b8b..277fe0659 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,11 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
     "${MODELS_ROOT}/*.cu"
     "${MODELS_ROOT}/*.cuh"
   )
+  file(GLOB test_cuda_srcs CONFIGURE_DEPENDS
+    "${TESTS_ROOT}/*.cu"
+    "${TESTS_ROOT}/*.cuh"
+  )
+  list(APPEND test_srcs ${test_cuda_srcs})
   list(APPEND generator_srcs ${generator_cuda_srcs})
   add_compile_definitions(USE_CUDA=1)
   include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
diff --git a/README.md b/README.md
index 2cf827ba5..314afbaa3 100644
--- a/README.md
+++ b/README.md
@@ -6,44 +6,10 @@ This library provides the generative AI loop for ONNX models, including inferenc
 
 Users can call a high level `generate()` method, or run each iteration of the model in a loop.
 
-* Search techniques like greedy/beam search to generate token sequences
-* Built in scoring tools like repetition penalties
+* Support greedy/beam search and TopP, TopK sampling to generate token sequences
+* Built in logits processing like repetition penalties
 * Easy custom scoring
 
-## Sample code for phi-2 in Python
-
-Install onnxruntime-genai.
-
-(Temporary) Build and install from source according to the instructions below.
-
-
-```python
-import onnxruntime_genai as og
-
-model=og.Model(f'models/microsoft/phi-2', device_type)
-
-tokenizer = model.create_tokenizer()
-
-prompt = '''def print_prime(n):
-    """
-    Print all primes between 1 and n
-    """'''
-
-tokens = tokenizer.encode(prompt)
-
-params=og.SearchParams(model)
-params.max_length = 200
-params.input_ids = tokens
-
-output_tokens=model.generate(params)
-
-text = tokenizer.decode(output_tokens)
-
-print("Output:")
-print(text)
-```
-
-
 ## Features
 
 * Supported model architectures:
@@ -126,6 +92,39 @@ huggingface-cli login --token <your HuggingFace token>
 python export.py -m microsoft/phi-2 -p int4 -e cpu -o phi2-int4-cpu.onnx
 ```
 
+## Sample code for phi-2 in Python
+
+Install onnxruntime-genai.
+
+(Temporary) Build and install from source according to the instructions below.
+
+
+```python
+import onnxruntime_genai as og
+
+model=og.Model(f'models/microsoft/phi-2', device_type)
+
+tokenizer = model.create_tokenizer()
+
+prompt = '''def print_prime(n):
+    """
+    Print all primes between 1 and n
+    """'''
+
+tokens = tokenizer.encode(prompt)
+
+params=og.SearchParams(model)
+params.max_length = 200
+params.input_ids = tokens
+
+output_tokens=model.generate(params)
+
+text = tokenizer.decode(output_tokens)
+
+print("Output:")
+print(text)
+```
+
 
 ## Contributing
 
diff --git a/src/beam_search_topk.cu b/src/beam_search_topk.cu
index 1dae92cc4..222561ce8 100644
--- a/src/beam_search_topk.cu
+++ b/src/beam_search_topk.cu
@@ -6,51 +6,7 @@
 namespace Generators {
 namespace cuda {
 
-template <typename T, int max_k>
-struct TopK {
-  int32_t key[max_k];
-  T value[max_k];
-
-  __device__ __forceinline__ void Insert(T elem, int elem_id) {
-    T v = value[max_k - 1];
-    if (v < elem ||
-        (key[max_k - 1] == -1) ||
-        ((elem == value[max_k - 1]) && (elem_id < key[max_k - 1]))) {
-      value[max_k - 1] = elem;
-      key[max_k - 1] = elem_id;
-    }
-
-    for (int k = max_k - 2; k >= 0; --k) {
-      if (value[k + 1] > value[k] ||
-          key[k] == -1 ||
-          ((value[k + 1] == value[k]) && (key[k + 1] < key[k]))) {
-        T u2 = value[k];
-        int p2 = key[k];
-        value[k] = value[k + 1];
-        key[k] = key[k + 1];
-        value[k + 1] = u2;
-        key[k + 1] = p2;
-      }
-    }
-  }
-
-  __device__ __forceinline__ void Init() {
-    for (int i = 0; i < max_k; i++) {
-      key[i] = -1;
-      value[i] = -std::numeric_limits<T>::infinity();
-    }
-  }
-};
-
-template <typename T, int max_k>
-__device__ __forceinline__ TopK<T, max_k> reduce_topk_op(const TopK<T, max_k>& a, const TopK<T, max_k>& b) {
-  TopK<T, max_k> res = a;
-  for (int i = 0; i < max_k; ++i)
-    res.Insert(b.value[i], b.key[i]);
-  return res;
-}
-
-// kernel to compute the top k on last axis for tensor with shape: [batch, beam_size, parts_of_vocab, vacab_part_size]
+// kernel to compute the top k on last axis for tensor with shape: [batch, beam_size, parts_of_vocab, vocab_part_size]
 // Its grid is [batch * beam_size, parts_of_vocab]
 template <typename T, int max_k, int thread_block_size>
 __launch_bounds__(thread_block_size) __global__ void BeamSearchOnlineTopKStage1Kernel(
@@ -319,18 +275,18 @@ void BeamSearchTopK(
                              tmp_indices_2nd_stage,   \
                              tmp_values_1st_stage,    \
                              tmp_indices_1st_stage,   \
-                             stream);
+                             stream)
 
   if (k <= 4) {
-    TopKLauncher(4)
+    TopKLauncher(4);
   } else if (k <= 8) {
-    TopKLauncher(8)
+    TopKLauncher(8);
   } else if (k <= 16) {
-    TopKLauncher(16)
+    TopKLauncher(16);
   } else if (k <= 32) {
-    TopKLauncher(32)
+    TopKLauncher(32);
   } else {
-    TopKLauncher(64)
+    TopKLauncher(64);
   }
 
   LaunchBatchTopKKernel(tmp_values_2nd_stage,
diff --git a/src/beam_search_topk.h b/src/beam_search_topk.h
index 4bf6210ce..4cd3f4b62 100644
--- a/src/beam_search_topk.h
+++ b/src/beam_search_topk.h
@@ -20,5 +20,49 @@ void BeamSearchTopK(
     int32_t* output_indices,
     cudaStream_t stream);
 
+template <typename T, int max_k>
+struct TopK {
+  int32_t key[max_k];
+  T value[max_k];
+
+  __device__ __forceinline__ void Insert(T elem, int elem_id) {
+    T v = value[max_k - 1];
+    if (v < elem ||
+        (key[max_k - 1] == -1) ||
+        ((elem == value[max_k - 1]) && (elem_id < key[max_k - 1]))) {
+      value[max_k - 1] = elem;
+      key[max_k - 1] = elem_id;
+    }
+
+    for (int k = max_k - 2; k >= 0; --k) {
+      if (value[k + 1] > value[k] ||
+          key[k] == -1 ||
+          ((value[k + 1] == value[k]) && (key[k + 1] < key[k]))) {
+        T u2 = value[k];
+        int p2 = key[k];
+        value[k] = value[k + 1];
+        key[k] = key[k + 1];
+        value[k + 1] = u2;
+        key[k + 1] = p2;
+      }
+    }
+  }
+
+  __device__ __forceinline__ void Init() {
+    for (int i = 0; i < max_k; i++) {
+      key[i] = -1;
+      value[i] = -std::numeric_limits<T>::infinity();
+    }
+  }
+};
+
+template <typename T, int max_k>
+__device__ __forceinline__ TopK<T, max_k> reduce_topk_op(const TopK<T, max_k>& a, const TopK<T, max_k>& b) {
+  TopK<T, max_k> res = a;
+  for (int i = 0; i < max_k; ++i)
+    res.Insert(b.value[i], b.key[i]);
+  return res;
+}
+
 }  // namespace cuda
 }  // namespace Generators
diff --git a/src/cuda_sampling.cu b/src/cuda_sampling.cu
new file mode 100644
index 000000000..15636b928
--- /dev/null
+++ b/src/cuda_sampling.cu
@@ -0,0 +1,596 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <span>
+#include "beam_search_topk.h"
+#include "cuda_sampling.cuh"
+#include "smartptrs.h"
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+#include <curand_kernel.h>
+#include <iostream>
+
+namespace Generators {
+namespace cuda {
+
+constexpr int kMaxThreads = 1024;
+constexpr int kGPUWarpSize = 32;
+
+SamplingData::SamplingData(int batch_size, int vocab_size, cudaStream_t stream) {
+  indices_sorted = CudaMallocArray<int>(vocab_size * batch_size);
+  scores_sorted = CudaMallocArray<float>(vocab_size * batch_size);
+  scores_softmaxed = CudaMallocArray<float>(vocab_size * batch_size);
+  prefix_sums = CudaMallocArray<float>(vocab_size * batch_size);
+  thresholds = CudaMallocArray<float>(batch_size);
+  indices_in = CudaMallocArray<int>(vocab_size * batch_size);
+  offsets = CudaMallocArray<int>(batch_size + 1);
+  temp_storage_bytes = 0;
+  cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, (float*)nullptr, (float*)nullptr,
+    (int*)nullptr, (int*)nullptr, vocab_size*batch_size, batch_size, (int*)nullptr, (int*)nullptr, 0, sizeof(float) * 8, stream);
+  temp_buffer = CudaMallocArray<float>(temp_storage_bytes / sizeof(float));
+}
+
+// Softmax Kernels and Launchers
+
+template <typename T, typename AccumT>
+struct MaxFloat {
+  __device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
+    return ::max(max, (AccumT)v);
+  }
+};
+
+template <typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T, typename AccumT>
+struct SumExpFloat {
+  __device__ __forceinline__ SumExpFloat(AccumT v)
+      : max_k(v) {}
+
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + exp((AccumT)v - max_k);
+  }
+
+  const AccumT max_k;
+};
+
+template <typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+// aligned vector generates vectorized load/store on CUDA
+template <typename T, int vec_size>
+struct alignas(sizeof(T) * vec_size) aligned_vector {
+  T val[vec_size];
+};
+
+template <template <typename, typename> class Reduction, int ILP, typename T, typename AccumT>
+__device__ __forceinline__ AccumT IlpReduce(int shift, T* data, int size, const Reduction<T, 
+                                            AccumT>& r, AccumT defaultVal) {
+  using LoadT = aligned_vector<T, ILP>;
+  AccumT threadVal = defaultVal;
+  int offset = threadIdx.x;
+  // shift and do 1
+  if (shift > 0) {
+    data -= shift;
+    size += shift;
+    if (threadIdx.x >= shift && threadIdx.x < size) {
+      threadVal = r(threadVal, data[offset]);
+    }
+    size -= blockDim.x;
+    data += blockDim.x;
+  }
+  if (size <= 0) return threadVal;
+  int last = size % (ILP * blockDim.x);
+  T v[ILP];
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *value = reinterpret_cast<LoadT*>(data)[offset];
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      threadVal = r(threadVal, v[j]);
+    }
+  }
+  offset = size - last + threadIdx.x;
+  // Epilogue
+  for (; offset < size; offset += blockDim.x)
+    threadVal = r(threadVal, data[offset]);
+  return threadVal;
+}
+
+template <template <typename> class Reduction, typename AccumT>
+__device__ __forceinline__ AccumT SoftmaxReduce(AccumT* smem, AccumT val, const Reduction<AccumT>& r, AccumT defaultVal) {
+  // To avoid RaW races from chaining SoftmaxReduce calls together, we need a sync here
+  __syncthreads();
+  smem[threadIdx.x] = val;
+  __syncthreads();
+  AccumT warpVal = defaultVal;
+  // First warp will perform per-warp reductions for the remaining warps
+  if (threadIdx.x < kGPUWarpSize) {
+    int warps_per_block = blockDim.x / kGPUWarpSize;
+    for (int i = 0; i < warps_per_block; ++i) {
+      warpVal = r(warpVal, smem[i * kGPUWarpSize + threadIdx.x]);
+    }
+    smem[threadIdx.x] = warpVal;
+  }
+  __syncthreads();
+  // First thread will perform a reduction of the above per-warp reductions
+  AccumT blockVal = defaultVal;
+  if (threadIdx.x == 0) {
+    #pragma unroll
+    for (int i = 0; i < kGPUWarpSize; ++i) {
+      blockVal = r(blockVal, smem[i]);
+    }
+    smem[0] = blockVal;
+  }
+  // Sync and broadcast
+  __syncthreads();
+  return smem[0];
+}
+
+dim3 SoftmaxGetBlockSize(int ILP, uint64_t size) {
+  uint64_t block_size = 1;
+  uint64_t max_block_size = min(size / ILP, static_cast<uint64_t>(kMaxThreads));
+  // In the vectorized case we want to trade off allowing more of the buffers to be accessed
+  // in a vectorized way against wanting a larger block size to get better utilisation.
+  // In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
+  // of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
+  // allowing a larger block size.
+  if (ILP > 1) {
+    max_block_size /= 2;
+  }
+  while (block_size < max_block_size) block_size *= 2;
+  // Launch at least a single warp - the kernel assumes that.
+  block_size = max(block_size, static_cast<uint64_t>(kGPUWarpSize));
+  return dim3(static_cast<unsigned int>(block_size));
+}
+
+template <typename T, typename AccumT, typename OutT>
+struct LogSoftmaxForwardEpilogue {
+  __device__ __forceinline__ LogSoftmaxForwardEpilogue(AccumT max_input, AccumT sum)
+      : max_input(max_input), logsum(log(sum)) {}
+
+  __device__ __forceinline__ OutT operator()(T input) const {
+    return static_cast<OutT>((AccumT)input - max_input - logsum);
+  }
+
+  const AccumT max_input;
+  const AccumT logsum;
+};
+
+template <typename T, typename AccumT, typename OutT>
+struct SoftmaxForwardEpilogue {
+  __device__ __forceinline__ SoftmaxForwardEpilogue(AccumT max_input, AccumT sum)
+      : max_input(max_input), sum(sum) {}
+
+  __device__ __forceinline__ OutT operator()(T input) const {
+    return static_cast<OutT>(exp((AccumT)input - max_input) / sum);
+  }
+
+  const AccumT max_input;
+  const AccumT sum;
+};
+
+/**
+ * This will apply the Epilogue with vectorized reads & writes when input & output have the same shift
+ */
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
+__device__ __forceinline__ void WriteFpropResultsVectorized(int size,
+                                                            const int shift,
+                                                            scalar_t* input,
+                                                            outscalar_t* output,
+                                                            Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+  using LoadT = aligned_vector<scalar_t, ILP>;
+  using StoreT = aligned_vector<outscalar_t, ILP>;
+  int offset = threadIdx.x;
+  // if unaligned, do one value / thread and move on, guaranteeing aligned reads/writes later
+  if (shift > 0) {
+    input -= shift;
+    output -= shift;
+    size += shift;
+    if (threadIdx.x >= shift && threadIdx.x < size) {
+      output[offset] = epilogue(input[offset]);
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+    output += blockDim.x;
+  }
+  if (size <= 0) return;
+  const int last = size % (ILP * blockDim.x);
+  scalar_t in_v[ILP];
+  LoadT* in_value = reinterpret_cast<LoadT*>(&in_v);
+  outscalar_t out_v[ILP];
+  StoreT* out_value = reinterpret_cast<StoreT*>(&out_v);
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *in_value = reinterpret_cast<LoadT*>(input)[offset];
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      out_v[j] = epilogue(in_v[j]);
+    }
+    reinterpret_cast<StoreT*>(output)[offset] = *out_value;
+  }
+  offset = size - last + threadIdx.x;
+  // handle the tail
+  for (; offset < size; offset += blockDim.x) {
+    output[offset] = epilogue(input[offset]);
+  }
+}
+
+/**
+ * This will apply the Epilogue with non-vectrorized reads & writes for the general case
+ */
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
+__device__ __forceinline__ void WriteFpropResults(int classes,
+                                                  scalar_t* input,
+                                                  outscalar_t* output,
+                                                  Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+  int offset = threadIdx.x;
+  int last = classes % (ILP * blockDim.x);
+  // Main bulk of loop with ILP
+  for (; offset < classes - last; offset += blockDim.x * ILP) {
+    scalar_t tmp[ILP];
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      tmp[j] = input[offset + j * blockDim.x];
+    }
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      output[offset + j * blockDim.x] = epilogue(tmp[j]);
+    }
+  }
+  // Remainder - no ILP
+  for (; offset < classes; offset += blockDim.x) {
+    output[offset] = epilogue(input[offset]);
+  }
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
+          template <typename, typename, typename> class Epilogue>
+__global__ void SoftmaxBlockForward(outscalar_t* output, scalar_t* input, int classes,
+                                      int input_stride, int output_stride, accscalar_t temperature) {
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  // forward pointers to batch[blockIdx.x]
+  // each block handles a sample in the mini-batch
+  input += blockIdx.x * input_stride;
+  output += blockIdx.x * output_stride;
+  const int input_align_bytes = ILP * sizeof(scalar_t);
+  const int output_align_bytes = ILP * sizeof(outscalar_t);
+  const int shift = ((uint64_t)input) % input_align_bytes / sizeof(scalar_t);
+  const int output_shift = ((uint64_t)output) % output_align_bytes / sizeof(outscalar_t);
+  // find the max
+  accscalar_t threadMax = IlpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
+      shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -std::numeric_limits<accscalar_t>::max());
+  accscalar_t max_k = SoftmaxReduce<Max, accscalar_t>(
+      sdata, threadMax, Max<accscalar_t>(), -std::numeric_limits<accscalar_t>::max());
+  // reduce all values
+  accscalar_t threadExp = IlpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
+      shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k/temperature), static_cast<accscalar_t>(0));
+  accscalar_t sumAll = SoftmaxReduce<Add, accscalar_t>(
+      sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
+  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+  if (shift == output_shift) {
+    WriteFpropResultsVectorized<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, shift, input, output, epilogue);
+  } else {
+    WriteFpropResults<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, input, output, epilogue);
+  }
+}
+
+template <bool is_log_softmax>
+void DispatchBlockwiseSoftmaxForward(cudaStream_t* stream, float* output, const float* input, int softmax_elements,
+                                          int input_stride, int output_stride, int batch_count, float temperature=1.0) {
+  dim3 grid(batch_count);
+  constexpr int ILP = sizeof(float4) / sizeof(float);
+  dim3 block = SoftmaxGetBlockSize(ILP, softmax_elements);
+  if (is_log_softmax) {
+    SoftmaxBlockForward<ILP, float, float, float, LogSoftmaxForwardEpilogue>
+        <<<grid, block, block.x * sizeof(float), *stream>>>(output, const_cast<float*>(input),
+                                                           softmax_elements, input_stride, output_stride, temperature);
+  } else {
+    SoftmaxBlockForward<ILP, float, float, float, SoftmaxForwardEpilogue>
+        <<<grid, block, block.x * sizeof(float), *stream>>>(output, const_cast<float*>(input),
+                                                           softmax_elements, input_stride, output_stride, temperature);
+  }
+}
+
+// Populate Kernels and Launchers
+
+__global__ void PopulateIndices(int* indices, int size, int batch_size) {
+  int global_index = threadIdx.x + blockIdx.x * blockDim.x;
+  int index = global_index % size;
+  if (global_index < size * batch_size) {
+    indices[global_index] = index;
+  }
+}
+
+void LaunchPopulateIndices(int* indices, int size, int batch_size, cudaStream_t stream) {
+  dim3 grid((batch_size * size / 256)+1, 1, 1);
+  dim3 block(256, 1, 1);
+  PopulateIndices<<<grid, block, 0, stream>>>(indices, size, batch_size);
+}
+
+__global__ void PopulateOffsets(int* offsets, int size, int batch_size) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < batch_size + 1)
+    offsets[index] = index * size;
+}
+
+void LaunchPopulateOffsets(int* offsets, int size, int batch_size, cudaStream_t stream) {
+  dim3 grid(int(batch_size/128)+1, 1, 1);
+  dim3 block(128, 1, 1);
+  PopulateOffsets<<<grid, block, 0, stream>>>(offsets, size, batch_size);
+}
+
+// Sorting Kernel Launcher
+
+template <typename T>
+void LaunchSortPairs(void* d_temp_storage,
+                     size_t temp_storage_bytes,
+                     const T* d_keys_in,
+                     T* d_keys_out,
+                     const int* d_values_in,
+                     int* d_values_out,
+                     int num_items,
+                     int num_segments,
+                     int* d_offsets,
+                     cudaStream_t stream,
+                     bool is_descending) {
+  if (is_descending) {
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+        d_values_in, d_values_out, num_items, num_segments, d_offsets, d_offsets + 1, 0, sizeof(T) * 8, stream);
+  } else {
+    cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+        d_values_in, d_values_out, num_items, num_segments, d_offsets, d_offsets + 1, 0, sizeof(T) * 8, stream);
+  }
+}
+
+template <typename T>
+void GetTempStorageSize(const T* d_keys_in,
+                        const int* d_values_in,
+                        int* d_offsets,
+                        int num_items,
+                        int num_segments,
+                        cudaStream_t stream,
+                        bool is_descending,
+                        size_t& temp_storage_bytes) {
+  if (is_descending) {
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, d_keys_in, (T*)nullptr,
+        d_values_in, (int*)nullptr, num_items, num_segments, d_offsets, d_offsets + 1, 0, sizeof(T) * 8, stream);
+  } else {
+    cub::DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, d_keys_in, (T*)nullptr,
+        d_values_in, (int*)nullptr, num_items, num_segments, d_offsets, d_offsets + 1, 0, sizeof(T) * 8, stream);
+  }
+}
+
+// Sampling Kernels and Launchers
+
+template <int kBlockSize>
+__global__ void PrefixSumKernel(float* scores, float* prefix_sums, int sample_range, int batch_size) {
+  int batch = blockIdx.x;
+  float prefix_sum = 0.0f;
+
+  typedef cub::BlockScan<float, kBlockSize> BlockScan;
+__shared__ typename BlockScan::TempStorage temp_storage;
+
+  for (int i = 0; i < sample_range; i += blockDim.x) {
+    int global_index = threadIdx.x + i + batch * sample_range;
+    int local_index = threadIdx.x + i;
+    float score = (local_index < sample_range) ? scores[global_index] : 0.0f;
+    float sum = score;
+    BlockScan(temp_storage).InclusiveSum(sum, sum);
+    prefix_sum += sum;
+    __syncthreads();
+    if (local_index < sample_range) {
+      prefix_sums[local_index + batch * sample_range] = prefix_sum;
+    }
+  }
+}
+
+// Get top k indices and scores from unsorted input
+template <int max_k, int kBlockSize>
+__global__ void GetTopKKernel(int* indices_out, float* scores_in, float* scores_out, int batch_size, int vocab_size, int k) {
+  TopK<float, max_k> thread_top_k;
+  thread_top_k.Init();
+  int batch = blockIdx.x;
+
+  for (int i = threadIdx.x; i < vocab_size; i += blockDim.x) {
+    thread_top_k.Insert(scores_in[i + batch * vocab_size], i);
+  }
+
+  // reduce in thread block
+  typedef cub::BlockReduce<TopK<float, max_k>, kBlockSize> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  TopK<float, max_k> top_k_sequence = BlockReduce(temp_storage).Reduce(thread_top_k, reduce_topk_op<float, max_k>);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < k; i++) {
+      scores_out[i + batch * k] = top_k_sequence.value[i];
+      indices_out[i + batch * k] = top_k_sequence.key[i];
+    }
+  }
+}
+
+// Gets all top K indices and scores from unsorted input
+template <int max_k>
+void LaunchGetTopKSubset(cudaStream_t stream, float* scores_in, float* scores_out, int* indices_out, int vocab_size, int batch_size, int k) {
+  dim3 grid(batch_size, 1, 1);
+  dim3 block(256, 1, 1);
+  GetTopKKernel<max_k, 256><<<grid, block, 0, stream>>>(indices_out, scores_in, scores_out, batch_size, vocab_size, k);
+}
+
+// Sets up random thresholds for top p or top k sampling
+__global__ void RandomThresholdKernelTopPAndK(int seed, float* thresholds, float* prefix_sums, int batch_size, float p, int k) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  curandState state;
+  curand_init(seed, index, 0, &state);
+
+  float k_prob = prefix_sums[k-1];
+  if (index < batch_size) {
+    float min_p = fminf(p, k_prob);
+    thresholds[index] = min_p * curand_uniform(&state);
+  }
+}
+
+// Sets up random thresholds for top p or top k sampling
+__global__ void RandomThresholdKernelTopP(int seed, float* thresholds, float* prefix_sums, int batch_size, float p) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  curandState state;
+  curand_init(seed, index, 0, &state);
+
+  if (index < batch_size) {
+    thresholds[index] = p * curand_uniform(&state);
+  }
+}
+
+// Sets up random thresholds for top p or top k sampling
+__global__ void RandomThresholdKernelTopK(int seed, float* thresholds, float* prefix_sums, int batch_size, int k) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  curandState state;
+  curand_init(seed, index, 0, &state);
+
+  if (index < batch_size) {
+    thresholds[index] = prefix_sums[k-1] * curand_uniform(&state);
+  }
+}
+
+
+template <int kBlockSize>
+__global__ void SampleKernel(float* prefix_sums, int* indices, int* index_out, int sample_range, float* thresholds) {
+  int batch = blockIdx.x;
+  int index = threadIdx.x;
+
+  __shared__ int first_index;
+  if (threadIdx.x == 0) {
+    first_index = sample_range - 1;
+  }
+  __syncthreads();
+
+  for (; index < sample_range; index += blockDim.x) {
+    if (index + batch * sample_range < blockDim.x * sample_range) {
+      float sum = prefix_sums[index + batch * sample_range];
+      // TOP P or K
+      if (sum >= thresholds[batch] || index == sample_range - 1) {
+        atomicMin(&first_index, index);
+        break;
+      }
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    index_out[batch] = indices[first_index + batch * sample_range];
+  }
+}
+
+void LaunchSampleKernel(SamplingData* data, cudaStream_t stream, float* scores, int* indices, int* index_out, int sample_range, int batch_size, float p = 0.0, int k=-1) {
+  dim3 grid(batch_size, 1, 1);
+  dim3 block(256, 1, 1);
+  // Prefix Sums
+  std::span<float> prefix_sums{data->prefix_sums.get(), static_cast<size_t>(sample_range * batch_size)};
+  PrefixSumKernel<256><<<grid, block, 0, stream>>>(scores, prefix_sums.data(), sample_range, batch_size);
+  // Random Thresholds for Top P or Top K Sampling
+  std::span<float> thresholds{data->thresholds.get(), static_cast<size_t>(batch_size)};
+  std::random_device rd;
+  std::mt19937 eee(rd());
+  std::uniform_int_distribution<int> dist(0, std::numeric_limits<int>::max());
+  int seed = dist(eee);
+  if (p > 0.0 && k > 1) {
+    RandomThresholdKernelTopPAndK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p, k);
+  } else if (p > 0.0) {
+    RandomThresholdKernelTopP<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p);
+  } else if (k > 1) {
+    RandomThresholdKernelTopK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, k);
+  }
+  SampleKernel<256><<<grid, block, 0, stream>>>(prefix_sums.data(), indices, index_out, sample_range, thresholds.data());
+}
+
+// Top P+K Kernel Launchers
+
+// Outputs sorted scores and corresponding indices... scores_out and indices_out should already be allocated
+void SoftmaxAndSort(SamplingData* data, cudaStream_t stream, float* scores_in, float* scores_out, int* indices_out, int vocab_size, int batch_size, float temperature) {
+  // Softmax scores
+  std::span<float> scores{data->scores_softmaxed.get(), static_cast<size_t>(vocab_size * batch_size)};
+  DispatchBlockwiseSoftmaxForward<false>(&stream, scores.data(), const_cast<const float*>(scores_in), vocab_size, vocab_size, vocab_size, batch_size, temperature);
+  // Sort indices by scores
+  std::span<int> offsets_gpu{data->offsets.get(), static_cast<size_t>(batch_size + 1)};
+  LaunchPopulateOffsets(offsets_gpu.data(), vocab_size, batch_size, stream);
+  std::span<int32_t> indices_in{data->indices_in.get(), static_cast<size_t>(vocab_size * batch_size)};
+  LaunchPopulateIndices(indices_in.data(), vocab_size, batch_size, stream);
+  std::span<float> temp_span{data->temp_buffer.get(), data->temp_storage_bytes / sizeof(float)};
+  LaunchSortPairs<float>(temp_span.data(), data->temp_storage_bytes, scores.data(), scores_out,
+                         indices_in.data(), indices_out, vocab_size * batch_size, batch_size, offsets_gpu.data(),
+                         stream, /*is_descending*/true);
+}
+
+void LaunchGetTopKSubsetFullSort(SamplingData* data, cudaStream_t stream, float* scores_in, float* scores_out, int* indices_out, int vocab_size, int batch_size, int k) {
+  // Sort indices and scores
+  std::span<float> scores_sorted{data->scores_sorted.get(), static_cast<size_t>(vocab_size * batch_size)};
+  std::span<int> indices_sorted{data->indices_sorted.get(), static_cast<size_t>(vocab_size * batch_size)};
+  std::span<int> offsets_gpu{data->offsets.get(), static_cast<size_t>(batch_size + 1)};
+  LaunchPopulateOffsets(offsets_gpu.data(), vocab_size, batch_size, stream);
+  std::span<int32_t> indices_in{data->indices_in.get(), static_cast<size_t>(vocab_size * batch_size)};
+  LaunchPopulateIndices(indices_in.data(), vocab_size, batch_size, stream);
+  std::span<float> temp_span{data->temp_buffer.get(), data->temp_storage_bytes / sizeof(float)};
+  LaunchSortPairs<float>(temp_span.data(), data->temp_storage_bytes, scores_in, scores_sorted.data(),
+                         indices_in.data(), indices_sorted.data(), vocab_size * batch_size, batch_size, offsets_gpu.data(),
+                         stream, /*is_descending*/true);
+}
+
+void GetTopKSubset(SamplingData* data, cudaStream_t stream, float* scores_in, float* scores_out, int* indices_out, int vocab_size, int batch_size, int k, float temperature) {
+  // Softmax scores
+  std::span<float> scores_softmaxed{data->scores_softmaxed.get(), static_cast<size_t>(vocab_size * batch_size)};
+  DispatchBlockwiseSoftmaxForward<false>(&stream, scores_softmaxed.data(), const_cast<const float*>(scores_in), vocab_size, vocab_size, vocab_size, batch_size, temperature);
+  // Get top k subset
+  #define GetTopK(max_k)                       \
+  LaunchGetTopKSubset<max_k>(stream,                  \
+                             scores_softmaxed.data(), \
+                             scores_out,       \
+                             indices_out,      \
+                             vocab_size,       \
+                             batch_size,       \
+                             k)
+
+  if (k <= 4) {
+    GetTopK(4);
+  } else if (k <= 8) {
+    GetTopK(8);
+  } else if (k <= 16) {
+    GetTopK(16);
+  } else if (k <= 32) {
+    GetTopK(32);
+  } else if (k <= 64) {
+    GetTopK(64);
+  } else {
+    LaunchGetTopKSubsetFullSort(data, stream, scores_softmaxed.data(), scores_out, indices_out, vocab_size, batch_size, k);
+  }
+}
+
+
+// Kernel launcher for combined (or seperate) top k and top p sampling; where k is the max number of tokens to sample and p is the probability threshold
+void GetSample(SamplingData* data, cudaStream_t stream, int32_t* next_token_out, float* scores_in, int vocab_size, int batch_size, int k, float p, float temperature) {
+  int sample_range = (k > 0 && k <= 64) ? k : vocab_size;
+  std::span<float> scores_sorted(data->scores_sorted.get(), static_cast<size_t>(sample_range * batch_size));
+  std::span<int> indices_sorted(data->indices_sorted.get(), static_cast<size_t>(sample_range * batch_size));
+  if (k > 0 && k < vocab_size) {
+    GetTopKSubset(data, stream, scores_in, scores_sorted.data(), indices_sorted.data(), vocab_size, batch_size, k, temperature);
+  } else {
+    SoftmaxAndSort(data, stream, scores_in, scores_sorted.data(), indices_sorted.data(), vocab_size, batch_size, temperature);
+  }
+  // Sample kernel
+  LaunchSampleKernel(data, stream, scores_sorted.data(), indices_sorted.data(), next_token_out, sample_range, batch_size, p, k);
+}
+
+} // namespace cuda
+} // namespace Generators
diff --git a/src/cuda_sampling.cuh b/src/cuda_sampling.cuh
new file mode 100644
index 000000000..9c6c36929
--- /dev/null
+++ b/src/cuda_sampling.cuh
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "smartptrs.h"
+
+namespace Generators {
+namespace cuda {
+
+struct SamplingData {
+  SamplingData(int batch_size, int vocab_size, cudaStream_t stream);
+  std::unique_ptr<int, Generators::CudaDeleter> indices_sorted = nullptr;
+  std::unique_ptr<float, Generators::CudaDeleter> scores_sorted = nullptr;
+  std::unique_ptr<float, Generators::CudaDeleter> scores_softmaxed = nullptr;
+  std::unique_ptr<float, Generators::CudaDeleter> prefix_sums = nullptr;
+  std::unique_ptr<float, Generators::CudaDeleter> thresholds = nullptr;
+  std::unique_ptr<int, Generators::CudaDeleter> indices_in = nullptr;
+  std::unique_ptr<int, Generators::CudaDeleter> offsets = nullptr;
+  std::unique_ptr<float, Generators::CudaDeleter> temp_buffer = nullptr;
+  size_t temp_storage_bytes = 0;
+};
+
+void LaunchPopulateIndices(int* indices, int size, int batch_size, cudaStream_t stream);
+void GetTopKSubset(cudaStream_t stream, float* scores_in, float* scores_out, int* indices_out, int vocab_size, int batch_size, int k, float temperature=1.0f);
+void GetSample(SamplingData* data, cudaStream_t stream, int32_t* d_next_token, float* d_scores, int vocab_size, int batch_size, int k, float p, float temperature);
+
+}  // namespace cuda
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/generators.cpp b/src/generators.cpp
index 385b41f99..29aceb81c 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -113,8 +113,9 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper
     throw std::runtime_error("Must call ComputeLogits before GenerateNextToken*");
   computed_logits_ = false;
 
-  // TODO: Do TopK if top_k >1 then do TopP on the results
-  if (top_p < 1.0f) {
+  if (top_p < 1.0f && top_k > 1) {
+    search_->SampleTopPAndK(top_p, top_k, temperature);
+  } else if (top_p < 1.0f) {
     search_->SampleTopP(top_p, temperature);
   } else if (top_k > 1) {
     search_->SampleTopK(top_k, temperature);
diff --git a/src/python/models/export.py b/src/python/models/export.py
index 7d8ef45a1..418e8770b 100644
--- a/src/python/models/export.py
+++ b/src/python/models/export.py
@@ -15,7 +15,7 @@
 import os
 
 class Model:
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.intermediate_size = config.intermediate_size
         self.hidden_size = config.hidden_size
         self.num_kv_heads = config.num_key_value_heads
@@ -31,6 +31,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir):
         self.onnx_dtype = onnx_dtype  # {"int4", "fp16", "fp32"}
         self.ep = ep
         self.cache_dir = cache_dir
+        self.extra_options = extra_options
         
         self.inputs = []
         self.outputs = []
@@ -102,8 +103,8 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir):
         # Quantization-specific variables (INT4, INT8, etc.)
         self.quant_attrs = {
             "int4": {
-                "block_size": 32,
-                "accuracy_level": None,
+                "block_size": int(extra_options["int4_block_size"]) if "int4_block_size" in extra_options else 32,
+                "accuracy_level": int(extra_options["int4_accuracy_level"]) if "int4_accuracy_level" in extra_options else None,
             }
         }
 
@@ -1028,8 +1029,8 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         return expand_name
 
 class LlamaModel(Model):
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir):
-        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir)
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
         self.model_inputs = ["input_ids", "attention_mask", "position_ids"]
 
     def make_attention_mask_reformatting(self):
@@ -1078,8 +1079,8 @@ def make_attention_mask_reformatting(self):
 
 
 class MistralModel(LlamaModel):
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir):
-        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir)
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
         self.position_ids_name = self.make_position_ids_reformatting()
 
     def make_position_ids_reformatting(self):
@@ -1122,8 +1123,8 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
 
 
 class PhiModel(LlamaModel):
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir):
-        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir)
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
         # self.input_shapes["position_ids"] = [1]  # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor
         self.layernorm_attrs["simple"] = False
         self.rotemb_attrs["num_heads"] = self.num_attn_heads
@@ -1232,16 +1233,31 @@ def make_layer(self, layer_id, layer):
         self.layernorm_attrs["skip_input"] = f"{residual_add_name}/output_0"
 
 
+def parse_extra_options(kv_items):
+    """
+    Parse key value pairs that are separated by '='
+    """
+    kv_pairs = {}
+    
+    if kv_items:
+        for kv_str in kv_items:
+            kv = kv_str.split('=')
+            kv_pairs[kv[0].strip()] = kv[1].strip()
+    print(kv_pairs)
+    return kv_pairs
+
+
 def create_model(args):
     extra_kwargs = {} if os.path.exists(args.model_name_or_path) else {"cache_dir": args.cache_dir, "use_auth_token": True}
     config = AutoConfig.from_pretrained(args.model_name_or_path, **extra_kwargs)
 
+    extra_options = parse_extra_options(args.extra_options)
     if config.architectures[0] == "LlamaForCausalLM":
-        onnx_model = LlamaModel(config, args.io_dtype, args.precision, args.execution_provider, args.cache_dir)
+        onnx_model = LlamaModel(config, args.io_dtype, args.precision, args.execution_provider, args.cache_dir, extra_options)
     elif config.architectures[0] == "MistralForCausalLM":
-        onnx_model = MistralModel(config, args.io_dtype, args.precision, args.execution_provider, args.cache_dir)
+        onnx_model = MistralModel(config, args.io_dtype, args.precision, args.execution_provider, args.cache_dir, extra_options)
     elif config.architectures[0] == "PhiForCausalLM":
-        onnx_model = PhiModel(config, args.io_dtype, args.precision, args.execution_provider, args.cache_dir)
+        onnx_model = PhiModel(config, args.io_dtype, args.precision, args.execution_provider, args.cache_dir, extra_options)
     else:
         raise NotImplementedError(f"The {args.model_name_or_path} model is not currently supported.")
 
@@ -1251,6 +1267,7 @@ def create_model(args):
     # Save ONNX model
     onnx_model.save(args.output)
 
+
 def get_args():
     parser = argparse.ArgumentParser()
 
@@ -1294,6 +1311,21 @@ def get_args():
         help="Model cache directory (if providing model name and not folder path)",
     )
 
+    parser.add_argument(
+        "--extra_options",
+        required=False,
+        metavar="KEY=VALUE",
+        nargs='+',
+        default=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'cache_dir'),
+        help="""
+            Key value pairs for various options. Currently support:
+                int4_block_size = 16/32/64/128/256: Specify the block_size for int4 quantization.
+                int4_accuracy_level = 1/2/3/4: Specify the minimum accuracy level for activation of matmul in int4 quantization.
+                                          4 is int8, which means input A of int4 quantized matmul is quantized to int8 and input B is upcasted to int8 for computation.
+                                          1 is fp32, 2 is fp16, and 3 is bf16.
+            """,
+    )
+
     args = parser.parse_args()
 
     print("Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, INT4 CPU, INT4 CUDA")
diff --git a/src/search.h b/src/search.h
index 19ecf4a5d..978a36b2a 100644
--- a/src/search.h
+++ b/src/search.h
@@ -23,6 +23,7 @@ struct Search {
   virtual void SelectTop() = 0;
   virtual void SampleTopP(float /*p*/, float /*temperature*/) { assert(false); }
   virtual void SampleTopK(int /*k*/, float /*temperature*/) { assert(false); }
+  virtual void SampleTopPAndK(float /*p*/, int /*k*/, float /*temperature*/) { assert(false); }
 
   const GeneratorParams& params_;
 };
diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp
index afc43ae48..650d58e19 100644
--- a/src/search_cuda.cpp
+++ b/src/search_cuda.cpp
@@ -38,6 +38,7 @@ GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params)
     : Search_Cuda{params} {
   next_tokens_buffer_ = CudaMallocArray<int32_t>(params.batch_size, &next_tokens_);
   cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_.cuda_stream);
+  samplingdata_ = std::make_unique<cuda::SamplingData>(params_.batch_size, params_.vocab_size, params_.cuda_stream);
 }
 
 BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params)
@@ -164,44 +165,26 @@ void GreedySearch_Cuda::SelectTop() {
   AppendNextTokensToSequences();
 }
 
-// TODO: Find a good way to do this on the GPU
-void SoftMax(std::span<float> scores, float temperature);
-void TopPSampling(int32_t* d_next_token, float* d_scores, int size, float threshold, float temperature) {
-  auto scores_buffer = CudaMallocHostArray<float>(size);
-  std::span<float> scores{scores_buffer.get(), static_cast<size_t>(size)};
-  cudaMemcpy(scores.data(), d_scores, size * sizeof(float), cudaMemcpyDeviceToHost);
-
-  SoftMax(scores, temperature);
-
-  // Sort an array of indices into the scores
-  std::vector<int32_t> indices(scores.size());
-  std::iota(indices.begin(), indices.end(), 0);
-  std::sort(indices.begin(), indices.end(), [scores = scores.data()](int32_t i, int32_t j) { return scores[i] > scores[j]; });
-
-  int32_t token = 0;
-  // Find the first token where the cumulative probability exceeds the threshold
-  for (int i = 0; i < scores.size(); i++) {
-    threshold -= scores[indices[i]];
-    if (threshold > 0)
-      continue;
-
-    token = indices[i];
-    break;
-  }
-
-  cudaMemcpy(d_next_token, &token, sizeof(token), cudaMemcpyHostToDevice);
-}
-
 void GreedySearch_Cuda::SampleTopP(float p, float temperature) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dis(0, p);
+  std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
+  cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
+                  params_.batch_size, -1, p, temperature);
+  CheckForEOS();
+  AppendNextTokensToSequences();
+}
 
-  for (int i = 0; i < params_.batch_size; i++) {
-    std::span<float> scores = next_token_scores_.subspan(i * params_.vocab_size, params_.vocab_size);
-    TopPSampling(next_tokens_.data() + i, scores.data(), static_cast<int>(scores.size()), dis(gen), temperature);
-  }
+void GreedySearch_Cuda::SampleTopK(int k, float temperature) {
+  std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
+  cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
+                  params_.batch_size, k, 0.0, temperature);
+  CheckForEOS();
+  AppendNextTokensToSequences();
+}
 
+void GreedySearch_Cuda::SampleTopPAndK(float p, int k, float temperature) {
+  std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
+  cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
+                  params_.batch_size, k, p, temperature);
   CheckForEOS();
   AppendNextTokensToSequences();
 }
diff --git a/src/search_cuda.h b/src/search_cuda.h
index dd01de44d..023dad459 100644
--- a/src/search_cuda.h
+++ b/src/search_cuda.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "sequences_cuda.h"
 #include "search_cuda.cuh"
+#include "cuda_sampling.cuh"
 
 namespace Generators {
 
@@ -48,8 +49,9 @@ struct GreedySearch_Cuda : Search_Cuda {
   RoamingArray<int32_t> GetNextIndices() override { return gpu_span<int32_t>{}; }
 
   void SelectTop() override;
-  void SampleTopK(int k, float t) override { assert(false); }
+  void SampleTopK(int k, float t) override;
   void SampleTopP(float p, float t) override;
+  void SampleTopPAndK(float p, int k, float t) override;
 
  private:
   void CheckForEOS();
@@ -57,6 +59,7 @@ struct GreedySearch_Cuda : Search_Cuda {
 
   cuda_unique_ptr<int32_t> next_tokens_buffer_;
   std::unique_ptr<cuda::ArgMaxData> argmaxdata_;
+  std::unique_ptr<cuda::SamplingData> samplingdata_;
 };
 
 struct BeamSearch_Cuda : Search_Cuda {
diff --git a/src/smartptrs.h b/src/smartptrs.h
index 141aff444..b6cb8cb2d 100644
--- a/src/smartptrs.h
+++ b/src/smartptrs.h
@@ -1,5 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#pragma once
+#include <memory>
+#include <span>
 
 namespace Generators {
 
diff --git a/src/span.h b/src/span.h
index 5b63cc39d..2cddabea6 100644
--- a/src/span.h
+++ b/src/span.h
@@ -3,6 +3,7 @@
 #ifndef USE_CXX17
 #include <span>
 #else
+#include <vector>
 namespace std {
 
 template <typename T>
@@ -12,6 +13,8 @@ struct span {
 
   span(const span<std::remove_const_t<T> >& s) : p_{const_cast<T*>(s.data())}, length_{s.size()} {}
   span(std::vector<std::remove_const_t<T> >& s) : p_{const_cast<T*>(s.data())}, length_{s.size()} {}
+  template <auto N>
+  span(std::array<std::remove_const_t<T>, N> s) : p_{const_cast<T*>(s.data())}, length_{s.size()} {}
 
   bool empty() const { return length_ == 0; }
 
@@ -21,16 +24,14 @@ struct span {
   T* data() { return p_; }
   const T* data() const { return p_; }
 
-  T& operator[](size_t index) { return p_[index]; }
+  T& operator[](size_t index) const { return p_[index]; }
 
-  T& back() { return p_[length_ - 1]; }
-  T back() const { return p_[length_ - 1]; }
+  T& back() const { return p_[length_ - 1]; }
 
-  T* begin() { return p_; }
-  T* end() { return p_ + length_; }
+  T* begin() const { return p_; }
+  T* end() const { return p_ + length_; }
 
-  span subspan(size_t index, size_t length) { return span(p_ + index, length); }
-  span<const T> subspan(size_t index, size_t length) const { return span(p_ + index, length); }
+  span subspan(size_t index, size_t length) const { return span(p_ + index, length); }
 
  private:
   T* p_{};
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7d01f8ea1..fe9ba7f8b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -6,7 +6,7 @@ file(GLOB test_srcs CONFIGURE_DEPENDS
 )
 
 # google unit test
-add_executable(unit_tests main.cpp c_api.cpp tests.cpp)
+add_executable(unit_tests main.cpp c_api.cpp tests.cpp sampling_benchmark.cpp sampling_tests.cpp)
 target_include_directories(unit_tests PRIVATE
   ${ORT_HEADER_DIR}
   ${CMAKE_SOURCE_DIR}/src
@@ -19,12 +19,18 @@ target_link_libraries(unit_tests PRIVATE
   GTest::gtest_main
 )
 if(USE_CUDA AND CMAKE_CUDA_COMPILER)
+  file(GLOB cuda_test_srcs CONFIGURE_DEPENDS
+    "${CMAKE_CURRENT_SOURCE_DIR}/*.cu"
+    "${CMAKE_CURRENT_SOURCE_DIR}/*.cuh"
+  )
   set_target_properties(unit_tests PROPERTIES LINKER_LANGUAGE CUDA)
   target_link_libraries(unit_tests PRIVATE cublasLt cublas cudnn curand cufft cudart)
+  target_sources(unit_tests PRIVATE ${cuda_test_srcs})
 endif()
 
-if(USE_TOKENIZER)
-  add_compile_definitions(USE_TOKENIZER=1)
+if(NO_TOKENIZER)
+  add_compile_definitions(NO_TOKENIZER=1)
+else()
   target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT})
   target_link_libraries(unit_tests PRIVATE tokenizer)
 endif()
diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp
new file mode 100644
index 000000000..92b79800b
--- /dev/null
+++ b/test/sampling_benchmark.cpp
@@ -0,0 +1,159 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <gtest/gtest.h>
+#include <generators.h>
+#include <search.h>
+#include <models/model.h>
+#include <iostream>
+#include <random>
+#include <chrono>
+
+// Our working directory is generators/build so one up puts us in the root directory:
+#define MODEL_PATH "../../test_models/"
+
+#if USE_CUDA
+#include "tests_helper.cuh"
+
+TEST(Benchmarks, BenchmarkRandomizedSamplingTopP) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  int vocab_size = 32000;  // vocab size of llama
+  int batch_size = 1;
+  std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  double total_time = 0.0;
+  int num_iter = 1000;
+  for (int i = 0; i < num_iter; i++) {
+    auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
+    auto indices_buffer = Generators::CudaMallocArray<int>(vocab_size * batch_size);
+    std::random_device rd;
+    std::mt19937 engine(rd());
+    std::uniform_int_distribution<> dist(1, 25);
+    int num_large = dist(engine);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
+    float* cpu_logits = new float[vocab_size * batch_size];
+    cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
+    auto generator = Generators::CreateGenerator(*model, params);
+    generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
+
+    cudaStreamSynchronize(params.cuda_stream);
+    auto start = std::chrono::high_resolution_clock::now();
+    generator->search_->SampleTopP(0.95f, 1.0f);
+    auto stop = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    total_time += duration.count();
+
+    auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+    cudaStreamSynchronize(params.cuda_stream);
+  }
+  double average_time = total_time / double(num_iter);
+  std::cout << "Average time taken by TopP: "
+            << average_time << " microseconds" << std::endl;
+}
+
+TEST(Benchmarks, BenchmarkRandomizedSamplingTopK) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  int vocab_size = 32000;  // vocab size of llama
+  int batch_size = 1;
+  int k = 5;
+  std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  double total_time = 0.0;
+  int num_iter = 1000;
+  for (int i = 0; i < num_iter; i++) {
+    auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
+    auto indices_buffer = Generators::CudaMallocArray<int>(vocab_size * batch_size);
+    std::random_device rd;
+    std::mt19937 engine(rd());
+    std::uniform_int_distribution<> dist(1, 25);
+    int num_large = dist(engine);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
+    float* cpu_logits = new float[vocab_size * batch_size];
+    cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
+    auto generator = Generators::CreateGenerator(*model, params);
+    generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
+
+    cudaStreamSynchronize(params.cuda_stream);
+    auto start = std::chrono::high_resolution_clock::now();
+    generator->search_->SampleTopK(k, 1.0f);
+    auto stop = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    total_time += duration.count();
+
+    auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+    cudaStreamSynchronize(params.cuda_stream);
+  }
+  double average_time = total_time / double(num_iter);
+  std::cout << "Average time taken by TopK: "
+            << average_time << " microseconds" << std::endl;
+}
+
+TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndK) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  int vocab_size = 32000;  // vocab size of llama
+  int batch_size = 1;
+  float p = 0.95f;
+  int k = 5;
+  std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  double total_time = 0.0;
+  int num_iter = 1000;
+  for (int i = 0; i < num_iter; i++) {
+    auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
+    auto indices_buffer = Generators::CudaMallocArray<int>(vocab_size * batch_size);
+    std::random_device rd;
+    std::mt19937 engine(rd());
+    std::uniform_int_distribution<> dist(1, 25);
+    int num_large = dist(engine);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
+    float* cpu_logits = new float[vocab_size * batch_size];
+    cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
+    auto generator = Generators::CreateGenerator(*model, params);
+    generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
+
+    cudaStreamSynchronize(params.cuda_stream);
+    auto start = std::chrono::high_resolution_clock::now();
+    generator->search_->SampleTopPAndK(p, k, 1.0f);
+    auto stop = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    total_time += duration.count();
+
+    auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+    cudaStreamSynchronize(params.cuda_stream);
+  }
+  double average_time = total_time / double(num_iter);
+  std::cout << "Average time taken by TopP+K: "
+            << average_time << " microseconds" << std::endl;
+}
+
+#endif
\ No newline at end of file
diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp
new file mode 100644
index 000000000..353655b04
--- /dev/null
+++ b/test/sampling_tests.cpp
@@ -0,0 +1,250 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <gtest/gtest.h>
+#include <generators.h>
+#include <search.h>
+#include <models/model.h>
+#include <iostream>
+#include <random>
+
+// Our working directory is generators/build so one up puts us in the root directory:
+#define MODEL_PATH "../../test_models/"
+
+#if USE_CUDA
+#include "tests_helper.cuh"
+
+TEST(SamplingTests, BatchedSamplingTopP) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  std::vector<int32_t> input_ids{0, 1, 2, 3};
+  std::vector<int32_t> expected_output{1, 2, 3, 4};
+  auto output_span = Generators::cpu_span<int32_t>(expected_output);
+  std::vector<float> logits_cpu = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f,
+                                   0.1f, 0.1f, 0.6f, 0.1f, 0.1f,
+                                   0.1f, 0.1f, 0.1f, 0.6f, 0.1f,
+                                   0.1f, 0.1f, 0.1f, 0.1f, 0.6f};
+  auto logits_gpu = Generators::CudaMallocArray<float>(logits_cpu.size());
+  int vocab_size = 5;
+  int batch_size = 4;
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream);
+  cudaStreamSynchronize(params.cuda_stream);
+  auto generator = Generators::CreateGenerator(*model, params);
+  generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
+  // Verify outputs match expected outputs
+  generator->search_->SampleTopP(0.25f, 1.0f);
+  auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+  EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t)));
+}
+
+TEST(SamplingTests, BatchedSamplingTopK) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  std::vector<int32_t> input_ids{0, 1, 2, 3};
+  std::vector<float> logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f,
+                                0.25f, 2.0f, 1.25f, 1.5f, 0.25f,
+                                0.25f, 2.0f, 0.25f, 1.5f, 1.25f,
+                                1.25f, 0.25f, 1.5f, 0.25f, 2.0f};
+  auto logits_gpu = Generators::CudaMallocArray<float>(logits_cpu.size());
+  int vocab_size = 5;
+  int batch_size = 4;
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream);
+  cudaStreamSynchronize(params.cuda_stream);
+  auto generator = Generators::CreateGenerator(*model, params);
+  generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
+  // Verify outputs match expected outputs
+  int k = 2;
+  generator->search_->SampleTopK(k, 1.0);
+  auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+  for (int b = 0; b < batch_size; b++) {
+    auto next_token = next_tokens[b];
+    auto next_token_score = logits_cpu[next_token + vocab_size * b];
+    EXPECT_GT(next_token_score, 1.25f);
+  }
+}
+
+TEST(SamplingTests, BatchedSamplingTopPAndK) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  std::vector<int32_t> input_ids{0, 1, 2, 3};
+  std::vector<float> logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f,
+                                0.25f, 2.0f, 1.25f, 1.5f, 0.25f,
+                                0.25f, 2.0f, 0.25f, 1.5f, 1.25f,
+                                1.25f, 0.25f, 1.5f, 0.25f, 2.0f};
+  auto logits_gpu = Generators::CudaMallocArray<float>(logits_cpu.size());
+  int vocab_size = 5;
+  int batch_size = 4;
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream);
+  cudaStreamSynchronize(params.cuda_stream);
+  auto generator = Generators::CreateGenerator(*model, params);
+  generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
+  // Verify outputs match expected outputs
+  float p = 0.25f;
+  int k = 2;
+  generator->search_->SampleTopPAndK(p, k, 1.0);
+  auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+  for (int b = 0; b < batch_size; b++) {
+    auto next_token = next_tokens[b];
+    auto next_token_score = logits_cpu[next_token + vocab_size * b];
+    EXPECT_GT(next_token_score, 1.25f);
+  }
+}
+
+TEST(SamplingTests, RandomizedSamplingTopP) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  int vocab_size = 32000;  // vocab size of llama
+  int batch_size = 5;
+  std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  int num_iter = 100;
+  for (int i = 0; i < num_iter; i++) {
+    auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
+    auto indices_buffer = Generators::CudaMallocHostArray<int>(vocab_size * batch_size);
+    std::random_device rd;
+    std::mt19937 engine(rd());
+    std::uniform_int_distribution<> dist(1, 25);
+    int num_large = dist(engine);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
+    float* cpu_logits = new float[vocab_size * batch_size];
+    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
+
+    auto generator = Generators::CreateGenerator(*model, params);
+    generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
+    generator->search_->SampleTopP(0.95f, 1.0f);
+
+    auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+    cudaStreamSynchronize(params.cuda_stream);
+    // Verify outputs match expected outputs
+    for (int b = 0; b < batch_size; b++) {
+      auto next_token = next_tokens[b];
+      auto next_token_score = cpu_logits[next_token + vocab_size * b];
+      EXPECT_GT(next_token_score, 0.0001f);
+    }
+  }
+}
+
+TEST(SamplingTests, RandomizedSamplingTopK) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  int vocab_size = 32000;  // vocab size of llama
+  int batch_size = 5;
+  int k = 5;
+  std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  int num_iter = 100;
+  for (int i = 0; i < num_iter; i++) {
+    auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
+    auto indices_buffer = Generators::CudaMallocHostArray<int>(vocab_size * batch_size);
+    std::random_device rd;
+    std::mt19937 engine(rd());
+    std::uniform_int_distribution<> dist(1, 25);
+    int num_large = dist(engine);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
+    float* cpu_logits = new float[vocab_size * batch_size];
+    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
+
+    auto generator = Generators::CreateGenerator(*model, params);
+    generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
+    generator->search_->SampleTopK(k, 1.0f);
+    auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+    cudaStreamSynchronize(params.cuda_stream);
+    // Verify outputs match expected outputs
+    for (int b = 0; b < batch_size; b++) {
+      auto next_token = next_tokens[b];
+      auto next_token_score = cpu_logits[next_token + vocab_size * b];
+      EXPECT_GT(next_token_score, 10.0f);
+    }
+  }
+}
+
+TEST(SamplingTests, RandomizedSamplingTopPAndK) {
+  std::unique_ptr<OrtEnv> g_ort_env;
+  Ort::InitApi();
+  g_ort_env = OrtEnv::Create();
+  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  int vocab_size = 32000;  // vocab size of llama
+  int batch_size = 5;
+  float p = 0.95f;
+  int k = 5;
+  std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
+  Generators::GeneratorParams params = Generators::GeneratorParams{};
+  params.max_length = 10;
+  params.batch_size = batch_size;
+  params.sequence_length = 1;
+  params.vocab_size = vocab_size;
+  params.input_ids = input_ids;
+  params.device_type = Generators::DeviceType::CUDA;
+  int num_iter = 100;
+  for (int i = 0; i < num_iter; i++) {
+    auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
+    auto indices_buffer = Generators::CudaMallocHostArray<int>(vocab_size * batch_size);
+    std::random_device rd;
+    std::mt19937 engine(rd());
+    std::uniform_int_distribution<> dist(1, 25);
+    int num_large = dist(engine);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
+    float* cpu_logits = new float[vocab_size * batch_size];
+    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
+
+    auto generator = Generators::CreateGenerator(*model, params);
+    generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
+    generator->search_->SampleTopPAndK(p, k, 1.0f);
+    auto next_tokens = generator->search_->GetNextTokens().GetCPU();
+    cudaStreamSynchronize(params.cuda_stream);
+    // Verify outputs match expected outputs
+    for (int b = 0; b < batch_size; b++) {
+      auto next_token = next_tokens[b];
+      auto next_token_score = cpu_logits[next_token + vocab_size * b];
+      EXPECT_GT(next_token_score, 10.0f);
+    }
+  }
+}
+
+#endif
\ No newline at end of file
diff --git a/test/tests.cpp b/test/tests.cpp
index 2f2719a8a..53149502a 100644
--- a/test/tests.cpp
+++ b/test/tests.cpp
@@ -1,8 +1,12 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #include <gtest/gtest.h>
 #include <generators.h>
 #include <search.h>
 #include <models/model.h>
 #include <iostream>
+#include <random>
 
 // Our working directory is generators/build so one up puts us in the root directory:
 #define MODEL_PATH "../../test_models/"
@@ -103,6 +107,7 @@ TEST(ModelTests, BeamSearchGptFp32) {
 }
 
 #if USE_CUDA
+
 void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label) {
   std::vector<int64_t> input_ids_shape{2, 4};
   std::vector<int32_t> input_ids{0, 0, 0, 52, 0, 0, 195, 731};
@@ -208,7 +213,8 @@ TEST(ModelTests, BeamSearchGptCuda) {
 
 TEST(ModelTests, TestApiCuda) {
 #if TEST_PHI2
-#if USE_TOKENIZER
+#ifndef NO_TOKENIZER
+
   auto prompt = R"(
 def print_prime(n):
 '''
@@ -247,7 +253,7 @@ Print all primes between 1 and n
 
 TEST(ModelTests, TestHighLevelApiCuda) {
 #if TEST_PHI2
-#if USE_TOKENIZER
+#ifndef NO_TOKENIZER
   auto prompt = R"(
 def print_prime(n):
 '''
diff --git a/test/tests_helper.cu b/test/tests_helper.cu
new file mode 100644
index 000000000..16330512e
--- /dev/null
+++ b/test/tests_helper.cu
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "tests_helper.cuh"
+#include "../src/cuda_sampling.cuh"
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <span>
+
+// TODO: namespaces?
+// This is not really geometric decay anymore
+__global__ void GeometricDecayKernel(float* logits, int vocab_size, int num_large, float large_val) {
+  int index = threadIdx.x;
+  int batch = blockIdx.x;
+  for (; index < vocab_size; index += blockDim.x) {
+    if (index < num_large) {
+      logits[batch * vocab_size + index] = large_val + float(index) / 10.0f;
+    } else {
+      logits[batch * vocab_size + index] = 10.0f / powf(2.0f, static_cast<float>(index));
+    }
+  }
+}
+
+void LaunchGeometricDecayKernel(float* logits, int vocab_size, int batch_size, int num_large, float large_val, cudaStream_t stream) {
+  int num_threads = 256;
+  int num_blocks = batch_size;
+  GeometricDecayKernel<<<num_blocks, num_threads, 0, stream>>>(logits, vocab_size, num_large, large_val);
+}
+
+__global__ void FisherYatesKernel(float* logits, int* indices, int vocab_size, curandState* states) {
+  int shuffle_size = blockDim.x;
+  int shuffle_blocks = vocab_size / shuffle_size;
+  int index = threadIdx.x;
+  int batch = blockIdx.x;
+  // Shuffle between blocks of size blockDim.x
+  curand_init(clock64(), batch * vocab_size + index, 0, &states[index]);
+  for (int i = index; i < vocab_size; i += blockDim.x) {
+    int random_index = (curand(&states[index]) % shuffle_blocks) * shuffle_size + index;
+    float temp = logits[batch * vocab_size + i];
+    logits[batch * vocab_size + i] = logits[batch * vocab_size + random_index];
+    logits[batch * vocab_size + random_index] = temp;
+    int temp_i = indices[batch * vocab_size + i];
+    indices[batch * vocab_size + i] = indices[batch * vocab_size + random_index];
+    indices[batch * vocab_size + random_index] = temp_i;
+  }
+  __syncthreads();
+  // Shuffle within blocks of size blockDim.x
+  curand_init(clock64(), batch * vocab_size + index, 0, &states[index]);
+  int offset = index * shuffle_size;
+  if (offset + shuffle_size <= vocab_size) { 
+    for (int i = 0; i < shuffle_size; i += 1) {
+      int random_index = curand(&states[index]) % shuffle_size;
+      float temp = logits[batch * vocab_size + offset + i];
+      logits[batch * vocab_size + offset + i] = logits[batch * vocab_size + offset + random_index];
+      logits[batch * vocab_size + offset + random_index] = temp;
+      int temp_i = indices[batch * vocab_size + offset + i];
+      indices[batch * vocab_size + offset + i] = indices[batch * vocab_size + offset + random_index];
+      indices[batch * vocab_size + offset + random_index] = temp_i;
+    }
+  }
+}
+
+void LaunchFisherYatesKernel(float* logits, int* indices_buffer, int vocab_size, int batch_size, cudaStream_t stream) {
+  int num_threads = 256;
+  int num_blocks = batch_size;
+  curandState *random_states;
+  cudaMalloc((void **)&random_states, num_threads * sizeof(curandState));
+  std::span<float> logits_span{logits, static_cast<size_t>(vocab_size * batch_size)};
+  std::span<int32_t> indices{indices_buffer, static_cast<size_t>(vocab_size * batch_size)};
+  Generators::cuda::LaunchPopulateIndices(indices.data(), vocab_size, batch_size, stream);
+  FisherYatesKernel<<<num_blocks, num_threads, 0, stream>>>(logits_span.data(), indices.data(), vocab_size, random_states);
+}
\ No newline at end of file
diff --git a/test/tests_helper.cuh b/test/tests_helper.cuh
new file mode 100644
index 000000000..ff0f3f319
--- /dev/null
+++ b/test/tests_helper.cuh
@@ -0,0 +1,5 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+void LaunchGeometricDecayKernel(float* logits, int vocab_size, int batch_size, int num_large, float large_val, cudaStream_t stream);
+void LaunchFisherYatesKernel(float* logits, int* indices, int vocab_size, int batch_size, cudaStream_t stream);
\ No newline at end of file