diff --git a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj
index abe6491..3838150 100644
--- a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj
+++ b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj
@@ -92,13 +92,14 @@
   <ItemGroup>
     <CudaCompile Include="CudaSparseMatrix.cu" />
     <CudaCompile Include="kernel.cu" />
-    <CudaCompile Include="SparseMatrixSumKernel.cu" />
+    <CudaCompile Include="main.cu" />
+    <CudaCompile Include="Kernels.cu" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="CudaSparseMatrix.hpp" />
     <ClInclude Include="indicators.hpp" />
     <ClInclude Include="intellisense_cuda_intrinsics.h" />
-    <ClInclude Include="SparseMatrixSumKernel.cuh" />
+    <ClInclude Include="Kernels.cuh" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters
index fea601e..5034b1c 100644
--- a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters
+++ b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters
@@ -5,9 +5,10 @@
     <CudaCompile Include="CudaSparseMatrix.cu">
       <Filter>Objects</Filter>
     </CudaCompile>
-    <CudaCompile Include="SparseMatrixSumKernel.cu">
+    <CudaCompile Include="Kernels.cu">
       <Filter>Objects</Filter>
     </CudaCompile>
+    <CudaCompile Include="main.cu" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="indicators.hpp" />
@@ -17,7 +18,7 @@
     <ClInclude Include="intellisense_cuda_intrinsics.h">
       <Filter>Objects</Filter>
     </ClInclude>
-    <ClInclude Include="SparseMatrixSumKernel.cuh">
+    <ClInclude Include="Kernels.cuh">
       <Filter>Objects</Filter>
     </ClInclude>
   </ItemGroup>
diff --git a/CudaMaxCutPlanted/CudaSparseMatrix.cu b/CudaMaxCutPlanted/CudaSparseMatrix.cu
index 5c1dd41..deefa06 100644
--- a/CudaMaxCutPlanted/CudaSparseMatrix.cu
+++ b/CudaMaxCutPlanted/CudaSparseMatrix.cu
@@ -5,30 +5,9 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
 
-#include "SparseMatrixSumKernel.cuh"
-
-#define CHECK_CUDA(call)                                        \
-{                                                               \
-    cudaError_t err = call;                                     \
-    if (err != cudaSuccess) {                                   \
-        std::cerr << "CUDA error in file " << __FILE__          \
-                  << " at line " << __LINE__ << ": "            \
-                  << cudaGetErrorString(err) << std::endl;      \
-        exit(EXIT_FAILURE);                                     \
-    }                                                           \
-}
-
-#define CHECK_CUSPARSE(call)                                    \
-{                                                               \
-    cusparseStatus_t err = call;                                \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                       \
-        std::cerr << "CUSPARSE error in file " << __FILE__      \
-                  << " at line " << __LINE__ << ": "            \
-                  << cusparseGetErrorString(err) << std::endl;  \
-        exit(EXIT_FAILURE);                                     \
-    }                                                           \
-}
 
 const char* cusparseGetErrorString(cusparseStatus_t status) {
     switch (status) {
@@ -62,7 +41,7 @@ CudaSparseMatrix::CudaSparseMatrix(int* I, int* J, float* V, int n, int nnz, Spa
 }
 
 CudaSparseMatrix::CudaSparseMatrix(const CudaSparseMatrix& other)
-    : n_(other.n_), nnz_(other.nnz_) {
+    : n_(other.n_), nnz_(other.nnz_), matDescr_(nullptr) {
     cusparseHandle_t& cusparseHandle_ = CusparseHandle::getInstance();
     CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets_, (n_ + 1) * sizeof(int)));
     CHECK_CUDA(cudaMalloc((void**)&d_cols_, nnz_ * sizeof(int)));
@@ -79,21 +58,156 @@ CudaSparseMatrix::CudaSparseMatrix(const CudaSparseMatrix& other)
 }
 
 CudaSparseMatrix::~CudaSparseMatrix() {
-    CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_));
-    CHECK_CUDA(cudaFree(d_csrOffsets_));
-    CHECK_CUDA(cudaFree(d_cols_));
-    CHECK_CUDA(cudaFree(d_vals_));
+    clear();
 }
 
 void CudaSparseMatrix::updateData(const int* rows, const int* cols, const float* vals, int new_nnz, SparseType sparseType, MemoryType memType) {
     nnz_ = new_nnz;
     CHECK_CUDA(cudaFree(d_cols_));
     CHECK_CUDA(cudaFree(d_vals_));
+    CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_));
 
     CHECK_CUDA(cudaMalloc((void**)&d_cols_, nnz_ * sizeof(int)));
     CHECK_CUDA(cudaMalloc((void**)&d_vals_, nnz_ * sizeof(float)));
 
     allocateAndCopy(rows, cols, vals, sparseType, memType);
+
+    CHECK_CUSPARSE(cusparseCreateCsr(&matDescr_, n_, n_, nnz_,
+        d_csrOffsets_, d_cols_, d_vals_,
+        csr_row_ind_type_, csr_col_ind_type_,
+        index_base_, valueType_));
+}
+
+bool* CudaSparseMatrix::zero_elements_in_vector(const float* input_vect, int& zero_sum, int n) {
+    cusparseHandle_t& handle = CusparseHandle::getInstance();
+    bool* zero_elements_vect;
+    int* d_zero_sum;
+
+    zero_sum = 0;
+
+    // Allocate memory on the device
+    CHECK_CUDA(cudaMalloc((void**)&zero_elements_vect, n * sizeof(bool)));
+    CHECK_CUDA(cudaMalloc((void**)&d_zero_sum, sizeof(int)));
+
+    // Initialize memory
+    CHECK_CUDA(cudaMemset(zero_elements_vect, 0, n * sizeof(bool)));
+    CHECK_CUDA(cudaMemset(d_zero_sum, 0, sizeof(int)));
+
+    int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    zero_elements << <gridSize, BLOCK_SIZE >> > (input_vect, zero_elements_vect, d_zero_sum, n);
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    // Copy the result back to host
+    CHECK_CUDA(cudaMemcpy(&zero_sum, d_zero_sum, sizeof(int), cudaMemcpyDeviceToHost));
+
+    // Free device memory
+    CHECK_CUDA(cudaFree(d_zero_sum));
+
+    return zero_elements_vect;
+}
+
+void CudaSparseMatrix::fill_diagonal(const float* diagonal_vect)
+{
+    int nnz_sum = 0;
+    int zero_sum = 0;
+    int diag_nnz = n_;
+    int resize_n = diag_nnz;
+    // TODO: Flip to non zero vector and copy only the non zero elements to new vector
+    bool* zeros_in_diag_sum = zero_elements_in_vector(diagonal_vect, zero_sum, n_);
+    bool* non_zero = non_zero_diagonal(nnz_sum);
+    
+    bool* h_non_zero = new bool[n_];
+    CHECK_CUDA(cudaMemcpy(h_non_zero, zeros_in_diag_sum, n_ * sizeof(bool), cudaMemcpyDeviceToHost));
+
+    for (int i = 0; i < n_; i++)
+    {
+        std::cout << "zero_in_diag_" << i << ": " << h_non_zero[i] << std::endl;
+    }
+    
+    diag_nnz -= zero_sum;
+    resize_n = diag_nnz - nnz_sum;
+
+    int* original_I, * new_I, * new_J;
+    float* new_V;
+
+    CHECK_CUDA(cudaMalloc((void**)&original_I, (nnz_) * sizeof(int)));
+    CHECK_CUDA(cudaMalloc((void**)&new_I, (nnz_ + resize_n) * sizeof(int)));
+    CHECK_CUDA(cudaMalloc((void**)&new_J, (nnz_ + resize_n) * sizeof(int)));
+    CHECK_CUDA(cudaMalloc((void**)&new_V, (nnz_ + resize_n) * sizeof(float)));
+
+    csrTorows(d_csrOffsets_, original_I, n_, nnz_, SparseType::CSR);
+
+    CHECK_CUDA(cudaMemcpy(new_I, original_I, nnz_ * sizeof(int), cudaMemcpyDeviceToDevice));
+    CHECK_CUDA(cudaMemcpy(new_J, d_cols_, nnz_ * sizeof(int), cudaMemcpyDeviceToDevice));
+    CHECK_CUDA(cudaMemcpy(new_V, d_vals_, nnz_ * sizeof(float), cudaMemcpyDeviceToDevice));
+
+    int gridSize = ((nnz_+n_)+BLOCK_SIZE - 1) / BLOCK_SIZE;
+    set_diagonal << <gridSize, BLOCK_SIZE >> > (new_I, new_J, new_V, non_zero, diagonal_vect, nnz_, resize_n);
+    CHECK_CUDA(cudaDeviceSynchronize());
+    thrust::device_ptr<int> dev_I(new_I);
+    thrust::device_ptr<int> dev_J(new_J);
+    thrust::device_ptr<float> dev_V(new_V);
+
+    // First, sort by the secondary key (J) using stable sort
+    thrust::stable_sort_by_key(dev_J, dev_J + (nnz_ + resize_n), thrust::make_zip_iterator(thrust::make_tuple(dev_I, dev_V)));
+
+    // Then, sort by the primary key (I) using stable sort to maintain the order of the secondary key
+    thrust::stable_sort_by_key(dev_I, dev_I + (nnz_ + resize_n), thrust::make_zip_iterator(thrust::make_tuple(dev_J, dev_V)));
+
+    float* h_new_I = new float[nnz_+resize_n];
+    CHECK_CUDA(cudaMemcpy(h_new_I, new_V, (nnz_ + resize_n) * sizeof(float), cudaMemcpyDeviceToHost));
+
+    for (int i = 0; i < nnz_ + resize_n; i++)
+    {
+        std::cout << "Copied V_" << i << ": " << h_new_I[i] << std::endl;
+    }
+
+    updateData(new_I, new_J, new_V, nnz_ + resize_n, SparseType::COO, MemoryType::Device);
+
+    CHECK_CUDA(cudaFree(original_I));
+    CHECK_CUDA(cudaFree(new_I));
+    CHECK_CUDA(cudaFree(new_J));
+    CHECK_CUDA(cudaFree(new_V));
+
+    std::cout << "Total non zero elements on the diagonal: " << nnz_sum << std::endl;
+}
+
+bool* CudaSparseMatrix::non_zero_diagonal(int& nnz_diag_sum)
+{
+    cusparseHandle_t& handle = CusparseHandle::getInstance();
+    bool* nnz_diag;
+    int* I;
+    int* d_nnz_diag_sum;
+    nnz_diag_sum = 0;
+
+    // Allocate memory on the device
+    CHECK_CUDA(cudaMalloc((void**)&d_nnz_diag_sum, sizeof(int)));
+
+    // Initialize memory
+    CHECK_CUDA(cudaMemset(d_nnz_diag_sum, 0, sizeof(int)));
+
+    CHECK_CUDA(cudaMalloc((void**)&I, nnz_ * sizeof(int)));
+    cusparseXcsr2coo(handle,
+        d_csrOffsets_,
+        nnz_,
+        n_,
+        I,
+        CUSPARSE_INDEX_BASE_ZERO);
+
+
+    CHECK_CUDA(cudaMalloc((void**)&nnz_diag, n_ * sizeof(bool)));
+    CHECK_CUDA(cudaMemset(nnz_diag, 0, n_));
+    int gridSize = (nnz_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    non_zero_elements << <gridSize, BLOCK_SIZE >> > (I, d_cols_, nnz_diag, d_nnz_diag_sum, nnz_);
+
+    CHECK_CUDA(cudaMemcpy(&nnz_diag_sum, d_nnz_diag_sum, sizeof(int), cudaMemcpyDeviceToHost));
+
+    // Free device memory
+    CHECK_CUDA(cudaFree(d_nnz_diag_sum));
+    CHECK_CUDA(cudaDeviceSynchronize());
+    CHECK_CUDA(cudaFree(I));
+
+    return nnz_diag;
 }
 
 CudaDenseVector CudaSparseMatrix::dot(const float* d_vec)
@@ -122,6 +236,63 @@ CudaDenseVector CudaSparseMatrix::dot(const float* d_vec)
     return result_vector;
 }
 
+void CudaSparseMatrix::multiply(float value)
+{
+    cusparseHandle_t& handle = CusparseHandle::getInstance();
+    // Set scaling factors
+    const float beta = 0.0f;
+    size_t bufferSize = 0;
+
+    cusparseMatDescr_t input_desc;
+    CHECK_CUSPARSE(cusparseCreateMatDescr(&input_desc));
+
+    // Create matrix descriptor for the result matrix C
+    cusparseMatDescr_t result_desc;
+    CHECK_CUSPARSE(cusparseCreateMatDescr(&result_desc));
+
+    // Get buffer size for the operation
+    CHECK_CUSPARSE(cusparseScsrgeam2_bufferSizeExt(handle,
+        n_, n_,
+        &value, input_desc, nnz_,
+        d_vals_,
+        d_csrOffsets_,
+        d_cols_,
+        &beta, nullptr, nnz_,
+        nullptr,
+        nullptr,
+        nullptr,
+        result_desc,
+        d_vals_,
+        d_csrOffsets_,
+        d_cols_,
+        &bufferSize));
+
+    void* dBuffer;
+    cudaMalloc(&dBuffer, bufferSize);
+
+    // Perform the scaling operation
+    cusparseScsrgeam2(handle,
+        n_, n_,
+        &value, input_desc, nnz_,
+        d_vals_,
+        d_csrOffsets_,
+        d_cols_,
+        &beta, input_desc, nnz_,
+        d_vals_,
+        d_csrOffsets_,
+        d_cols_,
+        result_desc,
+        d_vals_,
+        d_csrOffsets_,
+        d_cols_,
+        dBuffer);
+    // Clean up
+    cudaFree(dBuffer);
+    cusparseDestroyMatDescr(input_desc);
+    cusparseDestroyMatDescr(result_desc);
+
+}
+
 void CudaSparseMatrix::allocateAndCopy(const int* rows, const int* cols, const float* vals, SparseType sparseType, MemoryType memType) {
     cudaMemcpyKind copyType;
 
@@ -167,6 +338,19 @@ void CudaSparseMatrix::rowsToCsr(const int* d_rows, int* d_csr_offset, int n, in
     
 }
 
+void CudaSparseMatrix::csrTorows(const int* d_csr_offset, int* d_rows, int n, int nnz, SparseType sparseType)
+{
+    if (sparseType == SparseType::CSR) {
+        cusparseHandle_t& handle = CusparseHandle::getInstance();
+        cusparseXcsr2coo(handle,
+            d_csr_offset,
+            nnz,
+            n,
+            d_rows,
+            CUSPARSE_INDEX_BASE_ZERO);
+    }
+}
+
 float* CudaSparseMatrix::sumRows()
 {
     cusparseHandle_t& handle = CusparseHandle::getInstance();
@@ -179,7 +363,8 @@ float* CudaSparseMatrix::sumRows()
     CHECK_CUDA(cudaMalloc((void**)&cscRowInd, nnz_ * sizeof(int)));
     CHECK_CUDA(cudaMalloc((void**)&cscVal, nnz_ * sizeof(float)));
 
-    CHECK_CUDA(cudaMemset((void*)diagonal, 0, nnz_ * sizeof(float)));
+    CHECK_CUDA(cudaMalloc((void**)&diagonal, n_ * sizeof(float)));
+    CHECK_CUDA(cudaMemset((void*)diagonal, 0, n_ * sizeof(float)));
 
 
     cusparseCsr2cscEx2_bufferSize(handle, n_, n_, nnz_,
@@ -201,10 +386,9 @@ float* CudaSparseMatrix::sumRows()
     // Clean up
     cudaFree(dBuffer);
     
-    int blockSize = 512;
-    int gridSize = (nnz_ + blockSize - 1) / blockSize;
+    int gridSize = (nnz_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
-    sum_axis << <gridSize, blockSize >> > (nnz_, cscRowInd, cscVal, diagonal);
+    sum_axis << <gridSize, BLOCK_SIZE >> > (nnz_, cscRowInd, cscVal, diagonal);
 
     CHECK_CUDA(cudaFree(cscColPtr));
     CHECK_CUDA(cudaFree(cscRowInd));
@@ -218,7 +402,8 @@ float* CudaSparseMatrix::sumCols()
     cusparseHandle_t& handle = CusparseHandle::getInstance();
     float* diagonal;
 
-    CHECK_CUDA(cudaMemset((void*)diagonal, 0, nnz_ * sizeof(float)));
+    CHECK_CUDA(cudaMalloc((void**)&diagonal, n_ * sizeof(float)));
+    CHECK_CUDA(cudaMemset((void*)diagonal, 0, n_ * sizeof(float)));
 
     int blockSize = 512;
     int gridSize = (nnz_ + blockSize - 1) / blockSize;
@@ -230,6 +415,14 @@ float* CudaSparseMatrix::sumCols()
 
 float* CudaSparseMatrix::sum(int axis)
 {
+    if (axis == 0) {
+        return sumRows();
+    }
+
+    if (axis == 1) {
+        return sumCols();
+    }
+
     return nullptr;
 }
 
@@ -274,7 +467,7 @@ void CudaSparseMatrix::display()
     std::cout << "Dense matrix:" << std::endl;
     for (int i = 0; i < n_; ++i) {
         for (int j = 0; j < n_; ++j) {
-            std::cout << h_denseMat[i * n_ + j] << " ";
+            std::cout << std::setw(7) << h_denseMat[i * n_ + j] << " ";
         }
         std::cout << std::endl;
     }
@@ -295,6 +488,33 @@ int CudaSparseMatrix::size() const
     return n_;
 }
 
+void CudaSparseMatrix::clear()
+{
+    if (d_csrOffsets_) {
+        CHECK_CUDA(cudaFree(d_csrOffsets_));
+        std::cout << "d_csrOffsets_ cleared" << std::endl;
+        d_csrOffsets_ = nullptr;
+    }
+    if (d_cols_) {
+        CHECK_CUDA(cudaFree(d_cols_));
+        std::cout << "d_cols_ cleared" << std::endl;
+        d_cols_ = nullptr;
+    }
+    if (d_vals_) {
+        CHECK_CUDA(cudaFree(d_vals_));
+        std::cout << "d_vals_ cleared" << std::endl;
+        d_vals_ = nullptr;
+    }
+    if (matDescr_) {
+        CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_));
+        std::cout << "matDescr_ cleared" << std::endl;
+        matDescr_ = nullptr;
+    }
+
+    nnz_ = 0;
+    n_ = 0;
+}
+
 CudaDenseVector::CudaDenseVector(int size, const float* V, MemoryType memType): size_(size)
 {
     cudaMemcpyKind copyType = memType == MemoryType::Host ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
diff --git a/CudaMaxCutPlanted/CudaSparseMatrix.hpp b/CudaMaxCutPlanted/CudaSparseMatrix.hpp
index de410b1..047c64e 100644
--- a/CudaMaxCutPlanted/CudaSparseMatrix.hpp
+++ b/CudaMaxCutPlanted/CudaSparseMatrix.hpp
@@ -4,6 +4,30 @@
 #include <cuda_runtime.h>
 #include <cusparse_v2.h>
 #include <memory>
+#include "Kernels.cuh"
+
+
+#define CHECK_CUDA(call)                                        \
+{                                                               \
+    cudaError_t err = call;                                     \
+    if (err != cudaSuccess) {                                   \
+        std::cerr << "CUDA error in file " << __FILE__          \
+                  << " at line " << __LINE__ << ": "            \
+                  << cudaGetErrorString(err) << std::endl;      \
+        exit(EXIT_FAILURE);                                     \
+    }                                                           \
+}
+
+#define CHECK_CUSPARSE(call)                                    \
+{                                                               \
+    cusparseStatus_t err = call;                                \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                       \
+        std::cerr << "CUSPARSE error in file " << __FILE__      \
+                  << " at line " << __LINE__ << ": "            \
+                  << cusparseGetErrorString(err) << std::endl;  \
+        exit(EXIT_FAILURE);                                     \
+    }                                                           \
+}
 
 enum class MemoryType {
     Host,
@@ -59,11 +83,15 @@ class CudaSparseMatrix {
     ~CudaSparseMatrix();
 
     void updateData(const int* rows, const int* cols, const float* vals, int new_nnz, SparseType sparseType, MemoryType memType);
+    void fill_diagonal(const float* diagonal_vect);
+    bool* non_zero_diagonal(int& nnz_diag_sum);
     CudaDenseVector dot(const float* d_vec);
+    void multiply(float value);
     float* sum(int axis);
     void display();
     int getNnz() const;
     int size() const;
+    void clear();
 
     // Other useful methods can be added here
 
@@ -80,6 +108,8 @@ class CudaSparseMatrix {
     cudaDataType valueType_ = CUDA_R_32F;
     void allocateAndCopy(const int* rows, const int* cols, const float* vals, SparseType sparseType, MemoryType memType);
     void rowsToCsr(const int* d_rows, int* d_csr_offset, int n, int nnz, SparseType sparseType);
+    void csrTorows(const int* d_csr_offset, int* d_rows, int n, int nnz, SparseType sparseType);
+    bool* zero_elements_in_vector(const float* input_vect, int& zero_sum, int n);
     float* sumRows();
     float* sumCols();
 
diff --git a/CudaMaxCutPlanted/Kernels.cu b/CudaMaxCutPlanted/Kernels.cu
new file mode 100644
index 0000000..6e7ceae
--- /dev/null
+++ b/CudaMaxCutPlanted/Kernels.cu
@@ -0,0 +1,137 @@
+#include "Kernels.cuh"
+
+#define NUM_BANKS 16
+#define LOG_NUM_BANKS 4
+#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
+
+/// <summary>
+/// This function sums either rows for matrix in csc format or columns for matrix in csr format
+/// </summary>
+/// <param name="nnz">Number on non zero elements</param>
+/// <param name="d_non_offset_axis_ind">Array with nnz elements that is NOT in compressed format</param>
+/// <param name="d_vals">Array with values</param>
+/// <param name="d_axis_sum">Output array that will contain output of summed rows/cols</param>
+/// <returns></returns>
+__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum) {
+    int element_ind = blockIdx.x * blockDim.x + threadIdx.x;
+    if (element_ind < nnz) {
+        int idx = d_non_offset_axis_ind[element_ind];        
+        atomicAdd(&d_axis_sum[idx], d_vals[idx]);
+    }
+}
+
+__global__ void create_random_matrix(int n, int nnz, int split, const int* p, int* d_rows, int* d_cols, float* d_vals, curandState* states) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < nnz) {
+        curand_init(1234, idx, 0, &states[idx]);
+        int i = static_cast<int>(floorf(idx / (n - split)));
+        int j = static_cast<int>((idx % (n - split)) + split);
+        d_rows[idx] = p[i];
+        d_cols[idx] = p[j];
+        d_vals[idx] = curand_uniform(&states[idx]) * 0.99f + 0.01f;
+    }
+}
+
+__global__ void set_true_elements(int split, const int* p, char* x) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < split) {
+        x[p[idx]] = 1;
+    }
+}
+
+__global__ void non_zero_elements(const int* I, const int* J, bool* non_zero_elements, int *nnz_sum, int nnz) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < nnz) {
+        if (I[idx] == J[idx]) {
+            atomicAdd(nnz_sum, 1);
+            non_zero_elements[idx] == true;
+        }
+        
+    }
+}
+
+__global__ void zero_elements(const float *input_vect, bool* zero_elements_vect, int* zero_sum, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        if (fabsf(input_vect[idx]) < 1e-6) {
+            atomicAdd(zero_sum, 1);
+            zero_elements_vect[idx] = true;
+        }
+
+    }
+}
+
+
+__global__ void set_diagonal(int* I, int* J, float* V, bool* non_zero_elements, const float* diagonal, int initial_n, int resize_n) {
+    int offset = initial_n;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < initial_n) {
+        if (I[idx] == J[idx] && non_zero_elements[idx]) {
+            
+            V[idx] = diagonal[idx];
+        }
+
+    }
+    else if (idx >= initial_n && idx < (initial_n + resize_n)) {
+        int index = idx - offset;
+        V[idx] = diagonal[index];
+        I[idx] = index;
+        J[idx] = index;
+    }
+}
+
+__global__ void prescan(float* g_odata, float* g_idata, int n) {
+    extern __shared__ float temp[];
+
+    int thid = threadIdx.x;
+    int offset = 1;
+
+    // Load input into shared memory with padding to avoid bank conflicts
+    int ai = thid;
+    int bi = thid + (n / 2);
+    int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+    int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+    temp[ai + bankOffsetA] = g_idata[ai];
+    temp[bi + bankOffsetB] = g_idata[bi];
+
+    // Build sum in place up the tree
+    for (int d = n >> 1; d > 0; d >>= 1) {
+        __syncthreads();
+        if (thid < d) {
+            int ai = offset * (2 * thid + 1) - 1;
+            int bi = offset * (2 * thid + 2) - 1;
+            ai += CONFLICT_FREE_OFFSET(ai);
+            bi += CONFLICT_FREE_OFFSET(bi);
+            temp[bi] += temp[ai];
+        }
+        offset *= 2;
+    }
+
+    // Clear the last element
+    if (thid == 0) {
+        temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0;
+    }
+
+    // Traverse down tree & build scan
+    for (int d = 1; d < n; d *= 2) {
+        offset >>= 1;
+        __syncthreads();
+        if (thid < d) {
+            int ai = offset * (2 * thid + 1) - 1;
+            int bi = offset * (2 * thid + 2) - 1;
+            ai += CONFLICT_FREE_OFFSET(ai);
+            bi += CONFLICT_FREE_OFFSET(bi);
+            float t = temp[ai];
+            temp[ai] = temp[bi];
+            temp[bi] += t;
+        }
+    }
+    __syncthreads();
+
+    // Write results to device memory with proper offsets
+    g_odata[ai] = temp[ai + bankOffsetA];
+    g_odata[bi] = temp[bi + bankOffsetB];
+}
+
+
diff --git a/CudaMaxCutPlanted/Kernels.cuh b/CudaMaxCutPlanted/Kernels.cuh
new file mode 100644
index 0000000..856e5f4
--- /dev/null
+++ b/CudaMaxCutPlanted/Kernels.cuh
@@ -0,0 +1,25 @@
+#ifndef KERNELS_CUH
+#define KERNELS_CUH
+
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#ifdef __INTELLISENSE__
+#include "intellisense_cuda_intrinsics.h"
+#endif
+#define BLOCK_SIZE 512
+
+
+// Kernel to sum all elements along axis
+__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum);
+// Fills sparse graph with random values
+__global__ void create_random_matrix(int n, int nnz, int split, const int* p, int* d_rows, int* d_cols, float* d_vals, curandState* states);
+//Sets true/1 to the zereos vector x in positions provided by p
+__global__ void set_true_elements(int split, const int* p, char* x);
+//Count all non zero elements on the diagonal and returns bool vector where true is set on a position where diagonal is non zero
+__global__ void non_zero_elements(const int* I, const int* J, bool* non_zero_elements, int* nnz_sum, int n);
+// Sets values to the diagonal taking into account that diagonal can have non zero values
+__global__ void set_diagonal(int* I, int* J, float* V, bool* non_zero_elements, const float* diagonal, int initial_n, int resize_n);
+// Count and sets bool vector for zero elements inside float device vector
+__global__ void zero_elements(const float* input_vect, bool* zero_elements_vect, int* zero_sum, int n);
+#endif // KERNELS_CUH
\ No newline at end of file
diff --git a/CudaMaxCutPlanted/SparseMatrixSumKernel.cu b/CudaMaxCutPlanted/SparseMatrixSumKernel.cu
deleted file mode 100644
index 22fa399..0000000
--- a/CudaMaxCutPlanted/SparseMatrixSumKernel.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "SparseMatrixSumKernel.cuh"
-
-
-/// <summary>
-/// This function sums either rows for matrix in csc format or columns for matrix in csr format
-/// </summary>
-/// <param name="nnz">Number on non zero elements</param>
-/// <param name="d_non_offset_axis_ind">Array with nnz elements that is NOT in compressed format</param>
-/// <param name="d_vals">Array with values</param>
-/// <param name="d_axis_sum">Output array that will contain output of summed rows/cols</param>
-/// <returns></returns>
-__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum) {
-    int element_ind = blockIdx.x * blockDim.x + threadIdx.x;
-    if (element_ind < nnz) {
-        int idx = d_non_offset_axis_ind[element_ind];        
-        atomicAdd(&d_axis_sum[idx], d_vals[idx]);
-    }
-}
\ No newline at end of file
diff --git a/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh b/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh
deleted file mode 100644
index 42d4502..0000000
--- a/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef SPARSEMATRIXSUMKERNEL_CUH
-#define SPARSEMATRIXSUMKERNEL_CUH
-
-#include <cuda_runtime.h>
-#ifdef __INTELLISENSE__
-#include "intellisense_cuda_intrinsics.h"
-#endif
-
-
-// Kernel to mark zero elements
-__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum);
-
-#endif // SPARSEMATRIXSUMKERNEL_CUH
\ No newline at end of file
diff --git a/CudaMaxCutPlanted/kernel.cu b/CudaMaxCutPlanted/kernel.cu
index 3e93ae4..6128890 100644
--- a/CudaMaxCutPlanted/kernel.cu
+++ b/CudaMaxCutPlanted/kernel.cu
@@ -1,429 +1,583 @@
-﻿#include <cuda_runtime.h>
-#include <cusparse_v2.h>
-#include <cublas_v2.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sort.h>
-#include <thrust/random.h>
-#include <thrust/reduce.h>
-#include <thrust/extrema.h>
-#include <thrust/sequence.h>
-#include <thrust/inner_product.h>
-#include <iostream>
-#include <algorithm>
-#include <random>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include "indicators.hpp"
-#ifdef __INTELLISENSE__
-#include "intellisense_cuda_intrinsics.h"
-#endif
-
-
-#define CHECK_CUSPARSE(call)                                    \
-{                                                               \
-    cusparseStatus_t err = call;                                \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                       \
-        std::cerr << "CUSPARSE error in file " << __FILE__      \
-                  << " at line " << __LINE__ << ": "            \
-                  << cusparseGetErrorString(err) << std::endl;  \
-        exit(EXIT_FAILURE);                                     \
-    }                                                           \
-}
-
-#define CHECK_CUDA(call)                                        \
-{                                                               \
-    cudaError_t err = call;                                     \
-    if (err != cudaSuccess) {                                   \
-        std::cerr << "CUDA error in file " << __FILE__          \
-                  << " at line " << __LINE__ << ": "            \
-                  << cudaGetErrorString(err) << std::endl;      \
-        exit(EXIT_FAILURE);                                     \
-    }                                                           \
-}
-
-#define CHECK_CUBLAS(call)                                        \
-{                                                                 \
-    cublasStatus_t err = call;                                    \
-    if (err != CUBLAS_STATUS_SUCCESS) {                           \
-        std::cerr << "CUBLAS error in file " << __FILE__          \
-                  << " at line " << __LINE__ << ": "              \
-                  << cublasGetErrorString(err) << std::endl;      \
-        exit(EXIT_FAILURE);                                       \
-    }                                                             \
-}
-
-
-struct csr_data {
-    int* rowPointer;
-    int* cols;
-    float* vals;
-};
-
-const char* cusparseGetErrorString(cusparseStatus_t status) {
-    switch (status) {
-    case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR";
-    case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR";
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT";
-    default: return "UNKNOWN CUSPARSE STATUS";
-    }
-}
-
-const char* cublasGetErrorString(cublasStatus_t status) {
-    switch (status) {
-    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
-    default: return "UNKNOWN CUBLAS STATUS";
-    }
-}
-
-__global__ void cols_sum(int n, const int* d_csrOffsets, const float* d_vals, float* d_rowSums) {
-    int row = blockIdx.x * blockDim.x + threadIdx.x;
-    if (row < n) {
-        int row_start = d_csrOffsets[row];
-        int row_end = d_csrOffsets[row + 1];
-        for (int j = row_start; j < row_end; j++) {
-            atomicAdd(&d_rowSums[row], d_vals[j]);
-        }
-    }
-}
-
-void convert_sparse_to_dense_and_display(cusparseHandle_t handle, const cusparseSpMatDescr_t& matDescr, int n) {
-    // Allocate memory for the dense matrix on the device
-    float* d_denseMat;
-    cudaMalloc((void**)&d_denseMat, n * n * sizeof(float));
-
-    // Create a dense matrix descriptor
-    cusparseDnMatDescr_t denseDescr;
-    CHECK_CUSPARSE(cusparseCreateDnMat(&denseDescr,
-        n, // number of rows
-        n, // number of columns
-        n, // leading dimension
-        d_denseMat, // pointer to dense matrix data
-        CUDA_R_32F, // data type
-        CUSPARSE_ORDER_ROW)); // row-major order
-
-    // Convert sparse matrix to dense matrix
-    void* dBuffer = NULL;
-    size_t bufferSize = 0;
-    CHECK_CUSPARSE(cusparseSparseToDense_bufferSize(handle,
-        matDescr,
-        denseDescr,
-        CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
-        &bufferSize));
-
-    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
-
-    CHECK_CUSPARSE(cusparseSparseToDense(handle,
-        matDescr,
-        denseDescr,
-        CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
-        dBuffer));
-
-    // Copy the dense matrix from device to host
-    std::vector<float> h_denseMat(n * n);
-    CHECK_CUDA(cudaMemcpy(h_denseMat.data(), d_denseMat, n * n * sizeof(float), cudaMemcpyDeviceToHost));
-
-    std::cout << std::fixed << std::setprecision(4); // Set precision to 2 decimal places
-    std::cout << "Dense matrix:" << std::endl;
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < n; ++j) {
-            std::cout << h_denseMat[i * n + j] << " ";
-        }
-        std::cout << std::endl;
-    }
-
-    // Clean up
-    CHECK_CUDA(cudaFree(d_denseMat));
-    CHECK_CUDA(cudaFree(dBuffer));
-    CHECK_CUSPARSE(cusparseDestroyDnMat(denseDescr));
-}
-
-
-void fill_diagonal(cusparseHandle_t handle, cusparseSpMatDescr_t& input, thrust::device_vector<float> diag, csr_data& extended_pointers) {
-    int64_t n, nnz;
-    int* d_csrOffsets;
-    int* d_cols;
-    float* d_vals;
-
-    cusparseIndexType_t csrRowOffsetsType;
-    cusparseIndexType_t csrColIndType;
-    cusparseIndexBase_t idxBase;
-    cudaDataType valueTyp;
-
-    CHECK_CUSPARSE(cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
-
-    thrust::device_vector<int> d_rows_tdv(nnz);
-    thrust::device_vector<int> d_cols_tdv(d_cols, d_cols+nnz);
-    thrust::device_vector<float> d_vals_tdv(d_vals, d_vals+nnz);
-
-    CHECK_CUSPARSE(cusparseXcsr2coo(handle,
-        d_csrOffsets,
-        nnz,
-        n,
-        thrust::raw_pointer_cast(d_rows_tdv.data()),
-        CUSPARSE_INDEX_BASE_ZERO));
-
-    d_rows_tdv.resize(nnz + n);
-    d_cols_tdv.resize(nnz + n);
-    d_vals_tdv.resize(nnz + n);
-
-    int nnz_n = nnz + n;
-
-    thrust::device_vector<int> d_vec(n);
-
-    // Fill the vector with values from 0 to n-1
-    thrust::sequence(d_vec.begin(), d_vec.end());
-
-    thrust::copy(d_vec.begin(), d_vec.end(), d_rows_tdv.begin() + nnz);
-    thrust::copy(d_vec.begin(), d_vec.end(), d_cols_tdv.begin() + nnz);
-    thrust::copy(diag.begin(), diag.end(), d_vals_tdv.begin() + nnz);
-
-    /*thrust::copy(d_rows_tdv.begin(), d_rows_tdv.end(), );
-    thrust::copy(d_cols_tdv.begin(), d_cols_tdv.end(), extended_pointers.cols);
-    thrust::copy(d_vals_tdv.begin(), d_vals_tdv.end(), extended_pointers.vals);*/
-
-
-    CHECK_CUSPARSE(cusparseXcsr2coo(handle,
-        d_csrOffsets,
-        nnz,
-        n,
-        thrust::raw_pointer_cast(d_rows_tdv.data()),
-        CUSPARSE_INDEX_BASE_ZERO));
-    thrust::device_vector<int> d_csrOffsets_o(n + 1);
-
-    thrust::sort_by_key(thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.begin(), d_cols_tdv.begin())),
-        thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.end(), d_cols_tdv.end())),
-        d_vals_tdv.begin());
-    
-    CHECK_CUSPARSE(cusparseXcoo2csr(handle,
-        thrust::raw_pointer_cast(d_rows_tdv.data()),
-        nnz + n,
-        n,
-        thrust::raw_pointer_cast(d_csrOffsets_o.data()),
-        CUSPARSE_INDEX_BASE_ZERO));
-
-    CHECK_CUDA(cudaMemcpy(extended_pointers.rowPointer, thrust::raw_pointer_cast(d_csrOffsets_o.data()), (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
-    CHECK_CUDA(cudaMemcpy(extended_pointers.cols, thrust::raw_pointer_cast(d_cols_tdv.data()), nnz_n * sizeof(int), cudaMemcpyDeviceToDevice));
-    CHECK_CUDA(cudaMemcpy(extended_pointers.vals, thrust::raw_pointer_cast(d_vals_tdv.data()), nnz_n * sizeof(float), cudaMemcpyDeviceToDevice));
-
-    CHECK_CUSPARSE(cusparseCsrSetPointers(input,
-        extended_pointers.rowPointer,
-        extended_pointers.cols,
-        extended_pointers.vals));
-
-}
-
-
-void scale_csr_matrix(cusparseHandle_t handle,
-    float alpha,
-    cusparseSpMatDescr_t& input,
-    cusparseSpMatDescr_t& result) {
-    // Extract the dimensions and the number of non-zero elements
-    int64_t n, nnz;
-    int* d_csrOffsets;
-    int* d_cols;
-    float* d_vals;
-
-    int64_t n_r, nnz_r;
-    int* d_csrOffsets_r;
-    int* d_cols_r;
-    float* d_vals_r;
-
-    cusparseIndexType_t csrRowOffsetsType;
-    cusparseIndexType_t csrColIndType;
-    cusparseIndexBase_t idxBase;
-    cudaDataType valueTyp;
-
-    cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
-    cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
-
-    // Set scaling factors
-    const float beta = 0.0f;
-    size_t bufferSize = 0;
-
-    cusparseMatDescr_t input_desc;
-    cusparseCreateMatDescr(&input_desc);
-
-    // Create matrix descriptor for the result matrix C
-    cusparseMatDescr_t result_desc;
-    cusparseCreateMatDescr(&result_desc);
-
-    // Get buffer size for the operation
-    cusparseScsrgeam2_bufferSizeExt(handle,
-        n, n,
-        &alpha, input_desc, nnz,
-        d_vals,
-        d_csrOffsets,
-        d_cols,
-        &beta, input_desc, nnz,
-        d_vals,
-        d_csrOffsets,
-        d_cols,
-        result_desc,
-        d_vals_r,
-        d_csrOffsets_r,
-        d_cols_r,
-        &bufferSize);
-
-    void* dBuffer;
-    cudaMalloc(&dBuffer, bufferSize);
-
-    // Perform the scaling operation
-    cusparseScsrgeam2(handle,
-        n, n,
-        &alpha, input_desc, nnz,
-        d_vals,
-        d_csrOffsets,
-        d_cols,
-        &beta, input_desc, nnz,
-        d_vals,
-        d_csrOffsets,
-        d_cols,
-        result_desc,
-        d_vals_r,
-        d_csrOffsets_r,
-        d_cols_r,
-        dBuffer);
-    // Clean up
-    cudaFree(dBuffer);
-    cusparseDestroyMatDescr(input_desc);
-    cusparseDestroyMatDescr(result_desc);
-}
-
-thrust::device_vector<float> sum_rows_csr_matrix(cusparseHandle_t handle, cusparseSpMatDescr_t input) {
-    // Extract CSR matrix information
-    int64_t rows, cols, nnz;
-    int* d_csrOffsets, * d_cols;
-    float* d_vals;
-
-    cusparseIndexType_t csrRowOffsetsType;
-    cusparseIndexType_t csrColIndType;
-    cusparseIndexBase_t idxBase;
-    cudaDataType valueTyp;
-
-    cusparseCsrGet(input, &rows, &cols, &nnz,
-        (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
-        &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
-
-    // Allocate memory for the row sums
-    thrust::device_vector<float> d_rowSums(rows);
-
-    // Launch kernel to sum elements of each row using atomicAdd
-    int blockSize = 256;
-    int gridSize = (rows + blockSize - 1) / blockSize;
-    cols_sum << <gridSize, blockSize >> > (rows, d_csrOffsets, d_vals, thrust::raw_pointer_cast(d_rowSums.data()));
-    cudaDeviceSynchronize();
-
-    return d_rowSums;
-}
-
-int estimate_split(float density, int n) {
-    if (density > 0.5f) {
-        std::cout << "Error, density can not be bigger than 0.5!" << std::endl;
-    }
-    
-    float sparsity = 1.0f - density;
-    float inside = 2.0f * sparsity - 1.0f;
-    float smaller_set_size = 0.5f * n * (1 + sqrt(inside));
-    return static_cast<int>(smaller_set_size) - 1;
-}
-
-// Function to generate initial permutation
-std::vector<int> generate_initial_permutation(std::mt19937 & rng, int n) {
-    std::vector<int> permutation(n);
-    for (int i = 0; i < n; ++i) {
-        permutation[i] = i;
-    }
-    std::shuffle(permutation.begin(), permutation.end(), rng);
-    return permutation;
-}
-
-
-void create_graph_sparse(int n, int split, std::vector<int>& p, thrust::device_vector<int>& d_rows, thrust::device_vector<int>& d_cols, thrust::device_vector<float>& d_vals) {
-    thrust::host_vector<int> h_rows, h_cols;
-    thrust::host_vector<float> h_vals;
-
-    thrust::default_random_engine rng;
-    thrust::random::uniform_real_distribution<float> dist(0.01f, 1.0f);
-
-    std::vector<std::tuple<int, int, float>> combined;
-
-    for (int i = 0; i <= split; ++i) {
-        for (int j = split + 1; j < n; ++j) {
-            float rnd_val = dist(rng);
-            combined.push_back(std::make_tuple(p[i], p[j], rnd_val));
-            combined.push_back(std::make_tuple(p[j], p[i], rnd_val));
-        }
-    }
-
-    std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) {
-        if (std::get<0>(a) == std::get<0>(b)) {
-            return std::get<1>(a) < std::get<1>(b);
-        }
-        return std::get<0>(a) < std::get<0>(b);
-        });
-
-    for (size_t i = 0; i < combined.size(); ++i) {
-        h_rows.push_back(std::get<0>(combined[i]));
-        h_cols.push_back(std::get<1>(combined[i]));
-        h_vals.push_back(std::get<2>(combined[i]));
-    }
-
-    d_rows = h_rows;
-    d_cols = h_cols;
-    d_vals = h_vals;
-}
-
-thrust::device_vector<float> generate_solution(const std::vector<int>& p, int split) {
-    int size = p.size();
-
-    // Initialize a device vector of bool type with false (equivalent to CUDA.zeros(Bool, size(p)))
-    thrust::device_vector<float> x(size, 0);
-
-    
-    // Set the first split elements to true (equivalent to x[p[1:split]] .= 1)
-    for (int i = 0; i <= split ; ++i) {
-        x[p[i]] = 1;
-    }
-
-    return x;
-}
-
-
-void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cusparseSpMatDescr_t& Q, csr_data& extended_pointers) {
-    scale_csr_matrix(handle, -1.0f, graph, Q);
-    thrust::device_vector<float> sum = sum_rows_csr_matrix(handle, graph);
-    fill_diagonal(handle, Q, sum, extended_pointers);
-    scale_csr_matrix(handle, -0.25f, Q, Q);
-
-    for (int i = 0; i < sum.size(); i++) {
-        std::cout << sum[i] << std::endl;
-    }
-}
-
-//float calculate_qubo_energy(cublasHandle_t cublasHandle,
+﻿//#include <cuda_runtime.h>
+//#include <cusparse_v2.h>
+//#include <cublas_v2.h>
+//#include <thrust/host_vector.h>
+//#include <thrust/device_vector.h>
+//#include <thrust/execution_policy.h>
+//#include <thrust/sort.h>
+//#include <thrust/random.h>
+//#include <thrust/reduce.h>
+//#include <thrust/extrema.h>
+//#include <thrust/sequence.h>
+//#include <thrust/inner_product.h>
+//#include <iostream>
+//#include <algorithm>
+//#include <random>
+//#include <vector>
+//#include <iostream>
+//#include <iomanip>
+//#include "indicators.hpp"
+//#ifdef __INTELLISENSE__
+//#include "intellisense_cuda_intrinsics.h"
+//#endif
+//
+//
+//#define CHECK_CUSPARSE(call)                                    \
+//{                                                               \
+//    cusparseStatus_t err = call;                                \
+//    if (err != CUSPARSE_STATUS_SUCCESS) {                       \
+//        std::cerr << "CUSPARSE error in file " << __FILE__      \
+//                  << " at line " << __LINE__ << ": "            \
+//                  << cusparseGetErrorString(err) << std::endl;  \
+//        exit(EXIT_FAILURE);                                     \
+//    }                                                           \
+//}
+//
+//#define CHECK_CUDA(call)                                        \
+//{                                                               \
+//    cudaError_t err = call;                                     \
+//    if (err != cudaSuccess) {                                   \
+//        std::cerr << "CUDA error in file " << __FILE__          \
+//                  << " at line " << __LINE__ << ": "            \
+//                  << cudaGetErrorString(err) << std::endl;      \
+//        exit(EXIT_FAILURE);                                     \
+//    }                                                           \
+//}
+//
+//#define CHECK_CUBLAS(call)                                        \
+//{                                                                 \
+//    cublasStatus_t err = call;                                    \
+//    if (err != CUBLAS_STATUS_SUCCESS) {                           \
+//        std::cerr << "CUBLAS error in file " << __FILE__          \
+//                  << " at line " << __LINE__ << ": "              \
+//                  << cublasGetErrorString(err) << std::endl;      \
+//        exit(EXIT_FAILURE);                                       \
+//    }                                                             \
+//}
+//
+//
+//struct csr_data {
+//    int* rowPointer;
+//    int* cols;
+//    float* vals;
+//};
+//
+//const char* cusparseGetErrorString(cusparseStatus_t status) {
+//    switch (status) {
+//    case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS";
+//    case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED";
+//    case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED";
+//    case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE";
+//    case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH";
+//    case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR";
+//    case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED";
+//    case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR";
+//    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+//    case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT";
+//    default: return "UNKNOWN CUSPARSE STATUS";
+//    }
+//}
+//
+//const char* cublasGetErrorString(cublasStatus_t status) {
+//    switch (status) {
+//    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+//    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+//    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+//    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+//    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+//    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+//    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+//    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+//    case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+//    case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+//    default: return "UNKNOWN CUBLAS STATUS";
+//    }
+//}
+//
+//__global__ void cols_sum(int n, const int* d_csrOffsets, const float* d_vals, float* d_rowSums) {
+//    int row = blockIdx.x * blockDim.x + threadIdx.x;
+//    if (row < n) {
+//        int row_start = d_csrOffsets[row];
+//        int row_end = d_csrOffsets[row + 1];
+//        for (int j = row_start; j < row_end; j++) {
+//            atomicAdd(&d_rowSums[row], d_vals[j]);
+//        }
+//    }
+//}
+//
+//void convert_sparse_to_dense_and_display(cusparseHandle_t handle, const cusparseSpMatDescr_t& matDescr, int n) {
+//    // Allocate memory for the dense matrix on the device
+//    float* d_denseMat;
+//    cudaMalloc((void**)&d_denseMat, n * n * sizeof(float));
+//
+//    // Create a dense matrix descriptor
+//    cusparseDnMatDescr_t denseDescr;
+//    CHECK_CUSPARSE(cusparseCreateDnMat(&denseDescr,
+//        n, // number of rows
+//        n, // number of columns
+//        n, // leading dimension
+//        d_denseMat, // pointer to dense matrix data
+//        CUDA_R_32F, // data type
+//        CUSPARSE_ORDER_ROW)); // row-major order
+//
+//    // Convert sparse matrix to dense matrix
+//    void* dBuffer = NULL;
+//    size_t bufferSize = 0;
+//    CHECK_CUSPARSE(cusparseSparseToDense_bufferSize(handle,
+//        matDescr,
+//        denseDescr,
+//        CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
+//        &bufferSize));
+//
+//    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+//    CHECK_CUSPARSE(cusparseSparseToDense(handle,
+//        matDescr,
+//        denseDescr,
+//        CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
+//        dBuffer));
+//
+//    // Copy the dense matrix from device to host
+//    std::vector<float> h_denseMat(n * n);
+//    CHECK_CUDA(cudaMemcpy(h_denseMat.data(), d_denseMat, n * n * sizeof(float), cudaMemcpyDeviceToHost));
+//
+//    std::cout << std::fixed << std::setprecision(4); // Set precision to 2 decimal places
+//    std::cout << "Dense matrix:" << std::endl;
+//    for (int i = 0; i < n; ++i) {
+//        for (int j = 0; j < n; ++j) {
+//            std::cout << h_denseMat[i * n + j] << " ";
+//        }
+//        std::cout << std::endl;
+//    }
+//
+//    // Clean up
+//    CHECK_CUDA(cudaFree(d_denseMat));
+//    CHECK_CUDA(cudaFree(dBuffer));
+//    CHECK_CUSPARSE(cusparseDestroyDnMat(denseDescr));
+//}
+//
+//
+//void fill_diagonal(cusparseHandle_t handle, cusparseSpMatDescr_t& input, thrust::device_vector<float> diag, csr_data& extended_pointers) {
+//    int64_t n, nnz;
+//    int* d_csrOffsets;
+//    int* d_cols;
+//    float* d_vals;
+//
+//    cusparseIndexType_t csrRowOffsetsType;
+//    cusparseIndexType_t csrColIndType;
+//    cusparseIndexBase_t idxBase;
+//    cudaDataType valueTyp;
+//
+//    CHECK_CUSPARSE(cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
+//
+//    thrust::device_vector<int> d_rows_tdv(nnz);
+//    thrust::device_vector<int> d_cols_tdv(d_cols, d_cols+nnz);
+//    thrust::device_vector<float> d_vals_tdv(d_vals, d_vals+nnz);
+//
+//    CHECK_CUSPARSE(cusparseXcsr2coo(handle,
+//        d_csrOffsets,
+//        nnz,
+//        n,
+//        thrust::raw_pointer_cast(d_rows_tdv.data()),
+//        CUSPARSE_INDEX_BASE_ZERO));
+//
+//    d_rows_tdv.resize(nnz + n);
+//    d_cols_tdv.resize(nnz + n);
+//    d_vals_tdv.resize(nnz + n);
+//
+//    int nnz_n = nnz + n;
+//
+//    thrust::device_vector<int> d_vec(n);
+//
+//    // Fill the vector with values from 0 to n-1
+//    thrust::sequence(d_vec.begin(), d_vec.end());
+//
+//    thrust::copy(d_vec.begin(), d_vec.end(), d_rows_tdv.begin() + nnz);
+//    thrust::copy(d_vec.begin(), d_vec.end(), d_cols_tdv.begin() + nnz);
+//    thrust::copy(diag.begin(), diag.end(), d_vals_tdv.begin() + nnz);
+//
+//    /*thrust::copy(d_rows_tdv.begin(), d_rows_tdv.end(), );
+//    thrust::copy(d_cols_tdv.begin(), d_cols_tdv.end(), extended_pointers.cols);
+//    thrust::copy(d_vals_tdv.begin(), d_vals_tdv.end(), extended_pointers.vals);*/
+//
+//
+//    CHECK_CUSPARSE(cusparseXcsr2coo(handle,
+//        d_csrOffsets,
+//        nnz,
+//        n,
+//        thrust::raw_pointer_cast(d_rows_tdv.data()),
+//        CUSPARSE_INDEX_BASE_ZERO));
+//    thrust::device_vector<int> d_csrOffsets_o(n + 1);
+//
+//    thrust::sort_by_key(thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.begin(), d_cols_tdv.begin())),
+//        thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.end(), d_cols_tdv.end())),
+//        d_vals_tdv.begin());
+//    
+//    CHECK_CUSPARSE(cusparseXcoo2csr(handle,
+//        thrust::raw_pointer_cast(d_rows_tdv.data()),
+//        nnz + n,
+//        n,
+//        thrust::raw_pointer_cast(d_csrOffsets_o.data()),
+//        CUSPARSE_INDEX_BASE_ZERO));
+//
+//    CHECK_CUDA(cudaMemcpy(extended_pointers.rowPointer, thrust::raw_pointer_cast(d_csrOffsets_o.data()), (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(extended_pointers.cols, thrust::raw_pointer_cast(d_cols_tdv.data()), nnz_n * sizeof(int), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(extended_pointers.vals, thrust::raw_pointer_cast(d_vals_tdv.data()), nnz_n * sizeof(float), cudaMemcpyDeviceToDevice));
+//
+//    CHECK_CUSPARSE(cusparseCsrSetPointers(input,
+//        extended_pointers.rowPointer,
+//        extended_pointers.cols,
+//        extended_pointers.vals));
+//
+//}
+//
+//
+//void scale_csr_matrix(cusparseHandle_t handle,
+//    float alpha,
+//    cusparseSpMatDescr_t& input,
+//    cusparseSpMatDescr_t& result) {
+//    // Extract the dimensions and the number of non-zero elements
+//    int64_t n, nnz;
+//    int* d_csrOffsets;
+//    int* d_cols;
+//    float* d_vals;
+//
+//    int64_t n_r, nnz_r;
+//    int* d_csrOffsets_r;
+//    int* d_cols_r;
+//    float* d_vals_r;
+//
+//    cusparseIndexType_t csrRowOffsetsType;
+//    cusparseIndexType_t csrColIndType;
+//    cusparseIndexBase_t idxBase;
+//    cudaDataType valueTyp;
+//
+//    cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+//    cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+//
+//    // Set scaling factors
+//    const float beta = 0.0f;
+//    size_t bufferSize = 0;
+//
+//    cusparseMatDescr_t input_desc;
+//    cusparseCreateMatDescr(&input_desc);
+//
+//    // Create matrix descriptor for the result matrix C
+//    cusparseMatDescr_t result_desc;
+//    cusparseCreateMatDescr(&result_desc);
+//
+//    // Get buffer size for the operation
+//    cusparseScsrgeam2_bufferSizeExt(handle,
+//        n, n,
+//        &alpha, input_desc, nnz,
+//        d_vals,
+//        d_csrOffsets,
+//        d_cols,
+//        &beta, input_desc, nnz,
+//        d_vals,
+//        d_csrOffsets,
+//        d_cols,
+//        result_desc,
+//        d_vals_r,
+//        d_csrOffsets_r,
+//        d_cols_r,
+//        &bufferSize);
+//
+//    void* dBuffer;
+//    cudaMalloc(&dBuffer, bufferSize);
+//
+//    // Perform the scaling operation
+//    cusparseScsrgeam2(handle,
+//        n, n,
+//        &alpha, input_desc, nnz,
+//        d_vals,
+//        d_csrOffsets,
+//        d_cols,
+//        &beta, input_desc, nnz,
+//        d_vals,
+//        d_csrOffsets,
+//        d_cols,
+//        result_desc,
+//        d_vals_r,
+//        d_csrOffsets_r,
+//        d_cols_r,
+//        dBuffer);
+//    // Clean up
+//    cudaFree(dBuffer);
+//    cusparseDestroyMatDescr(input_desc);
+//    cusparseDestroyMatDescr(result_desc);
+//}
+//
+//thrust::device_vector<float> sum_rows_csr_matrix(cusparseHandle_t handle, cusparseSpMatDescr_t input) {
+//    // Extract CSR matrix information
+//    int64_t rows, cols, nnz;
+//    int* d_csrOffsets, * d_cols;
+//    float* d_vals;
+//
+//    cusparseIndexType_t csrRowOffsetsType;
+//    cusparseIndexType_t csrColIndType;
+//    cusparseIndexBase_t idxBase;
+//    cudaDataType valueTyp;
+//
+//    cusparseCsrGet(input, &rows, &cols, &nnz,
+//        (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
+//        &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+//
+//    // Allocate memory for the row sums
+//    thrust::device_vector<float> d_rowSums(rows);
+//
+//    // Launch kernel to sum elements of each row using atomicAdd
+//    int blockSize = 256;
+//    int gridSize = (rows + blockSize - 1) / blockSize;
+//    cols_sum << <gridSize, blockSize >> > (rows, d_csrOffsets, d_vals, thrust::raw_pointer_cast(d_rowSums.data()));
+//    cudaDeviceSynchronize();
+//
+//    return d_rowSums;
+//}
+//
+//int estimate_split(float density, int n) {
+//    if (density > 0.5f) {
+//        std::cout << "Error, density can not be bigger than 0.5!" << std::endl;
+//    }
+//    
+//    float sparsity = 1.0f - density;
+//    float inside = 2.0f * sparsity - 1.0f;
+//    float smaller_set_size = 0.5f * n * (1 + sqrt(inside));
+//    return static_cast<int>(smaller_set_size) - 1;
+//}
+//
+//// Function to generate initial permutation
+//std::vector<int> generate_initial_permutation(std::mt19937 & rng, int n) {
+//    std::vector<int> permutation(n);
+//    for (int i = 0; i < n; ++i) {
+//        permutation[i] = i;
+//    }
+//    std::shuffle(permutation.begin(), permutation.end(), rng);
+//    return permutation;
+//}
+//
+//
+//void create_graph_sparse(int n, int split, std::vector<int>& p, thrust::device_vector<int>& d_rows, thrust::device_vector<int>& d_cols, thrust::device_vector<float>& d_vals) {
+//    thrust::host_vector<int> h_rows, h_cols;
+//    thrust::host_vector<float> h_vals;
+//
+//    thrust::default_random_engine rng;
+//    thrust::random::uniform_real_distribution<float> dist(0.01f, 1.0f);
+//
+//    std::vector<std::tuple<int, int, float>> combined;
+//
+//    for (int i = 0; i <= split; ++i) {
+//        for (int j = split + 1; j < n; ++j) {
+//            float rnd_val = dist(rng);
+//            combined.push_back(std::make_tuple(p[i], p[j], rnd_val));
+//            combined.push_back(std::make_tuple(p[j], p[i], rnd_val));
+//        }
+//    }
+//
+//    std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) {
+//        if (std::get<0>(a) == std::get<0>(b)) {
+//            return std::get<1>(a) < std::get<1>(b);
+//        }
+//        return std::get<0>(a) < std::get<0>(b);
+//        });
+//
+//    for (size_t i = 0; i < combined.size(); ++i) {
+//        h_rows.push_back(std::get<0>(combined[i]));
+//        h_cols.push_back(std::get<1>(combined[i]));
+//        h_vals.push_back(std::get<2>(combined[i]));
+//    }
+//
+//    d_rows = h_rows;
+//    d_cols = h_cols;
+//    d_vals = h_vals;
+//}
+//
+//thrust::device_vector<float> generate_solution(const std::vector<int>& p, int split) {
+//    int size = p.size();
+//
+//    // Initialize a device vector of bool type with false (equivalent to CUDA.zeros(Bool, size(p)))
+//    thrust::device_vector<float> x(size, 0);
+//
+//    
+//    // Set the first split elements to true (equivalent to x[p[1:split]] .= 1)
+//    for (int i = 0; i <= split ; ++i) {
+//        x[p[i]] = 1;
+//    }
+//
+//    return x;
+//}
+//
+//
+//void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cusparseSpMatDescr_t& Q, csr_data& extended_pointers) {
+//    scale_csr_matrix(handle, -1.0f, graph, Q);
+//    thrust::device_vector<float> sum = sum_rows_csr_matrix(handle, graph);
+//    fill_diagonal(handle, Q, sum, extended_pointers);
+//    scale_csr_matrix(handle, -0.25f, Q, Q);
+//
+//    for (int i = 0; i < sum.size(); i++) {
+//        std::cout << sum[i] << std::endl;
+//    }
+//}
+//
+////float calculate_qubo_energy(cublasHandle_t cublasHandle,
+////    cusparseHandle_t handle,
+////    int n,
+////    const cusparseSpMatDescr_t& Q,
+////    const thrust::device_vector<float>& x) {
+////    float alpha = 1.0f;
+////    float beta = 0.0f;
+////    size_t bufferSize = 0;
+////    void* dBuffer = nullptr;
+////
+////    // Create dense vector descriptors
+////    cusparseDnVecDescr_t vecX, vecY;
+////    float* Qx;
+////    float* in_x;
+////
+////    cudaMalloc((void**)&Qx, n * sizeof(float));
+////    cudaMalloc((void**)&in_x, n * sizeof(float));
+////
+////    cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice);
+////
+////    cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F);
+////    cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F);
+////    // Allocate buffer
+////    cusparseSpMV_bufferSize(handle,
+////        CUSPARSE_OPERATION_NON_TRANSPOSE,
+////        &alpha,
+////        Q,
+////        vecX,
+////        &beta,
+////        vecY,
+////        CUDA_R_32F,
+////        CUSPARSE_SPMV_ALG_DEFAULT,
+////        &bufferSize);
+////    cudaMalloc(&dBuffer, bufferSize);
+////    // Perform Q * x
+////    cusparseSpMV(handle,
+////        CUSPARSE_OPERATION_NON_TRANSPOSE,
+////        &alpha,
+////        Q,
+////        vecX,
+////        &beta,
+////        vecY,
+////        CUDA_R_32F,
+////        CUSPARSE_SPMV_ALG_DEFAULT,
+////        dBuffer);
+////
+////    float* Qx_ptr;
+////    cusparseDnVecGetValues(vecY, (void**)&Qx_ptr);
+////    
+////    // Compute the dot product x^T * (Q * x) using cuBLAS
+////    float result;
+////    cublasSdot(cublasHandle, n, in_x, 1, Qx_ptr, 1, &result);
+////    // Clean up
+////    cusparseDestroyDnVec(vecX);
+////    cusparseDestroyDnVec(vecY);
+////    cudaFree(dBuffer);
+////    cudaFree(Qx);
+////    cudaFree(in_x);
+////    return result;
+////}
+//
+//
+//float qubo_eng(cublasHandle_t cublasHandle,
 //    cusparseHandle_t handle,
+//    const cusparseSpMatDescr_t& Q,
+//    float* sol_vector) {
+//    int64_t rows, cols, nnz;
+//    int* d_csrOffsets, * d_cols;
+//    float* d_vals;
+//    int* dA_csrOffsets, * dA_columns;
+//    float* dA_values;
+//
+//    cusparseIndexType_t csrRowOffsetsType;
+//    cusparseIndexType_t csrColIndType;
+//    cusparseIndexBase_t idxBase;
+//    cudaDataType valueTyp;
+//
+//    CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz));
+//
+//    CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float)));
+//
+//    CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz,
+//        (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
+//        &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
+//    // Host problem definition
+//    const int A_num_rows = rows;
+//    const int A_num_cols = cols;
+//    const int A_nnz = nnz;
+//    //int h_test[11];
+//    //int* h_test = (int*)calloc(rows+1, sizeof(int));
+//    float     alpha = 1.0f;
+//    float     beta = 0.0f;
+//    //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+//    ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
+//    ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+//    //for (int i = 0; i < A_num_rows+1; i++) {
+//    //    std::cout << h_test[i] << std::endl;
+//    //}
+//    //--------------------------------------------------------------------------
+//    // Device memory management
+//    float * dX, * dY;
+//    CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float)));
+//
+//    CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice));
+//    cusparseSpMatDescr_t matA;
+//    cusparseDnVecDescr_t vecX, vecY;
+//    void* dBuffer = NULL;
+//    size_t               bufferSize = 0;
+//    CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
+//        dA_csrOffsets, dA_columns, dA_values,
+//        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+//        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+//    // Create dense vector X
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F));
+//        // Create dense vector y
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F));
+//        // allocate an external buffer if needed
+//    CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+//        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+//        &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+//        CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
+//    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+//        // execute SpMV
+//    CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+//        &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+//        CUSPARSE_SPMV_ALG_DEFAULT, dBuffer));
+//
+//        // destroy matrix/vector descriptors
+//    CHECK_CUSPARSE(cusparseDestroySpMat(matA));
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+//
+//    /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost));
+//    std::cout << "Result of Q*x: " << std::endl;
+//    for (int i = 0; i < A_num_rows; i++) {
+//        std::cout << hY[i] << std::endl;
+//    }*/
+//
+//    float result;
+//    CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result));
+//    //std::cout << "Energy: " << result << std::endl;
+//    return result;
+//}
+//
+//float calculate_qubo_energy(cublasHandle_t cublasHandle,
+//    cusparseHandle_t cusparseHandle,
 //    int n,
 //    const cusparseSpMatDescr_t& Q,
-//    const thrust::device_vector<float>& x) {
+//    thrust::device_vector<float>& x) {
 //    float alpha = 1.0f;
 //    float beta = 0.0f;
 //    size_t bufferSize = 0;
@@ -432,17 +586,26 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars
 //    // Create dense vector descriptors
 //    cusparseDnVecDescr_t vecX, vecY;
 //    float* Qx;
+//    float* Qx_ptr;
+//    //float* hY = (float*)calloc(n, sizeof(float));
 //    float* in_x;
 //
-//    cudaMalloc((void**)&Qx, n * sizeof(float));
-//    cudaMalloc((void**)&in_x, n * sizeof(float));
+//   /* for (int i = 0; i < n; i++) {
+//        hY[i] = 0.0f;
+//    }*/
 //
-//    cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice);
+//    CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float)));
+//
+//    CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice));
+//    //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice));
+//
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F));
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F));
 //
-//    cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F);
-//    cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F);
 //    // Allocate buffer
-//    cusparseSpMV_bufferSize(handle,
+//    CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle,
 //        CUSPARSE_OPERATION_NON_TRANSPOSE,
 //        &alpha,
 //        Q,
@@ -451,10 +614,11 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars
 //        vecY,
 //        CUDA_R_32F,
 //        CUSPARSE_SPMV_ALG_DEFAULT,
-//        &bufferSize);
-//    cudaMalloc(&dBuffer, bufferSize);
+//        &bufferSize));
+//    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
 //    // Perform Q * x
-//    cusparseSpMV(handle,
+//    CHECK_CUSPARSE(cusparseSpMV(cusparseHandle,
 //        CUSPARSE_OPERATION_NON_TRANSPOSE,
 //        &alpha,
 //        Q,
@@ -463,223 +627,131 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars
 //        vecY,
 //        CUDA_R_32F,
 //        CUSPARSE_SPMV_ALG_DEFAULT,
-//        dBuffer);
+//        dBuffer));
+//
+//    // Extract the raw pointers to the data in the dense vector descriptors
+//    //float* Qx_ptr;
+//    //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr));
+//    // Ensure the computation is complete before accessing the result
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//
+//    /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost));
+//    for (int i = 0; i < n; i++) {
+//        std::cout << hY[i] << std::endl;
+//    }*/
 //
-//    float* Qx_ptr;
-//    cusparseDnVecGetValues(vecY, (void**)&Qx_ptr);
-//    
 //    // Compute the dot product x^T * (Q * x) using cuBLAS
 //    float result;
-//    cublasSdot(cublasHandle, n, in_x, 1, Qx_ptr, 1, &result);
+//    CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result));
+//
 //    // Clean up
-//    cusparseDestroyDnVec(vecX);
-//    cusparseDestroyDnVec(vecY);
-//    cudaFree(dBuffer);
-//    cudaFree(Qx);
-//    cudaFree(in_x);
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+//    CHECK_CUDA(cudaFree(dBuffer));
+//    CHECK_CUDA(cudaFree(Qx));
+//    CHECK_CUDA(cudaFree(in_x));
+//
 //    return result;
 //}
-
-
-float qubo_eng(cublasHandle_t cublasHandle,
-    cusparseHandle_t handle,
-    const cusparseSpMatDescr_t& Q,
-    float* sol_vector) {
-    int64_t rows, cols, nnz;
-    int* d_csrOffsets, * d_cols;
-    float* d_vals;
-    int* dA_csrOffsets, * dA_columns;
-    float* dA_values;
-
-    cusparseIndexType_t csrRowOffsetsType;
-    cusparseIndexType_t csrColIndType;
-    cusparseIndexBase_t idxBase;
-    cudaDataType valueTyp;
-
-    CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz));
-
-    CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int)));
-    CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int)));
-    CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float)));
-
-    CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz,
-        (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
-        &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
-    // Host problem definition
-    const int A_num_rows = rows;
-    const int A_num_cols = cols;
-    const int A_nnz = nnz;
-    //int h_test[11];
-    //int* h_test = (int*)calloc(rows+1, sizeof(int));
-    float     alpha = 1.0f;
-    float     beta = 0.0f;
-    //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
-    ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
-    ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
-    //for (int i = 0; i < A_num_rows+1; i++) {
-    //    std::cout << h_test[i] << std::endl;
-    //}
-    //--------------------------------------------------------------------------
-    // Device memory management
-    float * dX, * dY;
-    CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int)));
-    CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int)));
-    CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float)));
-    CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float)));
-    CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float)));
-
-    CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
-    CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
-    CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice));
-    CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice));
-    cusparseSpMatDescr_t matA;
-    cusparseDnVecDescr_t vecX, vecY;
-    void* dBuffer = NULL;
-    size_t               bufferSize = 0;
-    CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
-        dA_csrOffsets, dA_columns, dA_values,
-        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-    // Create dense vector X
-    CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F));
-        // Create dense vector y
-    CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F));
-        // allocate an external buffer if needed
-    CHECK_CUSPARSE(cusparseSpMV_bufferSize(
-        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
-        CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
-    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
-
-        // execute SpMV
-    CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
-        CUSPARSE_SPMV_ALG_DEFAULT, dBuffer));
-
-        // destroy matrix/vector descriptors
-    CHECK_CUSPARSE(cusparseDestroySpMat(matA));
-    CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
-    CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
-
-    /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost));
-    std::cout << "Result of Q*x: " << std::endl;
-    for (int i = 0; i < A_num_rows; i++) {
-        std::cout << hY[i] << std::endl;
-    }*/
-
-    float result;
-    CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result));
-    //std::cout << "Energy: " << result << std::endl;
-    return result;
-}
-
-float calculate_qubo_energy(cublasHandle_t cublasHandle,
-    cusparseHandle_t cusparseHandle,
-    int n,
-    const cusparseSpMatDescr_t& Q,
-    thrust::device_vector<float>& x) {
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    size_t bufferSize = 0;
-    void* dBuffer = nullptr;
-
-    // Create dense vector descriptors
-    cusparseDnVecDescr_t vecX, vecY;
-    float* Qx;
-    float* Qx_ptr;
-    //float* hY = (float*)calloc(n, sizeof(float));
-    float* in_x;
-
-   /* for (int i = 0; i < n; i++) {
-        hY[i] = 0.0f;
-    }*/
-
-    CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float)));
-    CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float)));
-    CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float)));
-
-    CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice));
-    //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice));
-
-    CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F));
-    CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F));
-
-    // Allocate buffer
-    CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &alpha,
-        Q,
-        vecX,
-        &beta,
-        vecY,
-        CUDA_R_32F,
-        CUSPARSE_SPMV_ALG_DEFAULT,
-        &bufferSize));
-    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
-
-    // Perform Q * x
-    CHECK_CUSPARSE(cusparseSpMV(cusparseHandle,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &alpha,
-        Q,
-        vecX,
-        &beta,
-        vecY,
-        CUDA_R_32F,
-        CUSPARSE_SPMV_ALG_DEFAULT,
-        dBuffer));
-
-    // Extract the raw pointers to the data in the dense vector descriptors
-    //float* Qx_ptr;
-    //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr));
-    // Ensure the computation is complete before accessing the result
-    CHECK_CUDA(cudaDeviceSynchronize());
-
-    /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost));
-    for (int i = 0; i < n; i++) {
-        std::cout << hY[i] << std::endl;
-    }*/
-
-    // Compute the dot product x^T * (Q * x) using cuBLAS
-    float result;
-    CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result));
-
-    // Clean up
-    CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
-    CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
-    CHECK_CUDA(cudaFree(dBuffer));
-    CHECK_CUDA(cudaFree(Qx));
-    CHECK_CUDA(cudaFree(in_x));
-
-    return result;
-}
-
+//
+////void brute_force_solutions(cublasHandle_t cublasHandle,
+////    cusparseHandle_t handle,
+////    int n,
+////    const cusparseSpMatDescr_t& Q) {
+////
+////    int num_solutions = 1 << n;  // 2^n
+////    thrust::device_vector<float> d_solutions(num_solutions * n);
+////    for (int i = 0; i < num_solutions; ++i) {
+////        for (int j = 0; j < n; ++j) {
+////            float val = static_cast<float>((i & (1 << j)) != 0);
+////            d_solutions[i * n + j] = val;
+////        }
+////    }
+////
+////
+////    // Allocate memory for energies
+////    thrust::device_vector<float> d_energies(num_solutions);
+////    thrust::device_vector<float> sol_x(n);
+////    for (int i = 0; i < num_solutions; ++i) {
+////        thrust::copy(d_solutions.begin() + (i * n), d_solutions.begin() + ((i + 1) * n), sol_x.begin());
+////        float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
+////        d_energies[i] = eng;
+////        std::cout << "Energy i: " << eng << std::endl;
+////    }
+////
+////
+////    // Find the minimum energy
+////    auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
+////    float min_energy = *min_energy_iter;
+////    int min_index = min_energy_iter - d_energies.begin();
+////
+////    // Copy the best solution to host
+////    thrust::host_vector<float> h_best_solution(n);
+////    thrust::copy(d_solutions.begin() + min_index * n, d_solutions.begin() + (min_index + 1) * n, h_best_solution.begin());
+////
+////    // Print the result
+////    std::cout << "Best energy: " << min_energy << std::endl;
+////    std::cout << "Best solution: ";
+////    for (float bit : h_best_solution) {
+////        std::cout << bit << " ";
+////    }
+////    std::cout << std::endl;
+////    convert_sparse_to_dense_and_display(handle, Q, n);
+////}
+//
 //void brute_force_solutions(cublasHandle_t cublasHandle,
 //    cusparseHandle_t handle,
 //    int n,
 //    const cusparseSpMatDescr_t& Q) {
-//
+//    indicators::show_console_cursor(false);
 //    int num_solutions = 1 << n;  // 2^n
 //    thrust::device_vector<float> d_solutions(num_solutions * n);
+//    /*
 //    for (int i = 0; i < num_solutions; ++i) {
 //        for (int j = 0; j < n; ++j) {
 //            float val = static_cast<float>((i & (1 << j)) != 0);
 //            d_solutions[i * n + j] = val;
 //        }
-//    }
-//
+//    }*/
 //
+//    //convert_sparse_to_dense_and_display(handle, Q, n);
 //    // Allocate memory for energies
 //    thrust::device_vector<float> d_energies(num_solutions);
 //    thrust::device_vector<float> sol_x(n);
+//    
+//    indicators::ProgressBar  bar{
+//    indicators::option::BarWidth{40},
+//    indicators::option::Start{"["},
+//    indicators::option::Fill{"="},
+//    indicators::option::Lead{">"},
+//    indicators::option::Remainder{" "},
+//    indicators::option::End{" ]"},
+//    indicators::option::ForegroundColor{indicators::Color::white},
+//    indicators::option::FontStyles{
+//          std::vector<indicators::FontStyle>{indicators::FontStyle::bold}},
+//    indicators::option::MaxProgress{num_solutions}
+//    };
+//    int smallest_ind = -1;
+//    float smallest_eng = 1000;
+//
 //    for (int i = 0; i < num_solutions; ++i) {
-//        thrust::copy(d_solutions.begin() + (i * n), d_solutions.begin() + ((i + 1) * n), sol_x.begin());
-//        float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
+//        for (int j = 0; j < n; ++j) {
+//            float val = static_cast<float>((i & (1 << j)) != 0);
+//            sol_x[j] = val;
+//            d_solutions[i * n + j] = val;
+//        }
+//        float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data()));
+//        //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
 //        d_energies[i] = eng;
-//        std::cout << "Energy i: " << eng << std::endl;
+//        if (eng <= smallest_eng) {
+//            smallest_ind = i;
+//            smallest_eng = eng;
+//        }
+//        //std::cout << "Energy i: " << eng << std::endl;
+//        bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) });
+//        bar.tick();
 //    }
-//
-//
+//    std::cout << "Tested all solutions, now sorting." << std::endl;
 //    // Find the minimum energy
 //    auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
 //    float min_energy = *min_energy_iter;
@@ -687,10 +759,11 @@ float calculate_qubo_energy(cublasHandle_t cublasHandle,
 //
 //    // Copy the best solution to host
 //    thrust::host_vector<float> h_best_solution(n);
-//    thrust::copy(d_solutions.begin() + min_index * n, d_solutions.begin() + (min_index + 1) * n, h_best_solution.begin());
+//    thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin());
 //
 //    // Print the result
-//    std::cout << "Best energy: " << min_energy << std::endl;
+//    std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl;
+//    std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl;
 //    std::cout << "Best solution: ";
 //    for (float bit : h_best_solution) {
 //        std::cout << bit << " ";
@@ -698,198 +771,125 @@ float calculate_qubo_energy(cublasHandle_t cublasHandle,
 //    std::cout << std::endl;
 //    convert_sparse_to_dense_and_display(handle, Q, n);
 //}
-
-void brute_force_solutions(cublasHandle_t cublasHandle,
-    cusparseHandle_t handle,
-    int n,
-    const cusparseSpMatDescr_t& Q) {
-    indicators::show_console_cursor(false);
-    int num_solutions = 1 << n;  // 2^n
-    thrust::device_vector<float> d_solutions(num_solutions * n);
-    /*
-    for (int i = 0; i < num_solutions; ++i) {
-        for (int j = 0; j < n; ++j) {
-            float val = static_cast<float>((i & (1 << j)) != 0);
-            d_solutions[i * n + j] = val;
-        }
-    }*/
-
-    //convert_sparse_to_dense_and_display(handle, Q, n);
-    // Allocate memory for energies
-    thrust::device_vector<float> d_energies(num_solutions);
-    thrust::device_vector<float> sol_x(n);
-    
-    indicators::ProgressBar  bar{
-    indicators::option::BarWidth{40},
-    indicators::option::Start{"["},
-    indicators::option::Fill{"="},
-    indicators::option::Lead{">"},
-    indicators::option::Remainder{" "},
-    indicators::option::End{" ]"},
-    indicators::option::ForegroundColor{indicators::Color::white},
-    indicators::option::FontStyles{
-          std::vector<indicators::FontStyle>{indicators::FontStyle::bold}},
-    indicators::option::MaxProgress{num_solutions}
-    };
-    int smallest_ind = -1;
-    float smallest_eng = 1000;
-
-    for (int i = 0; i < num_solutions; ++i) {
-        for (int j = 0; j < n; ++j) {
-            float val = static_cast<float>((i & (1 << j)) != 0);
-            sol_x[j] = val;
-            d_solutions[i * n + j] = val;
-        }
-        float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data()));
-        //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
-        d_energies[i] = eng;
-        if (eng <= smallest_eng) {
-            smallest_ind = i;
-            smallest_eng = eng;
-        }
-        //std::cout << "Energy i: " << eng << std::endl;
-        bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) });
-        bar.tick();
-    }
-    std::cout << "Tested all solutions, now sorting." << std::endl;
-    // Find the minimum energy
-    auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
-    float min_energy = *min_energy_iter;
-    int min_index = min_energy_iter - d_energies.begin();
-
-    // Copy the best solution to host
-    thrust::host_vector<float> h_best_solution(n);
-    thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin());
-
-    // Print the result
-    std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl;
-    std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl;
-    std::cout << "Best solution: ";
-    for (float bit : h_best_solution) {
-        std::cout << bit << " ";
-    }
-    std::cout << std::endl;
-    convert_sparse_to_dense_and_display(handle, Q, n);
-}
-
-
-int main() {
-    int n = 12;
-    int seed = 14;
-    float density = 0.5;
-    std::mt19937 rng(seed);
-
-    std::vector<int> p = generate_initial_permutation(rng, n);
-
-    int split = estimate_split(density, n);  // Example split, can be computed as needed
-    std::cout << "Split: " << split << std::endl;
-
-    thrust::device_vector<int> d_rows;
-    thrust::device_vector<int> d_cols;
-    thrust::device_vector<float> d_vals;
-
-    create_graph_sparse(n, split, p, d_rows, d_cols, d_vals);
-    thrust::device_vector<float> d_x = generate_solution(p, split);
-    std::cout << "Graph created.";
-    // Print the result (for debugging purposes)
-    thrust::host_vector<int> h_rows = d_rows;
-    thrust::host_vector<int> h_cols = d_cols;
-    thrust::host_vector<float> h_vals = d_vals;
-    thrust::host_vector<float> h_x = d_x;
-    int print_size = 2*n;
-    std::cout << "Rows: ";
-    for (int i = 0; i < print_size; ++i) {
-        std::cout << h_rows[i] << " ";
-    }
-    std::cout << std::endl;
-
-    std::cout << "Cols: ";
-    for (int i = 0; i < print_size; ++i) {
-        std::cout << h_cols[i] << " ";
-    }
-    std::cout << std::endl;
-
-    std::cout << "Vals: ";
-    for (int i = 0; i < print_size; ++i) {
-        std::cout << h_vals[i] << " ";
-    }
-    std::cout << std::endl;
-
-    std::cout << "Solution: ";
-    for (int i = 0; i < n; ++i) {
-        std::cout << h_x[i] << " ";
-    }
-    std::cout << std::endl;
-
-    cusparseHandle_t handle;
-    cublasHandle_t cublasHandle;
-    cusparseSpMatDescr_t graph_csr, Q;
-
-    // Initialize cuSPARSE
-    cublasCreate(&cublasHandle);
-    CHECK_CUSPARSE(cusparseCreate(&handle));
-    int nnz = d_vals.size();
-    thrust::device_vector<int> d_csrOffsets(n + 1);
-    cusparseXcoo2csr(handle,
-        thrust::raw_pointer_cast(d_rows.data()),
-        nnz,
-        n,
-        thrust::raw_pointer_cast(d_csrOffsets.data()),
-        CUSPARSE_INDEX_BASE_ZERO);
-
-    CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr,
-        n,
-        n,
-        nnz,
-        thrust::raw_pointer_cast(d_csrOffsets.data()),
-        thrust::raw_pointer_cast(d_cols.data()), // column indices
-        thrust::raw_pointer_cast(d_vals.data()), // values
-        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-
-    thrust::device_vector<float> d_QVals(nnz);
-    CHECK_CUSPARSE(cusparseCreateCsr(&Q,
-        n,
-        n,
-        nnz,
-        thrust::raw_pointer_cast(d_csrOffsets.data()),
-        thrust::raw_pointer_cast(d_cols.data()), // column indices
-        thrust::raw_pointer_cast(d_QVals.data()), // values
-        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-    // Now matDescr can be used in further cuSPARSE operations
-
-    int* newRowsPtr;
-    int* newCols;
-    float* newVals;
-    csr_data extended_sparse_data;
-
-    CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int)));
-    CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int)));
-    CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float)));
-
-    extended_sparse_data.rowPointer = newRowsPtr;
-    extended_sparse_data.cols = newCols;
-    extended_sparse_data.vals = newVals;
-
-
-    std::cout << "Sparse matrix created successfully!" << std::endl;
-    graph_to_qubo(handle, graph_csr, Q, extended_sparse_data);
-    /*convert_sparse_to_dense_and_display(handle, graph_csr, n); */
-    //convert_sparse_to_dense_and_display(handle, Q, n);
-    
-    float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data()));
-    
-    std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl;
-
-    /*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x);
-    std::cout << "Qubo energy: " << energy << std::endl;*/
-    brute_force_solutions(cublasHandle, handle, n, Q);
-    
-    // Clean up
-    CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr));
-    CHECK_CUSPARSE(cusparseDestroySpMat(Q));
-    CHECK_CUSPARSE(cusparseDestroy(handle));
-
-    return 0;
-};
+//
+//
+//int main() {
+//    int n = 12;
+//    int seed = 14;
+//    float density = 0.5;
+//    std::mt19937 rng(seed);
+//
+//    std::vector<int> p = generate_initial_permutation(rng, n);
+//
+//    int split = estimate_split(density, n);  // Example split, can be computed as needed
+//    std::cout << "Split: " << split << std::endl;
+//
+//    thrust::device_vector<int> d_rows;
+//    thrust::device_vector<int> d_cols;
+//    thrust::device_vector<float> d_vals;
+//
+//    create_graph_sparse(n, split, p, d_rows, d_cols, d_vals);
+//    thrust::device_vector<float> d_x = generate_solution(p, split);
+//    std::cout << "Graph created.";
+//    // Print the result (for debugging purposes)
+//    thrust::host_vector<int> h_rows = d_rows;
+//    thrust::host_vector<int> h_cols = d_cols;
+//    thrust::host_vector<float> h_vals = d_vals;
+//    thrust::host_vector<float> h_x = d_x;
+//    int print_size = 2*n;
+//    std::cout << "Rows: ";
+//    for (int i = 0; i < print_size; ++i) {
+//        std::cout << h_rows[i] << " ";
+//    }
+//    std::cout << std::endl;
+//
+//    std::cout << "Cols: ";
+//    for (int i = 0; i < print_size; ++i) {
+//        std::cout << h_cols[i] << " ";
+//    }
+//    std::cout << std::endl;
+//
+//    std::cout << "Vals: ";
+//    for (int i = 0; i < print_size; ++i) {
+//        std::cout << h_vals[i] << " ";
+//    }
+//    std::cout << std::endl;
+//
+//    std::cout << "Solution: ";
+//    for (int i = 0; i < n; ++i) {
+//        std::cout << h_x[i] << " ";
+//    }
+//    std::cout << std::endl;
+//
+//    cusparseHandle_t handle;
+//    cublasHandle_t cublasHandle;
+//    cusparseSpMatDescr_t graph_csr, Q;
+//
+//    // Initialize cuSPARSE
+//    cublasCreate(&cublasHandle);
+//    CHECK_CUSPARSE(cusparseCreate(&handle));
+//    int nnz = d_vals.size();
+//    thrust::device_vector<int> d_csrOffsets(n + 1);
+//    cusparseXcoo2csr(handle,
+//        thrust::raw_pointer_cast(d_rows.data()),
+//        nnz,
+//        n,
+//        thrust::raw_pointer_cast(d_csrOffsets.data()),
+//        CUSPARSE_INDEX_BASE_ZERO);
+//
+//    CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr,
+//        n,
+//        n,
+//        nnz,
+//        thrust::raw_pointer_cast(d_csrOffsets.data()),
+//        thrust::raw_pointer_cast(d_cols.data()), // column indices
+//        thrust::raw_pointer_cast(d_vals.data()), // values
+//        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+//        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+//
+//    thrust::device_vector<float> d_QVals(nnz);
+//    CHECK_CUSPARSE(cusparseCreateCsr(&Q,
+//        n,
+//        n,
+//        nnz,
+//        thrust::raw_pointer_cast(d_csrOffsets.data()),
+//        thrust::raw_pointer_cast(d_cols.data()), // column indices
+//        thrust::raw_pointer_cast(d_QVals.data()), // values
+//        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+//        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+//    // Now matDescr can be used in further cuSPARSE operations
+//
+//    int* newRowsPtr;
+//    int* newCols;
+//    float* newVals;
+//    csr_data extended_sparse_data;
+//
+//    CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float)));
+//
+//    extended_sparse_data.rowPointer = newRowsPtr;
+//    extended_sparse_data.cols = newCols;
+//    extended_sparse_data.vals = newVals;
+//
+//
+//    std::cout << "Sparse matrix created successfully!" << std::endl;
+//    graph_to_qubo(handle, graph_csr, Q, extended_sparse_data);
+//    /*convert_sparse_to_dense_and_display(handle, graph_csr, n); */
+//    //convert_sparse_to_dense_and_display(handle, Q, n);
+//    
+//    float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data()));
+//    
+//    std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl;
+//
+//    /*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x);
+//    std::cout << "Qubo energy: " << energy << std::endl;*/
+//    brute_force_solutions(cublasHandle, handle, n, Q);
+//    
+//    // Clean up
+//    CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr));
+//    CHECK_CUSPARSE(cusparseDestroySpMat(Q));
+//    CHECK_CUSPARSE(cusparseDestroy(handle));
+//
+//    return 0;
+//};
diff --git a/CudaMaxCutPlanted/main.cu b/CudaMaxCutPlanted/main.cu
new file mode 100644
index 0000000..cef2f1c
--- /dev/null
+++ b/CudaMaxCutPlanted/main.cu
@@ -0,0 +1,636 @@
+//#include <cuda_runtime.h>
+//#include <cusparse_v2.h>
+#include "CudaSparseMatrix.hpp"
+//#include <cublas_v2.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_ptr.h>
+//#include <thrust/device_vector.h>
+//#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+//#include <thrust/random.h>
+//#include <thrust/reduce.h>
+//#include <thrust/extrema.h>
+//#include <thrust/sequence.h>
+//#include <thrust/inner_product.h>
+//#include <iostream>
+//#include <algorithm>
+#include <random>
+#include <curand_kernel.h>
+//#include <vector>
+//#include <iostream>
+//#include <iomanip>
+#include "indicators.hpp"
+#ifdef __INTELLISENSE__
+#include "intellisense_cuda_intrinsics.h"
+#endif
+
+
+
+void scale_csr_matrix(cusparseHandle_t handle,
+    float alpha,
+    cusparseSpMatDescr_t& input,
+    cusparseSpMatDescr_t& result) {
+    // Extract the dimensions and the number of non-zero elements
+    int64_t n, nnz;
+    int* d_csrOffsets;
+    int* d_cols;
+    float* d_vals;
+
+    int64_t n_r, nnz_r;
+    int* d_csrOffsets_r;
+    int* d_cols_r;
+    float* d_vals_r;
+
+    cusparseIndexType_t csrRowOffsetsType;
+    cusparseIndexType_t csrColIndType;
+    cusparseIndexBase_t idxBase;
+    cudaDataType valueTyp;
+
+    cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+    cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+
+    // Set scaling factors
+    const float beta = 0.0f;
+    size_t bufferSize = 0;
+
+    cusparseMatDescr_t input_desc;
+    cusparseCreateMatDescr(&input_desc);
+
+    // Create matrix descriptor for the result matrix C
+    cusparseMatDescr_t result_desc;
+    cusparseCreateMatDescr(&result_desc);
+
+    // Get buffer size for the operation
+    cusparseScsrgeam2_bufferSizeExt(handle,
+        n, n,
+        &alpha, input_desc, nnz,
+        d_vals,
+        d_csrOffsets,
+        d_cols,
+        &beta, input_desc, nnz,
+        d_vals,
+        d_csrOffsets,
+        d_cols,
+        result_desc,
+        d_vals_r,
+        d_csrOffsets_r,
+        d_cols_r,
+        &bufferSize);
+
+    void* dBuffer;
+    cudaMalloc(&dBuffer, bufferSize);
+
+    // Perform the scaling operation
+    cusparseScsrgeam2(handle,
+        n, n,
+        &alpha, input_desc, nnz,
+        d_vals,
+        d_csrOffsets,
+        d_cols,
+        &beta, input_desc, nnz,
+        d_vals,
+        d_csrOffsets,
+        d_cols,
+        result_desc,
+        d_vals_r,
+        d_csrOffsets_r,
+        d_cols_r,
+        dBuffer);
+    // Clean up
+    cudaFree(dBuffer);
+    cusparseDestroyMatDescr(input_desc);
+    cusparseDestroyMatDescr(result_desc);
+}
+
+// Function to generate initial permutation
+int* generate_initial_permutation(std::mt19937& rng, int n) {
+    std::vector<int> permutation(n);
+    for (int i = 0; i < n; ++i) {
+        permutation[i] = i;
+    }
+    std::shuffle(permutation.begin(), permutation.end(), rng);
+
+    int* p;
+    CHECK_CUDA(cudaMalloc((void**)&p, n * sizeof(int)));
+    CHECK_CUDA(cudaMemcpy(p, permutation.data(), n * sizeof(int), cudaMemcpyHostToDevice));
+
+    return p;
+}
+
+
+void create_graph_sparse(int n, int nnz, int split, const int* p, int *I, int* J, float* V) {
+
+    curandState* states;
+    CHECK_CUDA(cudaMalloc((void**)&states, nnz * sizeof(states)));
+    int gridSize = (nnz + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    create_random_matrix << <gridSize, BLOCK_SIZE >> > (n, nnz, split, p, I, J, V, states);
+
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    // Wrap raw device pointers in thrust device pointers for sorting
+    thrust::device_ptr<int> dev_I(I);
+    thrust::device_ptr<int> dev_J(J);
+    thrust::device_ptr<float> dev_V(V);
+
+    // First, sort by the secondary key (J) using stable sort
+    thrust::stable_sort_by_key(dev_J, dev_J + nnz, thrust::make_zip_iterator(thrust::make_tuple(dev_I, dev_V)));
+
+    // Then, sort by the primary key (I) using stable sort to maintain the order of the secondary key
+    thrust::stable_sort_by_key(dev_I, dev_I + nnz, thrust::make_zip_iterator(thrust::make_tuple(dev_J, dev_V)));
+
+
+    int* h_rows = new int[nnz];
+    int* h_cols = new int[nnz];
+    float* h_vals = new float[nnz];
+    
+
+    CHECK_CUDA(cudaMemcpy(h_rows, I, nnz * sizeof(int), cudaMemcpyDeviceToHost));
+    CHECK_CUDA(cudaMemcpy(h_cols, J, nnz * sizeof(int), cudaMemcpyDeviceToHost));
+    CHECK_CUDA(cudaMemcpy(h_vals, V, nnz * sizeof(float), cudaMemcpyDeviceToHost));
+
+    for (int i = 0; i < nnz; ++i) {
+        std::cout << "I: " << h_rows[i] << " J: " << h_cols[i] << " V: " << h_vals[i] << std::endl;
+    }
+
+    CHECK_CUDA(cudaFree(states));
+    delete[] h_rows;
+    delete[] h_cols;
+    delete[] h_vals;
+
+    //thrust::host_vector<int> h_rows, h_cols;
+    //thrust::host_vector<float> h_vals;
+
+    //thrust::default_random_engine rng;
+    //thrust::random::uniform_real_distribution<float> dist(0.01f, 1.0f);
+
+    //std::vector<std::tuple<int, int, float>> combined;
+
+    //for (int i = 0; i < split; ++i) {
+    //    std::cout << "P_" << i << ":" << p[i] << std::endl;
+    //    for (int j = split; j < n; ++j) {
+    //        std::cout << "I" << ": " << i << " ";
+    //        std::cout << "J" << ":" << j << std::endl;
+    //        float rnd_val = dist(rng);
+    //        combined.push_back(std::make_tuple(p[i], p[j], rnd_val));
+    //        //combined.push_back(std::make_tuple(p[j], p[i], rnd_val));
+    //    }
+    //}
+
+    //std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) {
+    //    if (std::get<0>(a) == std::get<0>(b)) {
+    //        return std::get<1>(a) < std::get<1>(b);
+    //    }
+    //    return std::get<0>(a) < std::get<0>(b);
+    //    });
+
+    //for (size_t i = 0; i < combined.size(); ++i) {
+    //    
+    //    
+    //    h_rows.push_back(std::get<0>(combined[i]));
+    //    h_cols.push_back(std::get<1>(combined[i]));
+    //    h_vals.push_back(std::get<2>(combined[i]));
+    //}
+
+    //d_rows = h_rows;
+    //d_cols = h_cols;
+    //d_vals = h_vals;
+}
+
+char* generate_solution(const int* p, int split, int n) {
+    char* x;
+    CHECK_CUDA(cudaMalloc((void**)&x, n * sizeof(char)));
+    // Initialize a device vector of bool type with false 
+    CHECK_CUDA(cudaMemset(x, 0, n));
+    int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    set_true_elements << <gridSize, BLOCK_SIZE >> > (split, p, x);
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    return x;
+}
+
+void graph_to_qubo(CudaSparseMatrix& Q) {
+    Q.multiply(-1.0f);
+    float* row_sum = Q.sum(0);    
+    Q.fill_diagonal(row_sum);
+    /*scale_csr_matrix(handle, -1.0f, graph, Q);
+    thrust::device_vector<float> sum = sum_rows_csr_matrix(handle, graph);
+    fill_diagonal(handle, Q, sum, extended_pointers);
+    scale_csr_matrix(handle, -0.25f, Q, Q);
+
+    for (int i = 0; i < sum.size(); i++) {
+        std::cout << sum[i] << std::endl;
+    }*/
+}
+
+//
+//float qubo_eng(cublasHandle_t cublasHandle,
+//    cusparseHandle_t handle,
+//    const cusparseSpMatDescr_t& Q,
+//    float* sol_vector) {
+//    int64_t rows, cols, nnz;
+//    int* d_csrOffsets, * d_cols;
+//    float* d_vals;
+//    int* dA_csrOffsets, * dA_columns;
+//    float* dA_values;
+//
+//    cusparseIndexType_t csrRowOffsetsType;
+//    cusparseIndexType_t csrColIndType;
+//    cusparseIndexBase_t idxBase;
+//    cudaDataType valueTyp;
+//
+//    CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz));
+//
+//    CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float)));
+//
+//    CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz,
+//        (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
+//        &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
+//    // Host problem definition
+//    const int A_num_rows = rows;
+//    const int A_num_cols = cols;
+//    const int A_nnz = nnz;
+//    //int h_test[11];
+//    //int* h_test = (int*)calloc(rows+1, sizeof(int));
+//    float     alpha = 1.0f;
+//    float     beta = 0.0f;
+//    //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+//    ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
+//    ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+//    //for (int i = 0; i < A_num_rows+1; i++) {
+//    //    std::cout << h_test[i] << std::endl;
+//    //}
+//    //--------------------------------------------------------------------------
+//    // Device memory management
+//    float* dX, * dY;
+//    CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int)));
+//    CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float)));
+//
+//    CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice));
+//    CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice));
+//    cusparseSpMatDescr_t matA;
+//    cusparseDnVecDescr_t vecX, vecY;
+//    void* dBuffer = NULL;
+//    size_t               bufferSize = 0;
+//    CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
+//        dA_csrOffsets, dA_columns, dA_values,
+//        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+//        CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+//    // Create dense vector X
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F));
+//    // Create dense vector y
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F));
+//    // allocate an external buffer if needed
+//    CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+//        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+//        &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+//        CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
+//    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+//    // execute SpMV
+//    CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+//        &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+//        CUSPARSE_SPMV_ALG_DEFAULT, dBuffer));
+//
+//    // destroy matrix/vector descriptors
+//    CHECK_CUSPARSE(cusparseDestroySpMat(matA));
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+//
+//    /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost));
+//    std::cout << "Result of Q*x: " << std::endl;
+//    for (int i = 0; i < A_num_rows; i++) {
+//        std::cout << hY[i] << std::endl;
+//    }*/
+//
+//    float result;
+//    CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result));
+//    //std::cout << "Energy: " << result << std::endl;
+//    return result;
+//}
+
+//float calculate_qubo_energy(cublasHandle_t cublasHandle,
+//    cusparseHandle_t cusparseHandle,
+//    int n,
+//    const cusparseSpMatDescr_t& Q,
+//    thrust::device_vector<float>& x) {
+//    float alpha = 1.0f;
+//    float beta = 0.0f;
+//    size_t bufferSize = 0;
+//    void* dBuffer = nullptr;
+//
+//    // Create dense vector descriptors
+//    cusparseDnVecDescr_t vecX, vecY;
+//    float* Qx;
+//    float* Qx_ptr;
+//    //float* hY = (float*)calloc(n, sizeof(float));
+//    float* in_x;
+//
+//    /* for (int i = 0; i < n; i++) {
+//         hY[i] = 0.0f;
+//     }*/
+//
+//    CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float)));
+//    CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float)));
+//
+//    CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice));
+//    //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice));
+//
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F));
+//    CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F));
+//
+//    // Allocate buffer
+//    CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle,
+//        CUSPARSE_OPERATION_NON_TRANSPOSE,
+//        &alpha,
+//        Q,
+//        vecX,
+//        &beta,
+//        vecY,
+//        CUDA_R_32F,
+//        CUSPARSE_SPMV_ALG_DEFAULT,
+//        &bufferSize));
+//    CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+//    // Perform Q * x
+//    CHECK_CUSPARSE(cusparseSpMV(cusparseHandle,
+//        CUSPARSE_OPERATION_NON_TRANSPOSE,
+//        &alpha,
+//        Q,
+//        vecX,
+//        &beta,
+//        vecY,
+//        CUDA_R_32F,
+//        CUSPARSE_SPMV_ALG_DEFAULT,
+//        dBuffer));
+//
+//    // Extract the raw pointers to the data in the dense vector descriptors
+//    //float* Qx_ptr;
+//    //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr));
+//    // Ensure the computation is complete before accessing the result
+//    CHECK_CUDA(cudaDeviceSynchronize());
+//
+//    /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost));
+//    for (int i = 0; i < n; i++) {
+//        std::cout << hY[i] << std::endl;
+//    }*/
+//
+//    // Compute the dot product x^T * (Q * x) using cuBLAS
+//    float result;
+//    CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result));
+//
+//    // Clean up
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+//    CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+//    CHECK_CUDA(cudaFree(dBuffer));
+//    CHECK_CUDA(cudaFree(Qx));
+//    CHECK_CUDA(cudaFree(in_x));
+//
+//    return result;
+//}
+//
+//void brute_force_solutions(cublasHandle_t cublasHandle,
+//    cusparseHandle_t handle,
+//    int n,
+//    const cusparseSpMatDescr_t& Q) {
+//    indicators::show_console_cursor(false);
+//    int num_solutions = 1 << n;  // 2^n
+//    thrust::device_vector<float> d_solutions(num_solutions * n);
+//    /*
+//    for (int i = 0; i < num_solutions; ++i) {
+//        for (int j = 0; j < n; ++j) {
+//            float val = static_cast<float>((i & (1 << j)) != 0);
+//            d_solutions[i * n + j] = val;
+//        }
+//    }*/
+//
+//    //convert_sparse_to_dense_and_display(handle, Q, n);
+//    // Allocate memory for energies
+//    thrust::device_vector<float> d_energies(num_solutions);
+//    thrust::device_vector<float> sol_x(n);
+//
+//    indicators::ProgressBar  bar{
+//    indicators::option::BarWidth{40},
+//    indicators::option::Start{"["},
+//    indicators::option::Fill{"="},
+//    indicators::option::Lead{">"},
+//    indicators::option::Remainder{" "},
+//    indicators::option::End{" ]"},
+//    indicators::option::ForegroundColor{indicators::Color::white},
+//    indicators::option::FontStyles{
+//          std::vector<indicators::FontStyle>{indicators::FontStyle::bold}},
+//    indicators::option::MaxProgress{num_solutions}
+//    };
+//    int smallest_ind = -1;
+//    float smallest_eng = 1000;
+//
+//    for (int i = 0; i < num_solutions; ++i) {
+//        for (int j = 0; j < n; ++j) {
+//            float val = static_cast<float>((i & (1 << j)) != 0);
+//            sol_x[j] = val;
+//            d_solutions[i * n + j] = val;
+//        }
+//        float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data()));
+//        //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
+//        d_energies[i] = eng;
+//        if (eng <= smallest_eng) {
+//            smallest_ind = i;
+//            smallest_eng = eng;
+//        }
+//        //std::cout << "Energy i: " << eng << std::endl;
+//        bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) });
+//        bar.tick();
+//    }
+//    std::cout << "Tested all solutions, now sorting." << std::endl;
+//    // Find the minimum energy
+//    auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
+//    float min_energy = *min_energy_iter;
+//    int min_index = min_energy_iter - d_energies.begin();
+//
+//    // Copy the best solution to host
+//    thrust::host_vector<float> h_best_solution(n);
+//    thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin());
+//
+//    // Print the result
+//    std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl;
+//    std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl;
+//    std::cout << "Best solution: ";
+//    for (float bit : h_best_solution) {
+//        std::cout << bit << " ";
+//    }
+//    std::cout << std::endl;
+//    convert_sparse_to_dense_and_display(handle, Q, n);
+//}
+
+int estimate_split(float density, int n) {
+    if (density > 0.5f) {
+        std::cout << "Error, density can not be bigger than 0.5!" << std::endl;
+    }
+    
+    float sparsity = 1.0f - density;
+    float inside = 2.0f * sparsity - 1.0f;
+    float smaller_set_size = 0.5f * n * (1 + sqrt(inside));
+    return static_cast<int>(smaller_set_size) - 1;
+}
+
+
+int main() {
+    int n = 12;
+    int seed = 14;
+    float density = 0.5;
+    int* I, * J;
+    float* V;
+    std::mt19937 rng(seed);
+
+    int* p = generate_initial_permutation(rng, n);
+
+    int split = estimate_split(density, n);  // Example split, can be computed as needed
+    int nnz = split * (n - split );
+    std::cout << "Split: " << split << " nnz: " << nnz << std::endl;
+
+    CHECK_CUDA(cudaMalloc((void**)&I, nnz * sizeof(int)));
+    CHECK_CUDA(cudaMalloc((void**)&J, nnz * sizeof(int)));
+    CHECK_CUDA(cudaMalloc((void**)&V, nnz * sizeof(float)));
+
+    create_graph_sparse(n, nnz, split, p, I, J, V);
+
+    CudaSparseMatrix graph = CudaSparseMatrix(I, J, V, n, nnz, SparseType::COO, MemoryType::Device);
+    graph.display();
+
+    char* x = generate_solution(p, split, n);
+
+    char* h_x = new char[n];
+
+    CHECK_CUDA(cudaMemcpy(h_x, x, n * sizeof(char), cudaMemcpyDeviceToHost));
+
+    for (int i = 0; i < n; i++)
+    {
+        std::cout << "X_" << i << " " << static_cast<int>(h_x[i]) << std::endl;
+    }
+
+    CudaSparseMatrix Q = CudaSparseMatrix(graph);
+    graph.clear();
+
+    graph_to_qubo(Q);
+
+    Q.display();
+
+    cudaFree(I);
+    cudaFree(J);
+    cudaFree(V);
+    cudaFree(p);
+    cudaFree(x);
+
+    delete[] h_x;
+
+    //std::cout << "Graph created.";
+    //// Print the result (for debugging purposes)
+    //thrust::host_vector<int> h_rows = d_rows;
+    //thrust::host_vector<int> h_cols = d_cols;
+    //thrust::host_vector<float> h_vals = d_vals;
+    //thrust::host_vector<float> h_x = d_x;
+    //int print_size = 2 * n;
+    //std::cout << "Rows: ";
+    //for (int i = 0; i < print_size; ++i) {
+    //    std::cout << h_rows[i] << " ";
+    //}
+    //std::cout << std::endl;
+
+    //std::cout << "Cols: ";
+    //for (int i = 0; i < print_size; ++i) {
+    //    std::cout << h_cols[i] << " ";
+    //}
+    //std::cout << std::endl;
+
+    //std::cout << "Vals: ";
+    //for (int i = 0; i < print_size; ++i) {
+    //    std::cout << h_vals[i] << " ";
+    //}
+    //std::cout << std::endl;
+
+    //std::cout << "Solution: ";
+    //for (int i = 0; i < n; ++i) {
+    //    std::cout << h_x[i] << " ";
+    //}
+    //std::cout << std::endl;
+
+    //cusparseHandle_t handle;
+    //cublasHandle_t cublasHandle;
+    //cusparseSpMatDescr_t graph_csr, Q;
+
+    //// Initialize cuSPARSE
+    //cublasCreate(&cublasHandle);
+    //CHECK_CUSPARSE(cusparseCreate(&handle));
+    //int nnz = d_vals.size();
+    //thrust::device_vector<int> d_csrOffsets(n + 1);
+    //cusparseXcoo2csr(handle,
+    //    thrust::raw_pointer_cast(d_rows.data()),
+    //    nnz,
+    //    n,
+    //    thrust::raw_pointer_cast(d_csrOffsets.data()),
+    //    CUSPARSE_INDEX_BASE_ZERO);
+
+    //CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr,
+    //    n,
+    //    n,
+    //    nnz,
+    //    thrust::raw_pointer_cast(d_csrOffsets.data()),
+    //    thrust::raw_pointer_cast(d_cols.data()), // column indices
+    //    thrust::raw_pointer_cast(d_vals.data()), // values
+    //    CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+    //    CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+
+    //thrust::device_vector<float> d_QVals(nnz);
+    //CHECK_CUSPARSE(cusparseCreateCsr(&Q,
+    //    n,
+    //    n,
+    //    nnz,
+    //    thrust::raw_pointer_cast(d_csrOffsets.data()),
+    //    thrust::raw_pointer_cast(d_cols.data()), // column indices
+    //    thrust::raw_pointer_cast(d_QVals.data()), // values
+    //    CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+    //    CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+    //// Now matDescr can be used in further cuSPARSE operations
+
+    //int* newRowsPtr;
+    //int* newCols;
+    //float* newVals;
+    //csr_data extended_sparse_data;
+
+    //CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int)));
+    //CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int)));
+    //CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float)));
+
+    //extended_sparse_data.rowPointer = newRowsPtr;
+    //extended_sparse_data.cols = newCols;
+    //extended_sparse_data.vals = newVals;
+
+
+    //std::cout << "Sparse matrix created successfully!" << std::endl;
+    //graph_to_qubo(handle, graph_csr, Q, extended_sparse_data);
+    ///*convert_sparse_to_dense_and_display(handle, graph_csr, n); */
+    ////convert_sparse_to_dense_and_display(handle, Q, n);
+
+    //float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data()));
+
+    //std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl;
+
+    ///*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x);
+    //std::cout << "Qubo energy: " << energy << std::endl;*/
+    //brute_force_solutions(cublasHandle, handle, n, Q);
+
+    //// Clean up
+    //CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr));
+    //CHECK_CUSPARSE(cusparseDestroySpMat(Q));
+    //CHECK_CUSPARSE(cusparseDestroy(handle));
+
+    return 0;
+};