diff --git a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj
index abe6491..3838150 100644
--- a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj
+++ b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj
@@ -92,13 +92,14 @@
-
+
+
-
+
diff --git a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters
index fea601e..5034b1c 100644
--- a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters
+++ b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters
@@ -5,9 +5,10 @@
Objects
-
+
Objects
+
@@ -17,7 +18,7 @@
Objects
-
+
Objects
diff --git a/CudaMaxCutPlanted/CudaSparseMatrix.cu b/CudaMaxCutPlanted/CudaSparseMatrix.cu
index 5c1dd41..deefa06 100644
--- a/CudaMaxCutPlanted/CudaSparseMatrix.cu
+++ b/CudaMaxCutPlanted/CudaSparseMatrix.cu
@@ -5,30 +5,9 @@
#include
#include
+#include
+#include
-#include "SparseMatrixSumKernel.cuh"
-
-#define CHECK_CUDA(call) \
-{ \
- cudaError_t err = call; \
- if (err != cudaSuccess) { \
- std::cerr << "CUDA error in file " << __FILE__ \
- << " at line " << __LINE__ << ": " \
- << cudaGetErrorString(err) << std::endl; \
- exit(EXIT_FAILURE); \
- } \
-}
-
-#define CHECK_CUSPARSE(call) \
-{ \
- cusparseStatus_t err = call; \
- if (err != CUSPARSE_STATUS_SUCCESS) { \
- std::cerr << "CUSPARSE error in file " << __FILE__ \
- << " at line " << __LINE__ << ": " \
- << cusparseGetErrorString(err) << std::endl; \
- exit(EXIT_FAILURE); \
- } \
-}
const char* cusparseGetErrorString(cusparseStatus_t status) {
switch (status) {
@@ -62,7 +41,7 @@ CudaSparseMatrix::CudaSparseMatrix(int* I, int* J, float* V, int n, int nnz, Spa
}
CudaSparseMatrix::CudaSparseMatrix(const CudaSparseMatrix& other)
- : n_(other.n_), nnz_(other.nnz_) {
+ : n_(other.n_), nnz_(other.nnz_), matDescr_(nullptr) {
cusparseHandle_t& cusparseHandle_ = CusparseHandle::getInstance();
CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets_, (n_ + 1) * sizeof(int)));
CHECK_CUDA(cudaMalloc((void**)&d_cols_, nnz_ * sizeof(int)));
@@ -79,21 +58,156 @@ CudaSparseMatrix::CudaSparseMatrix(const CudaSparseMatrix& other)
}
CudaSparseMatrix::~CudaSparseMatrix() {
- CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_));
- CHECK_CUDA(cudaFree(d_csrOffsets_));
- CHECK_CUDA(cudaFree(d_cols_));
- CHECK_CUDA(cudaFree(d_vals_));
+ clear();
}
void CudaSparseMatrix::updateData(const int* rows, const int* cols, const float* vals, int new_nnz, SparseType sparseType, MemoryType memType) {
nnz_ = new_nnz;
CHECK_CUDA(cudaFree(d_cols_));
CHECK_CUDA(cudaFree(d_vals_));
+ CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_));
CHECK_CUDA(cudaMalloc((void**)&d_cols_, nnz_ * sizeof(int)));
CHECK_CUDA(cudaMalloc((void**)&d_vals_, nnz_ * sizeof(float)));
allocateAndCopy(rows, cols, vals, sparseType, memType);
+
+ CHECK_CUSPARSE(cusparseCreateCsr(&matDescr_, n_, n_, nnz_,
+ d_csrOffsets_, d_cols_, d_vals_,
+ csr_row_ind_type_, csr_col_ind_type_,
+ index_base_, valueType_));
+}
+
+bool* CudaSparseMatrix::zero_elements_in_vector(const float* input_vect, int& zero_sum, int n) {
+ cusparseHandle_t& handle = CusparseHandle::getInstance();
+ bool* zero_elements_vect;
+ int* d_zero_sum;
+
+ zero_sum = 0;
+
+ // Allocate memory on the device
+ CHECK_CUDA(cudaMalloc((void**)&zero_elements_vect, n * sizeof(bool)));
+ CHECK_CUDA(cudaMalloc((void**)&d_zero_sum, sizeof(int)));
+
+ // Initialize memory
+ CHECK_CUDA(cudaMemset(zero_elements_vect, 0, n * sizeof(bool)));
+ CHECK_CUDA(cudaMemset(d_zero_sum, 0, sizeof(int)));
+
+ int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ zero_elements << > > (input_vect, zero_elements_vect, d_zero_sum, n);
+ CHECK_CUDA(cudaDeviceSynchronize());
+
+ // Copy the result back to host
+ CHECK_CUDA(cudaMemcpy(&zero_sum, d_zero_sum, sizeof(int), cudaMemcpyDeviceToHost));
+
+ // Free device memory
+ CHECK_CUDA(cudaFree(d_zero_sum));
+
+ return zero_elements_vect;
+}
+
+void CudaSparseMatrix::fill_diagonal(const float* diagonal_vect)
+{
+ int nnz_sum = 0;
+ int zero_sum = 0;
+ int diag_nnz = n_;
+ int resize_n = diag_nnz;
+ // TODO: Flip to non zero vector and copy only the non zero elements to new vector
+ bool* zeros_in_diag_sum = zero_elements_in_vector(diagonal_vect, zero_sum, n_);
+ bool* non_zero = non_zero_diagonal(nnz_sum);
+
+ bool* h_non_zero = new bool[n_];
+ CHECK_CUDA(cudaMemcpy(h_non_zero, zeros_in_diag_sum, n_ * sizeof(bool), cudaMemcpyDeviceToHost));
+
+ for (int i = 0; i < n_; i++)
+ {
+ std::cout << "zero_in_diag_" << i << ": " << h_non_zero[i] << std::endl;
+ }
+
+ diag_nnz -= zero_sum;
+ resize_n = diag_nnz - nnz_sum;
+
+ int* original_I, * new_I, * new_J;
+ float* new_V;
+
+ CHECK_CUDA(cudaMalloc((void**)&original_I, (nnz_) * sizeof(int)));
+ CHECK_CUDA(cudaMalloc((void**)&new_I, (nnz_ + resize_n) * sizeof(int)));
+ CHECK_CUDA(cudaMalloc((void**)&new_J, (nnz_ + resize_n) * sizeof(int)));
+ CHECK_CUDA(cudaMalloc((void**)&new_V, (nnz_ + resize_n) * sizeof(float)));
+
+ csrTorows(d_csrOffsets_, original_I, n_, nnz_, SparseType::CSR);
+
+ CHECK_CUDA(cudaMemcpy(new_I, original_I, nnz_ * sizeof(int), cudaMemcpyDeviceToDevice));
+ CHECK_CUDA(cudaMemcpy(new_J, d_cols_, nnz_ * sizeof(int), cudaMemcpyDeviceToDevice));
+ CHECK_CUDA(cudaMemcpy(new_V, d_vals_, nnz_ * sizeof(float), cudaMemcpyDeviceToDevice));
+
+ int gridSize = ((nnz_+n_)+BLOCK_SIZE - 1) / BLOCK_SIZE;
+ set_diagonal << > > (new_I, new_J, new_V, non_zero, diagonal_vect, nnz_, resize_n);
+ CHECK_CUDA(cudaDeviceSynchronize());
+ thrust::device_ptr dev_I(new_I);
+ thrust::device_ptr dev_J(new_J);
+ thrust::device_ptr dev_V(new_V);
+
+ // First, sort by the secondary key (J) using stable sort
+ thrust::stable_sort_by_key(dev_J, dev_J + (nnz_ + resize_n), thrust::make_zip_iterator(thrust::make_tuple(dev_I, dev_V)));
+
+ // Then, sort by the primary key (I) using stable sort to maintain the order of the secondary key
+ thrust::stable_sort_by_key(dev_I, dev_I + (nnz_ + resize_n), thrust::make_zip_iterator(thrust::make_tuple(dev_J, dev_V)));
+
+ float* h_new_I = new float[nnz_+resize_n];
+ CHECK_CUDA(cudaMemcpy(h_new_I, new_V, (nnz_ + resize_n) * sizeof(float), cudaMemcpyDeviceToHost));
+
+ for (int i = 0; i < nnz_ + resize_n; i++)
+ {
+ std::cout << "Copied V_" << i << ": " << h_new_I[i] << std::endl;
+ }
+
+ updateData(new_I, new_J, new_V, nnz_ + resize_n, SparseType::COO, MemoryType::Device);
+
+ CHECK_CUDA(cudaFree(original_I));
+ CHECK_CUDA(cudaFree(new_I));
+ CHECK_CUDA(cudaFree(new_J));
+ CHECK_CUDA(cudaFree(new_V));
+
+ std::cout << "Total non zero elements on the diagonal: " << nnz_sum << std::endl;
+}
+
+bool* CudaSparseMatrix::non_zero_diagonal(int& nnz_diag_sum)
+{
+ cusparseHandle_t& handle = CusparseHandle::getInstance();
+ bool* nnz_diag;
+ int* I;
+ int* d_nnz_diag_sum;
+ nnz_diag_sum = 0;
+
+ // Allocate memory on the device
+ CHECK_CUDA(cudaMalloc((void**)&d_nnz_diag_sum, sizeof(int)));
+
+ // Initialize memory
+ CHECK_CUDA(cudaMemset(d_nnz_diag_sum, 0, sizeof(int)));
+
+ CHECK_CUDA(cudaMalloc((void**)&I, nnz_ * sizeof(int)));
+ cusparseXcsr2coo(handle,
+ d_csrOffsets_,
+ nnz_,
+ n_,
+ I,
+ CUSPARSE_INDEX_BASE_ZERO);
+
+
+ CHECK_CUDA(cudaMalloc((void**)&nnz_diag, n_ * sizeof(bool)));
+ CHECK_CUDA(cudaMemset(nnz_diag, 0, n_));
+ int gridSize = (nnz_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ non_zero_elements << > > (I, d_cols_, nnz_diag, d_nnz_diag_sum, nnz_);
+
+ CHECK_CUDA(cudaMemcpy(&nnz_diag_sum, d_nnz_diag_sum, sizeof(int), cudaMemcpyDeviceToHost));
+
+ // Free device memory
+ CHECK_CUDA(cudaFree(d_nnz_diag_sum));
+ CHECK_CUDA(cudaDeviceSynchronize());
+ CHECK_CUDA(cudaFree(I));
+
+ return nnz_diag;
}
CudaDenseVector CudaSparseMatrix::dot(const float* d_vec)
@@ -122,6 +236,63 @@ CudaDenseVector CudaSparseMatrix::dot(const float* d_vec)
return result_vector;
}
+void CudaSparseMatrix::multiply(float value)
+{
+ cusparseHandle_t& handle = CusparseHandle::getInstance();
+ // Set scaling factors
+ const float beta = 0.0f;
+ size_t bufferSize = 0;
+
+ cusparseMatDescr_t input_desc;
+ CHECK_CUSPARSE(cusparseCreateMatDescr(&input_desc));
+
+ // Create matrix descriptor for the result matrix C
+ cusparseMatDescr_t result_desc;
+ CHECK_CUSPARSE(cusparseCreateMatDescr(&result_desc));
+
+ // Get buffer size for the operation
+ CHECK_CUSPARSE(cusparseScsrgeam2_bufferSizeExt(handle,
+ n_, n_,
+ &value, input_desc, nnz_,
+ d_vals_,
+ d_csrOffsets_,
+ d_cols_,
+ &beta, nullptr, nnz_,
+ nullptr,
+ nullptr,
+ nullptr,
+ result_desc,
+ d_vals_,
+ d_csrOffsets_,
+ d_cols_,
+ &bufferSize));
+
+ void* dBuffer;
+ cudaMalloc(&dBuffer, bufferSize);
+
+ // Perform the scaling operation
+ cusparseScsrgeam2(handle,
+ n_, n_,
+ &value, input_desc, nnz_,
+ d_vals_,
+ d_csrOffsets_,
+ d_cols_,
+ &beta, input_desc, nnz_,
+ d_vals_,
+ d_csrOffsets_,
+ d_cols_,
+ result_desc,
+ d_vals_,
+ d_csrOffsets_,
+ d_cols_,
+ dBuffer);
+ // Clean up
+ cudaFree(dBuffer);
+ cusparseDestroyMatDescr(input_desc);
+ cusparseDestroyMatDescr(result_desc);
+
+}
+
void CudaSparseMatrix::allocateAndCopy(const int* rows, const int* cols, const float* vals, SparseType sparseType, MemoryType memType) {
cudaMemcpyKind copyType;
@@ -167,6 +338,19 @@ void CudaSparseMatrix::rowsToCsr(const int* d_rows, int* d_csr_offset, int n, in
}
+void CudaSparseMatrix::csrTorows(const int* d_csr_offset, int* d_rows, int n, int nnz, SparseType sparseType)
+{
+ if (sparseType == SparseType::CSR) {
+ cusparseHandle_t& handle = CusparseHandle::getInstance();
+ cusparseXcsr2coo(handle,
+ d_csr_offset,
+ nnz,
+ n,
+ d_rows,
+ CUSPARSE_INDEX_BASE_ZERO);
+ }
+}
+
float* CudaSparseMatrix::sumRows()
{
cusparseHandle_t& handle = CusparseHandle::getInstance();
@@ -179,7 +363,8 @@ float* CudaSparseMatrix::sumRows()
CHECK_CUDA(cudaMalloc((void**)&cscRowInd, nnz_ * sizeof(int)));
CHECK_CUDA(cudaMalloc((void**)&cscVal, nnz_ * sizeof(float)));
- CHECK_CUDA(cudaMemset((void*)diagonal, 0, nnz_ * sizeof(float)));
+ CHECK_CUDA(cudaMalloc((void**)&diagonal, n_ * sizeof(float)));
+ CHECK_CUDA(cudaMemset((void*)diagonal, 0, n_ * sizeof(float)));
cusparseCsr2cscEx2_bufferSize(handle, n_, n_, nnz_,
@@ -201,10 +386,9 @@ float* CudaSparseMatrix::sumRows()
// Clean up
cudaFree(dBuffer);
- int blockSize = 512;
- int gridSize = (nnz_ + blockSize - 1) / blockSize;
+ int gridSize = (nnz_ + BLOCK_SIZE - 1) / BLOCK_SIZE;
- sum_axis << > > (nnz_, cscRowInd, cscVal, diagonal);
+ sum_axis << > > (nnz_, cscRowInd, cscVal, diagonal);
CHECK_CUDA(cudaFree(cscColPtr));
CHECK_CUDA(cudaFree(cscRowInd));
@@ -218,7 +402,8 @@ float* CudaSparseMatrix::sumCols()
cusparseHandle_t& handle = CusparseHandle::getInstance();
float* diagonal;
- CHECK_CUDA(cudaMemset((void*)diagonal, 0, nnz_ * sizeof(float)));
+ CHECK_CUDA(cudaMalloc((void**)&diagonal, n_ * sizeof(float)));
+ CHECK_CUDA(cudaMemset((void*)diagonal, 0, n_ * sizeof(float)));
int blockSize = 512;
int gridSize = (nnz_ + blockSize - 1) / blockSize;
@@ -230,6 +415,14 @@ float* CudaSparseMatrix::sumCols()
float* CudaSparseMatrix::sum(int axis)
{
+ if (axis == 0) {
+ return sumRows();
+ }
+
+ if (axis == 1) {
+ return sumCols();
+ }
+
return nullptr;
}
@@ -274,7 +467,7 @@ void CudaSparseMatrix::display()
std::cout << "Dense matrix:" << std::endl;
for (int i = 0; i < n_; ++i) {
for (int j = 0; j < n_; ++j) {
- std::cout << h_denseMat[i * n_ + j] << " ";
+ std::cout << std::setw(7) << h_denseMat[i * n_ + j] << " ";
}
std::cout << std::endl;
}
@@ -295,6 +488,33 @@ int CudaSparseMatrix::size() const
return n_;
}
+void CudaSparseMatrix::clear()
+{
+ if (d_csrOffsets_) {
+ CHECK_CUDA(cudaFree(d_csrOffsets_));
+ std::cout << "d_csrOffsets_ cleared" << std::endl;
+ d_csrOffsets_ = nullptr;
+ }
+ if (d_cols_) {
+ CHECK_CUDA(cudaFree(d_cols_));
+ std::cout << "d_cols_ cleared" << std::endl;
+ d_cols_ = nullptr;
+ }
+ if (d_vals_) {
+ CHECK_CUDA(cudaFree(d_vals_));
+ std::cout << "d_vals_ cleared" << std::endl;
+ d_vals_ = nullptr;
+ }
+ if (matDescr_) {
+ CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_));
+ std::cout << "matDescr_ cleared" << std::endl;
+ matDescr_ = nullptr;
+ }
+
+ nnz_ = 0;
+ n_ = 0;
+}
+
CudaDenseVector::CudaDenseVector(int size, const float* V, MemoryType memType): size_(size)
{
cudaMemcpyKind copyType = memType == MemoryType::Host ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
diff --git a/CudaMaxCutPlanted/CudaSparseMatrix.hpp b/CudaMaxCutPlanted/CudaSparseMatrix.hpp
index de410b1..047c64e 100644
--- a/CudaMaxCutPlanted/CudaSparseMatrix.hpp
+++ b/CudaMaxCutPlanted/CudaSparseMatrix.hpp
@@ -4,6 +4,30 @@
#include
#include
#include
+#include "Kernels.cuh"
+
+
+#define CHECK_CUDA(call) \
+{ \
+ cudaError_t err = call; \
+ if (err != cudaSuccess) { \
+ std::cerr << "CUDA error in file " << __FILE__ \
+ << " at line " << __LINE__ << ": " \
+ << cudaGetErrorString(err) << std::endl; \
+ exit(EXIT_FAILURE); \
+ } \
+}
+
+#define CHECK_CUSPARSE(call) \
+{ \
+ cusparseStatus_t err = call; \
+ if (err != CUSPARSE_STATUS_SUCCESS) { \
+ std::cerr << "CUSPARSE error in file " << __FILE__ \
+ << " at line " << __LINE__ << ": " \
+ << cusparseGetErrorString(err) << std::endl; \
+ exit(EXIT_FAILURE); \
+ } \
+}
enum class MemoryType {
Host,
@@ -59,11 +83,15 @@ class CudaSparseMatrix {
~CudaSparseMatrix();
void updateData(const int* rows, const int* cols, const float* vals, int new_nnz, SparseType sparseType, MemoryType memType);
+ void fill_diagonal(const float* diagonal_vect);
+ bool* non_zero_diagonal(int& nnz_diag_sum);
CudaDenseVector dot(const float* d_vec);
+ void multiply(float value);
float* sum(int axis);
void display();
int getNnz() const;
int size() const;
+ void clear();
// Other useful methods can be added here
@@ -80,6 +108,8 @@ class CudaSparseMatrix {
cudaDataType valueType_ = CUDA_R_32F;
void allocateAndCopy(const int* rows, const int* cols, const float* vals, SparseType sparseType, MemoryType memType);
void rowsToCsr(const int* d_rows, int* d_csr_offset, int n, int nnz, SparseType sparseType);
+ void csrTorows(const int* d_csr_offset, int* d_rows, int n, int nnz, SparseType sparseType);
+ bool* zero_elements_in_vector(const float* input_vect, int& zero_sum, int n);
float* sumRows();
float* sumCols();
diff --git a/CudaMaxCutPlanted/Kernels.cu b/CudaMaxCutPlanted/Kernels.cu
new file mode 100644
index 0000000..6e7ceae
--- /dev/null
+++ b/CudaMaxCutPlanted/Kernels.cu
@@ -0,0 +1,137 @@
+#include "Kernels.cuh"
+
+#define NUM_BANKS 16
+#define LOG_NUM_BANKS 4
+#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
+
+///
+/// This function sums either rows for matrix in csc format or columns for matrix in csr format
+///
+/// Number on non zero elements
+/// Array with nnz elements that is NOT in compressed format
+/// Array with values
+/// Output array that will contain output of summed rows/cols
+///
+__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum) {
+ int element_ind = blockIdx.x * blockDim.x + threadIdx.x;
+ if (element_ind < nnz) {
+ int idx = d_non_offset_axis_ind[element_ind];
+ atomicAdd(&d_axis_sum[idx], d_vals[idx]);
+ }
+}
+
+__global__ void create_random_matrix(int n, int nnz, int split, const int* p, int* d_rows, int* d_cols, float* d_vals, curandState* states) {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < nnz) {
+ curand_init(1234, idx, 0, &states[idx]);
+ int i = static_cast(floorf(idx / (n - split)));
+ int j = static_cast((idx % (n - split)) + split);
+ d_rows[idx] = p[i];
+ d_cols[idx] = p[j];
+ d_vals[idx] = curand_uniform(&states[idx]) * 0.99f + 0.01f;
+ }
+}
+
+__global__ void set_true_elements(int split, const int* p, char* x) {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < split) {
+ x[p[idx]] = 1;
+ }
+}
+
+__global__ void non_zero_elements(const int* I, const int* J, bool* non_zero_elements, int *nnz_sum, int nnz) {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < nnz) {
+ if (I[idx] == J[idx]) {
+ atomicAdd(nnz_sum, 1);
+ non_zero_elements[idx] == true;
+ }
+
+ }
+}
+
+__global__ void zero_elements(const float *input_vect, bool* zero_elements_vect, int* zero_sum, int n) {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < n) {
+ if (fabsf(input_vect[idx]) < 1e-6) {
+ atomicAdd(zero_sum, 1);
+ zero_elements_vect[idx] = true;
+ }
+
+ }
+}
+
+
+__global__ void set_diagonal(int* I, int* J, float* V, bool* non_zero_elements, const float* diagonal, int initial_n, int resize_n) {
+ int offset = initial_n;
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < initial_n) {
+ if (I[idx] == J[idx] && non_zero_elements[idx]) {
+
+ V[idx] = diagonal[idx];
+ }
+
+ }
+ else if (idx >= initial_n && idx < (initial_n + resize_n)) {
+ int index = idx - offset;
+ V[idx] = diagonal[index];
+ I[idx] = index;
+ J[idx] = index;
+ }
+}
+
+__global__ void prescan(float* g_odata, float* g_idata, int n) {
+ extern __shared__ float temp[];
+
+ int thid = threadIdx.x;
+ int offset = 1;
+
+ // Load input into shared memory with padding to avoid bank conflicts
+ int ai = thid;
+ int bi = thid + (n / 2);
+ int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
+ int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
+
+ temp[ai + bankOffsetA] = g_idata[ai];
+ temp[bi + bankOffsetB] = g_idata[bi];
+
+ // Build sum in place up the tree
+ for (int d = n >> 1; d > 0; d >>= 1) {
+ __syncthreads();
+ if (thid < d) {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+ ai += CONFLICT_FREE_OFFSET(ai);
+ bi += CONFLICT_FREE_OFFSET(bi);
+ temp[bi] += temp[ai];
+ }
+ offset *= 2;
+ }
+
+ // Clear the last element
+ if (thid == 0) {
+ temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0;
+ }
+
+ // Traverse down tree & build scan
+ for (int d = 1; d < n; d *= 2) {
+ offset >>= 1;
+ __syncthreads();
+ if (thid < d) {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+ ai += CONFLICT_FREE_OFFSET(ai);
+ bi += CONFLICT_FREE_OFFSET(bi);
+ float t = temp[ai];
+ temp[ai] = temp[bi];
+ temp[bi] += t;
+ }
+ }
+ __syncthreads();
+
+ // Write results to device memory with proper offsets
+ g_odata[ai] = temp[ai + bankOffsetA];
+ g_odata[bi] = temp[bi + bankOffsetB];
+}
+
+
diff --git a/CudaMaxCutPlanted/Kernels.cuh b/CudaMaxCutPlanted/Kernels.cuh
new file mode 100644
index 0000000..856e5f4
--- /dev/null
+++ b/CudaMaxCutPlanted/Kernels.cuh
@@ -0,0 +1,25 @@
+#ifndef KERNELS_CUH
+#define KERNELS_CUH
+
+#include
+#include
+#include
+#ifdef __INTELLISENSE__
+#include "intellisense_cuda_intrinsics.h"
+#endif
+#define BLOCK_SIZE 512
+
+
+// Kernel to sum all elements along axis
+__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum);
+// Fills sparse graph with random values
+__global__ void create_random_matrix(int n, int nnz, int split, const int* p, int* d_rows, int* d_cols, float* d_vals, curandState* states);
+//Sets true/1 to the zereos vector x in positions provided by p
+__global__ void set_true_elements(int split, const int* p, char* x);
+//Count all non zero elements on the diagonal and returns bool vector where true is set on a position where diagonal is non zero
+__global__ void non_zero_elements(const int* I, const int* J, bool* non_zero_elements, int* nnz_sum, int n);
+// Sets values to the diagonal taking into account that diagonal can have non zero values
+__global__ void set_diagonal(int* I, int* J, float* V, bool* non_zero_elements, const float* diagonal, int initial_n, int resize_n);
+// Count and sets bool vector for zero elements inside float device vector
+__global__ void zero_elements(const float* input_vect, bool* zero_elements_vect, int* zero_sum, int n);
+#endif // KERNELS_CUH
\ No newline at end of file
diff --git a/CudaMaxCutPlanted/SparseMatrixSumKernel.cu b/CudaMaxCutPlanted/SparseMatrixSumKernel.cu
deleted file mode 100644
index 22fa399..0000000
--- a/CudaMaxCutPlanted/SparseMatrixSumKernel.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "SparseMatrixSumKernel.cuh"
-
-
-///
-/// This function sums either rows for matrix in csc format or columns for matrix in csr format
-///
-/// Number on non zero elements
-/// Array with nnz elements that is NOT in compressed format
-/// Array with values
-/// Output array that will contain output of summed rows/cols
-///
-__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum) {
- int element_ind = blockIdx.x * blockDim.x + threadIdx.x;
- if (element_ind < nnz) {
- int idx = d_non_offset_axis_ind[element_ind];
- atomicAdd(&d_axis_sum[idx], d_vals[idx]);
- }
-}
\ No newline at end of file
diff --git a/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh b/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh
deleted file mode 100644
index 42d4502..0000000
--- a/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef SPARSEMATRIXSUMKERNEL_CUH
-#define SPARSEMATRIXSUMKERNEL_CUH
-
-#include
-#ifdef __INTELLISENSE__
-#include "intellisense_cuda_intrinsics.h"
-#endif
-
-
-// Kernel to mark zero elements
-__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum);
-
-#endif // SPARSEMATRIXSUMKERNEL_CUH
\ No newline at end of file
diff --git a/CudaMaxCutPlanted/kernel.cu b/CudaMaxCutPlanted/kernel.cu
index 3e93ae4..6128890 100644
--- a/CudaMaxCutPlanted/kernel.cu
+++ b/CudaMaxCutPlanted/kernel.cu
@@ -1,429 +1,583 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include "indicators.hpp"
-#ifdef __INTELLISENSE__
-#include "intellisense_cuda_intrinsics.h"
-#endif
-
-
-#define CHECK_CUSPARSE(call) \
-{ \
- cusparseStatus_t err = call; \
- if (err != CUSPARSE_STATUS_SUCCESS) { \
- std::cerr << "CUSPARSE error in file " << __FILE__ \
- << " at line " << __LINE__ << ": " \
- << cusparseGetErrorString(err) << std::endl; \
- exit(EXIT_FAILURE); \
- } \
-}
-
-#define CHECK_CUDA(call) \
-{ \
- cudaError_t err = call; \
- if (err != cudaSuccess) { \
- std::cerr << "CUDA error in file " << __FILE__ \
- << " at line " << __LINE__ << ": " \
- << cudaGetErrorString(err) << std::endl; \
- exit(EXIT_FAILURE); \
- } \
-}
-
-#define CHECK_CUBLAS(call) \
-{ \
- cublasStatus_t err = call; \
- if (err != CUBLAS_STATUS_SUCCESS) { \
- std::cerr << "CUBLAS error in file " << __FILE__ \
- << " at line " << __LINE__ << ": " \
- << cublasGetErrorString(err) << std::endl; \
- exit(EXIT_FAILURE); \
- } \
-}
-
-
-struct csr_data {
- int* rowPointer;
- int* cols;
- float* vals;
-};
-
-const char* cusparseGetErrorString(cusparseStatus_t status) {
- switch (status) {
- case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS";
- case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED";
- case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED";
- case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE";
- case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH";
- case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR";
- case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED";
- case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR";
- case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
- case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT";
- default: return "UNKNOWN CUSPARSE STATUS";
- }
-}
-
-const char* cublasGetErrorString(cublasStatus_t status) {
- switch (status) {
- case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
- case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
- case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
- case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
- case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
- case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
- case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
- case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
- case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
- case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
- default: return "UNKNOWN CUBLAS STATUS";
- }
-}
-
-__global__ void cols_sum(int n, const int* d_csrOffsets, const float* d_vals, float* d_rowSums) {
- int row = blockIdx.x * blockDim.x + threadIdx.x;
- if (row < n) {
- int row_start = d_csrOffsets[row];
- int row_end = d_csrOffsets[row + 1];
- for (int j = row_start; j < row_end; j++) {
- atomicAdd(&d_rowSums[row], d_vals[j]);
- }
- }
-}
-
-void convert_sparse_to_dense_and_display(cusparseHandle_t handle, const cusparseSpMatDescr_t& matDescr, int n) {
- // Allocate memory for the dense matrix on the device
- float* d_denseMat;
- cudaMalloc((void**)&d_denseMat, n * n * sizeof(float));
-
- // Create a dense matrix descriptor
- cusparseDnMatDescr_t denseDescr;
- CHECK_CUSPARSE(cusparseCreateDnMat(&denseDescr,
- n, // number of rows
- n, // number of columns
- n, // leading dimension
- d_denseMat, // pointer to dense matrix data
- CUDA_R_32F, // data type
- CUSPARSE_ORDER_ROW)); // row-major order
-
- // Convert sparse matrix to dense matrix
- void* dBuffer = NULL;
- size_t bufferSize = 0;
- CHECK_CUSPARSE(cusparseSparseToDense_bufferSize(handle,
- matDescr,
- denseDescr,
- CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
- &bufferSize));
-
- CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
-
- CHECK_CUSPARSE(cusparseSparseToDense(handle,
- matDescr,
- denseDescr,
- CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
- dBuffer));
-
- // Copy the dense matrix from device to host
- std::vector h_denseMat(n * n);
- CHECK_CUDA(cudaMemcpy(h_denseMat.data(), d_denseMat, n * n * sizeof(float), cudaMemcpyDeviceToHost));
-
- std::cout << std::fixed << std::setprecision(4); // Set precision to 2 decimal places
- std::cout << "Dense matrix:" << std::endl;
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- std::cout << h_denseMat[i * n + j] << " ";
- }
- std::cout << std::endl;
- }
-
- // Clean up
- CHECK_CUDA(cudaFree(d_denseMat));
- CHECK_CUDA(cudaFree(dBuffer));
- CHECK_CUSPARSE(cusparseDestroyDnMat(denseDescr));
-}
-
-
-void fill_diagonal(cusparseHandle_t handle, cusparseSpMatDescr_t& input, thrust::device_vector diag, csr_data& extended_pointers) {
- int64_t n, nnz;
- int* d_csrOffsets;
- int* d_cols;
- float* d_vals;
-
- cusparseIndexType_t csrRowOffsetsType;
- cusparseIndexType_t csrColIndType;
- cusparseIndexBase_t idxBase;
- cudaDataType valueTyp;
-
- CHECK_CUSPARSE(cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
-
- thrust::device_vector d_rows_tdv(nnz);
- thrust::device_vector d_cols_tdv(d_cols, d_cols+nnz);
- thrust::device_vector d_vals_tdv(d_vals, d_vals+nnz);
-
- CHECK_CUSPARSE(cusparseXcsr2coo(handle,
- d_csrOffsets,
- nnz,
- n,
- thrust::raw_pointer_cast(d_rows_tdv.data()),
- CUSPARSE_INDEX_BASE_ZERO));
-
- d_rows_tdv.resize(nnz + n);
- d_cols_tdv.resize(nnz + n);
- d_vals_tdv.resize(nnz + n);
-
- int nnz_n = nnz + n;
-
- thrust::device_vector d_vec(n);
-
- // Fill the vector with values from 0 to n-1
- thrust::sequence(d_vec.begin(), d_vec.end());
-
- thrust::copy(d_vec.begin(), d_vec.end(), d_rows_tdv.begin() + nnz);
- thrust::copy(d_vec.begin(), d_vec.end(), d_cols_tdv.begin() + nnz);
- thrust::copy(diag.begin(), diag.end(), d_vals_tdv.begin() + nnz);
-
- /*thrust::copy(d_rows_tdv.begin(), d_rows_tdv.end(), );
- thrust::copy(d_cols_tdv.begin(), d_cols_tdv.end(), extended_pointers.cols);
- thrust::copy(d_vals_tdv.begin(), d_vals_tdv.end(), extended_pointers.vals);*/
-
-
- CHECK_CUSPARSE(cusparseXcsr2coo(handle,
- d_csrOffsets,
- nnz,
- n,
- thrust::raw_pointer_cast(d_rows_tdv.data()),
- CUSPARSE_INDEX_BASE_ZERO));
- thrust::device_vector d_csrOffsets_o(n + 1);
-
- thrust::sort_by_key(thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.begin(), d_cols_tdv.begin())),
- thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.end(), d_cols_tdv.end())),
- d_vals_tdv.begin());
-
- CHECK_CUSPARSE(cusparseXcoo2csr(handle,
- thrust::raw_pointer_cast(d_rows_tdv.data()),
- nnz + n,
- n,
- thrust::raw_pointer_cast(d_csrOffsets_o.data()),
- CUSPARSE_INDEX_BASE_ZERO));
-
- CHECK_CUDA(cudaMemcpy(extended_pointers.rowPointer, thrust::raw_pointer_cast(d_csrOffsets_o.data()), (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
- CHECK_CUDA(cudaMemcpy(extended_pointers.cols, thrust::raw_pointer_cast(d_cols_tdv.data()), nnz_n * sizeof(int), cudaMemcpyDeviceToDevice));
- CHECK_CUDA(cudaMemcpy(extended_pointers.vals, thrust::raw_pointer_cast(d_vals_tdv.data()), nnz_n * sizeof(float), cudaMemcpyDeviceToDevice));
-
- CHECK_CUSPARSE(cusparseCsrSetPointers(input,
- extended_pointers.rowPointer,
- extended_pointers.cols,
- extended_pointers.vals));
-
-}
-
-
-void scale_csr_matrix(cusparseHandle_t handle,
- float alpha,
- cusparseSpMatDescr_t& input,
- cusparseSpMatDescr_t& result) {
- // Extract the dimensions and the number of non-zero elements
- int64_t n, nnz;
- int* d_csrOffsets;
- int* d_cols;
- float* d_vals;
-
- int64_t n_r, nnz_r;
- int* d_csrOffsets_r;
- int* d_cols_r;
- float* d_vals_r;
-
- cusparseIndexType_t csrRowOffsetsType;
- cusparseIndexType_t csrColIndType;
- cusparseIndexBase_t idxBase;
- cudaDataType valueTyp;
-
- cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
- cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
-
- // Set scaling factors
- const float beta = 0.0f;
- size_t bufferSize = 0;
-
- cusparseMatDescr_t input_desc;
- cusparseCreateMatDescr(&input_desc);
-
- // Create matrix descriptor for the result matrix C
- cusparseMatDescr_t result_desc;
- cusparseCreateMatDescr(&result_desc);
-
- // Get buffer size for the operation
- cusparseScsrgeam2_bufferSizeExt(handle,
- n, n,
- &alpha, input_desc, nnz,
- d_vals,
- d_csrOffsets,
- d_cols,
- &beta, input_desc, nnz,
- d_vals,
- d_csrOffsets,
- d_cols,
- result_desc,
- d_vals_r,
- d_csrOffsets_r,
- d_cols_r,
- &bufferSize);
-
- void* dBuffer;
- cudaMalloc(&dBuffer, bufferSize);
-
- // Perform the scaling operation
- cusparseScsrgeam2(handle,
- n, n,
- &alpha, input_desc, nnz,
- d_vals,
- d_csrOffsets,
- d_cols,
- &beta, input_desc, nnz,
- d_vals,
- d_csrOffsets,
- d_cols,
- result_desc,
- d_vals_r,
- d_csrOffsets_r,
- d_cols_r,
- dBuffer);
- // Clean up
- cudaFree(dBuffer);
- cusparseDestroyMatDescr(input_desc);
- cusparseDestroyMatDescr(result_desc);
-}
-
-thrust::device_vector sum_rows_csr_matrix(cusparseHandle_t handle, cusparseSpMatDescr_t input) {
- // Extract CSR matrix information
- int64_t rows, cols, nnz;
- int* d_csrOffsets, * d_cols;
- float* d_vals;
-
- cusparseIndexType_t csrRowOffsetsType;
- cusparseIndexType_t csrColIndType;
- cusparseIndexBase_t idxBase;
- cudaDataType valueTyp;
-
- cusparseCsrGet(input, &rows, &cols, &nnz,
- (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
- &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
-
- // Allocate memory for the row sums
- thrust::device_vector d_rowSums(rows);
-
- // Launch kernel to sum elements of each row using atomicAdd
- int blockSize = 256;
- int gridSize = (rows + blockSize - 1) / blockSize;
- cols_sum << > > (rows, d_csrOffsets, d_vals, thrust::raw_pointer_cast(d_rowSums.data()));
- cudaDeviceSynchronize();
-
- return d_rowSums;
-}
-
-int estimate_split(float density, int n) {
- if (density > 0.5f) {
- std::cout << "Error, density can not be bigger than 0.5!" << std::endl;
- }
-
- float sparsity = 1.0f - density;
- float inside = 2.0f * sparsity - 1.0f;
- float smaller_set_size = 0.5f * n * (1 + sqrt(inside));
- return static_cast(smaller_set_size) - 1;
-}
-
-// Function to generate initial permutation
-std::vector generate_initial_permutation(std::mt19937 & rng, int n) {
- std::vector permutation(n);
- for (int i = 0; i < n; ++i) {
- permutation[i] = i;
- }
- std::shuffle(permutation.begin(), permutation.end(), rng);
- return permutation;
-}
-
-
-void create_graph_sparse(int n, int split, std::vector& p, thrust::device_vector& d_rows, thrust::device_vector& d_cols, thrust::device_vector& d_vals) {
- thrust::host_vector h_rows, h_cols;
- thrust::host_vector h_vals;
-
- thrust::default_random_engine rng;
- thrust::random::uniform_real_distribution dist(0.01f, 1.0f);
-
- std::vector> combined;
-
- for (int i = 0; i <= split; ++i) {
- for (int j = split + 1; j < n; ++j) {
- float rnd_val = dist(rng);
- combined.push_back(std::make_tuple(p[i], p[j], rnd_val));
- combined.push_back(std::make_tuple(p[j], p[i], rnd_val));
- }
- }
-
- std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) {
- if (std::get<0>(a) == std::get<0>(b)) {
- return std::get<1>(a) < std::get<1>(b);
- }
- return std::get<0>(a) < std::get<0>(b);
- });
-
- for (size_t i = 0; i < combined.size(); ++i) {
- h_rows.push_back(std::get<0>(combined[i]));
- h_cols.push_back(std::get<1>(combined[i]));
- h_vals.push_back(std::get<2>(combined[i]));
- }
-
- d_rows = h_rows;
- d_cols = h_cols;
- d_vals = h_vals;
-}
-
-thrust::device_vector generate_solution(const std::vector& p, int split) {
- int size = p.size();
-
- // Initialize a device vector of bool type with false (equivalent to CUDA.zeros(Bool, size(p)))
- thrust::device_vector x(size, 0);
-
-
- // Set the first split elements to true (equivalent to x[p[1:split]] .= 1)
- for (int i = 0; i <= split ; ++i) {
- x[p[i]] = 1;
- }
-
- return x;
-}
-
-
-void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cusparseSpMatDescr_t& Q, csr_data& extended_pointers) {
- scale_csr_matrix(handle, -1.0f, graph, Q);
- thrust::device_vector sum = sum_rows_csr_matrix(handle, graph);
- fill_diagonal(handle, Q, sum, extended_pointers);
- scale_csr_matrix(handle, -0.25f, Q, Q);
-
- for (int i = 0; i < sum.size(); i++) {
- std::cout << sum[i] << std::endl;
- }
-}
-
-//float calculate_qubo_energy(cublasHandle_t cublasHandle,
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include "indicators.hpp"
+//#ifdef __INTELLISENSE__
+//#include "intellisense_cuda_intrinsics.h"
+//#endif
+//
+//
+//#define CHECK_CUSPARSE(call) \
+//{ \
+// cusparseStatus_t err = call; \
+// if (err != CUSPARSE_STATUS_SUCCESS) { \
+// std::cerr << "CUSPARSE error in file " << __FILE__ \
+// << " at line " << __LINE__ << ": " \
+// << cusparseGetErrorString(err) << std::endl; \
+// exit(EXIT_FAILURE); \
+// } \
+//}
+//
+//#define CHECK_CUDA(call) \
+//{ \
+// cudaError_t err = call; \
+// if (err != cudaSuccess) { \
+// std::cerr << "CUDA error in file " << __FILE__ \
+// << " at line " << __LINE__ << ": " \
+// << cudaGetErrorString(err) << std::endl; \
+// exit(EXIT_FAILURE); \
+// } \
+//}
+//
+//#define CHECK_CUBLAS(call) \
+//{ \
+// cublasStatus_t err = call; \
+// if (err != CUBLAS_STATUS_SUCCESS) { \
+// std::cerr << "CUBLAS error in file " << __FILE__ \
+// << " at line " << __LINE__ << ": " \
+// << cublasGetErrorString(err) << std::endl; \
+// exit(EXIT_FAILURE); \
+// } \
+//}
+//
+//
+//struct csr_data {
+// int* rowPointer;
+// int* cols;
+// float* vals;
+//};
+//
+//const char* cusparseGetErrorString(cusparseStatus_t status) {
+// switch (status) {
+// case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS";
+// case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED";
+// case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED";
+// case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE";
+// case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH";
+// case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR";
+// case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED";
+// case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR";
+// case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+// case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT";
+// default: return "UNKNOWN CUSPARSE STATUS";
+// }
+//}
+//
+//const char* cublasGetErrorString(cublasStatus_t status) {
+// switch (status) {
+// case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+// case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+// case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+// case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+// case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+// case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+// case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+// case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+// case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+// case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+// default: return "UNKNOWN CUBLAS STATUS";
+// }
+//}
+//
+//__global__ void cols_sum(int n, const int* d_csrOffsets, const float* d_vals, float* d_rowSums) {
+// int row = blockIdx.x * blockDim.x + threadIdx.x;
+// if (row < n) {
+// int row_start = d_csrOffsets[row];
+// int row_end = d_csrOffsets[row + 1];
+// for (int j = row_start; j < row_end; j++) {
+// atomicAdd(&d_rowSums[row], d_vals[j]);
+// }
+// }
+//}
+//
+//void convert_sparse_to_dense_and_display(cusparseHandle_t handle, const cusparseSpMatDescr_t& matDescr, int n) {
+// // Allocate memory for the dense matrix on the device
+// float* d_denseMat;
+// cudaMalloc((void**)&d_denseMat, n * n * sizeof(float));
+//
+// // Create a dense matrix descriptor
+// cusparseDnMatDescr_t denseDescr;
+// CHECK_CUSPARSE(cusparseCreateDnMat(&denseDescr,
+// n, // number of rows
+// n, // number of columns
+// n, // leading dimension
+// d_denseMat, // pointer to dense matrix data
+// CUDA_R_32F, // data type
+// CUSPARSE_ORDER_ROW)); // row-major order
+//
+// // Convert sparse matrix to dense matrix
+// void* dBuffer = NULL;
+// size_t bufferSize = 0;
+// CHECK_CUSPARSE(cusparseSparseToDense_bufferSize(handle,
+// matDescr,
+// denseDescr,
+// CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
+// &bufferSize));
+//
+// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+// CHECK_CUSPARSE(cusparseSparseToDense(handle,
+// matDescr,
+// denseDescr,
+// CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
+// dBuffer));
+//
+// // Copy the dense matrix from device to host
+// std::vector h_denseMat(n * n);
+// CHECK_CUDA(cudaMemcpy(h_denseMat.data(), d_denseMat, n * n * sizeof(float), cudaMemcpyDeviceToHost));
+//
+// std::cout << std::fixed << std::setprecision(4); // Set precision to 2 decimal places
+// std::cout << "Dense matrix:" << std::endl;
+// for (int i = 0; i < n; ++i) {
+// for (int j = 0; j < n; ++j) {
+// std::cout << h_denseMat[i * n + j] << " ";
+// }
+// std::cout << std::endl;
+// }
+//
+// // Clean up
+// CHECK_CUDA(cudaFree(d_denseMat));
+// CHECK_CUDA(cudaFree(dBuffer));
+// CHECK_CUSPARSE(cusparseDestroyDnMat(denseDescr));
+//}
+//
+//
+//void fill_diagonal(cusparseHandle_t handle, cusparseSpMatDescr_t& input, thrust::device_vector diag, csr_data& extended_pointers) {
+// int64_t n, nnz;
+// int* d_csrOffsets;
+// int* d_cols;
+// float* d_vals;
+//
+// cusparseIndexType_t csrRowOffsetsType;
+// cusparseIndexType_t csrColIndType;
+// cusparseIndexBase_t idxBase;
+// cudaDataType valueTyp;
+//
+// CHECK_CUSPARSE(cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
+//
+// thrust::device_vector d_rows_tdv(nnz);
+// thrust::device_vector d_cols_tdv(d_cols, d_cols+nnz);
+// thrust::device_vector d_vals_tdv(d_vals, d_vals+nnz);
+//
+// CHECK_CUSPARSE(cusparseXcsr2coo(handle,
+// d_csrOffsets,
+// nnz,
+// n,
+// thrust::raw_pointer_cast(d_rows_tdv.data()),
+// CUSPARSE_INDEX_BASE_ZERO));
+//
+// d_rows_tdv.resize(nnz + n);
+// d_cols_tdv.resize(nnz + n);
+// d_vals_tdv.resize(nnz + n);
+//
+// int nnz_n = nnz + n;
+//
+// thrust::device_vector d_vec(n);
+//
+// // Fill the vector with values from 0 to n-1
+// thrust::sequence(d_vec.begin(), d_vec.end());
+//
+// thrust::copy(d_vec.begin(), d_vec.end(), d_rows_tdv.begin() + nnz);
+// thrust::copy(d_vec.begin(), d_vec.end(), d_cols_tdv.begin() + nnz);
+// thrust::copy(diag.begin(), diag.end(), d_vals_tdv.begin() + nnz);
+//
+// /*thrust::copy(d_rows_tdv.begin(), d_rows_tdv.end(), );
+// thrust::copy(d_cols_tdv.begin(), d_cols_tdv.end(), extended_pointers.cols);
+// thrust::copy(d_vals_tdv.begin(), d_vals_tdv.end(), extended_pointers.vals);*/
+//
+//
+// CHECK_CUSPARSE(cusparseXcsr2coo(handle,
+// d_csrOffsets,
+// nnz,
+// n,
+// thrust::raw_pointer_cast(d_rows_tdv.data()),
+// CUSPARSE_INDEX_BASE_ZERO));
+// thrust::device_vector d_csrOffsets_o(n + 1);
+//
+// thrust::sort_by_key(thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.begin(), d_cols_tdv.begin())),
+// thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.end(), d_cols_tdv.end())),
+// d_vals_tdv.begin());
+//
+// CHECK_CUSPARSE(cusparseXcoo2csr(handle,
+// thrust::raw_pointer_cast(d_rows_tdv.data()),
+// nnz + n,
+// n,
+// thrust::raw_pointer_cast(d_csrOffsets_o.data()),
+// CUSPARSE_INDEX_BASE_ZERO));
+//
+// CHECK_CUDA(cudaMemcpy(extended_pointers.rowPointer, thrust::raw_pointer_cast(d_csrOffsets_o.data()), (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(extended_pointers.cols, thrust::raw_pointer_cast(d_cols_tdv.data()), nnz_n * sizeof(int), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(extended_pointers.vals, thrust::raw_pointer_cast(d_vals_tdv.data()), nnz_n * sizeof(float), cudaMemcpyDeviceToDevice));
+//
+// CHECK_CUSPARSE(cusparseCsrSetPointers(input,
+// extended_pointers.rowPointer,
+// extended_pointers.cols,
+// extended_pointers.vals));
+//
+//}
+//
+//
+//void scale_csr_matrix(cusparseHandle_t handle,
+// float alpha,
+// cusparseSpMatDescr_t& input,
+// cusparseSpMatDescr_t& result) {
+// // Extract the dimensions and the number of non-zero elements
+// int64_t n, nnz;
+// int* d_csrOffsets;
+// int* d_cols;
+// float* d_vals;
+//
+// int64_t n_r, nnz_r;
+// int* d_csrOffsets_r;
+// int* d_cols_r;
+// float* d_vals_r;
+//
+// cusparseIndexType_t csrRowOffsetsType;
+// cusparseIndexType_t csrColIndType;
+// cusparseIndexBase_t idxBase;
+// cudaDataType valueTyp;
+//
+// cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+// cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+//
+// // Set scaling factors
+// const float beta = 0.0f;
+// size_t bufferSize = 0;
+//
+// cusparseMatDescr_t input_desc;
+// cusparseCreateMatDescr(&input_desc);
+//
+// // Create matrix descriptor for the result matrix C
+// cusparseMatDescr_t result_desc;
+// cusparseCreateMatDescr(&result_desc);
+//
+// // Get buffer size for the operation
+// cusparseScsrgeam2_bufferSizeExt(handle,
+// n, n,
+// &alpha, input_desc, nnz,
+// d_vals,
+// d_csrOffsets,
+// d_cols,
+// &beta, input_desc, nnz,
+// d_vals,
+// d_csrOffsets,
+// d_cols,
+// result_desc,
+// d_vals_r,
+// d_csrOffsets_r,
+// d_cols_r,
+// &bufferSize);
+//
+// void* dBuffer;
+// cudaMalloc(&dBuffer, bufferSize);
+//
+// // Perform the scaling operation
+// cusparseScsrgeam2(handle,
+// n, n,
+// &alpha, input_desc, nnz,
+// d_vals,
+// d_csrOffsets,
+// d_cols,
+// &beta, input_desc, nnz,
+// d_vals,
+// d_csrOffsets,
+// d_cols,
+// result_desc,
+// d_vals_r,
+// d_csrOffsets_r,
+// d_cols_r,
+// dBuffer);
+// // Clean up
+// cudaFree(dBuffer);
+// cusparseDestroyMatDescr(input_desc);
+// cusparseDestroyMatDescr(result_desc);
+//}
+//
+//thrust::device_vector sum_rows_csr_matrix(cusparseHandle_t handle, cusparseSpMatDescr_t input) {
+// // Extract CSR matrix information
+// int64_t rows, cols, nnz;
+// int* d_csrOffsets, * d_cols;
+// float* d_vals;
+//
+// cusparseIndexType_t csrRowOffsetsType;
+// cusparseIndexType_t csrColIndType;
+// cusparseIndexBase_t idxBase;
+// cudaDataType valueTyp;
+//
+// cusparseCsrGet(input, &rows, &cols, &nnz,
+// (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
+// &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+//
+// // Allocate memory for the row sums
+// thrust::device_vector d_rowSums(rows);
+//
+// // Launch kernel to sum elements of each row using atomicAdd
+// int blockSize = 256;
+// int gridSize = (rows + blockSize - 1) / blockSize;
+// cols_sum << > > (rows, d_csrOffsets, d_vals, thrust::raw_pointer_cast(d_rowSums.data()));
+// cudaDeviceSynchronize();
+//
+// return d_rowSums;
+//}
+//
+//int estimate_split(float density, int n) {
+// if (density > 0.5f) {
+// std::cout << "Error, density can not be bigger than 0.5!" << std::endl;
+// }
+//
+// float sparsity = 1.0f - density;
+// float inside = 2.0f * sparsity - 1.0f;
+// float smaller_set_size = 0.5f * n * (1 + sqrt(inside));
+// return static_cast(smaller_set_size) - 1;
+//}
+//
+//// Function to generate initial permutation
+//std::vector generate_initial_permutation(std::mt19937 & rng, int n) {
+// std::vector permutation(n);
+// for (int i = 0; i < n; ++i) {
+// permutation[i] = i;
+// }
+// std::shuffle(permutation.begin(), permutation.end(), rng);
+// return permutation;
+//}
+//
+//
+//void create_graph_sparse(int n, int split, std::vector& p, thrust::device_vector& d_rows, thrust::device_vector& d_cols, thrust::device_vector& d_vals) {
+// thrust::host_vector h_rows, h_cols;
+// thrust::host_vector h_vals;
+//
+// thrust::default_random_engine rng;
+// thrust::random::uniform_real_distribution dist(0.01f, 1.0f);
+//
+// std::vector> combined;
+//
+// for (int i = 0; i <= split; ++i) {
+// for (int j = split + 1; j < n; ++j) {
+// float rnd_val = dist(rng);
+// combined.push_back(std::make_tuple(p[i], p[j], rnd_val));
+// combined.push_back(std::make_tuple(p[j], p[i], rnd_val));
+// }
+// }
+//
+// std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) {
+// if (std::get<0>(a) == std::get<0>(b)) {
+// return std::get<1>(a) < std::get<1>(b);
+// }
+// return std::get<0>(a) < std::get<0>(b);
+// });
+//
+// for (size_t i = 0; i < combined.size(); ++i) {
+// h_rows.push_back(std::get<0>(combined[i]));
+// h_cols.push_back(std::get<1>(combined[i]));
+// h_vals.push_back(std::get<2>(combined[i]));
+// }
+//
+// d_rows = h_rows;
+// d_cols = h_cols;
+// d_vals = h_vals;
+//}
+//
+//thrust::device_vector generate_solution(const std::vector& p, int split) {
+// int size = p.size();
+//
+// // Initialize a device vector of bool type with false (equivalent to CUDA.zeros(Bool, size(p)))
+// thrust::device_vector x(size, 0);
+//
+//
+// // Set the first split elements to true (equivalent to x[p[1:split]] .= 1)
+// for (int i = 0; i <= split ; ++i) {
+// x[p[i]] = 1;
+// }
+//
+// return x;
+//}
+//
+//
+//void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cusparseSpMatDescr_t& Q, csr_data& extended_pointers) {
+// scale_csr_matrix(handle, -1.0f, graph, Q);
+// thrust::device_vector sum = sum_rows_csr_matrix(handle, graph);
+// fill_diagonal(handle, Q, sum, extended_pointers);
+// scale_csr_matrix(handle, -0.25f, Q, Q);
+//
+// for (int i = 0; i < sum.size(); i++) {
+// std::cout << sum[i] << std::endl;
+// }
+//}
+//
+////float calculate_qubo_energy(cublasHandle_t cublasHandle,
+//// cusparseHandle_t handle,
+//// int n,
+//// const cusparseSpMatDescr_t& Q,
+//// const thrust::device_vector& x) {
+//// float alpha = 1.0f;
+//// float beta = 0.0f;
+//// size_t bufferSize = 0;
+//// void* dBuffer = nullptr;
+////
+//// // Create dense vector descriptors
+//// cusparseDnVecDescr_t vecX, vecY;
+//// float* Qx;
+//// float* in_x;
+////
+//// cudaMalloc((void**)&Qx, n * sizeof(float));
+//// cudaMalloc((void**)&in_x, n * sizeof(float));
+////
+//// cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice);
+////
+//// cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F);
+//// cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F);
+//// // Allocate buffer
+//// cusparseSpMV_bufferSize(handle,
+//// CUSPARSE_OPERATION_NON_TRANSPOSE,
+//// &alpha,
+//// Q,
+//// vecX,
+//// &beta,
+//// vecY,
+//// CUDA_R_32F,
+//// CUSPARSE_SPMV_ALG_DEFAULT,
+//// &bufferSize);
+//// cudaMalloc(&dBuffer, bufferSize);
+//// // Perform Q * x
+//// cusparseSpMV(handle,
+//// CUSPARSE_OPERATION_NON_TRANSPOSE,
+//// &alpha,
+//// Q,
+//// vecX,
+//// &beta,
+//// vecY,
+//// CUDA_R_32F,
+//// CUSPARSE_SPMV_ALG_DEFAULT,
+//// dBuffer);
+////
+//// float* Qx_ptr;
+//// cusparseDnVecGetValues(vecY, (void**)&Qx_ptr);
+////
+//// // Compute the dot product x^T * (Q * x) using cuBLAS
+//// float result;
+//// cublasSdot(cublasHandle, n, in_x, 1, Qx_ptr, 1, &result);
+//// // Clean up
+//// cusparseDestroyDnVec(vecX);
+//// cusparseDestroyDnVec(vecY);
+//// cudaFree(dBuffer);
+//// cudaFree(Qx);
+//// cudaFree(in_x);
+//// return result;
+////}
+//
+//
+//float qubo_eng(cublasHandle_t cublasHandle,
// cusparseHandle_t handle,
+// const cusparseSpMatDescr_t& Q,
+// float* sol_vector) {
+// int64_t rows, cols, nnz;
+// int* d_csrOffsets, * d_cols;
+// float* d_vals;
+// int* dA_csrOffsets, * dA_columns;
+// float* dA_values;
+//
+// cusparseIndexType_t csrRowOffsetsType;
+// cusparseIndexType_t csrColIndType;
+// cusparseIndexBase_t idxBase;
+// cudaDataType valueTyp;
+//
+// CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz));
+//
+// CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float)));
+//
+// CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz,
+// (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
+// &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
+// // Host problem definition
+// const int A_num_rows = rows;
+// const int A_num_cols = cols;
+// const int A_nnz = nnz;
+// //int h_test[11];
+// //int* h_test = (int*)calloc(rows+1, sizeof(int));
+// float alpha = 1.0f;
+// float beta = 0.0f;
+// //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+// ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
+// ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+// //for (int i = 0; i < A_num_rows+1; i++) {
+// // std::cout << h_test[i] << std::endl;
+// //}
+// //--------------------------------------------------------------------------
+// // Device memory management
+// float * dX, * dY;
+// CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float)));
+//
+// CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice));
+// cusparseSpMatDescr_t matA;
+// cusparseDnVecDescr_t vecX, vecY;
+// void* dBuffer = NULL;
+// size_t bufferSize = 0;
+// CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
+// dA_csrOffsets, dA_columns, dA_values,
+// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+// // Create dense vector X
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F));
+// // Create dense vector y
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F));
+// // allocate an external buffer if needed
+// CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+// handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+// CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
+// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+// // execute SpMV
+// CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+// CUSPARSE_SPMV_ALG_DEFAULT, dBuffer));
+//
+// // destroy matrix/vector descriptors
+// CHECK_CUSPARSE(cusparseDestroySpMat(matA));
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+//
+// /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost));
+// std::cout << "Result of Q*x: " << std::endl;
+// for (int i = 0; i < A_num_rows; i++) {
+// std::cout << hY[i] << std::endl;
+// }*/
+//
+// float result;
+// CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result));
+// //std::cout << "Energy: " << result << std::endl;
+// return result;
+//}
+//
+//float calculate_qubo_energy(cublasHandle_t cublasHandle,
+// cusparseHandle_t cusparseHandle,
// int n,
// const cusparseSpMatDescr_t& Q,
-// const thrust::device_vector& x) {
+// thrust::device_vector& x) {
// float alpha = 1.0f;
// float beta = 0.0f;
// size_t bufferSize = 0;
@@ -432,17 +586,26 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars
// // Create dense vector descriptors
// cusparseDnVecDescr_t vecX, vecY;
// float* Qx;
+// float* Qx_ptr;
+// //float* hY = (float*)calloc(n, sizeof(float));
// float* in_x;
//
-// cudaMalloc((void**)&Qx, n * sizeof(float));
-// cudaMalloc((void**)&in_x, n * sizeof(float));
+// /* for (int i = 0; i < n; i++) {
+// hY[i] = 0.0f;
+// }*/
//
-// cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice);
+// CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float)));
+//
+// CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice));
+// //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice));
+//
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F));
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F));
//
-// cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F);
-// cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F);
// // Allocate buffer
-// cusparseSpMV_bufferSize(handle,
+// CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle,
// CUSPARSE_OPERATION_NON_TRANSPOSE,
// &alpha,
// Q,
@@ -451,10 +614,11 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars
// vecY,
// CUDA_R_32F,
// CUSPARSE_SPMV_ALG_DEFAULT,
-// &bufferSize);
-// cudaMalloc(&dBuffer, bufferSize);
+// &bufferSize));
+// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
// // Perform Q * x
-// cusparseSpMV(handle,
+// CHECK_CUSPARSE(cusparseSpMV(cusparseHandle,
// CUSPARSE_OPERATION_NON_TRANSPOSE,
// &alpha,
// Q,
@@ -463,223 +627,131 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars
// vecY,
// CUDA_R_32F,
// CUSPARSE_SPMV_ALG_DEFAULT,
-// dBuffer);
+// dBuffer));
+//
+// // Extract the raw pointers to the data in the dense vector descriptors
+// //float* Qx_ptr;
+// //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr));
+// // Ensure the computation is complete before accessing the result
+// CHECK_CUDA(cudaDeviceSynchronize());
+//
+// /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost));
+// for (int i = 0; i < n; i++) {
+// std::cout << hY[i] << std::endl;
+// }*/
//
-// float* Qx_ptr;
-// cusparseDnVecGetValues(vecY, (void**)&Qx_ptr);
-//
// // Compute the dot product x^T * (Q * x) using cuBLAS
// float result;
-// cublasSdot(cublasHandle, n, in_x, 1, Qx_ptr, 1, &result);
+// CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result));
+//
// // Clean up
-// cusparseDestroyDnVec(vecX);
-// cusparseDestroyDnVec(vecY);
-// cudaFree(dBuffer);
-// cudaFree(Qx);
-// cudaFree(in_x);
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+// CHECK_CUDA(cudaFree(dBuffer));
+// CHECK_CUDA(cudaFree(Qx));
+// CHECK_CUDA(cudaFree(in_x));
+//
// return result;
//}
-
-
-float qubo_eng(cublasHandle_t cublasHandle,
- cusparseHandle_t handle,
- const cusparseSpMatDescr_t& Q,
- float* sol_vector) {
- int64_t rows, cols, nnz;
- int* d_csrOffsets, * d_cols;
- float* d_vals;
- int* dA_csrOffsets, * dA_columns;
- float* dA_values;
-
- cusparseIndexType_t csrRowOffsetsType;
- cusparseIndexType_t csrColIndType;
- cusparseIndexBase_t idxBase;
- cudaDataType valueTyp;
-
- CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz));
-
- CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int)));
- CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int)));
- CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float)));
-
- CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz,
- (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
- &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
- // Host problem definition
- const int A_num_rows = rows;
- const int A_num_cols = cols;
- const int A_nnz = nnz;
- //int h_test[11];
- //int* h_test = (int*)calloc(rows+1, sizeof(int));
- float alpha = 1.0f;
- float beta = 0.0f;
- //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
- ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
- ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
- //for (int i = 0; i < A_num_rows+1; i++) {
- // std::cout << h_test[i] << std::endl;
- //}
- //--------------------------------------------------------------------------
- // Device memory management
- float * dX, * dY;
- CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int)));
- CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int)));
- CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float)));
- CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float)));
- CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float)));
-
- CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
- CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
- CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice));
- CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice));
- cusparseSpMatDescr_t matA;
- cusparseDnVecDescr_t vecX, vecY;
- void* dBuffer = NULL;
- size_t bufferSize = 0;
- CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
- dA_csrOffsets, dA_columns, dA_values,
- CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
- CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
- // Create dense vector X
- CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F));
- // Create dense vector y
- CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F));
- // allocate an external buffer if needed
- CHECK_CUSPARSE(cusparseSpMV_bufferSize(
- handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
- &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
- CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
- CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
-
- // execute SpMV
- CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
- &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
- CUSPARSE_SPMV_ALG_DEFAULT, dBuffer));
-
- // destroy matrix/vector descriptors
- CHECK_CUSPARSE(cusparseDestroySpMat(matA));
- CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
- CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
-
- /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost));
- std::cout << "Result of Q*x: " << std::endl;
- for (int i = 0; i < A_num_rows; i++) {
- std::cout << hY[i] << std::endl;
- }*/
-
- float result;
- CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result));
- //std::cout << "Energy: " << result << std::endl;
- return result;
-}
-
-float calculate_qubo_energy(cublasHandle_t cublasHandle,
- cusparseHandle_t cusparseHandle,
- int n,
- const cusparseSpMatDescr_t& Q,
- thrust::device_vector& x) {
- float alpha = 1.0f;
- float beta = 0.0f;
- size_t bufferSize = 0;
- void* dBuffer = nullptr;
-
- // Create dense vector descriptors
- cusparseDnVecDescr_t vecX, vecY;
- float* Qx;
- float* Qx_ptr;
- //float* hY = (float*)calloc(n, sizeof(float));
- float* in_x;
-
- /* for (int i = 0; i < n; i++) {
- hY[i] = 0.0f;
- }*/
-
- CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float)));
- CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float)));
- CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float)));
-
- CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice));
- //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice));
-
- CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F));
- CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F));
-
- // Allocate buffer
- CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle,
- CUSPARSE_OPERATION_NON_TRANSPOSE,
- &alpha,
- Q,
- vecX,
- &beta,
- vecY,
- CUDA_R_32F,
- CUSPARSE_SPMV_ALG_DEFAULT,
- &bufferSize));
- CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
-
- // Perform Q * x
- CHECK_CUSPARSE(cusparseSpMV(cusparseHandle,
- CUSPARSE_OPERATION_NON_TRANSPOSE,
- &alpha,
- Q,
- vecX,
- &beta,
- vecY,
- CUDA_R_32F,
- CUSPARSE_SPMV_ALG_DEFAULT,
- dBuffer));
-
- // Extract the raw pointers to the data in the dense vector descriptors
- //float* Qx_ptr;
- //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr));
- // Ensure the computation is complete before accessing the result
- CHECK_CUDA(cudaDeviceSynchronize());
-
- /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost));
- for (int i = 0; i < n; i++) {
- std::cout << hY[i] << std::endl;
- }*/
-
- // Compute the dot product x^T * (Q * x) using cuBLAS
- float result;
- CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result));
-
- // Clean up
- CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
- CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
- CHECK_CUDA(cudaFree(dBuffer));
- CHECK_CUDA(cudaFree(Qx));
- CHECK_CUDA(cudaFree(in_x));
-
- return result;
-}
-
+//
+////void brute_force_solutions(cublasHandle_t cublasHandle,
+//// cusparseHandle_t handle,
+//// int n,
+//// const cusparseSpMatDescr_t& Q) {
+////
+//// int num_solutions = 1 << n; // 2^n
+//// thrust::device_vector d_solutions(num_solutions * n);
+//// for (int i = 0; i < num_solutions; ++i) {
+//// for (int j = 0; j < n; ++j) {
+//// float val = static_cast((i & (1 << j)) != 0);
+//// d_solutions[i * n + j] = val;
+//// }
+//// }
+////
+////
+//// // Allocate memory for energies
+//// thrust::device_vector d_energies(num_solutions);
+//// thrust::device_vector sol_x(n);
+//// for (int i = 0; i < num_solutions; ++i) {
+//// thrust::copy(d_solutions.begin() + (i * n), d_solutions.begin() + ((i + 1) * n), sol_x.begin());
+//// float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
+//// d_energies[i] = eng;
+//// std::cout << "Energy i: " << eng << std::endl;
+//// }
+////
+////
+//// // Find the minimum energy
+//// auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
+//// float min_energy = *min_energy_iter;
+//// int min_index = min_energy_iter - d_energies.begin();
+////
+//// // Copy the best solution to host
+//// thrust::host_vector h_best_solution(n);
+//// thrust::copy(d_solutions.begin() + min_index * n, d_solutions.begin() + (min_index + 1) * n, h_best_solution.begin());
+////
+//// // Print the result
+//// std::cout << "Best energy: " << min_energy << std::endl;
+//// std::cout << "Best solution: ";
+//// for (float bit : h_best_solution) {
+//// std::cout << bit << " ";
+//// }
+//// std::cout << std::endl;
+//// convert_sparse_to_dense_and_display(handle, Q, n);
+////}
+//
//void brute_force_solutions(cublasHandle_t cublasHandle,
// cusparseHandle_t handle,
// int n,
// const cusparseSpMatDescr_t& Q) {
-//
+// indicators::show_console_cursor(false);
// int num_solutions = 1 << n; // 2^n
// thrust::device_vector d_solutions(num_solutions * n);
+// /*
// for (int i = 0; i < num_solutions; ++i) {
// for (int j = 0; j < n; ++j) {
// float val = static_cast((i & (1 << j)) != 0);
// d_solutions[i * n + j] = val;
// }
-// }
-//
+// }*/
//
+// //convert_sparse_to_dense_and_display(handle, Q, n);
// // Allocate memory for energies
// thrust::device_vector d_energies(num_solutions);
// thrust::device_vector sol_x(n);
+//
+// indicators::ProgressBar bar{
+// indicators::option::BarWidth{40},
+// indicators::option::Start{"["},
+// indicators::option::Fill{"="},
+// indicators::option::Lead{">"},
+// indicators::option::Remainder{" "},
+// indicators::option::End{" ]"},
+// indicators::option::ForegroundColor{indicators::Color::white},
+// indicators::option::FontStyles{
+// std::vector{indicators::FontStyle::bold}},
+// indicators::option::MaxProgress{num_solutions}
+// };
+// int smallest_ind = -1;
+// float smallest_eng = 1000;
+//
// for (int i = 0; i < num_solutions; ++i) {
-// thrust::copy(d_solutions.begin() + (i * n), d_solutions.begin() + ((i + 1) * n), sol_x.begin());
-// float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
+// for (int j = 0; j < n; ++j) {
+// float val = static_cast((i & (1 << j)) != 0);
+// sol_x[j] = val;
+// d_solutions[i * n + j] = val;
+// }
+// float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data()));
+// //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
// d_energies[i] = eng;
-// std::cout << "Energy i: " << eng << std::endl;
+// if (eng <= smallest_eng) {
+// smallest_ind = i;
+// smallest_eng = eng;
+// }
+// //std::cout << "Energy i: " << eng << std::endl;
+// bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) });
+// bar.tick();
// }
-//
-//
+// std::cout << "Tested all solutions, now sorting." << std::endl;
// // Find the minimum energy
// auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
// float min_energy = *min_energy_iter;
@@ -687,10 +759,11 @@ float calculate_qubo_energy(cublasHandle_t cublasHandle,
//
// // Copy the best solution to host
// thrust::host_vector h_best_solution(n);
-// thrust::copy(d_solutions.begin() + min_index * n, d_solutions.begin() + (min_index + 1) * n, h_best_solution.begin());
+// thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin());
//
// // Print the result
-// std::cout << "Best energy: " << min_energy << std::endl;
+// std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl;
+// std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl;
// std::cout << "Best solution: ";
// for (float bit : h_best_solution) {
// std::cout << bit << " ";
@@ -698,198 +771,125 @@ float calculate_qubo_energy(cublasHandle_t cublasHandle,
// std::cout << std::endl;
// convert_sparse_to_dense_and_display(handle, Q, n);
//}
-
-void brute_force_solutions(cublasHandle_t cublasHandle,
- cusparseHandle_t handle,
- int n,
- const cusparseSpMatDescr_t& Q) {
- indicators::show_console_cursor(false);
- int num_solutions = 1 << n; // 2^n
- thrust::device_vector d_solutions(num_solutions * n);
- /*
- for (int i = 0; i < num_solutions; ++i) {
- for (int j = 0; j < n; ++j) {
- float val = static_cast((i & (1 << j)) != 0);
- d_solutions[i * n + j] = val;
- }
- }*/
-
- //convert_sparse_to_dense_and_display(handle, Q, n);
- // Allocate memory for energies
- thrust::device_vector d_energies(num_solutions);
- thrust::device_vector sol_x(n);
-
- indicators::ProgressBar bar{
- indicators::option::BarWidth{40},
- indicators::option::Start{"["},
- indicators::option::Fill{"="},
- indicators::option::Lead{">"},
- indicators::option::Remainder{" "},
- indicators::option::End{" ]"},
- indicators::option::ForegroundColor{indicators::Color::white},
- indicators::option::FontStyles{
- std::vector{indicators::FontStyle::bold}},
- indicators::option::MaxProgress{num_solutions}
- };
- int smallest_ind = -1;
- float smallest_eng = 1000;
-
- for (int i = 0; i < num_solutions; ++i) {
- for (int j = 0; j < n; ++j) {
- float val = static_cast((i & (1 << j)) != 0);
- sol_x[j] = val;
- d_solutions[i * n + j] = val;
- }
- float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data()));
- //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
- d_energies[i] = eng;
- if (eng <= smallest_eng) {
- smallest_ind = i;
- smallest_eng = eng;
- }
- //std::cout << "Energy i: " << eng << std::endl;
- bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) });
- bar.tick();
- }
- std::cout << "Tested all solutions, now sorting." << std::endl;
- // Find the minimum energy
- auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
- float min_energy = *min_energy_iter;
- int min_index = min_energy_iter - d_energies.begin();
-
- // Copy the best solution to host
- thrust::host_vector h_best_solution(n);
- thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin());
-
- // Print the result
- std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl;
- std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl;
- std::cout << "Best solution: ";
- for (float bit : h_best_solution) {
- std::cout << bit << " ";
- }
- std::cout << std::endl;
- convert_sparse_to_dense_and_display(handle, Q, n);
-}
-
-
-int main() {
- int n = 12;
- int seed = 14;
- float density = 0.5;
- std::mt19937 rng(seed);
-
- std::vector p = generate_initial_permutation(rng, n);
-
- int split = estimate_split(density, n); // Example split, can be computed as needed
- std::cout << "Split: " << split << std::endl;
-
- thrust::device_vector d_rows;
- thrust::device_vector d_cols;
- thrust::device_vector d_vals;
-
- create_graph_sparse(n, split, p, d_rows, d_cols, d_vals);
- thrust::device_vector d_x = generate_solution(p, split);
- std::cout << "Graph created.";
- // Print the result (for debugging purposes)
- thrust::host_vector h_rows = d_rows;
- thrust::host_vector h_cols = d_cols;
- thrust::host_vector h_vals = d_vals;
- thrust::host_vector h_x = d_x;
- int print_size = 2*n;
- std::cout << "Rows: ";
- for (int i = 0; i < print_size; ++i) {
- std::cout << h_rows[i] << " ";
- }
- std::cout << std::endl;
-
- std::cout << "Cols: ";
- for (int i = 0; i < print_size; ++i) {
- std::cout << h_cols[i] << " ";
- }
- std::cout << std::endl;
-
- std::cout << "Vals: ";
- for (int i = 0; i < print_size; ++i) {
- std::cout << h_vals[i] << " ";
- }
- std::cout << std::endl;
-
- std::cout << "Solution: ";
- for (int i = 0; i < n; ++i) {
- std::cout << h_x[i] << " ";
- }
- std::cout << std::endl;
-
- cusparseHandle_t handle;
- cublasHandle_t cublasHandle;
- cusparseSpMatDescr_t graph_csr, Q;
-
- // Initialize cuSPARSE
- cublasCreate(&cublasHandle);
- CHECK_CUSPARSE(cusparseCreate(&handle));
- int nnz = d_vals.size();
- thrust::device_vector d_csrOffsets(n + 1);
- cusparseXcoo2csr(handle,
- thrust::raw_pointer_cast(d_rows.data()),
- nnz,
- n,
- thrust::raw_pointer_cast(d_csrOffsets.data()),
- CUSPARSE_INDEX_BASE_ZERO);
-
- CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr,
- n,
- n,
- nnz,
- thrust::raw_pointer_cast(d_csrOffsets.data()),
- thrust::raw_pointer_cast(d_cols.data()), // column indices
- thrust::raw_pointer_cast(d_vals.data()), // values
- CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
- CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-
- thrust::device_vector d_QVals(nnz);
- CHECK_CUSPARSE(cusparseCreateCsr(&Q,
- n,
- n,
- nnz,
- thrust::raw_pointer_cast(d_csrOffsets.data()),
- thrust::raw_pointer_cast(d_cols.data()), // column indices
- thrust::raw_pointer_cast(d_QVals.data()), // values
- CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
- CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
- // Now matDescr can be used in further cuSPARSE operations
-
- int* newRowsPtr;
- int* newCols;
- float* newVals;
- csr_data extended_sparse_data;
-
- CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int)));
- CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int)));
- CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float)));
-
- extended_sparse_data.rowPointer = newRowsPtr;
- extended_sparse_data.cols = newCols;
- extended_sparse_data.vals = newVals;
-
-
- std::cout << "Sparse matrix created successfully!" << std::endl;
- graph_to_qubo(handle, graph_csr, Q, extended_sparse_data);
- /*convert_sparse_to_dense_and_display(handle, graph_csr, n); */
- //convert_sparse_to_dense_and_display(handle, Q, n);
-
- float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data()));
-
- std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl;
-
- /*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x);
- std::cout << "Qubo energy: " << energy << std::endl;*/
- brute_force_solutions(cublasHandle, handle, n, Q);
-
- // Clean up
- CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr));
- CHECK_CUSPARSE(cusparseDestroySpMat(Q));
- CHECK_CUSPARSE(cusparseDestroy(handle));
-
- return 0;
-};
+//
+//
+//int main() {
+// int n = 12;
+// int seed = 14;
+// float density = 0.5;
+// std::mt19937 rng(seed);
+//
+// std::vector p = generate_initial_permutation(rng, n);
+//
+// int split = estimate_split(density, n); // Example split, can be computed as needed
+// std::cout << "Split: " << split << std::endl;
+//
+// thrust::device_vector d_rows;
+// thrust::device_vector d_cols;
+// thrust::device_vector d_vals;
+//
+// create_graph_sparse(n, split, p, d_rows, d_cols, d_vals);
+// thrust::device_vector d_x = generate_solution(p, split);
+// std::cout << "Graph created.";
+// // Print the result (for debugging purposes)
+// thrust::host_vector h_rows = d_rows;
+// thrust::host_vector h_cols = d_cols;
+// thrust::host_vector h_vals = d_vals;
+// thrust::host_vector h_x = d_x;
+// int print_size = 2*n;
+// std::cout << "Rows: ";
+// for (int i = 0; i < print_size; ++i) {
+// std::cout << h_rows[i] << " ";
+// }
+// std::cout << std::endl;
+//
+// std::cout << "Cols: ";
+// for (int i = 0; i < print_size; ++i) {
+// std::cout << h_cols[i] << " ";
+// }
+// std::cout << std::endl;
+//
+// std::cout << "Vals: ";
+// for (int i = 0; i < print_size; ++i) {
+// std::cout << h_vals[i] << " ";
+// }
+// std::cout << std::endl;
+//
+// std::cout << "Solution: ";
+// for (int i = 0; i < n; ++i) {
+// std::cout << h_x[i] << " ";
+// }
+// std::cout << std::endl;
+//
+// cusparseHandle_t handle;
+// cublasHandle_t cublasHandle;
+// cusparseSpMatDescr_t graph_csr, Q;
+//
+// // Initialize cuSPARSE
+// cublasCreate(&cublasHandle);
+// CHECK_CUSPARSE(cusparseCreate(&handle));
+// int nnz = d_vals.size();
+// thrust::device_vector d_csrOffsets(n + 1);
+// cusparseXcoo2csr(handle,
+// thrust::raw_pointer_cast(d_rows.data()),
+// nnz,
+// n,
+// thrust::raw_pointer_cast(d_csrOffsets.data()),
+// CUSPARSE_INDEX_BASE_ZERO);
+//
+// CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr,
+// n,
+// n,
+// nnz,
+// thrust::raw_pointer_cast(d_csrOffsets.data()),
+// thrust::raw_pointer_cast(d_cols.data()), // column indices
+// thrust::raw_pointer_cast(d_vals.data()), // values
+// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+//
+// thrust::device_vector d_QVals(nnz);
+// CHECK_CUSPARSE(cusparseCreateCsr(&Q,
+// n,
+// n,
+// nnz,
+// thrust::raw_pointer_cast(d_csrOffsets.data()),
+// thrust::raw_pointer_cast(d_cols.data()), // column indices
+// thrust::raw_pointer_cast(d_QVals.data()), // values
+// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+// // Now matDescr can be used in further cuSPARSE operations
+//
+// int* newRowsPtr;
+// int* newCols;
+// float* newVals;
+// csr_data extended_sparse_data;
+//
+// CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float)));
+//
+// extended_sparse_data.rowPointer = newRowsPtr;
+// extended_sparse_data.cols = newCols;
+// extended_sparse_data.vals = newVals;
+//
+//
+// std::cout << "Sparse matrix created successfully!" << std::endl;
+// graph_to_qubo(handle, graph_csr, Q, extended_sparse_data);
+// /*convert_sparse_to_dense_and_display(handle, graph_csr, n); */
+// //convert_sparse_to_dense_and_display(handle, Q, n);
+//
+// float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data()));
+//
+// std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl;
+//
+// /*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x);
+// std::cout << "Qubo energy: " << energy << std::endl;*/
+// brute_force_solutions(cublasHandle, handle, n, Q);
+//
+// // Clean up
+// CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr));
+// CHECK_CUSPARSE(cusparseDestroySpMat(Q));
+// CHECK_CUSPARSE(cusparseDestroy(handle));
+//
+// return 0;
+//};
diff --git a/CudaMaxCutPlanted/main.cu b/CudaMaxCutPlanted/main.cu
new file mode 100644
index 0000000..cef2f1c
--- /dev/null
+++ b/CudaMaxCutPlanted/main.cu
@@ -0,0 +1,636 @@
+//#include
+//#include
+#include "CudaSparseMatrix.hpp"
+//#include
+#include
+#include
+//#include
+//#include
+#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+//#include
+#include
+#include
+//#include
+//#include
+//#include
+#include "indicators.hpp"
+#ifdef __INTELLISENSE__
+#include "intellisense_cuda_intrinsics.h"
+#endif
+
+
+
+void scale_csr_matrix(cusparseHandle_t handle,
+ float alpha,
+ cusparseSpMatDescr_t& input,
+ cusparseSpMatDescr_t& result) {
+ // Extract the dimensions and the number of non-zero elements
+ int64_t n, nnz;
+ int* d_csrOffsets;
+ int* d_cols;
+ float* d_vals;
+
+ int64_t n_r, nnz_r;
+ int* d_csrOffsets_r;
+ int* d_cols_r;
+ float* d_vals_r;
+
+ cusparseIndexType_t csrRowOffsetsType;
+ cusparseIndexType_t csrColIndType;
+ cusparseIndexBase_t idxBase;
+ cudaDataType valueTyp;
+
+ cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+ cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp);
+
+ // Set scaling factors
+ const float beta = 0.0f;
+ size_t bufferSize = 0;
+
+ cusparseMatDescr_t input_desc;
+ cusparseCreateMatDescr(&input_desc);
+
+ // Create matrix descriptor for the result matrix C
+ cusparseMatDescr_t result_desc;
+ cusparseCreateMatDescr(&result_desc);
+
+ // Get buffer size for the operation
+ cusparseScsrgeam2_bufferSizeExt(handle,
+ n, n,
+ &alpha, input_desc, nnz,
+ d_vals,
+ d_csrOffsets,
+ d_cols,
+ &beta, input_desc, nnz,
+ d_vals,
+ d_csrOffsets,
+ d_cols,
+ result_desc,
+ d_vals_r,
+ d_csrOffsets_r,
+ d_cols_r,
+ &bufferSize);
+
+ void* dBuffer;
+ cudaMalloc(&dBuffer, bufferSize);
+
+ // Perform the scaling operation
+ cusparseScsrgeam2(handle,
+ n, n,
+ &alpha, input_desc, nnz,
+ d_vals,
+ d_csrOffsets,
+ d_cols,
+ &beta, input_desc, nnz,
+ d_vals,
+ d_csrOffsets,
+ d_cols,
+ result_desc,
+ d_vals_r,
+ d_csrOffsets_r,
+ d_cols_r,
+ dBuffer);
+ // Clean up
+ cudaFree(dBuffer);
+ cusparseDestroyMatDescr(input_desc);
+ cusparseDestroyMatDescr(result_desc);
+}
+
+// Function to generate initial permutation
+int* generate_initial_permutation(std::mt19937& rng, int n) {
+ std::vector permutation(n);
+ for (int i = 0; i < n; ++i) {
+ permutation[i] = i;
+ }
+ std::shuffle(permutation.begin(), permutation.end(), rng);
+
+ int* p;
+ CHECK_CUDA(cudaMalloc((void**)&p, n * sizeof(int)));
+ CHECK_CUDA(cudaMemcpy(p, permutation.data(), n * sizeof(int), cudaMemcpyHostToDevice));
+
+ return p;
+}
+
+
+void create_graph_sparse(int n, int nnz, int split, const int* p, int *I, int* J, float* V) {
+
+ curandState* states;
+ CHECK_CUDA(cudaMalloc((void**)&states, nnz * sizeof(states)));
+ int gridSize = (nnz + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+ create_random_matrix << > > (n, nnz, split, p, I, J, V, states);
+
+ CHECK_CUDA(cudaDeviceSynchronize());
+
+ // Wrap raw device pointers in thrust device pointers for sorting
+ thrust::device_ptr dev_I(I);
+ thrust::device_ptr dev_J(J);
+ thrust::device_ptr dev_V(V);
+
+ // First, sort by the secondary key (J) using stable sort
+ thrust::stable_sort_by_key(dev_J, dev_J + nnz, thrust::make_zip_iterator(thrust::make_tuple(dev_I, dev_V)));
+
+ // Then, sort by the primary key (I) using stable sort to maintain the order of the secondary key
+ thrust::stable_sort_by_key(dev_I, dev_I + nnz, thrust::make_zip_iterator(thrust::make_tuple(dev_J, dev_V)));
+
+
+ int* h_rows = new int[nnz];
+ int* h_cols = new int[nnz];
+ float* h_vals = new float[nnz];
+
+
+ CHECK_CUDA(cudaMemcpy(h_rows, I, nnz * sizeof(int), cudaMemcpyDeviceToHost));
+ CHECK_CUDA(cudaMemcpy(h_cols, J, nnz * sizeof(int), cudaMemcpyDeviceToHost));
+ CHECK_CUDA(cudaMemcpy(h_vals, V, nnz * sizeof(float), cudaMemcpyDeviceToHost));
+
+ for (int i = 0; i < nnz; ++i) {
+ std::cout << "I: " << h_rows[i] << " J: " << h_cols[i] << " V: " << h_vals[i] << std::endl;
+ }
+
+ CHECK_CUDA(cudaFree(states));
+ delete[] h_rows;
+ delete[] h_cols;
+ delete[] h_vals;
+
+ //thrust::host_vector h_rows, h_cols;
+ //thrust::host_vector h_vals;
+
+ //thrust::default_random_engine rng;
+ //thrust::random::uniform_real_distribution dist(0.01f, 1.0f);
+
+ //std::vector> combined;
+
+ //for (int i = 0; i < split; ++i) {
+ // std::cout << "P_" << i << ":" << p[i] << std::endl;
+ // for (int j = split; j < n; ++j) {
+ // std::cout << "I" << ": " << i << " ";
+ // std::cout << "J" << ":" << j << std::endl;
+ // float rnd_val = dist(rng);
+ // combined.push_back(std::make_tuple(p[i], p[j], rnd_val));
+ // //combined.push_back(std::make_tuple(p[j], p[i], rnd_val));
+ // }
+ //}
+
+ //std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) {
+ // if (std::get<0>(a) == std::get<0>(b)) {
+ // return std::get<1>(a) < std::get<1>(b);
+ // }
+ // return std::get<0>(a) < std::get<0>(b);
+ // });
+
+ //for (size_t i = 0; i < combined.size(); ++i) {
+ //
+ //
+ // h_rows.push_back(std::get<0>(combined[i]));
+ // h_cols.push_back(std::get<1>(combined[i]));
+ // h_vals.push_back(std::get<2>(combined[i]));
+ //}
+
+ //d_rows = h_rows;
+ //d_cols = h_cols;
+ //d_vals = h_vals;
+}
+
+char* generate_solution(const int* p, int split, int n) {
+ char* x;
+ CHECK_CUDA(cudaMalloc((void**)&x, n * sizeof(char)));
+ // Initialize a device vector of bool type with false
+ CHECK_CUDA(cudaMemset(x, 0, n));
+ int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ set_true_elements << > > (split, p, x);
+ CHECK_CUDA(cudaDeviceSynchronize());
+
+ return x;
+}
+
+void graph_to_qubo(CudaSparseMatrix& Q) {
+ Q.multiply(-1.0f);
+ float* row_sum = Q.sum(0);
+ Q.fill_diagonal(row_sum);
+ /*scale_csr_matrix(handle, -1.0f, graph, Q);
+ thrust::device_vector sum = sum_rows_csr_matrix(handle, graph);
+ fill_diagonal(handle, Q, sum, extended_pointers);
+ scale_csr_matrix(handle, -0.25f, Q, Q);
+
+ for (int i = 0; i < sum.size(); i++) {
+ std::cout << sum[i] << std::endl;
+ }*/
+}
+
+//
+//float qubo_eng(cublasHandle_t cublasHandle,
+// cusparseHandle_t handle,
+// const cusparseSpMatDescr_t& Q,
+// float* sol_vector) {
+// int64_t rows, cols, nnz;
+// int* d_csrOffsets, * d_cols;
+// float* d_vals;
+// int* dA_csrOffsets, * dA_columns;
+// float* dA_values;
+//
+// cusparseIndexType_t csrRowOffsetsType;
+// cusparseIndexType_t csrColIndType;
+// cusparseIndexBase_t idxBase;
+// cudaDataType valueTyp;
+//
+// CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz));
+//
+// CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float)));
+//
+// CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz,
+// (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals,
+// &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp));
+// // Host problem definition
+// const int A_num_rows = rows;
+// const int A_num_cols = cols;
+// const int A_nnz = nnz;
+// //int h_test[11];
+// //int* h_test = (int*)calloc(rows+1, sizeof(int));
+// float alpha = 1.0f;
+// float beta = 0.0f;
+// //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+// ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
+// ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+// //for (int i = 0; i < A_num_rows+1; i++) {
+// // std::cout << h_test[i] << std::endl;
+// //}
+// //--------------------------------------------------------------------------
+// // Device memory management
+// float* dX, * dY;
+// CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int)));
+// CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float)));
+//
+// CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice));
+// CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice));
+// cusparseSpMatDescr_t matA;
+// cusparseDnVecDescr_t vecX, vecY;
+// void* dBuffer = NULL;
+// size_t bufferSize = 0;
+// CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
+// dA_csrOffsets, dA_columns, dA_values,
+// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+// // Create dense vector X
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F));
+// // Create dense vector y
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F));
+// // allocate an external buffer if needed
+// CHECK_CUSPARSE(cusparseSpMV_bufferSize(
+// handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+// CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
+// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+// // execute SpMV
+// CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
+// CUSPARSE_SPMV_ALG_DEFAULT, dBuffer));
+//
+// // destroy matrix/vector descriptors
+// CHECK_CUSPARSE(cusparseDestroySpMat(matA));
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+//
+// /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost));
+// std::cout << "Result of Q*x: " << std::endl;
+// for (int i = 0; i < A_num_rows; i++) {
+// std::cout << hY[i] << std::endl;
+// }*/
+//
+// float result;
+// CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result));
+// //std::cout << "Energy: " << result << std::endl;
+// return result;
+//}
+
+//float calculate_qubo_energy(cublasHandle_t cublasHandle,
+// cusparseHandle_t cusparseHandle,
+// int n,
+// const cusparseSpMatDescr_t& Q,
+// thrust::device_vector& x) {
+// float alpha = 1.0f;
+// float beta = 0.0f;
+// size_t bufferSize = 0;
+// void* dBuffer = nullptr;
+//
+// // Create dense vector descriptors
+// cusparseDnVecDescr_t vecX, vecY;
+// float* Qx;
+// float* Qx_ptr;
+// //float* hY = (float*)calloc(n, sizeof(float));
+// float* in_x;
+//
+// /* for (int i = 0; i < n; i++) {
+// hY[i] = 0.0f;
+// }*/
+//
+// CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float)));
+// CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float)));
+//
+// CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice));
+// //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice));
+//
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F));
+// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F));
+//
+// // Allocate buffer
+// CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle,
+// CUSPARSE_OPERATION_NON_TRANSPOSE,
+// &alpha,
+// Q,
+// vecX,
+// &beta,
+// vecY,
+// CUDA_R_32F,
+// CUSPARSE_SPMV_ALG_DEFAULT,
+// &bufferSize));
+// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
+//
+// // Perform Q * x
+// CHECK_CUSPARSE(cusparseSpMV(cusparseHandle,
+// CUSPARSE_OPERATION_NON_TRANSPOSE,
+// &alpha,
+// Q,
+// vecX,
+// &beta,
+// vecY,
+// CUDA_R_32F,
+// CUSPARSE_SPMV_ALG_DEFAULT,
+// dBuffer));
+//
+// // Extract the raw pointers to the data in the dense vector descriptors
+// //float* Qx_ptr;
+// //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr));
+// // Ensure the computation is complete before accessing the result
+// CHECK_CUDA(cudaDeviceSynchronize());
+//
+// /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost));
+// for (int i = 0; i < n; i++) {
+// std::cout << hY[i] << std::endl;
+// }*/
+//
+// // Compute the dot product x^T * (Q * x) using cuBLAS
+// float result;
+// CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result));
+//
+// // Clean up
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX));
+// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY));
+// CHECK_CUDA(cudaFree(dBuffer));
+// CHECK_CUDA(cudaFree(Qx));
+// CHECK_CUDA(cudaFree(in_x));
+//
+// return result;
+//}
+//
+//void brute_force_solutions(cublasHandle_t cublasHandle,
+// cusparseHandle_t handle,
+// int n,
+// const cusparseSpMatDescr_t& Q) {
+// indicators::show_console_cursor(false);
+// int num_solutions = 1 << n; // 2^n
+// thrust::device_vector d_solutions(num_solutions * n);
+// /*
+// for (int i = 0; i < num_solutions; ++i) {
+// for (int j = 0; j < n; ++j) {
+// float val = static_cast((i & (1 << j)) != 0);
+// d_solutions[i * n + j] = val;
+// }
+// }*/
+//
+// //convert_sparse_to_dense_and_display(handle, Q, n);
+// // Allocate memory for energies
+// thrust::device_vector d_energies(num_solutions);
+// thrust::device_vector sol_x(n);
+//
+// indicators::ProgressBar bar{
+// indicators::option::BarWidth{40},
+// indicators::option::Start{"["},
+// indicators::option::Fill{"="},
+// indicators::option::Lead{">"},
+// indicators::option::Remainder{" "},
+// indicators::option::End{" ]"},
+// indicators::option::ForegroundColor{indicators::Color::white},
+// indicators::option::FontStyles{
+// std::vector{indicators::FontStyle::bold}},
+// indicators::option::MaxProgress{num_solutions}
+// };
+// int smallest_ind = -1;
+// float smallest_eng = 1000;
+//
+// for (int i = 0; i < num_solutions; ++i) {
+// for (int j = 0; j < n; ++j) {
+// float val = static_cast((i & (1 << j)) != 0);
+// sol_x[j] = val;
+// d_solutions[i * n + j] = val;
+// }
+// float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data()));
+// //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x);
+// d_energies[i] = eng;
+// if (eng <= smallest_eng) {
+// smallest_ind = i;
+// smallest_eng = eng;
+// }
+// //std::cout << "Energy i: " << eng << std::endl;
+// bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) });
+// bar.tick();
+// }
+// std::cout << "Tested all solutions, now sorting." << std::endl;
+// // Find the minimum energy
+// auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end());
+// float min_energy = *min_energy_iter;
+// int min_index = min_energy_iter - d_energies.begin();
+//
+// // Copy the best solution to host
+// thrust::host_vector h_best_solution(n);
+// thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin());
+//
+// // Print the result
+// std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl;
+// std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl;
+// std::cout << "Best solution: ";
+// for (float bit : h_best_solution) {
+// std::cout << bit << " ";
+// }
+// std::cout << std::endl;
+// convert_sparse_to_dense_and_display(handle, Q, n);
+//}
+
+int estimate_split(float density, int n) {
+ if (density > 0.5f) {
+ std::cout << "Error, density can not be bigger than 0.5!" << std::endl;
+ }
+
+ float sparsity = 1.0f - density;
+ float inside = 2.0f * sparsity - 1.0f;
+ float smaller_set_size = 0.5f * n * (1 + sqrt(inside));
+ return static_cast(smaller_set_size) - 1;
+}
+
+
+int main() {
+ int n = 12;
+ int seed = 14;
+ float density = 0.5;
+ int* I, * J;
+ float* V;
+ std::mt19937 rng(seed);
+
+ int* p = generate_initial_permutation(rng, n);
+
+ int split = estimate_split(density, n); // Example split, can be computed as needed
+ int nnz = split * (n - split );
+ std::cout << "Split: " << split << " nnz: " << nnz << std::endl;
+
+ CHECK_CUDA(cudaMalloc((void**)&I, nnz * sizeof(int)));
+ CHECK_CUDA(cudaMalloc((void**)&J, nnz * sizeof(int)));
+ CHECK_CUDA(cudaMalloc((void**)&V, nnz * sizeof(float)));
+
+ create_graph_sparse(n, nnz, split, p, I, J, V);
+
+ CudaSparseMatrix graph = CudaSparseMatrix(I, J, V, n, nnz, SparseType::COO, MemoryType::Device);
+ graph.display();
+
+ char* x = generate_solution(p, split, n);
+
+ char* h_x = new char[n];
+
+ CHECK_CUDA(cudaMemcpy(h_x, x, n * sizeof(char), cudaMemcpyDeviceToHost));
+
+ for (int i = 0; i < n; i++)
+ {
+ std::cout << "X_" << i << " " << static_cast(h_x[i]) << std::endl;
+ }
+
+ CudaSparseMatrix Q = CudaSparseMatrix(graph);
+ graph.clear();
+
+ graph_to_qubo(Q);
+
+ Q.display();
+
+ cudaFree(I);
+ cudaFree(J);
+ cudaFree(V);
+ cudaFree(p);
+ cudaFree(x);
+
+ delete[] h_x;
+
+ //std::cout << "Graph created.";
+ //// Print the result (for debugging purposes)
+ //thrust::host_vector h_rows = d_rows;
+ //thrust::host_vector h_cols = d_cols;
+ //thrust::host_vector h_vals = d_vals;
+ //thrust::host_vector h_x = d_x;
+ //int print_size = 2 * n;
+ //std::cout << "Rows: ";
+ //for (int i = 0; i < print_size; ++i) {
+ // std::cout << h_rows[i] << " ";
+ //}
+ //std::cout << std::endl;
+
+ //std::cout << "Cols: ";
+ //for (int i = 0; i < print_size; ++i) {
+ // std::cout << h_cols[i] << " ";
+ //}
+ //std::cout << std::endl;
+
+ //std::cout << "Vals: ";
+ //for (int i = 0; i < print_size; ++i) {
+ // std::cout << h_vals[i] << " ";
+ //}
+ //std::cout << std::endl;
+
+ //std::cout << "Solution: ";
+ //for (int i = 0; i < n; ++i) {
+ // std::cout << h_x[i] << " ";
+ //}
+ //std::cout << std::endl;
+
+ //cusparseHandle_t handle;
+ //cublasHandle_t cublasHandle;
+ //cusparseSpMatDescr_t graph_csr, Q;
+
+ //// Initialize cuSPARSE
+ //cublasCreate(&cublasHandle);
+ //CHECK_CUSPARSE(cusparseCreate(&handle));
+ //int nnz = d_vals.size();
+ //thrust::device_vector d_csrOffsets(n + 1);
+ //cusparseXcoo2csr(handle,
+ // thrust::raw_pointer_cast(d_rows.data()),
+ // nnz,
+ // n,
+ // thrust::raw_pointer_cast(d_csrOffsets.data()),
+ // CUSPARSE_INDEX_BASE_ZERO);
+
+ //CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr,
+ // n,
+ // n,
+ // nnz,
+ // thrust::raw_pointer_cast(d_csrOffsets.data()),
+ // thrust::raw_pointer_cast(d_cols.data()), // column indices
+ // thrust::raw_pointer_cast(d_vals.data()), // values
+ // CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+ // CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+
+ //thrust::device_vector d_QVals(nnz);
+ //CHECK_CUSPARSE(cusparseCreateCsr(&Q,
+ // n,
+ // n,
+ // nnz,
+ // thrust::raw_pointer_cast(d_csrOffsets.data()),
+ // thrust::raw_pointer_cast(d_cols.data()), // column indices
+ // thrust::raw_pointer_cast(d_QVals.data()), // values
+ // CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+ // CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+ //// Now matDescr can be used in further cuSPARSE operations
+
+ //int* newRowsPtr;
+ //int* newCols;
+ //float* newVals;
+ //csr_data extended_sparse_data;
+
+ //CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int)));
+ //CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int)));
+ //CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float)));
+
+ //extended_sparse_data.rowPointer = newRowsPtr;
+ //extended_sparse_data.cols = newCols;
+ //extended_sparse_data.vals = newVals;
+
+
+ //std::cout << "Sparse matrix created successfully!" << std::endl;
+ //graph_to_qubo(handle, graph_csr, Q, extended_sparse_data);
+ ///*convert_sparse_to_dense_and_display(handle, graph_csr, n); */
+ ////convert_sparse_to_dense_and_display(handle, Q, n);
+
+ //float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data()));
+
+ //std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl;
+
+ ///*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x);
+ //std::cout << "Qubo energy: " << energy << std::endl;*/
+ //brute_force_solutions(cublasHandle, handle, n, Q);
+
+ //// Clean up
+ //CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr));
+ //CHECK_CUSPARSE(cusparseDestroySpMat(Q));
+ //CHECK_CUSPARSE(cusparseDestroy(handle));
+
+ return 0;
+};