diff --git a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj index abe6491..3838150 100644 --- a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj +++ b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj @@ -92,13 +92,14 @@ - + + - + diff --git a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters index fea601e..5034b1c 100644 --- a/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters +++ b/CudaMaxCutPlanted/CudaMaxCutPlanted.vcxproj.filters @@ -5,9 +5,10 @@ Objects - + Objects + @@ -17,7 +18,7 @@ Objects - + Objects diff --git a/CudaMaxCutPlanted/CudaSparseMatrix.cu b/CudaMaxCutPlanted/CudaSparseMatrix.cu index 5c1dd41..deefa06 100644 --- a/CudaMaxCutPlanted/CudaSparseMatrix.cu +++ b/CudaMaxCutPlanted/CudaSparseMatrix.cu @@ -5,30 +5,9 @@ #include #include +#include +#include -#include "SparseMatrixSumKernel.cuh" - -#define CHECK_CUDA(call) \ -{ \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - std::cerr << "CUDA error in file " << __FILE__ \ - << " at line " << __LINE__ << ": " \ - << cudaGetErrorString(err) << std::endl; \ - exit(EXIT_FAILURE); \ - } \ -} - -#define CHECK_CUSPARSE(call) \ -{ \ - cusparseStatus_t err = call; \ - if (err != CUSPARSE_STATUS_SUCCESS) { \ - std::cerr << "CUSPARSE error in file " << __FILE__ \ - << " at line " << __LINE__ << ": " \ - << cusparseGetErrorString(err) << std::endl; \ - exit(EXIT_FAILURE); \ - } \ -} const char* cusparseGetErrorString(cusparseStatus_t status) { switch (status) { @@ -62,7 +41,7 @@ CudaSparseMatrix::CudaSparseMatrix(int* I, int* J, float* V, int n, int nnz, Spa } CudaSparseMatrix::CudaSparseMatrix(const CudaSparseMatrix& other) - : n_(other.n_), nnz_(other.nnz_) { + : n_(other.n_), nnz_(other.nnz_), matDescr_(nullptr) { cusparseHandle_t& cusparseHandle_ = CusparseHandle::getInstance(); CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets_, (n_ + 1) * sizeof(int))); CHECK_CUDA(cudaMalloc((void**)&d_cols_, nnz_ * sizeof(int))); @@ -79,21 +58,156 @@ CudaSparseMatrix::CudaSparseMatrix(const CudaSparseMatrix& other) } CudaSparseMatrix::~CudaSparseMatrix() { - CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_)); - CHECK_CUDA(cudaFree(d_csrOffsets_)); - CHECK_CUDA(cudaFree(d_cols_)); - CHECK_CUDA(cudaFree(d_vals_)); + clear(); } void CudaSparseMatrix::updateData(const int* rows, const int* cols, const float* vals, int new_nnz, SparseType sparseType, MemoryType memType) { nnz_ = new_nnz; CHECK_CUDA(cudaFree(d_cols_)); CHECK_CUDA(cudaFree(d_vals_)); + CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_)); CHECK_CUDA(cudaMalloc((void**)&d_cols_, nnz_ * sizeof(int))); CHECK_CUDA(cudaMalloc((void**)&d_vals_, nnz_ * sizeof(float))); allocateAndCopy(rows, cols, vals, sparseType, memType); + + CHECK_CUSPARSE(cusparseCreateCsr(&matDescr_, n_, n_, nnz_, + d_csrOffsets_, d_cols_, d_vals_, + csr_row_ind_type_, csr_col_ind_type_, + index_base_, valueType_)); +} + +bool* CudaSparseMatrix::zero_elements_in_vector(const float* input_vect, int& zero_sum, int n) { + cusparseHandle_t& handle = CusparseHandle::getInstance(); + bool* zero_elements_vect; + int* d_zero_sum; + + zero_sum = 0; + + // Allocate memory on the device + CHECK_CUDA(cudaMalloc((void**)&zero_elements_vect, n * sizeof(bool))); + CHECK_CUDA(cudaMalloc((void**)&d_zero_sum, sizeof(int))); + + // Initialize memory + CHECK_CUDA(cudaMemset(zero_elements_vect, 0, n * sizeof(bool))); + CHECK_CUDA(cudaMemset(d_zero_sum, 0, sizeof(int))); + + int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + zero_elements << > > (input_vect, zero_elements_vect, d_zero_sum, n); + CHECK_CUDA(cudaDeviceSynchronize()); + + // Copy the result back to host + CHECK_CUDA(cudaMemcpy(&zero_sum, d_zero_sum, sizeof(int), cudaMemcpyDeviceToHost)); + + // Free device memory + CHECK_CUDA(cudaFree(d_zero_sum)); + + return zero_elements_vect; +} + +void CudaSparseMatrix::fill_diagonal(const float* diagonal_vect) +{ + int nnz_sum = 0; + int zero_sum = 0; + int diag_nnz = n_; + int resize_n = diag_nnz; + // TODO: Flip to non zero vector and copy only the non zero elements to new vector + bool* zeros_in_diag_sum = zero_elements_in_vector(diagonal_vect, zero_sum, n_); + bool* non_zero = non_zero_diagonal(nnz_sum); + + bool* h_non_zero = new bool[n_]; + CHECK_CUDA(cudaMemcpy(h_non_zero, zeros_in_diag_sum, n_ * sizeof(bool), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < n_; i++) + { + std::cout << "zero_in_diag_" << i << ": " << h_non_zero[i] << std::endl; + } + + diag_nnz -= zero_sum; + resize_n = diag_nnz - nnz_sum; + + int* original_I, * new_I, * new_J; + float* new_V; + + CHECK_CUDA(cudaMalloc((void**)&original_I, (nnz_) * sizeof(int))); + CHECK_CUDA(cudaMalloc((void**)&new_I, (nnz_ + resize_n) * sizeof(int))); + CHECK_CUDA(cudaMalloc((void**)&new_J, (nnz_ + resize_n) * sizeof(int))); + CHECK_CUDA(cudaMalloc((void**)&new_V, (nnz_ + resize_n) * sizeof(float))); + + csrTorows(d_csrOffsets_, original_I, n_, nnz_, SparseType::CSR); + + CHECK_CUDA(cudaMemcpy(new_I, original_I, nnz_ * sizeof(int), cudaMemcpyDeviceToDevice)); + CHECK_CUDA(cudaMemcpy(new_J, d_cols_, nnz_ * sizeof(int), cudaMemcpyDeviceToDevice)); + CHECK_CUDA(cudaMemcpy(new_V, d_vals_, nnz_ * sizeof(float), cudaMemcpyDeviceToDevice)); + + int gridSize = ((nnz_+n_)+BLOCK_SIZE - 1) / BLOCK_SIZE; + set_diagonal << > > (new_I, new_J, new_V, non_zero, diagonal_vect, nnz_, resize_n); + CHECK_CUDA(cudaDeviceSynchronize()); + thrust::device_ptr dev_I(new_I); + thrust::device_ptr dev_J(new_J); + thrust::device_ptr dev_V(new_V); + + // First, sort by the secondary key (J) using stable sort + thrust::stable_sort_by_key(dev_J, dev_J + (nnz_ + resize_n), thrust::make_zip_iterator(thrust::make_tuple(dev_I, dev_V))); + + // Then, sort by the primary key (I) using stable sort to maintain the order of the secondary key + thrust::stable_sort_by_key(dev_I, dev_I + (nnz_ + resize_n), thrust::make_zip_iterator(thrust::make_tuple(dev_J, dev_V))); + + float* h_new_I = new float[nnz_+resize_n]; + CHECK_CUDA(cudaMemcpy(h_new_I, new_V, (nnz_ + resize_n) * sizeof(float), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < nnz_ + resize_n; i++) + { + std::cout << "Copied V_" << i << ": " << h_new_I[i] << std::endl; + } + + updateData(new_I, new_J, new_V, nnz_ + resize_n, SparseType::COO, MemoryType::Device); + + CHECK_CUDA(cudaFree(original_I)); + CHECK_CUDA(cudaFree(new_I)); + CHECK_CUDA(cudaFree(new_J)); + CHECK_CUDA(cudaFree(new_V)); + + std::cout << "Total non zero elements on the diagonal: " << nnz_sum << std::endl; +} + +bool* CudaSparseMatrix::non_zero_diagonal(int& nnz_diag_sum) +{ + cusparseHandle_t& handle = CusparseHandle::getInstance(); + bool* nnz_diag; + int* I; + int* d_nnz_diag_sum; + nnz_diag_sum = 0; + + // Allocate memory on the device + CHECK_CUDA(cudaMalloc((void**)&d_nnz_diag_sum, sizeof(int))); + + // Initialize memory + CHECK_CUDA(cudaMemset(d_nnz_diag_sum, 0, sizeof(int))); + + CHECK_CUDA(cudaMalloc((void**)&I, nnz_ * sizeof(int))); + cusparseXcsr2coo(handle, + d_csrOffsets_, + nnz_, + n_, + I, + CUSPARSE_INDEX_BASE_ZERO); + + + CHECK_CUDA(cudaMalloc((void**)&nnz_diag, n_ * sizeof(bool))); + CHECK_CUDA(cudaMemset(nnz_diag, 0, n_)); + int gridSize = (nnz_ + BLOCK_SIZE - 1) / BLOCK_SIZE; + non_zero_elements << > > (I, d_cols_, nnz_diag, d_nnz_diag_sum, nnz_); + + CHECK_CUDA(cudaMemcpy(&nnz_diag_sum, d_nnz_diag_sum, sizeof(int), cudaMemcpyDeviceToHost)); + + // Free device memory + CHECK_CUDA(cudaFree(d_nnz_diag_sum)); + CHECK_CUDA(cudaDeviceSynchronize()); + CHECK_CUDA(cudaFree(I)); + + return nnz_diag; } CudaDenseVector CudaSparseMatrix::dot(const float* d_vec) @@ -122,6 +236,63 @@ CudaDenseVector CudaSparseMatrix::dot(const float* d_vec) return result_vector; } +void CudaSparseMatrix::multiply(float value) +{ + cusparseHandle_t& handle = CusparseHandle::getInstance(); + // Set scaling factors + const float beta = 0.0f; + size_t bufferSize = 0; + + cusparseMatDescr_t input_desc; + CHECK_CUSPARSE(cusparseCreateMatDescr(&input_desc)); + + // Create matrix descriptor for the result matrix C + cusparseMatDescr_t result_desc; + CHECK_CUSPARSE(cusparseCreateMatDescr(&result_desc)); + + // Get buffer size for the operation + CHECK_CUSPARSE(cusparseScsrgeam2_bufferSizeExt(handle, + n_, n_, + &value, input_desc, nnz_, + d_vals_, + d_csrOffsets_, + d_cols_, + &beta, nullptr, nnz_, + nullptr, + nullptr, + nullptr, + result_desc, + d_vals_, + d_csrOffsets_, + d_cols_, + &bufferSize)); + + void* dBuffer; + cudaMalloc(&dBuffer, bufferSize); + + // Perform the scaling operation + cusparseScsrgeam2(handle, + n_, n_, + &value, input_desc, nnz_, + d_vals_, + d_csrOffsets_, + d_cols_, + &beta, input_desc, nnz_, + d_vals_, + d_csrOffsets_, + d_cols_, + result_desc, + d_vals_, + d_csrOffsets_, + d_cols_, + dBuffer); + // Clean up + cudaFree(dBuffer); + cusparseDestroyMatDescr(input_desc); + cusparseDestroyMatDescr(result_desc); + +} + void CudaSparseMatrix::allocateAndCopy(const int* rows, const int* cols, const float* vals, SparseType sparseType, MemoryType memType) { cudaMemcpyKind copyType; @@ -167,6 +338,19 @@ void CudaSparseMatrix::rowsToCsr(const int* d_rows, int* d_csr_offset, int n, in } +void CudaSparseMatrix::csrTorows(const int* d_csr_offset, int* d_rows, int n, int nnz, SparseType sparseType) +{ + if (sparseType == SparseType::CSR) { + cusparseHandle_t& handle = CusparseHandle::getInstance(); + cusparseXcsr2coo(handle, + d_csr_offset, + nnz, + n, + d_rows, + CUSPARSE_INDEX_BASE_ZERO); + } +} + float* CudaSparseMatrix::sumRows() { cusparseHandle_t& handle = CusparseHandle::getInstance(); @@ -179,7 +363,8 @@ float* CudaSparseMatrix::sumRows() CHECK_CUDA(cudaMalloc((void**)&cscRowInd, nnz_ * sizeof(int))); CHECK_CUDA(cudaMalloc((void**)&cscVal, nnz_ * sizeof(float))); - CHECK_CUDA(cudaMemset((void*)diagonal, 0, nnz_ * sizeof(float))); + CHECK_CUDA(cudaMalloc((void**)&diagonal, n_ * sizeof(float))); + CHECK_CUDA(cudaMemset((void*)diagonal, 0, n_ * sizeof(float))); cusparseCsr2cscEx2_bufferSize(handle, n_, n_, nnz_, @@ -201,10 +386,9 @@ float* CudaSparseMatrix::sumRows() // Clean up cudaFree(dBuffer); - int blockSize = 512; - int gridSize = (nnz_ + blockSize - 1) / blockSize; + int gridSize = (nnz_ + BLOCK_SIZE - 1) / BLOCK_SIZE; - sum_axis << > > (nnz_, cscRowInd, cscVal, diagonal); + sum_axis << > > (nnz_, cscRowInd, cscVal, diagonal); CHECK_CUDA(cudaFree(cscColPtr)); CHECK_CUDA(cudaFree(cscRowInd)); @@ -218,7 +402,8 @@ float* CudaSparseMatrix::sumCols() cusparseHandle_t& handle = CusparseHandle::getInstance(); float* diagonal; - CHECK_CUDA(cudaMemset((void*)diagonal, 0, nnz_ * sizeof(float))); + CHECK_CUDA(cudaMalloc((void**)&diagonal, n_ * sizeof(float))); + CHECK_CUDA(cudaMemset((void*)diagonal, 0, n_ * sizeof(float))); int blockSize = 512; int gridSize = (nnz_ + blockSize - 1) / blockSize; @@ -230,6 +415,14 @@ float* CudaSparseMatrix::sumCols() float* CudaSparseMatrix::sum(int axis) { + if (axis == 0) { + return sumRows(); + } + + if (axis == 1) { + return sumCols(); + } + return nullptr; } @@ -274,7 +467,7 @@ void CudaSparseMatrix::display() std::cout << "Dense matrix:" << std::endl; for (int i = 0; i < n_; ++i) { for (int j = 0; j < n_; ++j) { - std::cout << h_denseMat[i * n_ + j] << " "; + std::cout << std::setw(7) << h_denseMat[i * n_ + j] << " "; } std::cout << std::endl; } @@ -295,6 +488,33 @@ int CudaSparseMatrix::size() const return n_; } +void CudaSparseMatrix::clear() +{ + if (d_csrOffsets_) { + CHECK_CUDA(cudaFree(d_csrOffsets_)); + std::cout << "d_csrOffsets_ cleared" << std::endl; + d_csrOffsets_ = nullptr; + } + if (d_cols_) { + CHECK_CUDA(cudaFree(d_cols_)); + std::cout << "d_cols_ cleared" << std::endl; + d_cols_ = nullptr; + } + if (d_vals_) { + CHECK_CUDA(cudaFree(d_vals_)); + std::cout << "d_vals_ cleared" << std::endl; + d_vals_ = nullptr; + } + if (matDescr_) { + CHECK_CUSPARSE(cusparseDestroySpMat(matDescr_)); + std::cout << "matDescr_ cleared" << std::endl; + matDescr_ = nullptr; + } + + nnz_ = 0; + n_ = 0; +} + CudaDenseVector::CudaDenseVector(int size, const float* V, MemoryType memType): size_(size) { cudaMemcpyKind copyType = memType == MemoryType::Host ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; diff --git a/CudaMaxCutPlanted/CudaSparseMatrix.hpp b/CudaMaxCutPlanted/CudaSparseMatrix.hpp index de410b1..047c64e 100644 --- a/CudaMaxCutPlanted/CudaSparseMatrix.hpp +++ b/CudaMaxCutPlanted/CudaSparseMatrix.hpp @@ -4,6 +4,30 @@ #include #include #include +#include "Kernels.cuh" + + +#define CHECK_CUDA(call) \ +{ \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error in file " << __FILE__ \ + << " at line " << __LINE__ << ": " \ + << cudaGetErrorString(err) << std::endl; \ + exit(EXIT_FAILURE); \ + } \ +} + +#define CHECK_CUSPARSE(call) \ +{ \ + cusparseStatus_t err = call; \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + std::cerr << "CUSPARSE error in file " << __FILE__ \ + << " at line " << __LINE__ << ": " \ + << cusparseGetErrorString(err) << std::endl; \ + exit(EXIT_FAILURE); \ + } \ +} enum class MemoryType { Host, @@ -59,11 +83,15 @@ class CudaSparseMatrix { ~CudaSparseMatrix(); void updateData(const int* rows, const int* cols, const float* vals, int new_nnz, SparseType sparseType, MemoryType memType); + void fill_diagonal(const float* diagonal_vect); + bool* non_zero_diagonal(int& nnz_diag_sum); CudaDenseVector dot(const float* d_vec); + void multiply(float value); float* sum(int axis); void display(); int getNnz() const; int size() const; + void clear(); // Other useful methods can be added here @@ -80,6 +108,8 @@ class CudaSparseMatrix { cudaDataType valueType_ = CUDA_R_32F; void allocateAndCopy(const int* rows, const int* cols, const float* vals, SparseType sparseType, MemoryType memType); void rowsToCsr(const int* d_rows, int* d_csr_offset, int n, int nnz, SparseType sparseType); + void csrTorows(const int* d_csr_offset, int* d_rows, int n, int nnz, SparseType sparseType); + bool* zero_elements_in_vector(const float* input_vect, int& zero_sum, int n); float* sumRows(); float* sumCols(); diff --git a/CudaMaxCutPlanted/Kernels.cu b/CudaMaxCutPlanted/Kernels.cu new file mode 100644 index 0000000..6e7ceae --- /dev/null +++ b/CudaMaxCutPlanted/Kernels.cu @@ -0,0 +1,137 @@ +#include "Kernels.cuh" + +#define NUM_BANKS 16 +#define LOG_NUM_BANKS 4 +#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) + +/// +/// This function sums either rows for matrix in csc format or columns for matrix in csr format +/// +/// Number on non zero elements +/// Array with nnz elements that is NOT in compressed format +/// Array with values +/// Output array that will contain output of summed rows/cols +/// +__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum) { + int element_ind = blockIdx.x * blockDim.x + threadIdx.x; + if (element_ind < nnz) { + int idx = d_non_offset_axis_ind[element_ind]; + atomicAdd(&d_axis_sum[idx], d_vals[idx]); + } +} + +__global__ void create_random_matrix(int n, int nnz, int split, const int* p, int* d_rows, int* d_cols, float* d_vals, curandState* states) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < nnz) { + curand_init(1234, idx, 0, &states[idx]); + int i = static_cast(floorf(idx / (n - split))); + int j = static_cast((idx % (n - split)) + split); + d_rows[idx] = p[i]; + d_cols[idx] = p[j]; + d_vals[idx] = curand_uniform(&states[idx]) * 0.99f + 0.01f; + } +} + +__global__ void set_true_elements(int split, const int* p, char* x) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < split) { + x[p[idx]] = 1; + } +} + +__global__ void non_zero_elements(const int* I, const int* J, bool* non_zero_elements, int *nnz_sum, int nnz) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < nnz) { + if (I[idx] == J[idx]) { + atomicAdd(nnz_sum, 1); + non_zero_elements[idx] == true; + } + + } +} + +__global__ void zero_elements(const float *input_vect, bool* zero_elements_vect, int* zero_sum, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + if (fabsf(input_vect[idx]) < 1e-6) { + atomicAdd(zero_sum, 1); + zero_elements_vect[idx] = true; + } + + } +} + + +__global__ void set_diagonal(int* I, int* J, float* V, bool* non_zero_elements, const float* diagonal, int initial_n, int resize_n) { + int offset = initial_n; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < initial_n) { + if (I[idx] == J[idx] && non_zero_elements[idx]) { + + V[idx] = diagonal[idx]; + } + + } + else if (idx >= initial_n && idx < (initial_n + resize_n)) { + int index = idx - offset; + V[idx] = diagonal[index]; + I[idx] = index; + J[idx] = index; + } +} + +__global__ void prescan(float* g_odata, float* g_idata, int n) { + extern __shared__ float temp[]; + + int thid = threadIdx.x; + int offset = 1; + + // Load input into shared memory with padding to avoid bank conflicts + int ai = thid; + int bi = thid + (n / 2); + int bankOffsetA = CONFLICT_FREE_OFFSET(ai); + int bankOffsetB = CONFLICT_FREE_OFFSET(bi); + + temp[ai + bankOffsetA] = g_idata[ai]; + temp[bi + bankOffsetB] = g_idata[bi]; + + // Build sum in place up the tree + for (int d = n >> 1; d > 0; d >>= 1) { + __syncthreads(); + if (thid < d) { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + temp[bi] += temp[ai]; + } + offset *= 2; + } + + // Clear the last element + if (thid == 0) { + temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0; + } + + // Traverse down tree & build scan + for (int d = 1; d < n; d *= 2) { + offset >>= 1; + __syncthreads(); + if (thid < d) { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + float t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + + // Write results to device memory with proper offsets + g_odata[ai] = temp[ai + bankOffsetA]; + g_odata[bi] = temp[bi + bankOffsetB]; +} + + diff --git a/CudaMaxCutPlanted/Kernels.cuh b/CudaMaxCutPlanted/Kernels.cuh new file mode 100644 index 0000000..856e5f4 --- /dev/null +++ b/CudaMaxCutPlanted/Kernels.cuh @@ -0,0 +1,25 @@ +#ifndef KERNELS_CUH +#define KERNELS_CUH + +#include +#include +#include +#ifdef __INTELLISENSE__ +#include "intellisense_cuda_intrinsics.h" +#endif +#define BLOCK_SIZE 512 + + +// Kernel to sum all elements along axis +__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum); +// Fills sparse graph with random values +__global__ void create_random_matrix(int n, int nnz, int split, const int* p, int* d_rows, int* d_cols, float* d_vals, curandState* states); +//Sets true/1 to the zereos vector x in positions provided by p +__global__ void set_true_elements(int split, const int* p, char* x); +//Count all non zero elements on the diagonal and returns bool vector where true is set on a position where diagonal is non zero +__global__ void non_zero_elements(const int* I, const int* J, bool* non_zero_elements, int* nnz_sum, int n); +// Sets values to the diagonal taking into account that diagonal can have non zero values +__global__ void set_diagonal(int* I, int* J, float* V, bool* non_zero_elements, const float* diagonal, int initial_n, int resize_n); +// Count and sets bool vector for zero elements inside float device vector +__global__ void zero_elements(const float* input_vect, bool* zero_elements_vect, int* zero_sum, int n); +#endif // KERNELS_CUH \ No newline at end of file diff --git a/CudaMaxCutPlanted/SparseMatrixSumKernel.cu b/CudaMaxCutPlanted/SparseMatrixSumKernel.cu deleted file mode 100644 index 22fa399..0000000 --- a/CudaMaxCutPlanted/SparseMatrixSumKernel.cu +++ /dev/null @@ -1,18 +0,0 @@ -#include "SparseMatrixSumKernel.cuh" - - -/// -/// This function sums either rows for matrix in csc format or columns for matrix in csr format -/// -/// Number on non zero elements -/// Array with nnz elements that is NOT in compressed format -/// Array with values -/// Output array that will contain output of summed rows/cols -/// -__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum) { - int element_ind = blockIdx.x * blockDim.x + threadIdx.x; - if (element_ind < nnz) { - int idx = d_non_offset_axis_ind[element_ind]; - atomicAdd(&d_axis_sum[idx], d_vals[idx]); - } -} \ No newline at end of file diff --git a/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh b/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh deleted file mode 100644 index 42d4502..0000000 --- a/CudaMaxCutPlanted/SparseMatrixSumKernel.cuh +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef SPARSEMATRIXSUMKERNEL_CUH -#define SPARSEMATRIXSUMKERNEL_CUH - -#include -#ifdef __INTELLISENSE__ -#include "intellisense_cuda_intrinsics.h" -#endif - - -// Kernel to mark zero elements -__global__ void sum_axis(int nnz, const int* d_non_offset_axis_ind, const float* d_vals, float* d_axis_sum); - -#endif // SPARSEMATRIXSUMKERNEL_CUH \ No newline at end of file diff --git a/CudaMaxCutPlanted/kernel.cu b/CudaMaxCutPlanted/kernel.cu index 3e93ae4..6128890 100644 --- a/CudaMaxCutPlanted/kernel.cu +++ b/CudaMaxCutPlanted/kernel.cu @@ -1,429 +1,583 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "indicators.hpp" -#ifdef __INTELLISENSE__ -#include "intellisense_cuda_intrinsics.h" -#endif - - -#define CHECK_CUSPARSE(call) \ -{ \ - cusparseStatus_t err = call; \ - if (err != CUSPARSE_STATUS_SUCCESS) { \ - std::cerr << "CUSPARSE error in file " << __FILE__ \ - << " at line " << __LINE__ << ": " \ - << cusparseGetErrorString(err) << std::endl; \ - exit(EXIT_FAILURE); \ - } \ -} - -#define CHECK_CUDA(call) \ -{ \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - std::cerr << "CUDA error in file " << __FILE__ \ - << " at line " << __LINE__ << ": " \ - << cudaGetErrorString(err) << std::endl; \ - exit(EXIT_FAILURE); \ - } \ -} - -#define CHECK_CUBLAS(call) \ -{ \ - cublasStatus_t err = call; \ - if (err != CUBLAS_STATUS_SUCCESS) { \ - std::cerr << "CUBLAS error in file " << __FILE__ \ - << " at line " << __LINE__ << ": " \ - << cublasGetErrorString(err) << std::endl; \ - exit(EXIT_FAILURE); \ - } \ -} - - -struct csr_data { - int* rowPointer; - int* cols; - float* vals; -}; - -const char* cusparseGetErrorString(cusparseStatus_t status) { - switch (status) { - case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; - default: return "UNKNOWN CUSPARSE STATUS"; - } -} - -const char* cublasGetErrorString(cublasStatus_t status) { - switch (status) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; - default: return "UNKNOWN CUBLAS STATUS"; - } -} - -__global__ void cols_sum(int n, const int* d_csrOffsets, const float* d_vals, float* d_rowSums) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row < n) { - int row_start = d_csrOffsets[row]; - int row_end = d_csrOffsets[row + 1]; - for (int j = row_start; j < row_end; j++) { - atomicAdd(&d_rowSums[row], d_vals[j]); - } - } -} - -void convert_sparse_to_dense_and_display(cusparseHandle_t handle, const cusparseSpMatDescr_t& matDescr, int n) { - // Allocate memory for the dense matrix on the device - float* d_denseMat; - cudaMalloc((void**)&d_denseMat, n * n * sizeof(float)); - - // Create a dense matrix descriptor - cusparseDnMatDescr_t denseDescr; - CHECK_CUSPARSE(cusparseCreateDnMat(&denseDescr, - n, // number of rows - n, // number of columns - n, // leading dimension - d_denseMat, // pointer to dense matrix data - CUDA_R_32F, // data type - CUSPARSE_ORDER_ROW)); // row-major order - - // Convert sparse matrix to dense matrix - void* dBuffer = NULL; - size_t bufferSize = 0; - CHECK_CUSPARSE(cusparseSparseToDense_bufferSize(handle, - matDescr, - denseDescr, - CUSPARSE_SPARSETODENSE_ALG_DEFAULT, - &bufferSize)); - - CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); - - CHECK_CUSPARSE(cusparseSparseToDense(handle, - matDescr, - denseDescr, - CUSPARSE_SPARSETODENSE_ALG_DEFAULT, - dBuffer)); - - // Copy the dense matrix from device to host - std::vector h_denseMat(n * n); - CHECK_CUDA(cudaMemcpy(h_denseMat.data(), d_denseMat, n * n * sizeof(float), cudaMemcpyDeviceToHost)); - - std::cout << std::fixed << std::setprecision(4); // Set precision to 2 decimal places - std::cout << "Dense matrix:" << std::endl; - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - std::cout << h_denseMat[i * n + j] << " "; - } - std::cout << std::endl; - } - - // Clean up - CHECK_CUDA(cudaFree(d_denseMat)); - CHECK_CUDA(cudaFree(dBuffer)); - CHECK_CUSPARSE(cusparseDestroyDnMat(denseDescr)); -} - - -void fill_diagonal(cusparseHandle_t handle, cusparseSpMatDescr_t& input, thrust::device_vector diag, csr_data& extended_pointers) { - int64_t n, nnz; - int* d_csrOffsets; - int* d_cols; - float* d_vals; - - cusparseIndexType_t csrRowOffsetsType; - cusparseIndexType_t csrColIndType; - cusparseIndexBase_t idxBase; - cudaDataType valueTyp; - - CHECK_CUSPARSE(cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp)); - - thrust::device_vector d_rows_tdv(nnz); - thrust::device_vector d_cols_tdv(d_cols, d_cols+nnz); - thrust::device_vector d_vals_tdv(d_vals, d_vals+nnz); - - CHECK_CUSPARSE(cusparseXcsr2coo(handle, - d_csrOffsets, - nnz, - n, - thrust::raw_pointer_cast(d_rows_tdv.data()), - CUSPARSE_INDEX_BASE_ZERO)); - - d_rows_tdv.resize(nnz + n); - d_cols_tdv.resize(nnz + n); - d_vals_tdv.resize(nnz + n); - - int nnz_n = nnz + n; - - thrust::device_vector d_vec(n); - - // Fill the vector with values from 0 to n-1 - thrust::sequence(d_vec.begin(), d_vec.end()); - - thrust::copy(d_vec.begin(), d_vec.end(), d_rows_tdv.begin() + nnz); - thrust::copy(d_vec.begin(), d_vec.end(), d_cols_tdv.begin() + nnz); - thrust::copy(diag.begin(), diag.end(), d_vals_tdv.begin() + nnz); - - /*thrust::copy(d_rows_tdv.begin(), d_rows_tdv.end(), ); - thrust::copy(d_cols_tdv.begin(), d_cols_tdv.end(), extended_pointers.cols); - thrust::copy(d_vals_tdv.begin(), d_vals_tdv.end(), extended_pointers.vals);*/ - - - CHECK_CUSPARSE(cusparseXcsr2coo(handle, - d_csrOffsets, - nnz, - n, - thrust::raw_pointer_cast(d_rows_tdv.data()), - CUSPARSE_INDEX_BASE_ZERO)); - thrust::device_vector d_csrOffsets_o(n + 1); - - thrust::sort_by_key(thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.begin(), d_cols_tdv.begin())), - thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.end(), d_cols_tdv.end())), - d_vals_tdv.begin()); - - CHECK_CUSPARSE(cusparseXcoo2csr(handle, - thrust::raw_pointer_cast(d_rows_tdv.data()), - nnz + n, - n, - thrust::raw_pointer_cast(d_csrOffsets_o.data()), - CUSPARSE_INDEX_BASE_ZERO)); - - CHECK_CUDA(cudaMemcpy(extended_pointers.rowPointer, thrust::raw_pointer_cast(d_csrOffsets_o.data()), (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice)); - CHECK_CUDA(cudaMemcpy(extended_pointers.cols, thrust::raw_pointer_cast(d_cols_tdv.data()), nnz_n * sizeof(int), cudaMemcpyDeviceToDevice)); - CHECK_CUDA(cudaMemcpy(extended_pointers.vals, thrust::raw_pointer_cast(d_vals_tdv.data()), nnz_n * sizeof(float), cudaMemcpyDeviceToDevice)); - - CHECK_CUSPARSE(cusparseCsrSetPointers(input, - extended_pointers.rowPointer, - extended_pointers.cols, - extended_pointers.vals)); - -} - - -void scale_csr_matrix(cusparseHandle_t handle, - float alpha, - cusparseSpMatDescr_t& input, - cusparseSpMatDescr_t& result) { - // Extract the dimensions and the number of non-zero elements - int64_t n, nnz; - int* d_csrOffsets; - int* d_cols; - float* d_vals; - - int64_t n_r, nnz_r; - int* d_csrOffsets_r; - int* d_cols_r; - float* d_vals_r; - - cusparseIndexType_t csrRowOffsetsType; - cusparseIndexType_t csrColIndType; - cusparseIndexBase_t idxBase; - cudaDataType valueTyp; - - cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); - cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); - - // Set scaling factors - const float beta = 0.0f; - size_t bufferSize = 0; - - cusparseMatDescr_t input_desc; - cusparseCreateMatDescr(&input_desc); - - // Create matrix descriptor for the result matrix C - cusparseMatDescr_t result_desc; - cusparseCreateMatDescr(&result_desc); - - // Get buffer size for the operation - cusparseScsrgeam2_bufferSizeExt(handle, - n, n, - &alpha, input_desc, nnz, - d_vals, - d_csrOffsets, - d_cols, - &beta, input_desc, nnz, - d_vals, - d_csrOffsets, - d_cols, - result_desc, - d_vals_r, - d_csrOffsets_r, - d_cols_r, - &bufferSize); - - void* dBuffer; - cudaMalloc(&dBuffer, bufferSize); - - // Perform the scaling operation - cusparseScsrgeam2(handle, - n, n, - &alpha, input_desc, nnz, - d_vals, - d_csrOffsets, - d_cols, - &beta, input_desc, nnz, - d_vals, - d_csrOffsets, - d_cols, - result_desc, - d_vals_r, - d_csrOffsets_r, - d_cols_r, - dBuffer); - // Clean up - cudaFree(dBuffer); - cusparseDestroyMatDescr(input_desc); - cusparseDestroyMatDescr(result_desc); -} - -thrust::device_vector sum_rows_csr_matrix(cusparseHandle_t handle, cusparseSpMatDescr_t input) { - // Extract CSR matrix information - int64_t rows, cols, nnz; - int* d_csrOffsets, * d_cols; - float* d_vals; - - cusparseIndexType_t csrRowOffsetsType; - cusparseIndexType_t csrColIndType; - cusparseIndexBase_t idxBase; - cudaDataType valueTyp; - - cusparseCsrGet(input, &rows, &cols, &nnz, - (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, - &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); - - // Allocate memory for the row sums - thrust::device_vector d_rowSums(rows); - - // Launch kernel to sum elements of each row using atomicAdd - int blockSize = 256; - int gridSize = (rows + blockSize - 1) / blockSize; - cols_sum << > > (rows, d_csrOffsets, d_vals, thrust::raw_pointer_cast(d_rowSums.data())); - cudaDeviceSynchronize(); - - return d_rowSums; -} - -int estimate_split(float density, int n) { - if (density > 0.5f) { - std::cout << "Error, density can not be bigger than 0.5!" << std::endl; - } - - float sparsity = 1.0f - density; - float inside = 2.0f * sparsity - 1.0f; - float smaller_set_size = 0.5f * n * (1 + sqrt(inside)); - return static_cast(smaller_set_size) - 1; -} - -// Function to generate initial permutation -std::vector generate_initial_permutation(std::mt19937 & rng, int n) { - std::vector permutation(n); - for (int i = 0; i < n; ++i) { - permutation[i] = i; - } - std::shuffle(permutation.begin(), permutation.end(), rng); - return permutation; -} - - -void create_graph_sparse(int n, int split, std::vector& p, thrust::device_vector& d_rows, thrust::device_vector& d_cols, thrust::device_vector& d_vals) { - thrust::host_vector h_rows, h_cols; - thrust::host_vector h_vals; - - thrust::default_random_engine rng; - thrust::random::uniform_real_distribution dist(0.01f, 1.0f); - - std::vector> combined; - - for (int i = 0; i <= split; ++i) { - for (int j = split + 1; j < n; ++j) { - float rnd_val = dist(rng); - combined.push_back(std::make_tuple(p[i], p[j], rnd_val)); - combined.push_back(std::make_tuple(p[j], p[i], rnd_val)); - } - } - - std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) { - if (std::get<0>(a) == std::get<0>(b)) { - return std::get<1>(a) < std::get<1>(b); - } - return std::get<0>(a) < std::get<0>(b); - }); - - for (size_t i = 0; i < combined.size(); ++i) { - h_rows.push_back(std::get<0>(combined[i])); - h_cols.push_back(std::get<1>(combined[i])); - h_vals.push_back(std::get<2>(combined[i])); - } - - d_rows = h_rows; - d_cols = h_cols; - d_vals = h_vals; -} - -thrust::device_vector generate_solution(const std::vector& p, int split) { - int size = p.size(); - - // Initialize a device vector of bool type with false (equivalent to CUDA.zeros(Bool, size(p))) - thrust::device_vector x(size, 0); - - - // Set the first split elements to true (equivalent to x[p[1:split]] .= 1) - for (int i = 0; i <= split ; ++i) { - x[p[i]] = 1; - } - - return x; -} - - -void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cusparseSpMatDescr_t& Q, csr_data& extended_pointers) { - scale_csr_matrix(handle, -1.0f, graph, Q); - thrust::device_vector sum = sum_rows_csr_matrix(handle, graph); - fill_diagonal(handle, Q, sum, extended_pointers); - scale_csr_matrix(handle, -0.25f, Q, Q); - - for (int i = 0; i < sum.size(); i++) { - std::cout << sum[i] << std::endl; - } -} - -//float calculate_qubo_energy(cublasHandle_t cublasHandle, +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include "indicators.hpp" +//#ifdef __INTELLISENSE__ +//#include "intellisense_cuda_intrinsics.h" +//#endif +// +// +//#define CHECK_CUSPARSE(call) \ +//{ \ +// cusparseStatus_t err = call; \ +// if (err != CUSPARSE_STATUS_SUCCESS) { \ +// std::cerr << "CUSPARSE error in file " << __FILE__ \ +// << " at line " << __LINE__ << ": " \ +// << cusparseGetErrorString(err) << std::endl; \ +// exit(EXIT_FAILURE); \ +// } \ +//} +// +//#define CHECK_CUDA(call) \ +//{ \ +// cudaError_t err = call; \ +// if (err != cudaSuccess) { \ +// std::cerr << "CUDA error in file " << __FILE__ \ +// << " at line " << __LINE__ << ": " \ +// << cudaGetErrorString(err) << std::endl; \ +// exit(EXIT_FAILURE); \ +// } \ +//} +// +//#define CHECK_CUBLAS(call) \ +//{ \ +// cublasStatus_t err = call; \ +// if (err != CUBLAS_STATUS_SUCCESS) { \ +// std::cerr << "CUBLAS error in file " << __FILE__ \ +// << " at line " << __LINE__ << ": " \ +// << cublasGetErrorString(err) << std::endl; \ +// exit(EXIT_FAILURE); \ +// } \ +//} +// +// +//struct csr_data { +// int* rowPointer; +// int* cols; +// float* vals; +//}; +// +//const char* cusparseGetErrorString(cusparseStatus_t status) { +// switch (status) { +// case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; +// case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; +// case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; +// case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; +// case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; +// case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; +// case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; +// case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; +// case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; +// case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; +// default: return "UNKNOWN CUSPARSE STATUS"; +// } +//} +// +//const char* cublasGetErrorString(cublasStatus_t status) { +// switch (status) { +// case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; +// case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; +// case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; +// case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; +// case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; +// case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; +// case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; +// case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; +// case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; +// case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; +// default: return "UNKNOWN CUBLAS STATUS"; +// } +//} +// +//__global__ void cols_sum(int n, const int* d_csrOffsets, const float* d_vals, float* d_rowSums) { +// int row = blockIdx.x * blockDim.x + threadIdx.x; +// if (row < n) { +// int row_start = d_csrOffsets[row]; +// int row_end = d_csrOffsets[row + 1]; +// for (int j = row_start; j < row_end; j++) { +// atomicAdd(&d_rowSums[row], d_vals[j]); +// } +// } +//} +// +//void convert_sparse_to_dense_and_display(cusparseHandle_t handle, const cusparseSpMatDescr_t& matDescr, int n) { +// // Allocate memory for the dense matrix on the device +// float* d_denseMat; +// cudaMalloc((void**)&d_denseMat, n * n * sizeof(float)); +// +// // Create a dense matrix descriptor +// cusparseDnMatDescr_t denseDescr; +// CHECK_CUSPARSE(cusparseCreateDnMat(&denseDescr, +// n, // number of rows +// n, // number of columns +// n, // leading dimension +// d_denseMat, // pointer to dense matrix data +// CUDA_R_32F, // data type +// CUSPARSE_ORDER_ROW)); // row-major order +// +// // Convert sparse matrix to dense matrix +// void* dBuffer = NULL; +// size_t bufferSize = 0; +// CHECK_CUSPARSE(cusparseSparseToDense_bufferSize(handle, +// matDescr, +// denseDescr, +// CUSPARSE_SPARSETODENSE_ALG_DEFAULT, +// &bufferSize)); +// +// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); +// +// CHECK_CUSPARSE(cusparseSparseToDense(handle, +// matDescr, +// denseDescr, +// CUSPARSE_SPARSETODENSE_ALG_DEFAULT, +// dBuffer)); +// +// // Copy the dense matrix from device to host +// std::vector h_denseMat(n * n); +// CHECK_CUDA(cudaMemcpy(h_denseMat.data(), d_denseMat, n * n * sizeof(float), cudaMemcpyDeviceToHost)); +// +// std::cout << std::fixed << std::setprecision(4); // Set precision to 2 decimal places +// std::cout << "Dense matrix:" << std::endl; +// for (int i = 0; i < n; ++i) { +// for (int j = 0; j < n; ++j) { +// std::cout << h_denseMat[i * n + j] << " "; +// } +// std::cout << std::endl; +// } +// +// // Clean up +// CHECK_CUDA(cudaFree(d_denseMat)); +// CHECK_CUDA(cudaFree(dBuffer)); +// CHECK_CUSPARSE(cusparseDestroyDnMat(denseDescr)); +//} +// +// +//void fill_diagonal(cusparseHandle_t handle, cusparseSpMatDescr_t& input, thrust::device_vector diag, csr_data& extended_pointers) { +// int64_t n, nnz; +// int* d_csrOffsets; +// int* d_cols; +// float* d_vals; +// +// cusparseIndexType_t csrRowOffsetsType; +// cusparseIndexType_t csrColIndType; +// cusparseIndexBase_t idxBase; +// cudaDataType valueTyp; +// +// CHECK_CUSPARSE(cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp)); +// +// thrust::device_vector d_rows_tdv(nnz); +// thrust::device_vector d_cols_tdv(d_cols, d_cols+nnz); +// thrust::device_vector d_vals_tdv(d_vals, d_vals+nnz); +// +// CHECK_CUSPARSE(cusparseXcsr2coo(handle, +// d_csrOffsets, +// nnz, +// n, +// thrust::raw_pointer_cast(d_rows_tdv.data()), +// CUSPARSE_INDEX_BASE_ZERO)); +// +// d_rows_tdv.resize(nnz + n); +// d_cols_tdv.resize(nnz + n); +// d_vals_tdv.resize(nnz + n); +// +// int nnz_n = nnz + n; +// +// thrust::device_vector d_vec(n); +// +// // Fill the vector with values from 0 to n-1 +// thrust::sequence(d_vec.begin(), d_vec.end()); +// +// thrust::copy(d_vec.begin(), d_vec.end(), d_rows_tdv.begin() + nnz); +// thrust::copy(d_vec.begin(), d_vec.end(), d_cols_tdv.begin() + nnz); +// thrust::copy(diag.begin(), diag.end(), d_vals_tdv.begin() + nnz); +// +// /*thrust::copy(d_rows_tdv.begin(), d_rows_tdv.end(), ); +// thrust::copy(d_cols_tdv.begin(), d_cols_tdv.end(), extended_pointers.cols); +// thrust::copy(d_vals_tdv.begin(), d_vals_tdv.end(), extended_pointers.vals);*/ +// +// +// CHECK_CUSPARSE(cusparseXcsr2coo(handle, +// d_csrOffsets, +// nnz, +// n, +// thrust::raw_pointer_cast(d_rows_tdv.data()), +// CUSPARSE_INDEX_BASE_ZERO)); +// thrust::device_vector d_csrOffsets_o(n + 1); +// +// thrust::sort_by_key(thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.begin(), d_cols_tdv.begin())), +// thrust::make_zip_iterator(thrust::make_tuple(d_rows_tdv.end(), d_cols_tdv.end())), +// d_vals_tdv.begin()); +// +// CHECK_CUSPARSE(cusparseXcoo2csr(handle, +// thrust::raw_pointer_cast(d_rows_tdv.data()), +// nnz + n, +// n, +// thrust::raw_pointer_cast(d_csrOffsets_o.data()), +// CUSPARSE_INDEX_BASE_ZERO)); +// +// CHECK_CUDA(cudaMemcpy(extended_pointers.rowPointer, thrust::raw_pointer_cast(d_csrOffsets_o.data()), (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(extended_pointers.cols, thrust::raw_pointer_cast(d_cols_tdv.data()), nnz_n * sizeof(int), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(extended_pointers.vals, thrust::raw_pointer_cast(d_vals_tdv.data()), nnz_n * sizeof(float), cudaMemcpyDeviceToDevice)); +// +// CHECK_CUSPARSE(cusparseCsrSetPointers(input, +// extended_pointers.rowPointer, +// extended_pointers.cols, +// extended_pointers.vals)); +// +//} +// +// +//void scale_csr_matrix(cusparseHandle_t handle, +// float alpha, +// cusparseSpMatDescr_t& input, +// cusparseSpMatDescr_t& result) { +// // Extract the dimensions and the number of non-zero elements +// int64_t n, nnz; +// int* d_csrOffsets; +// int* d_cols; +// float* d_vals; +// +// int64_t n_r, nnz_r; +// int* d_csrOffsets_r; +// int* d_cols_r; +// float* d_vals_r; +// +// cusparseIndexType_t csrRowOffsetsType; +// cusparseIndexType_t csrColIndType; +// cusparseIndexBase_t idxBase; +// cudaDataType valueTyp; +// +// cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); +// cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); +// +// // Set scaling factors +// const float beta = 0.0f; +// size_t bufferSize = 0; +// +// cusparseMatDescr_t input_desc; +// cusparseCreateMatDescr(&input_desc); +// +// // Create matrix descriptor for the result matrix C +// cusparseMatDescr_t result_desc; +// cusparseCreateMatDescr(&result_desc); +// +// // Get buffer size for the operation +// cusparseScsrgeam2_bufferSizeExt(handle, +// n, n, +// &alpha, input_desc, nnz, +// d_vals, +// d_csrOffsets, +// d_cols, +// &beta, input_desc, nnz, +// d_vals, +// d_csrOffsets, +// d_cols, +// result_desc, +// d_vals_r, +// d_csrOffsets_r, +// d_cols_r, +// &bufferSize); +// +// void* dBuffer; +// cudaMalloc(&dBuffer, bufferSize); +// +// // Perform the scaling operation +// cusparseScsrgeam2(handle, +// n, n, +// &alpha, input_desc, nnz, +// d_vals, +// d_csrOffsets, +// d_cols, +// &beta, input_desc, nnz, +// d_vals, +// d_csrOffsets, +// d_cols, +// result_desc, +// d_vals_r, +// d_csrOffsets_r, +// d_cols_r, +// dBuffer); +// // Clean up +// cudaFree(dBuffer); +// cusparseDestroyMatDescr(input_desc); +// cusparseDestroyMatDescr(result_desc); +//} +// +//thrust::device_vector sum_rows_csr_matrix(cusparseHandle_t handle, cusparseSpMatDescr_t input) { +// // Extract CSR matrix information +// int64_t rows, cols, nnz; +// int* d_csrOffsets, * d_cols; +// float* d_vals; +// +// cusparseIndexType_t csrRowOffsetsType; +// cusparseIndexType_t csrColIndType; +// cusparseIndexBase_t idxBase; +// cudaDataType valueTyp; +// +// cusparseCsrGet(input, &rows, &cols, &nnz, +// (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, +// &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); +// +// // Allocate memory for the row sums +// thrust::device_vector d_rowSums(rows); +// +// // Launch kernel to sum elements of each row using atomicAdd +// int blockSize = 256; +// int gridSize = (rows + blockSize - 1) / blockSize; +// cols_sum << > > (rows, d_csrOffsets, d_vals, thrust::raw_pointer_cast(d_rowSums.data())); +// cudaDeviceSynchronize(); +// +// return d_rowSums; +//} +// +//int estimate_split(float density, int n) { +// if (density > 0.5f) { +// std::cout << "Error, density can not be bigger than 0.5!" << std::endl; +// } +// +// float sparsity = 1.0f - density; +// float inside = 2.0f * sparsity - 1.0f; +// float smaller_set_size = 0.5f * n * (1 + sqrt(inside)); +// return static_cast(smaller_set_size) - 1; +//} +// +//// Function to generate initial permutation +//std::vector generate_initial_permutation(std::mt19937 & rng, int n) { +// std::vector permutation(n); +// for (int i = 0; i < n; ++i) { +// permutation[i] = i; +// } +// std::shuffle(permutation.begin(), permutation.end(), rng); +// return permutation; +//} +// +// +//void create_graph_sparse(int n, int split, std::vector& p, thrust::device_vector& d_rows, thrust::device_vector& d_cols, thrust::device_vector& d_vals) { +// thrust::host_vector h_rows, h_cols; +// thrust::host_vector h_vals; +// +// thrust::default_random_engine rng; +// thrust::random::uniform_real_distribution dist(0.01f, 1.0f); +// +// std::vector> combined; +// +// for (int i = 0; i <= split; ++i) { +// for (int j = split + 1; j < n; ++j) { +// float rnd_val = dist(rng); +// combined.push_back(std::make_tuple(p[i], p[j], rnd_val)); +// combined.push_back(std::make_tuple(p[j], p[i], rnd_val)); +// } +// } +// +// std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) { +// if (std::get<0>(a) == std::get<0>(b)) { +// return std::get<1>(a) < std::get<1>(b); +// } +// return std::get<0>(a) < std::get<0>(b); +// }); +// +// for (size_t i = 0; i < combined.size(); ++i) { +// h_rows.push_back(std::get<0>(combined[i])); +// h_cols.push_back(std::get<1>(combined[i])); +// h_vals.push_back(std::get<2>(combined[i])); +// } +// +// d_rows = h_rows; +// d_cols = h_cols; +// d_vals = h_vals; +//} +// +//thrust::device_vector generate_solution(const std::vector& p, int split) { +// int size = p.size(); +// +// // Initialize a device vector of bool type with false (equivalent to CUDA.zeros(Bool, size(p))) +// thrust::device_vector x(size, 0); +// +// +// // Set the first split elements to true (equivalent to x[p[1:split]] .= 1) +// for (int i = 0; i <= split ; ++i) { +// x[p[i]] = 1; +// } +// +// return x; +//} +// +// +//void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cusparseSpMatDescr_t& Q, csr_data& extended_pointers) { +// scale_csr_matrix(handle, -1.0f, graph, Q); +// thrust::device_vector sum = sum_rows_csr_matrix(handle, graph); +// fill_diagonal(handle, Q, sum, extended_pointers); +// scale_csr_matrix(handle, -0.25f, Q, Q); +// +// for (int i = 0; i < sum.size(); i++) { +// std::cout << sum[i] << std::endl; +// } +//} +// +////float calculate_qubo_energy(cublasHandle_t cublasHandle, +//// cusparseHandle_t handle, +//// int n, +//// const cusparseSpMatDescr_t& Q, +//// const thrust::device_vector& x) { +//// float alpha = 1.0f; +//// float beta = 0.0f; +//// size_t bufferSize = 0; +//// void* dBuffer = nullptr; +//// +//// // Create dense vector descriptors +//// cusparseDnVecDescr_t vecX, vecY; +//// float* Qx; +//// float* in_x; +//// +//// cudaMalloc((void**)&Qx, n * sizeof(float)); +//// cudaMalloc((void**)&in_x, n * sizeof(float)); +//// +//// cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice); +//// +//// cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F); +//// cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F); +//// // Allocate buffer +//// cusparseSpMV_bufferSize(handle, +//// CUSPARSE_OPERATION_NON_TRANSPOSE, +//// &alpha, +//// Q, +//// vecX, +//// &beta, +//// vecY, +//// CUDA_R_32F, +//// CUSPARSE_SPMV_ALG_DEFAULT, +//// &bufferSize); +//// cudaMalloc(&dBuffer, bufferSize); +//// // Perform Q * x +//// cusparseSpMV(handle, +//// CUSPARSE_OPERATION_NON_TRANSPOSE, +//// &alpha, +//// Q, +//// vecX, +//// &beta, +//// vecY, +//// CUDA_R_32F, +//// CUSPARSE_SPMV_ALG_DEFAULT, +//// dBuffer); +//// +//// float* Qx_ptr; +//// cusparseDnVecGetValues(vecY, (void**)&Qx_ptr); +//// +//// // Compute the dot product x^T * (Q * x) using cuBLAS +//// float result; +//// cublasSdot(cublasHandle, n, in_x, 1, Qx_ptr, 1, &result); +//// // Clean up +//// cusparseDestroyDnVec(vecX); +//// cusparseDestroyDnVec(vecY); +//// cudaFree(dBuffer); +//// cudaFree(Qx); +//// cudaFree(in_x); +//// return result; +////} +// +// +//float qubo_eng(cublasHandle_t cublasHandle, // cusparseHandle_t handle, +// const cusparseSpMatDescr_t& Q, +// float* sol_vector) { +// int64_t rows, cols, nnz; +// int* d_csrOffsets, * d_cols; +// float* d_vals; +// int* dA_csrOffsets, * dA_columns; +// float* dA_values; +// +// cusparseIndexType_t csrRowOffsetsType; +// cusparseIndexType_t csrColIndType; +// cusparseIndexBase_t idxBase; +// cudaDataType valueTyp; +// +// CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz)); +// +// CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float))); +// +// CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz, +// (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, +// &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp)); +// // Host problem definition +// const int A_num_rows = rows; +// const int A_num_cols = cols; +// const int A_nnz = nnz; +// //int h_test[11]; +// //int* h_test = (int*)calloc(rows+1, sizeof(int)); +// float alpha = 1.0f; +// float beta = 0.0f; +// //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); +// ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost); +// ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); +// //for (int i = 0; i < A_num_rows+1; i++) { +// // std::cout << h_test[i] << std::endl; +// //} +// //-------------------------------------------------------------------------- +// // Device memory management +// float * dX, * dY; +// CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float))); +// +// CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice)); +// cusparseSpMatDescr_t matA; +// cusparseDnVecDescr_t vecX, vecY; +// void* dBuffer = NULL; +// size_t bufferSize = 0; +// CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz, +// dA_csrOffsets, dA_columns, dA_values, +// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, +// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); +// // Create dense vector X +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F)); +// // Create dense vector y +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F)); +// // allocate an external buffer if needed +// CHECK_CUSPARSE(cusparseSpMV_bufferSize( +// handle, CUSPARSE_OPERATION_NON_TRANSPOSE, +// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, +// CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); +// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); +// +// // execute SpMV +// CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, +// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, +// CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)); +// +// // destroy matrix/vector descriptors +// CHECK_CUSPARSE(cusparseDestroySpMat(matA)); +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX)); +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY)); +// +// /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost)); +// std::cout << "Result of Q*x: " << std::endl; +// for (int i = 0; i < A_num_rows; i++) { +// std::cout << hY[i] << std::endl; +// }*/ +// +// float result; +// CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result)); +// //std::cout << "Energy: " << result << std::endl; +// return result; +//} +// +//float calculate_qubo_energy(cublasHandle_t cublasHandle, +// cusparseHandle_t cusparseHandle, // int n, // const cusparseSpMatDescr_t& Q, -// const thrust::device_vector& x) { +// thrust::device_vector& x) { // float alpha = 1.0f; // float beta = 0.0f; // size_t bufferSize = 0; @@ -432,17 +586,26 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars // // Create dense vector descriptors // cusparseDnVecDescr_t vecX, vecY; // float* Qx; +// float* Qx_ptr; +// //float* hY = (float*)calloc(n, sizeof(float)); // float* in_x; // -// cudaMalloc((void**)&Qx, n * sizeof(float)); -// cudaMalloc((void**)&in_x, n * sizeof(float)); +// /* for (int i = 0; i < n; i++) { +// hY[i] = 0.0f; +// }*/ // -// cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice); +// CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float))); +// +// CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice)); +// //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice)); +// +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F)); +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F)); // -// cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F); -// cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F); // // Allocate buffer -// cusparseSpMV_bufferSize(handle, +// CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle, // CUSPARSE_OPERATION_NON_TRANSPOSE, // &alpha, // Q, @@ -451,10 +614,11 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars // vecY, // CUDA_R_32F, // CUSPARSE_SPMV_ALG_DEFAULT, -// &bufferSize); -// cudaMalloc(&dBuffer, bufferSize); +// &bufferSize)); +// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); +// // // Perform Q * x -// cusparseSpMV(handle, +// CHECK_CUSPARSE(cusparseSpMV(cusparseHandle, // CUSPARSE_OPERATION_NON_TRANSPOSE, // &alpha, // Q, @@ -463,223 +627,131 @@ void graph_to_qubo(cusparseHandle_t handle, cusparseSpMatDescr_t& graph, cuspars // vecY, // CUDA_R_32F, // CUSPARSE_SPMV_ALG_DEFAULT, -// dBuffer); +// dBuffer)); +// +// // Extract the raw pointers to the data in the dense vector descriptors +// //float* Qx_ptr; +// //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr)); +// // Ensure the computation is complete before accessing the result +// CHECK_CUDA(cudaDeviceSynchronize()); +// +// /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost)); +// for (int i = 0; i < n; i++) { +// std::cout << hY[i] << std::endl; +// }*/ // -// float* Qx_ptr; -// cusparseDnVecGetValues(vecY, (void**)&Qx_ptr); -// // // Compute the dot product x^T * (Q * x) using cuBLAS // float result; -// cublasSdot(cublasHandle, n, in_x, 1, Qx_ptr, 1, &result); +// CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result)); +// // // Clean up -// cusparseDestroyDnVec(vecX); -// cusparseDestroyDnVec(vecY); -// cudaFree(dBuffer); -// cudaFree(Qx); -// cudaFree(in_x); +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX)); +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY)); +// CHECK_CUDA(cudaFree(dBuffer)); +// CHECK_CUDA(cudaFree(Qx)); +// CHECK_CUDA(cudaFree(in_x)); +// // return result; //} - - -float qubo_eng(cublasHandle_t cublasHandle, - cusparseHandle_t handle, - const cusparseSpMatDescr_t& Q, - float* sol_vector) { - int64_t rows, cols, nnz; - int* d_csrOffsets, * d_cols; - float* d_vals; - int* dA_csrOffsets, * dA_columns; - float* dA_values; - - cusparseIndexType_t csrRowOffsetsType; - cusparseIndexType_t csrColIndType; - cusparseIndexBase_t idxBase; - cudaDataType valueTyp; - - CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz)); - - CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int))); - CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int))); - CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float))); - - CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz, - (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, - &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp)); - // Host problem definition - const int A_num_rows = rows; - const int A_num_cols = cols; - const int A_nnz = nnz; - //int h_test[11]; - //int* h_test = (int*)calloc(rows+1, sizeof(int)); - float alpha = 1.0f; - float beta = 0.0f; - //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); - ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost); - ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); - //for (int i = 0; i < A_num_rows+1; i++) { - // std::cout << h_test[i] << std::endl; - //} - //-------------------------------------------------------------------------- - // Device memory management - float * dX, * dY; - CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int))); - CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int))); - CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float))); - CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float))); - CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float))); - - CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice)); - CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice)); - CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice)); - CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice)); - cusparseSpMatDescr_t matA; - cusparseDnVecDescr_t vecX, vecY; - void* dBuffer = NULL; - size_t bufferSize = 0; - CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz, - dA_csrOffsets, dA_columns, dA_values, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - // Create dense vector X - CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F)); - // Create dense vector y - CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F)); - // allocate an external buffer if needed - CHECK_CUSPARSE(cusparseSpMV_bufferSize( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); - CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); - - // execute SpMV - CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)); - - // destroy matrix/vector descriptors - CHECK_CUSPARSE(cusparseDestroySpMat(matA)); - CHECK_CUSPARSE(cusparseDestroyDnVec(vecX)); - CHECK_CUSPARSE(cusparseDestroyDnVec(vecY)); - - /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost)); - std::cout << "Result of Q*x: " << std::endl; - for (int i = 0; i < A_num_rows; i++) { - std::cout << hY[i] << std::endl; - }*/ - - float result; - CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result)); - //std::cout << "Energy: " << result << std::endl; - return result; -} - -float calculate_qubo_energy(cublasHandle_t cublasHandle, - cusparseHandle_t cusparseHandle, - int n, - const cusparseSpMatDescr_t& Q, - thrust::device_vector& x) { - float alpha = 1.0f; - float beta = 0.0f; - size_t bufferSize = 0; - void* dBuffer = nullptr; - - // Create dense vector descriptors - cusparseDnVecDescr_t vecX, vecY; - float* Qx; - float* Qx_ptr; - //float* hY = (float*)calloc(n, sizeof(float)); - float* in_x; - - /* for (int i = 0; i < n; i++) { - hY[i] = 0.0f; - }*/ - - CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float))); - CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float))); - CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float))); - - CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice)); - //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice)); - - CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F)); - CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F)); - - // Allocate buffer - CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, - Q, - vecX, - &beta, - vecY, - CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, - &bufferSize)); - CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); - - // Perform Q * x - CHECK_CUSPARSE(cusparseSpMV(cusparseHandle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, - Q, - vecX, - &beta, - vecY, - CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, - dBuffer)); - - // Extract the raw pointers to the data in the dense vector descriptors - //float* Qx_ptr; - //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr)); - // Ensure the computation is complete before accessing the result - CHECK_CUDA(cudaDeviceSynchronize()); - - /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost)); - for (int i = 0; i < n; i++) { - std::cout << hY[i] << std::endl; - }*/ - - // Compute the dot product x^T * (Q * x) using cuBLAS - float result; - CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result)); - - // Clean up - CHECK_CUSPARSE(cusparseDestroyDnVec(vecX)); - CHECK_CUSPARSE(cusparseDestroyDnVec(vecY)); - CHECK_CUDA(cudaFree(dBuffer)); - CHECK_CUDA(cudaFree(Qx)); - CHECK_CUDA(cudaFree(in_x)); - - return result; -} - +// +////void brute_force_solutions(cublasHandle_t cublasHandle, +//// cusparseHandle_t handle, +//// int n, +//// const cusparseSpMatDescr_t& Q) { +//// +//// int num_solutions = 1 << n; // 2^n +//// thrust::device_vector d_solutions(num_solutions * n); +//// for (int i = 0; i < num_solutions; ++i) { +//// for (int j = 0; j < n; ++j) { +//// float val = static_cast((i & (1 << j)) != 0); +//// d_solutions[i * n + j] = val; +//// } +//// } +//// +//// +//// // Allocate memory for energies +//// thrust::device_vector d_energies(num_solutions); +//// thrust::device_vector sol_x(n); +//// for (int i = 0; i < num_solutions; ++i) { +//// thrust::copy(d_solutions.begin() + (i * n), d_solutions.begin() + ((i + 1) * n), sol_x.begin()); +//// float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x); +//// d_energies[i] = eng; +//// std::cout << "Energy i: " << eng << std::endl; +//// } +//// +//// +//// // Find the minimum energy +//// auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end()); +//// float min_energy = *min_energy_iter; +//// int min_index = min_energy_iter - d_energies.begin(); +//// +//// // Copy the best solution to host +//// thrust::host_vector h_best_solution(n); +//// thrust::copy(d_solutions.begin() + min_index * n, d_solutions.begin() + (min_index + 1) * n, h_best_solution.begin()); +//// +//// // Print the result +//// std::cout << "Best energy: " << min_energy << std::endl; +//// std::cout << "Best solution: "; +//// for (float bit : h_best_solution) { +//// std::cout << bit << " "; +//// } +//// std::cout << std::endl; +//// convert_sparse_to_dense_and_display(handle, Q, n); +////} +// //void brute_force_solutions(cublasHandle_t cublasHandle, // cusparseHandle_t handle, // int n, // const cusparseSpMatDescr_t& Q) { -// +// indicators::show_console_cursor(false); // int num_solutions = 1 << n; // 2^n // thrust::device_vector d_solutions(num_solutions * n); +// /* // for (int i = 0; i < num_solutions; ++i) { // for (int j = 0; j < n; ++j) { // float val = static_cast((i & (1 << j)) != 0); // d_solutions[i * n + j] = val; // } -// } -// +// }*/ // +// //convert_sparse_to_dense_and_display(handle, Q, n); // // Allocate memory for energies // thrust::device_vector d_energies(num_solutions); // thrust::device_vector sol_x(n); +// +// indicators::ProgressBar bar{ +// indicators::option::BarWidth{40}, +// indicators::option::Start{"["}, +// indicators::option::Fill{"="}, +// indicators::option::Lead{">"}, +// indicators::option::Remainder{" "}, +// indicators::option::End{" ]"}, +// indicators::option::ForegroundColor{indicators::Color::white}, +// indicators::option::FontStyles{ +// std::vector{indicators::FontStyle::bold}}, +// indicators::option::MaxProgress{num_solutions} +// }; +// int smallest_ind = -1; +// float smallest_eng = 1000; +// // for (int i = 0; i < num_solutions; ++i) { -// thrust::copy(d_solutions.begin() + (i * n), d_solutions.begin() + ((i + 1) * n), sol_x.begin()); -// float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x); +// for (int j = 0; j < n; ++j) { +// float val = static_cast((i & (1 << j)) != 0); +// sol_x[j] = val; +// d_solutions[i * n + j] = val; +// } +// float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data())); +// //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x); // d_energies[i] = eng; -// std::cout << "Energy i: " << eng << std::endl; +// if (eng <= smallest_eng) { +// smallest_ind = i; +// smallest_eng = eng; +// } +// //std::cout << "Energy i: " << eng << std::endl; +// bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) }); +// bar.tick(); // } -// -// +// std::cout << "Tested all solutions, now sorting." << std::endl; // // Find the minimum energy // auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end()); // float min_energy = *min_energy_iter; @@ -687,10 +759,11 @@ float calculate_qubo_energy(cublasHandle_t cublasHandle, // // // Copy the best solution to host // thrust::host_vector h_best_solution(n); -// thrust::copy(d_solutions.begin() + min_index * n, d_solutions.begin() + (min_index + 1) * n, h_best_solution.begin()); +// thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin()); // // // Print the result -// std::cout << "Best energy: " << min_energy << std::endl; +// std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl; +// std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl; // std::cout << "Best solution: "; // for (float bit : h_best_solution) { // std::cout << bit << " "; @@ -698,198 +771,125 @@ float calculate_qubo_energy(cublasHandle_t cublasHandle, // std::cout << std::endl; // convert_sparse_to_dense_and_display(handle, Q, n); //} - -void brute_force_solutions(cublasHandle_t cublasHandle, - cusparseHandle_t handle, - int n, - const cusparseSpMatDescr_t& Q) { - indicators::show_console_cursor(false); - int num_solutions = 1 << n; // 2^n - thrust::device_vector d_solutions(num_solutions * n); - /* - for (int i = 0; i < num_solutions; ++i) { - for (int j = 0; j < n; ++j) { - float val = static_cast((i & (1 << j)) != 0); - d_solutions[i * n + j] = val; - } - }*/ - - //convert_sparse_to_dense_and_display(handle, Q, n); - // Allocate memory for energies - thrust::device_vector d_energies(num_solutions); - thrust::device_vector sol_x(n); - - indicators::ProgressBar bar{ - indicators::option::BarWidth{40}, - indicators::option::Start{"["}, - indicators::option::Fill{"="}, - indicators::option::Lead{">"}, - indicators::option::Remainder{" "}, - indicators::option::End{" ]"}, - indicators::option::ForegroundColor{indicators::Color::white}, - indicators::option::FontStyles{ - std::vector{indicators::FontStyle::bold}}, - indicators::option::MaxProgress{num_solutions} - }; - int smallest_ind = -1; - float smallest_eng = 1000; - - for (int i = 0; i < num_solutions; ++i) { - for (int j = 0; j < n; ++j) { - float val = static_cast((i & (1 << j)) != 0); - sol_x[j] = val; - d_solutions[i * n + j] = val; - } - float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data())); - //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x); - d_energies[i] = eng; - if (eng <= smallest_eng) { - smallest_ind = i; - smallest_eng = eng; - } - //std::cout << "Energy i: " << eng << std::endl; - bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) }); - bar.tick(); - } - std::cout << "Tested all solutions, now sorting." << std::endl; - // Find the minimum energy - auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end()); - float min_energy = *min_energy_iter; - int min_index = min_energy_iter - d_energies.begin(); - - // Copy the best solution to host - thrust::host_vector h_best_solution(n); - thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin()); - - // Print the result - std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl; - std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl; - std::cout << "Best solution: "; - for (float bit : h_best_solution) { - std::cout << bit << " "; - } - std::cout << std::endl; - convert_sparse_to_dense_and_display(handle, Q, n); -} - - -int main() { - int n = 12; - int seed = 14; - float density = 0.5; - std::mt19937 rng(seed); - - std::vector p = generate_initial_permutation(rng, n); - - int split = estimate_split(density, n); // Example split, can be computed as needed - std::cout << "Split: " << split << std::endl; - - thrust::device_vector d_rows; - thrust::device_vector d_cols; - thrust::device_vector d_vals; - - create_graph_sparse(n, split, p, d_rows, d_cols, d_vals); - thrust::device_vector d_x = generate_solution(p, split); - std::cout << "Graph created."; - // Print the result (for debugging purposes) - thrust::host_vector h_rows = d_rows; - thrust::host_vector h_cols = d_cols; - thrust::host_vector h_vals = d_vals; - thrust::host_vector h_x = d_x; - int print_size = 2*n; - std::cout << "Rows: "; - for (int i = 0; i < print_size; ++i) { - std::cout << h_rows[i] << " "; - } - std::cout << std::endl; - - std::cout << "Cols: "; - for (int i = 0; i < print_size; ++i) { - std::cout << h_cols[i] << " "; - } - std::cout << std::endl; - - std::cout << "Vals: "; - for (int i = 0; i < print_size; ++i) { - std::cout << h_vals[i] << " "; - } - std::cout << std::endl; - - std::cout << "Solution: "; - for (int i = 0; i < n; ++i) { - std::cout << h_x[i] << " "; - } - std::cout << std::endl; - - cusparseHandle_t handle; - cublasHandle_t cublasHandle; - cusparseSpMatDescr_t graph_csr, Q; - - // Initialize cuSPARSE - cublasCreate(&cublasHandle); - CHECK_CUSPARSE(cusparseCreate(&handle)); - int nnz = d_vals.size(); - thrust::device_vector d_csrOffsets(n + 1); - cusparseXcoo2csr(handle, - thrust::raw_pointer_cast(d_rows.data()), - nnz, - n, - thrust::raw_pointer_cast(d_csrOffsets.data()), - CUSPARSE_INDEX_BASE_ZERO); - - CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr, - n, - n, - nnz, - thrust::raw_pointer_cast(d_csrOffsets.data()), - thrust::raw_pointer_cast(d_cols.data()), // column indices - thrust::raw_pointer_cast(d_vals.data()), // values - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - - thrust::device_vector d_QVals(nnz); - CHECK_CUSPARSE(cusparseCreateCsr(&Q, - n, - n, - nnz, - thrust::raw_pointer_cast(d_csrOffsets.data()), - thrust::raw_pointer_cast(d_cols.data()), // column indices - thrust::raw_pointer_cast(d_QVals.data()), // values - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - // Now matDescr can be used in further cuSPARSE operations - - int* newRowsPtr; - int* newCols; - float* newVals; - csr_data extended_sparse_data; - - CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int))); - CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int))); - CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float))); - - extended_sparse_data.rowPointer = newRowsPtr; - extended_sparse_data.cols = newCols; - extended_sparse_data.vals = newVals; - - - std::cout << "Sparse matrix created successfully!" << std::endl; - graph_to_qubo(handle, graph_csr, Q, extended_sparse_data); - /*convert_sparse_to_dense_and_display(handle, graph_csr, n); */ - //convert_sparse_to_dense_and_display(handle, Q, n); - - float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data())); - - std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl; - - /*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x); - std::cout << "Qubo energy: " << energy << std::endl;*/ - brute_force_solutions(cublasHandle, handle, n, Q); - - // Clean up - CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr)); - CHECK_CUSPARSE(cusparseDestroySpMat(Q)); - CHECK_CUSPARSE(cusparseDestroy(handle)); - - return 0; -}; +// +// +//int main() { +// int n = 12; +// int seed = 14; +// float density = 0.5; +// std::mt19937 rng(seed); +// +// std::vector p = generate_initial_permutation(rng, n); +// +// int split = estimate_split(density, n); // Example split, can be computed as needed +// std::cout << "Split: " << split << std::endl; +// +// thrust::device_vector d_rows; +// thrust::device_vector d_cols; +// thrust::device_vector d_vals; +// +// create_graph_sparse(n, split, p, d_rows, d_cols, d_vals); +// thrust::device_vector d_x = generate_solution(p, split); +// std::cout << "Graph created."; +// // Print the result (for debugging purposes) +// thrust::host_vector h_rows = d_rows; +// thrust::host_vector h_cols = d_cols; +// thrust::host_vector h_vals = d_vals; +// thrust::host_vector h_x = d_x; +// int print_size = 2*n; +// std::cout << "Rows: "; +// for (int i = 0; i < print_size; ++i) { +// std::cout << h_rows[i] << " "; +// } +// std::cout << std::endl; +// +// std::cout << "Cols: "; +// for (int i = 0; i < print_size; ++i) { +// std::cout << h_cols[i] << " "; +// } +// std::cout << std::endl; +// +// std::cout << "Vals: "; +// for (int i = 0; i < print_size; ++i) { +// std::cout << h_vals[i] << " "; +// } +// std::cout << std::endl; +// +// std::cout << "Solution: "; +// for (int i = 0; i < n; ++i) { +// std::cout << h_x[i] << " "; +// } +// std::cout << std::endl; +// +// cusparseHandle_t handle; +// cublasHandle_t cublasHandle; +// cusparseSpMatDescr_t graph_csr, Q; +// +// // Initialize cuSPARSE +// cublasCreate(&cublasHandle); +// CHECK_CUSPARSE(cusparseCreate(&handle)); +// int nnz = d_vals.size(); +// thrust::device_vector d_csrOffsets(n + 1); +// cusparseXcoo2csr(handle, +// thrust::raw_pointer_cast(d_rows.data()), +// nnz, +// n, +// thrust::raw_pointer_cast(d_csrOffsets.data()), +// CUSPARSE_INDEX_BASE_ZERO); +// +// CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr, +// n, +// n, +// nnz, +// thrust::raw_pointer_cast(d_csrOffsets.data()), +// thrust::raw_pointer_cast(d_cols.data()), // column indices +// thrust::raw_pointer_cast(d_vals.data()), // values +// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, +// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); +// +// thrust::device_vector d_QVals(nnz); +// CHECK_CUSPARSE(cusparseCreateCsr(&Q, +// n, +// n, +// nnz, +// thrust::raw_pointer_cast(d_csrOffsets.data()), +// thrust::raw_pointer_cast(d_cols.data()), // column indices +// thrust::raw_pointer_cast(d_QVals.data()), // values +// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, +// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); +// // Now matDescr can be used in further cuSPARSE operations +// +// int* newRowsPtr; +// int* newCols; +// float* newVals; +// csr_data extended_sparse_data; +// +// CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float))); +// +// extended_sparse_data.rowPointer = newRowsPtr; +// extended_sparse_data.cols = newCols; +// extended_sparse_data.vals = newVals; +// +// +// std::cout << "Sparse matrix created successfully!" << std::endl; +// graph_to_qubo(handle, graph_csr, Q, extended_sparse_data); +// /*convert_sparse_to_dense_and_display(handle, graph_csr, n); */ +// //convert_sparse_to_dense_and_display(handle, Q, n); +// +// float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data())); +// +// std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl; +// +// /*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x); +// std::cout << "Qubo energy: " << energy << std::endl;*/ +// brute_force_solutions(cublasHandle, handle, n, Q); +// +// // Clean up +// CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr)); +// CHECK_CUSPARSE(cusparseDestroySpMat(Q)); +// CHECK_CUSPARSE(cusparseDestroy(handle)); +// +// return 0; +//}; diff --git a/CudaMaxCutPlanted/main.cu b/CudaMaxCutPlanted/main.cu new file mode 100644 index 0000000..cef2f1c --- /dev/null +++ b/CudaMaxCutPlanted/main.cu @@ -0,0 +1,636 @@ +//#include +//#include +#include "CudaSparseMatrix.hpp" +//#include +#include +#include +//#include +//#include +#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +#include +#include +//#include +//#include +//#include +#include "indicators.hpp" +#ifdef __INTELLISENSE__ +#include "intellisense_cuda_intrinsics.h" +#endif + + + +void scale_csr_matrix(cusparseHandle_t handle, + float alpha, + cusparseSpMatDescr_t& input, + cusparseSpMatDescr_t& result) { + // Extract the dimensions and the number of non-zero elements + int64_t n, nnz; + int* d_csrOffsets; + int* d_cols; + float* d_vals; + + int64_t n_r, nnz_r; + int* d_csrOffsets_r; + int* d_cols_r; + float* d_vals_r; + + cusparseIndexType_t csrRowOffsetsType; + cusparseIndexType_t csrColIndType; + cusparseIndexBase_t idxBase; + cudaDataType valueTyp; + + cusparseCsrGet(input, &n, &n, &nnz, (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); + cusparseCsrGet(result, &n_r, &n_r, &nnz_r, (void**)&d_csrOffsets_r, (void**)&d_cols_r, (void**)&d_vals_r, &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp); + + // Set scaling factors + const float beta = 0.0f; + size_t bufferSize = 0; + + cusparseMatDescr_t input_desc; + cusparseCreateMatDescr(&input_desc); + + // Create matrix descriptor for the result matrix C + cusparseMatDescr_t result_desc; + cusparseCreateMatDescr(&result_desc); + + // Get buffer size for the operation + cusparseScsrgeam2_bufferSizeExt(handle, + n, n, + &alpha, input_desc, nnz, + d_vals, + d_csrOffsets, + d_cols, + &beta, input_desc, nnz, + d_vals, + d_csrOffsets, + d_cols, + result_desc, + d_vals_r, + d_csrOffsets_r, + d_cols_r, + &bufferSize); + + void* dBuffer; + cudaMalloc(&dBuffer, bufferSize); + + // Perform the scaling operation + cusparseScsrgeam2(handle, + n, n, + &alpha, input_desc, nnz, + d_vals, + d_csrOffsets, + d_cols, + &beta, input_desc, nnz, + d_vals, + d_csrOffsets, + d_cols, + result_desc, + d_vals_r, + d_csrOffsets_r, + d_cols_r, + dBuffer); + // Clean up + cudaFree(dBuffer); + cusparseDestroyMatDescr(input_desc); + cusparseDestroyMatDescr(result_desc); +} + +// Function to generate initial permutation +int* generate_initial_permutation(std::mt19937& rng, int n) { + std::vector permutation(n); + for (int i = 0; i < n; ++i) { + permutation[i] = i; + } + std::shuffle(permutation.begin(), permutation.end(), rng); + + int* p; + CHECK_CUDA(cudaMalloc((void**)&p, n * sizeof(int))); + CHECK_CUDA(cudaMemcpy(p, permutation.data(), n * sizeof(int), cudaMemcpyHostToDevice)); + + return p; +} + + +void create_graph_sparse(int n, int nnz, int split, const int* p, int *I, int* J, float* V) { + + curandState* states; + CHECK_CUDA(cudaMalloc((void**)&states, nnz * sizeof(states))); + int gridSize = (nnz + BLOCK_SIZE - 1) / BLOCK_SIZE; + + create_random_matrix << > > (n, nnz, split, p, I, J, V, states); + + CHECK_CUDA(cudaDeviceSynchronize()); + + // Wrap raw device pointers in thrust device pointers for sorting + thrust::device_ptr dev_I(I); + thrust::device_ptr dev_J(J); + thrust::device_ptr dev_V(V); + + // First, sort by the secondary key (J) using stable sort + thrust::stable_sort_by_key(dev_J, dev_J + nnz, thrust::make_zip_iterator(thrust::make_tuple(dev_I, dev_V))); + + // Then, sort by the primary key (I) using stable sort to maintain the order of the secondary key + thrust::stable_sort_by_key(dev_I, dev_I + nnz, thrust::make_zip_iterator(thrust::make_tuple(dev_J, dev_V))); + + + int* h_rows = new int[nnz]; + int* h_cols = new int[nnz]; + float* h_vals = new float[nnz]; + + + CHECK_CUDA(cudaMemcpy(h_rows, I, nnz * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK_CUDA(cudaMemcpy(h_cols, J, nnz * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK_CUDA(cudaMemcpy(h_vals, V, nnz * sizeof(float), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < nnz; ++i) { + std::cout << "I: " << h_rows[i] << " J: " << h_cols[i] << " V: " << h_vals[i] << std::endl; + } + + CHECK_CUDA(cudaFree(states)); + delete[] h_rows; + delete[] h_cols; + delete[] h_vals; + + //thrust::host_vector h_rows, h_cols; + //thrust::host_vector h_vals; + + //thrust::default_random_engine rng; + //thrust::random::uniform_real_distribution dist(0.01f, 1.0f); + + //std::vector> combined; + + //for (int i = 0; i < split; ++i) { + // std::cout << "P_" << i << ":" << p[i] << std::endl; + // for (int j = split; j < n; ++j) { + // std::cout << "I" << ": " << i << " "; + // std::cout << "J" << ":" << j << std::endl; + // float rnd_val = dist(rng); + // combined.push_back(std::make_tuple(p[i], p[j], rnd_val)); + // //combined.push_back(std::make_tuple(p[j], p[i], rnd_val)); + // } + //} + + //std::sort(combined.begin(), combined.end(), [](const auto& a, const auto& b) { + // if (std::get<0>(a) == std::get<0>(b)) { + // return std::get<1>(a) < std::get<1>(b); + // } + // return std::get<0>(a) < std::get<0>(b); + // }); + + //for (size_t i = 0; i < combined.size(); ++i) { + // + // + // h_rows.push_back(std::get<0>(combined[i])); + // h_cols.push_back(std::get<1>(combined[i])); + // h_vals.push_back(std::get<2>(combined[i])); + //} + + //d_rows = h_rows; + //d_cols = h_cols; + //d_vals = h_vals; +} + +char* generate_solution(const int* p, int split, int n) { + char* x; + CHECK_CUDA(cudaMalloc((void**)&x, n * sizeof(char))); + // Initialize a device vector of bool type with false + CHECK_CUDA(cudaMemset(x, 0, n)); + int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + set_true_elements << > > (split, p, x); + CHECK_CUDA(cudaDeviceSynchronize()); + + return x; +} + +void graph_to_qubo(CudaSparseMatrix& Q) { + Q.multiply(-1.0f); + float* row_sum = Q.sum(0); + Q.fill_diagonal(row_sum); + /*scale_csr_matrix(handle, -1.0f, graph, Q); + thrust::device_vector sum = sum_rows_csr_matrix(handle, graph); + fill_diagonal(handle, Q, sum, extended_pointers); + scale_csr_matrix(handle, -0.25f, Q, Q); + + for (int i = 0; i < sum.size(); i++) { + std::cout << sum[i] << std::endl; + }*/ +} + +// +//float qubo_eng(cublasHandle_t cublasHandle, +// cusparseHandle_t handle, +// const cusparseSpMatDescr_t& Q, +// float* sol_vector) { +// int64_t rows, cols, nnz; +// int* d_csrOffsets, * d_cols; +// float* d_vals; +// int* dA_csrOffsets, * dA_columns; +// float* dA_values; +// +// cusparseIndexType_t csrRowOffsetsType; +// cusparseIndexType_t csrColIndType; +// cusparseIndexBase_t idxBase; +// cudaDataType valueTyp; +// +// CHECK_CUSPARSE(cusparseSpMatGetSize(Q, &rows, &cols, &nnz)); +// +// CHECK_CUDA(cudaMalloc((void**)&d_csrOffsets, (rows + 1) * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&d_cols, nnz * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&d_vals, nnz * sizeof(float))); +// +// CHECK_CUSPARSE(cusparseCsrGet(Q, &rows, &cols, &nnz, +// (void**)&d_csrOffsets, (void**)&d_cols, (void**)&d_vals, +// &csrRowOffsetsType, &csrColIndType, &idxBase, &valueTyp)); +// // Host problem definition +// const int A_num_rows = rows; +// const int A_num_cols = cols; +// const int A_nnz = nnz; +// //int h_test[11]; +// //int* h_test = (int*)calloc(rows+1, sizeof(int)); +// float alpha = 1.0f; +// float beta = 0.0f; +// //CHECK_CUDA(cudaMemcpy(h_test, d_csrOffsets, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); +// ////cudaMemcpy(h_test, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost); +// ////CHECK_CUDA(cudaMemcpy(hCsrRowOffsets_toverify, dCsrRowOffsets_toverify, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); +// //for (int i = 0; i < A_num_rows+1; i++) { +// // std::cout << h_test[i] << std::endl; +// //} +// //-------------------------------------------------------------------------- +// // Device memory management +// float* dX, * dY; +// CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets, (A_num_rows + 1) * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int))); +// CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&dX, A_num_cols * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&dY, A_num_rows * sizeof(float))); +// +// CHECK_CUDA(cudaMemcpy(dA_csrOffsets, d_csrOffsets, (A_num_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(dA_columns, d_cols, A_nnz * sizeof(int), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(dA_values, d_vals, A_nnz * sizeof(float), cudaMemcpyDeviceToDevice)); +// CHECK_CUDA(cudaMemcpy(dX, sol_vector, A_num_cols * sizeof(float), cudaMemcpyDeviceToDevice)); +// cusparseSpMatDescr_t matA; +// cusparseDnVecDescr_t vecX, vecY; +// void* dBuffer = NULL; +// size_t bufferSize = 0; +// CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz, +// dA_csrOffsets, dA_columns, dA_values, +// CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, +// CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); +// // Create dense vector X +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, A_num_cols, dX, CUDA_R_32F)); +// // Create dense vector y +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, A_num_rows, dY, CUDA_R_32F)); +// // allocate an external buffer if needed +// CHECK_CUSPARSE(cusparseSpMV_bufferSize( +// handle, CUSPARSE_OPERATION_NON_TRANSPOSE, +// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, +// CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)); +// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); +// +// // execute SpMV +// CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, +// &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, +// CUSPARSE_SPMV_ALG_DEFAULT, dBuffer)); +// +// // destroy matrix/vector descriptors +// CHECK_CUSPARSE(cusparseDestroySpMat(matA)); +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX)); +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY)); +// +// /*CHECK_CUDA(cudaMemcpy(hY, dY, A_num_rows * sizeof(float), cudaMemcpyDeviceToHost)); +// std::cout << "Result of Q*x: " << std::endl; +// for (int i = 0; i < A_num_rows; i++) { +// std::cout << hY[i] << std::endl; +// }*/ +// +// float result; +// CHECK_CUBLAS(cublasSdot(cublasHandle, A_num_rows, dX, 1, dY, 1, &result)); +// //std::cout << "Energy: " << result << std::endl; +// return result; +//} + +//float calculate_qubo_energy(cublasHandle_t cublasHandle, +// cusparseHandle_t cusparseHandle, +// int n, +// const cusparseSpMatDescr_t& Q, +// thrust::device_vector& x) { +// float alpha = 1.0f; +// float beta = 0.0f; +// size_t bufferSize = 0; +// void* dBuffer = nullptr; +// +// // Create dense vector descriptors +// cusparseDnVecDescr_t vecX, vecY; +// float* Qx; +// float* Qx_ptr; +// //float* hY = (float*)calloc(n, sizeof(float)); +// float* in_x; +// +// /* for (int i = 0; i < n; i++) { +// hY[i] = 0.0f; +// }*/ +// +// CHECK_CUDA(cudaMalloc((void**)&Qx, n * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&Qx_ptr, n * sizeof(float))); +// CHECK_CUDA(cudaMalloc((void**)&in_x, n * sizeof(float))); +// +// CHECK_CUDA(cudaMemcpy(in_x, thrust::raw_pointer_cast(x.data()), n * sizeof(float), cudaMemcpyDeviceToDevice)); +// //CHECK_CUDA(cudaMemcpy(Qx, hY, n * sizeof(float), cudaMemcpyHostToDevice)); +// +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, n, in_x, CUDA_R_32F)); +// CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, n, Qx, CUDA_R_32F)); +// +// // Allocate buffer +// CHECK_CUSPARSE(cusparseSpMV_bufferSize(cusparseHandle, +// CUSPARSE_OPERATION_NON_TRANSPOSE, +// &alpha, +// Q, +// vecX, +// &beta, +// vecY, +// CUDA_R_32F, +// CUSPARSE_SPMV_ALG_DEFAULT, +// &bufferSize)); +// CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize)); +// +// // Perform Q * x +// CHECK_CUSPARSE(cusparseSpMV(cusparseHandle, +// CUSPARSE_OPERATION_NON_TRANSPOSE, +// &alpha, +// Q, +// vecX, +// &beta, +// vecY, +// CUDA_R_32F, +// CUSPARSE_SPMV_ALG_DEFAULT, +// dBuffer)); +// +// // Extract the raw pointers to the data in the dense vector descriptors +// //float* Qx_ptr; +// //CHECK_CUSPARSE(cusparseDnVecGetValues(vecY, (void**)&Qx_ptr)); +// // Ensure the computation is complete before accessing the result +// CHECK_CUDA(cudaDeviceSynchronize()); +// +// /*CHECK_CUDA(cudaMemcpy(hY, Qx, n * sizeof(float), cudaMemcpyDeviceToHost)); +// for (int i = 0; i < n; i++) { +// std::cout << hY[i] << std::endl; +// }*/ +// +// // Compute the dot product x^T * (Q * x) using cuBLAS +// float result; +// CHECK_CUBLAS(cublasSdot(cublasHandle, n, in_x, 1, Qx, 1, &result)); +// +// // Clean up +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecX)); +// CHECK_CUSPARSE(cusparseDestroyDnVec(vecY)); +// CHECK_CUDA(cudaFree(dBuffer)); +// CHECK_CUDA(cudaFree(Qx)); +// CHECK_CUDA(cudaFree(in_x)); +// +// return result; +//} +// +//void brute_force_solutions(cublasHandle_t cublasHandle, +// cusparseHandle_t handle, +// int n, +// const cusparseSpMatDescr_t& Q) { +// indicators::show_console_cursor(false); +// int num_solutions = 1 << n; // 2^n +// thrust::device_vector d_solutions(num_solutions * n); +// /* +// for (int i = 0; i < num_solutions; ++i) { +// for (int j = 0; j < n; ++j) { +// float val = static_cast((i & (1 << j)) != 0); +// d_solutions[i * n + j] = val; +// } +// }*/ +// +// //convert_sparse_to_dense_and_display(handle, Q, n); +// // Allocate memory for energies +// thrust::device_vector d_energies(num_solutions); +// thrust::device_vector sol_x(n); +// +// indicators::ProgressBar bar{ +// indicators::option::BarWidth{40}, +// indicators::option::Start{"["}, +// indicators::option::Fill{"="}, +// indicators::option::Lead{">"}, +// indicators::option::Remainder{" "}, +// indicators::option::End{" ]"}, +// indicators::option::ForegroundColor{indicators::Color::white}, +// indicators::option::FontStyles{ +// std::vector{indicators::FontStyle::bold}}, +// indicators::option::MaxProgress{num_solutions} +// }; +// int smallest_ind = -1; +// float smallest_eng = 1000; +// +// for (int i = 0; i < num_solutions; ++i) { +// for (int j = 0; j < n; ++j) { +// float val = static_cast((i & (1 << j)) != 0); +// sol_x[j] = val; +// d_solutions[i * n + j] = val; +// } +// float eng = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(sol_x.data())); +// //float eng = calculate_qubo_energy(cublasHandle, handle, n, Q, sol_x); +// d_energies[i] = eng; +// if (eng <= smallest_eng) { +// smallest_ind = i; +// smallest_eng = eng; +// } +// //std::cout << "Energy i: " << eng << std::endl; +// bar.set_option(indicators::option::PostfixText{ std::to_string(i) + "/" + std::to_string(num_solutions) }); +// bar.tick(); +// } +// std::cout << "Tested all solutions, now sorting." << std::endl; +// // Find the minimum energy +// auto min_energy_iter = thrust::min_element(d_energies.begin(), d_energies.end()); +// float min_energy = *min_energy_iter; +// int min_index = min_energy_iter - d_energies.begin(); +// +// // Copy the best solution to host +// thrust::host_vector h_best_solution(n); +// thrust::copy(d_solutions.begin() + smallest_ind * n, d_solutions.begin() + (smallest_ind + 1) * n, h_best_solution.begin()); +// +// // Print the result +// std::cout << "Best energy my: " << smallest_eng << " and id: " << smallest_ind << std::endl; +// std::cout << "Best energy: " << min_energy << " and id: " << min_index << std::endl; +// std::cout << "Best solution: "; +// for (float bit : h_best_solution) { +// std::cout << bit << " "; +// } +// std::cout << std::endl; +// convert_sparse_to_dense_and_display(handle, Q, n); +//} + +int estimate_split(float density, int n) { + if (density > 0.5f) { + std::cout << "Error, density can not be bigger than 0.5!" << std::endl; + } + + float sparsity = 1.0f - density; + float inside = 2.0f * sparsity - 1.0f; + float smaller_set_size = 0.5f * n * (1 + sqrt(inside)); + return static_cast(smaller_set_size) - 1; +} + + +int main() { + int n = 12; + int seed = 14; + float density = 0.5; + int* I, * J; + float* V; + std::mt19937 rng(seed); + + int* p = generate_initial_permutation(rng, n); + + int split = estimate_split(density, n); // Example split, can be computed as needed + int nnz = split * (n - split ); + std::cout << "Split: " << split << " nnz: " << nnz << std::endl; + + CHECK_CUDA(cudaMalloc((void**)&I, nnz * sizeof(int))); + CHECK_CUDA(cudaMalloc((void**)&J, nnz * sizeof(int))); + CHECK_CUDA(cudaMalloc((void**)&V, nnz * sizeof(float))); + + create_graph_sparse(n, nnz, split, p, I, J, V); + + CudaSparseMatrix graph = CudaSparseMatrix(I, J, V, n, nnz, SparseType::COO, MemoryType::Device); + graph.display(); + + char* x = generate_solution(p, split, n); + + char* h_x = new char[n]; + + CHECK_CUDA(cudaMemcpy(h_x, x, n * sizeof(char), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < n; i++) + { + std::cout << "X_" << i << " " << static_cast(h_x[i]) << std::endl; + } + + CudaSparseMatrix Q = CudaSparseMatrix(graph); + graph.clear(); + + graph_to_qubo(Q); + + Q.display(); + + cudaFree(I); + cudaFree(J); + cudaFree(V); + cudaFree(p); + cudaFree(x); + + delete[] h_x; + + //std::cout << "Graph created."; + //// Print the result (for debugging purposes) + //thrust::host_vector h_rows = d_rows; + //thrust::host_vector h_cols = d_cols; + //thrust::host_vector h_vals = d_vals; + //thrust::host_vector h_x = d_x; + //int print_size = 2 * n; + //std::cout << "Rows: "; + //for (int i = 0; i < print_size; ++i) { + // std::cout << h_rows[i] << " "; + //} + //std::cout << std::endl; + + //std::cout << "Cols: "; + //for (int i = 0; i < print_size; ++i) { + // std::cout << h_cols[i] << " "; + //} + //std::cout << std::endl; + + //std::cout << "Vals: "; + //for (int i = 0; i < print_size; ++i) { + // std::cout << h_vals[i] << " "; + //} + //std::cout << std::endl; + + //std::cout << "Solution: "; + //for (int i = 0; i < n; ++i) { + // std::cout << h_x[i] << " "; + //} + //std::cout << std::endl; + + //cusparseHandle_t handle; + //cublasHandle_t cublasHandle; + //cusparseSpMatDescr_t graph_csr, Q; + + //// Initialize cuSPARSE + //cublasCreate(&cublasHandle); + //CHECK_CUSPARSE(cusparseCreate(&handle)); + //int nnz = d_vals.size(); + //thrust::device_vector d_csrOffsets(n + 1); + //cusparseXcoo2csr(handle, + // thrust::raw_pointer_cast(d_rows.data()), + // nnz, + // n, + // thrust::raw_pointer_cast(d_csrOffsets.data()), + // CUSPARSE_INDEX_BASE_ZERO); + + //CHECK_CUSPARSE(cusparseCreateCsr(&graph_csr, + // n, + // n, + // nnz, + // thrust::raw_pointer_cast(d_csrOffsets.data()), + // thrust::raw_pointer_cast(d_cols.data()), // column indices + // thrust::raw_pointer_cast(d_vals.data()), // values + // CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + // CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + + //thrust::device_vector d_QVals(nnz); + //CHECK_CUSPARSE(cusparseCreateCsr(&Q, + // n, + // n, + // nnz, + // thrust::raw_pointer_cast(d_csrOffsets.data()), + // thrust::raw_pointer_cast(d_cols.data()), // column indices + // thrust::raw_pointer_cast(d_QVals.data()), // values + // CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + // CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + //// Now matDescr can be used in further cuSPARSE operations + + //int* newRowsPtr; + //int* newCols; + //float* newVals; + //csr_data extended_sparse_data; + + //CHECK_CUDA(cudaMalloc((void**)&newRowsPtr, (n + 1) * sizeof(int))); + //CHECK_CUDA(cudaMalloc((void**)&newCols, (nnz + n) * sizeof(int))); + //CHECK_CUDA(cudaMalloc((void**)&newVals, (nnz + n) * sizeof(float))); + + //extended_sparse_data.rowPointer = newRowsPtr; + //extended_sparse_data.cols = newCols; + //extended_sparse_data.vals = newVals; + + + //std::cout << "Sparse matrix created successfully!" << std::endl; + //graph_to_qubo(handle, graph_csr, Q, extended_sparse_data); + ///*convert_sparse_to_dense_and_display(handle, graph_csr, n); */ + ////convert_sparse_to_dense_and_display(handle, Q, n); + + //float planted_energy = qubo_eng(cublasHandle, handle, Q, thrust::raw_pointer_cast(d_x.data())); + + //std::cout << "Qubo energy of planted solution: " << planted_energy << std::endl; + + ///*float energy = calculate_qubo_energy(cublasHandle, handle, n, Q, d_x); + //std::cout << "Qubo energy: " << energy << std::endl;*/ + //brute_force_solutions(cublasHandle, handle, n, Q); + + //// Clean up + //CHECK_CUSPARSE(cusparseDestroySpMat(graph_csr)); + //CHECK_CUSPARSE(cusparseDestroySpMat(Q)); + //CHECK_CUSPARSE(cusparseDestroy(handle)); + + return 0; +};