From 622988e7c03d39975a36f0eaafa9ecb3773ea992 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Sun, 5 Nov 2023 13:52:43 -0500
Subject: [PATCH 1/2] feat: add copy back of verification vector

---
 cuda/GridInit.cu        |  9 +++++++++
 cuda/Main.cu            | 18 ++++++++----------
 cuda/Simulation.cu      | 19 ++++++++++++++-----
 cuda/XSbench_header.cuh |  1 +
 4 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/cuda/GridInit.cu b/cuda/GridInit.cu
index 0cd5d185..13c62db1 100644
--- a/cuda/GridInit.cu
+++ b/cuda/GridInit.cu
@@ -87,6 +87,15 @@ void release_device_memory(SimulationData GSD) {
 	cudaFree(GSD.verification);
 }
 
+void release_memory(SimulationData SD) {
+	free(SD.num_nucs);
+	free(SD.concs);
+	free(SD.mats);
+	free(SD.unionized_energy_array);
+	free(SD.nuclide_grid);
+	free(SD.verification);
+}
+
 SimulationData grid_init_do_not_profile( Inputs in, int mype )
 {
 	// Structure to hold all allocated simuluation data arrays
diff --git a/cuda/Main.cu b/cuda/Main.cu
index 75ca2b0f..f5b81959 100644
--- a/cuda/Main.cu
+++ b/cuda/Main.cu
@@ -38,8 +38,6 @@ int main( int argc, char* argv[] )
 	if( in.binary_mode == WRITE && mype == 0 )
 		binary_write(in, SD);
 
-	// Move data to GPU
-	SimulationData GSD = move_simulation_data_to_device( in, mype, SD );
 
 	// =====================================================================
 	// Cross Section (XS) Parallel Lookup Simulation
@@ -62,19 +60,19 @@ int main( int argc, char* argv[] )
 	if( in.simulation_method == EVENT_BASED )
 	{
 		if( in.kernel_id == 0 )
-			verification = run_event_based_simulation_baseline(in, GSD, mype);
+			verification = run_event_based_simulation_baseline(in, SD, mype);
 		else if( in.kernel_id == 1 )
-			verification = run_event_based_simulation_optimization_1(in, GSD, mype);
+			verification = run_event_based_simulation_optimization_1(in, SD, mype);
 		else if( in.kernel_id == 2 )
-			verification = run_event_based_simulation_optimization_2(in, GSD, mype);
+			verification = run_event_based_simulation_optimization_2(in, SD, mype);
 		else if( in.kernel_id == 3 )
-			verification = run_event_based_simulation_optimization_3(in, GSD, mype);
+			verification = run_event_based_simulation_optimization_3(in, SD, mype);
 		else if( in.kernel_id == 4 )
-			verification = run_event_based_simulation_optimization_4(in, GSD, mype);
+			verification = run_event_based_simulation_optimization_4(in, SD, mype);
 		else if( in.kernel_id == 5 )
-			verification = run_event_based_simulation_optimization_5(in, GSD, mype);
+			verification = run_event_based_simulation_optimization_5(in, SD, mype);
 		else if( in.kernel_id == 6 )
-			verification = run_event_based_simulation_optimization_6(in, GSD, mype);
+			verification = run_event_based_simulation_optimization_6(in, SD, mype);
 		else
 		{
 			printf("Error: No kernel ID %d found!\n", in.kernel_id);
@@ -97,7 +95,7 @@ int main( int argc, char* argv[] )
 	omp_end = get_time();
 
 	// Release device memory
-	release_device_memory(GSD);
+	release_memory(SD);
 
 	// Final Hash Step
 	verification = verification % 999983;
diff --git a/cuda/Simulation.cu b/cuda/Simulation.cu
index a757eeaf..8be81ab1 100644
--- a/cuda/Simulation.cu
+++ b/cuda/Simulation.cu
@@ -12,8 +12,14 @@
 // line argument.
 ////////////////////////////////////////////////////////////////////////////////////
 
-unsigned long long run_event_based_simulation_baseline(Inputs in, SimulationData GSD, int mype)
+unsigned long long run_event_based_simulation_baseline(Inputs in, SimulationData SD, int mype)
 {
+    size_t sz = in.lookups * sizeof(unsigned long);
+    unsigned long * v = (unsigned long *) malloc(sz);
+    
+    // Move Data to GPU
+    SimulationData GSD = move_simulation_data_to_device(in, mype, SD);
+
 	////////////////////////////////////////////////////////////////////////////////
 	// Configure & Launch Simulation Kernel
 	////////////////////////////////////////////////////////////////////////////////
@@ -22,18 +28,21 @@ unsigned long long run_event_based_simulation_baseline(Inputs in, SimulationData
 	int nthreads = 256;
 	int nblocks = ceil( (double) in.lookups / (double) nthreads);
 
+
 	xs_lookup_kernel_baseline<<<nblocks, nthreads>>>( in, GSD );
 	gpuErrchk( cudaPeekAtLastError() );
-	gpuErrchk( cudaDeviceSynchronize() );
 	
 	////////////////////////////////////////////////////////////////////////////////
 	// Reduce Verification Results
 	////////////////////////////////////////////////////////////////////////////////
 	if( mype == 0)	printf("Reducing verification results...\n");
+    gpuErrchk(cudaMemcpy(v, GSD.verification, sz, cudaMemcpyDeviceToHost) );
 
-	unsigned long verification_scalar = thrust::reduce(thrust::device, GSD.verification, GSD.verification + in.lookups, 0);
-	gpuErrchk( cudaPeekAtLastError() );
-	gpuErrchk( cudaDeviceSynchronize() );
+    unsigned long verification_scalar = 0;
+    for( int i =0; i < in.lookups; i++ )
+        verification_scalar += v[i];
+
+    release_device_memory(GSD);
 
 	return verification_scalar;
 }
diff --git a/cuda/XSbench_header.cuh b/cuda/XSbench_header.cuh
index 53dee7bd..4d45022d 100644
--- a/cuda/XSbench_header.cuh
+++ b/cuda/XSbench_header.cuh
@@ -139,6 +139,7 @@ unsigned long long run_event_based_simulation_optimization_6(Inputs in, Simulati
 SimulationData grid_init_do_not_profile( Inputs in, int mype );
 SimulationData move_simulation_data_to_device( Inputs in, int mype, SimulationData SD );
 void release_device_memory(SimulationData GSD);
+void release_memory(SimulationData SD);
 
 // XSutils.cu
 int NGP_compare( const void * a, const void * b );

From 36a4e033482b45539a709ab55ad64952190b6b33 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Sun, 5 Nov 2023 13:59:27 -0500
Subject: [PATCH 2/2] fix: allocation of host verification vector

---
 cuda/GridInit.cu   | 7 +++++++
 cuda/Simulation.cu | 7 ++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cuda/GridInit.cu b/cuda/GridInit.cu
index 13c62db1..6eb4d53e 100644
--- a/cuda/GridInit.cu
+++ b/cuda/GridInit.cu
@@ -100,6 +100,8 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 {
 	// Structure to hold all allocated simuluation data arrays
 	SimulationData SD;
+	
+
 
 	// Keep track of how much data we're allocating
 	size_t nbytes = 0;
@@ -152,6 +154,11 @@ SimulationData grid_init_do_not_profile( Inputs in, int mype )
 			printf("E%d = %lf\n", j, SD.nuclide_grid[i * in.n_gridpoints + j].energy);
 	}
 	*/
+
+    // Allocate Verification Array
+    size_t sz = in.lookups * sizeof(unsigned long);
+    SD.verification = (unsigned long *) malloc(sz);
+    nbytes += sz;
 	
 
 	////////////////////////////////////////////////////////////////////
diff --git a/cuda/Simulation.cu b/cuda/Simulation.cu
index 8be81ab1..c7b8a736 100644
--- a/cuda/Simulation.cu
+++ b/cuda/Simulation.cu
@@ -14,9 +14,6 @@
 
 unsigned long long run_event_based_simulation_baseline(Inputs in, SimulationData SD, int mype)
 {
-    size_t sz = in.lookups * sizeof(unsigned long);
-    unsigned long * v = (unsigned long *) malloc(sz);
-    
     // Move Data to GPU
     SimulationData GSD = move_simulation_data_to_device(in, mype, SD);
 
@@ -36,11 +33,11 @@ unsigned long long run_event_based_simulation_baseline(Inputs in, SimulationData
 	// Reduce Verification Results
 	////////////////////////////////////////////////////////////////////////////////
 	if( mype == 0)	printf("Reducing verification results...\n");
-    gpuErrchk(cudaMemcpy(v, GSD.verification, sz, cudaMemcpyDeviceToHost) );
+    gpuErrchk(cudaMemcpy(SD.verification, GSD.verification, in.lookups * sizeof(unsigned long), cudaMemcpyDeviceToHost) );
 
     unsigned long verification_scalar = 0;
     for( int i =0; i < in.lookups; i++ )
-        verification_scalar += v[i];
+        verification_scalar += SD.verification[i];
 
     release_device_memory(GSD);