Skip to content

Commit

Permalink
added comments in kirchhoff_cuda helper
Browse files Browse the repository at this point in the history
  • Loading branch information
Amnah2020 committed Oct 27, 2024
1 parent 0524e0e commit a7aca80
Showing 1 changed file with 110 additions and 39 deletions.
149 changes: 110 additions & 39 deletions pylops/waveeqprocessing/_kirchhoff_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,28 @@
import numpy as np
from math import cos


class _kirchhoffCudaHelper:
""" A helper class for performing Kirchhoff migration or modeling using CUDA via Numba.
This class provides methods to compute the forward and adjoint operations for Kirchhoff migration,
utilizing GPU acceleration through Numba's CUDA capabilities.
Parameters
----------
ns : int
Number of sources.
nr : int
Number of receivers.
nt : int
Number of time samples.
ni : int
Number of image points.
dynamic : int, optional
Flag indicating whether to use dynamic computation. ``True`` == 1 or not ``False`` == 0 (default is 0).
travsrcrec : int, optional
Flag indicating whether to use separate tables for src and rec traveltimes. Seperate == 1 (default is 0).
"""

def __init__(self, ns, nr, nt, ni, dynamic=0, travsrcrec=0):
self.dynamic, self.travsrcrec = dynamic, travsrcrec
Expand All @@ -11,8 +32,11 @@ def __init__(self, ns, nr, nt, ni, dynamic=0, travsrcrec=0):
self._lunch_grid_setup()

def _lunch_grid_setup(self):
""" Set up the CUDA grid and block dimensions based on the current device and computation flags.
This method configures the number of blocks and threads per block for CUDA kernels,
depending on whether dynamic computation and travel times from sources and receivers are used.
"""
current_device = cuda.get_current_device()
# self.num_blocks, self.num_threads_per_blocks = 32, 32
if self.dynamic:
num_sources = self.ns
num_streams = 3
Expand All @@ -35,20 +59,18 @@ def _lunch_grid_setup(self):
if not self.travsrcrec:
# version 4
self.num_threads_per_blocks = current_device.WARP_SIZE * 8
self.num_blocks = ((self.ns * self.nr) + (self.num_threads_per_blocks - 1)) // self.num_threads_per_blocks
self.num_blocks = ((self.ns * self.nr) + (
self.num_threads_per_blocks - 1)) // self.num_threads_per_blocks
else:
# version 3
wrap = current_device.WARP_SIZE
multipr_count = current_device.MULTIPROCESSOR_COUNT
self.num_threads_per_blocks = (wrap, wrap)
self.num_blocks = (multipr_count, multipr_count)





def _data_prep_dynamic(self, ns, nr, nt, ni, nz, dt, aperture, angleaperture, aperturetap, vel, six,
rix,trav_recs, angle_recs, trav_srcs, angle_srcs, amp_srcs, amp_recs):
rix, trav_recs, angle_recs, trav_srcs, angle_srcs, amp_srcs, amp_recs):
""" Prepare data for dynamic computation by transfering some variables to device memory in advance once."""
ns_d = np.int32(ns)
nr_d = np.int32(nr)
nt_d = np.int32(nt)
Expand Down Expand Up @@ -332,7 +354,23 @@ def _ampsrcrec_kirch_rmatvec_cuda_streams(ns, nr, nt, ni, nz, dt, aperturemin, a
cuda.atomic.add(y, ind1, val1)

def _process_streams(self, x, opt):
""" Data preparation and passing to different streams"""
"""Process data using CUDA streams for dynamic computation.
This method handles data preparation and execution of CUDA kernels using streams,
for both forward ('_matvec') and adjoint ('_rmatvec') operations.
Parameters
----------
x : ndarray
Input data (image or seismic data).
opt : str
Operation type, either '_matvec' for forward or '_rmatvec' for adjoint.
Returns
-------
y : ndarray
Output data after processing.
"""

if opt == "_matvec":
x = x.ravel()
Expand All @@ -352,25 +390,27 @@ def _process_streams(self, x, opt):
for irec in range(self.nr):
for stream, isrc_list in self.sources_per_streams.items():
if opt == "_matvec":
self._ampsrcrec_kirch_matvec_cuda_streams[self.num_blocks, self.num_threads_per_blocks, stream](*self.const_inputs,
self.trav_srcs_d_global,
self.amp_srcs_d_global,
self.angles_srcs_d_global,
self.trav_recs_d_global,
self.amp_recs_d_global,
self.angles_recs_d_global,
y_d_dict[stream],
isrc_list_d_dict[stream], irec, x_d)
self._ampsrcrec_kirch_matvec_cuda_streams[self.num_blocks, self.num_threads_per_blocks, stream](
*self.const_inputs,
self.trav_srcs_d_global,
self.amp_srcs_d_global,
self.angles_srcs_d_global,
self.trav_recs_d_global,
self.amp_recs_d_global,
self.angles_recs_d_global,
y_d_dict[stream],
isrc_list_d_dict[stream], irec, x_d)
elif opt == "_rmatvec":
self._ampsrcrec_kirch_rmatvec_cuda_streams[self.num_blocks, self.num_threads_per_blocks, stream](*self.const_inputs,
self.trav_srcs_d_global,
self.amp_srcs_d_global,
self.angles_srcs_d_global,
self.trav_recs_d_global,
self.amp_recs_d_global,
self.angles_recs_d_global,
y_d_dict[stream],
isrc_list_d_dict[stream], irec, x_d)
self._ampsrcrec_kirch_rmatvec_cuda_streams[self.num_blocks, self.num_threads_per_blocks, stream](
*self.const_inputs,
self.trav_srcs_d_global,
self.amp_srcs_d_global,
self.angles_srcs_d_global,
self.trav_recs_d_global,
self.amp_recs_d_global,
self.angles_recs_d_global,
y_d_dict[stream],
isrc_list_d_dict[stream], irec, x_d)
# Synchronize the streams to ensure all operations have been completed
for stream in self.sources_per_streams.keys():
stream.synchronize()
Expand All @@ -385,9 +425,24 @@ def _process_streams(self, x, opt):
return y_total

def _matvec_call(self, *inputs):
"""Handle the forward operation call, dispatching to appropriate CUDA kernels.
This method selects the appropriate kernel to execute based on the computation flags,
and performs the forward operation (matrix-vector multiplication).
Parameters
----------
*inputs : list
List of input parameters required by the kernels.
Returns
-------
y : ndarray
Output data (seismic data) after forward operation.
"""
if self.dynamic and self.travsrcrec:
y = self._process_streams(inputs[0], "_matvec")
elif self.travsrcrec: #len(inputs) == 9
elif self.travsrcrec: # len(inputs) == 9
x_d = cuda.to_device(inputs[0])
y_d = cuda.to_device(inputs[1])
ns_d = np.int32(inputs[2])
Expand All @@ -397,28 +452,44 @@ def _matvec_call(self, *inputs):
dt_d = np.float32(inputs[6])
trav_srcs_d = cuda.to_device(inputs[7])
trav_recs_d = cuda.to_device(inputs[8])
self._travsrcrec_kirch_matvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, ns_d, nr_d, nt_d, ni_d, dt_d,
trav_srcs_d, trav_recs_d)
elif not self.travsrcrec: # len(inputs) == 7:
self._travsrcrec_kirch_matvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, ns_d, nr_d, nt_d,
ni_d, dt_d,
trav_srcs_d, trav_recs_d)
elif not self.travsrcrec: # len(inputs) == 7:
x_d = cuda.to_device(inputs[0])
y_d = cuda.to_device(inputs[1])
nsnr_d = np.int32(inputs[2])
nt_d = np.int32(inputs[3])
ni_d = np.int32(inputs[4])
itrav_d = cuda.to_device(inputs[5])
travd_d = cuda.to_device(inputs[6])
self._trav_kirch_matvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, nsnr_d, nt_d, ni_d, itrav_d, travd_d)

self._trav_kirch_matvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, nsnr_d, nt_d, ni_d,
itrav_d, travd_d)

if not self.dynamic:
cuda.synchronize()
y = y_d.copy_to_host()
return y

def _rmatvec_call(self, *inputs):
""" Handle the adjoint operation call, dispatching to appropriate CUDA kernels.
This method selects the appropriate kernel to execute based on the computation flags,
and performs the adjoint operation (matrix-vector multiplication with the transpose).
Parameters
----------
*inputs : list
List of input parameters required by the kernels.
Returns
-------
y : ndarray
Output data (image) after adjoint operation.
"""
if self.dynamic and self.travsrcrec:
y = self._process_streams(inputs[0], "_rmatvec")
elif self.travsrcrec: #len(inputs) == 9
elif self.travsrcrec: # len(inputs) == 9
x_d = cuda.to_device(inputs[0])
y_d = cuda.to_device(inputs[1])
ns_d = np.int32(inputs[2])
Expand All @@ -428,21 +499,21 @@ def _rmatvec_call(self, *inputs):
dt_d = np.float32(inputs[6])
trav_srcs_d = cuda.to_device(inputs[7])
trav_recs_d = cuda.to_device(inputs[8])
self._travsrcrec_kirch_rmatvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, ns_d, nr_d, nt_d, ni_d, dt_d,
trav_srcs_d, trav_recs_d)
elif not self.travsrcrec: # len(inputs) == 7:
self._travsrcrec_kirch_rmatvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, ns_d, nr_d,
nt_d, ni_d, dt_d,
trav_srcs_d, trav_recs_d)
elif not self.travsrcrec: # len(inputs) == 7:
x_d = cuda.to_device(inputs[0])
y_d = cuda.to_device(inputs[1])
nsnr_d = np.int32(inputs[2])
nt_d = np.int32(inputs[3])
ni_d = np.int32(inputs[4])
itrav_d = cuda.to_device(inputs[5])
travd_d = cuda.to_device(inputs[6])
self._trav_kirch_rmatvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, nsnr_d, nt_d, ni_d, itrav_d, travd_d)

self._trav_kirch_rmatvec_cuda[self.num_blocks, self.num_threads_per_blocks](x_d, y_d, nsnr_d, nt_d, ni_d,
itrav_d, travd_d)

if not self.dynamic:
cuda.synchronize()
y = y_d.copy_to_host()
return y

0 comments on commit a7aca80

Please sign in to comment.