Skip to content

Commit

Permalink
added template functions for y16_y16_from_y12i with CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
Arun-Prasad-V committed Jun 26, 2024
1 parent 1cec5c3 commit f748b20
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 68 deletions.
67 changes: 9 additions & 58 deletions src/cuda/cuda-conversion.cu
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,8 @@ void rscuda::y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const
std::cout << milliseconds << std::endl; */
}

__global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel * source)
template<class SOURCE>
__global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t* b, int count, const SOURCE * source)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

Expand All @@ -354,26 +355,27 @@ __global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t*
}


void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source)
template<class SOURCE>
void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const SOURCE * source)
{
/*
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start); */

source = reinterpret_cast<const y12i_pixel*>(source);
source = reinterpret_cast<const SOURCE*>(source);

int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK;
uint16_t* a = reinterpret_cast<uint16_t*>(dest[0]);
uint16_t* b = reinterpret_cast<uint16_t*>(dest[1]);

auto d_src = alloc_dev<rscuda::y12i_pixel>(count);
auto d_src = alloc_dev<SOURCE>(count);
auto d_dst_0 = alloc_dev<uint16_t>(count);
auto d_dst_1 = alloc_dev<uint16_t>(count);


auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel), cudaMemcpyHostToDevice);
auto result = cudaMemcpy(d_src.get(), source, count * sizeof(SOURCE), cudaMemcpyHostToDevice);
assert(result == cudaSuccess);

kernel_split_frame_y16_y16_from_y12i_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
Expand All @@ -396,61 +398,10 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count,
*/
}

template void rscuda::y16_y16_from_y12i_10_cuda_helper<rscuda::y12i_pixel>(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source);

__global__ void kernel_split_frame_y16_y16_from_y12i_cuda_mipi(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel_mipi * source)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i >= count)
return;

a[i] = source[i].l() << 6 | source[i].l() >> 4;
b[i] = source[i].r() << 6 | source[i].r() >> 4;
}


void rscuda::y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source)
{
/*
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start); */

source = reinterpret_cast<const y12i_pixel_mipi*>(source);

int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK;
uint16_t* a = reinterpret_cast<uint16_t*>(dest[0]);
uint16_t* b = reinterpret_cast<uint16_t*>(dest[1]);

auto d_src = alloc_dev<rscuda::y12i_pixel_mipi>(count);
auto d_dst_0 = alloc_dev<uint16_t>(count);
auto d_dst_1 = alloc_dev<uint16_t>(count);


auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel_mipi), cudaMemcpyHostToDevice);
assert(result == cudaSuccess);

kernel_split_frame_y16_y16_from_y12i_cuda_mipi <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
cudaStreamSynchronize(0);

result = cudaGetLastError();
assert(result == cudaSuccess);

result = cudaMemcpy(a, d_dst_0.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
assert(result == cudaSuccess);
result = cudaMemcpy(b, d_dst_1.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
assert(result == cudaSuccess);

/*
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << milliseconds << std::endl;
*/
}

template void rscuda::y16_y16_from_y12i_10_cuda_helper<rscuda::y12i_pixel_mipi>(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source);

__global__ void kernel_z16_y8_from_sr300_inzi_cuda(const uint16_t* source, uint8_t* const dest, int count)
{
Expand Down
11 changes: 3 additions & 8 deletions src/cuda/cuda-conversion.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ namespace rscuda
struct y12i_pixel { uint8_t rl : 8, rh : 4, ll : 4, lh : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
struct y12i_pixel_mipi { uint8_t rl : 8, rh : 4, ll : 4, lh : 8, padding : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
void y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const y8i_pixel * source);
void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source);
void y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source);
template<class SOURCE>
void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const SOURCE * source);
void unpack_yuy2_cuda_helper(const uint8_t* src, uint8_t* dst, int n, rs2_format format);

template<rs2_format FORMAT> void unpack_yuy2_cuda(uint8_t * const d[], const uint8_t * s, int n)
Expand All @@ -47,12 +47,7 @@ namespace rscuda

template<class SOURCE> void split_frame_y16_y16_from_y12i_cuda(uint8_t* const dest[], int count, const SOURCE * source)
{
y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast<const y12i_pixel*>(source));
}

template<class SOURCE> void split_frame_y16_y16_from_y12i_cuda_mipi(uint8_t* const dest[], int count, const SOURCE * source)
{
y16_y16_from_y12i_10_cuda_helper_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source));
y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast<const SOURCE*>(source));
}

void unpack_z16_y8_from_sr300_inzi_cuda(uint8_t* const dest, const uint16_t* source, int count);
Expand Down
2 changes: 1 addition & 1 deletion src/proc/y12i-to-y16y16-mipi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace librealsense
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y16_y16_from_y12i_cuda_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi *>(source));
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi *>(source));
#else
split_frame(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source),
[](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
Expand Down
2 changes: 1 addition & 1 deletion src/proc/y12i-to-y16y16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace librealsense
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const y12i_pixel *>(source));
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel *>(source));
#else
split_frame(dest, count, reinterpret_cast<const y12i_pixel*>(source),
[](const y12i_pixel & p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
Expand Down

0 comments on commit f748b20

Please sign in to comment.