added template functions for y16_y16_from_y12i with CUDA

Arun-Prasad-V · Jun 26, 2024 · f748b20 · f748b20
1 parent 1cec5c3
commit f748b20
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 68 deletions.
diff --git a/src/cuda/cuda-conversion.cu b/src/cuda/cuda-conversion.cu
@@ -342,7 +342,8 @@ void rscuda::y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const
         std::cout << milliseconds << std::endl; */
 }
 
-__global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel * source)
+template<class SOURCE>
+__global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t* b, int count, const SOURCE * source)
 {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -354,26 +355,27 @@ __global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t*
 }
 
 
-void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source)
+template<class SOURCE>
+void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const SOURCE * source)
 {
     /*
         cudaEvent_t start, stop;
         cudaEventCreate(&start);
         cudaEventCreate(&stop);
         cudaEventRecord(start); */
 
-    source = reinterpret_cast<const y12i_pixel*>(source);
+    source = reinterpret_cast<const SOURCE*>(source);
 
     int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK;
     uint16_t* a = reinterpret_cast<uint16_t*>(dest[0]);
     uint16_t* b = reinterpret_cast<uint16_t*>(dest[1]);
 
-    auto d_src = alloc_dev<rscuda::y12i_pixel>(count);
+    auto d_src = alloc_dev<SOURCE>(count);
     auto d_dst_0 = alloc_dev<uint16_t>(count);
     auto d_dst_1 = alloc_dev<uint16_t>(count);
 
 
-    auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel), cudaMemcpyHostToDevice);
+    auto result = cudaMemcpy(d_src.get(), source, count * sizeof(SOURCE), cudaMemcpyHostToDevice);
     assert(result == cudaSuccess);
 
     kernel_split_frame_y16_y16_from_y12i_cuda <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
@@ -396,61 +398,10 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count,
     */
 }
 
+template void rscuda::y16_y16_from_y12i_10_cuda_helper<rscuda::y12i_pixel>(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source);
 
-__global__ void kernel_split_frame_y16_y16_from_y12i_cuda_mipi(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel_mipi * source)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i >= count)
-        return;
-
-    a[i] = source[i].l() << 6 | source[i].l() >> 4;
-    b[i] = source[i].r() << 6 | source[i].r() >> 4;
-}
-
-
-void rscuda::y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source)
-{
-    /*
-        cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-        cudaEventRecord(start); */
-
-    source = reinterpret_cast<const y12i_pixel_mipi*>(source);
-
-    int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK;
-    uint16_t* a = reinterpret_cast<uint16_t*>(dest[0]);
-    uint16_t* b = reinterpret_cast<uint16_t*>(dest[1]);
-
-    auto d_src = alloc_dev<rscuda::y12i_pixel_mipi>(count);
-    auto d_dst_0 = alloc_dev<uint16_t>(count);
-    auto d_dst_1 = alloc_dev<uint16_t>(count);
-
-
-    auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel_mipi), cudaMemcpyHostToDevice);
-    assert(result == cudaSuccess);
-
-    kernel_split_frame_y16_y16_from_y12i_cuda_mipi <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
-    cudaStreamSynchronize(0);
-
-    result = cudaGetLastError();
-    assert(result == cudaSuccess);
-
-    result = cudaMemcpy(a, d_dst_0.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
-    assert(result == cudaSuccess);
-    result = cudaMemcpy(b, d_dst_1.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
-    assert(result == cudaSuccess);
-
-    /*
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    float milliseconds = 0;
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    std::cout << milliseconds << std::endl;
-    */
-}
 
+template void rscuda::y16_y16_from_y12i_10_cuda_helper<rscuda::y12i_pixel_mipi>(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source);
 
 __global__ void kernel_z16_y8_from_sr300_inzi_cuda(const uint16_t* source, uint8_t* const dest, int count)
 {

diff --git a/src/cuda/cuda-conversion.cuh b/src/cuda/cuda-conversion.cuh
@@ -28,8 +28,8 @@ namespace rscuda
     struct y12i_pixel { uint8_t rl : 8, rh : 4, ll : 4, lh : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
     struct y12i_pixel_mipi { uint8_t rl : 8, rh : 4, ll : 4, lh : 8, padding : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
     void y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const y8i_pixel * source);
-    void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source);
-    void y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source);
+    template<class SOURCE>
+    void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const SOURCE * source);
     void unpack_yuy2_cuda_helper(const uint8_t* src, uint8_t* dst, int n, rs2_format format);
 
     template<rs2_format FORMAT> void unpack_yuy2_cuda(uint8_t * const d[], const uint8_t * s, int n)
@@ -47,12 +47,7 @@ namespace rscuda
 
     template<class SOURCE> void split_frame_y16_y16_from_y12i_cuda(uint8_t* const dest[], int count, const SOURCE * source)
     {
-        y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast<const y12i_pixel*>(source));
-    }
-
-    template<class SOURCE> void split_frame_y16_y16_from_y12i_cuda_mipi(uint8_t* const dest[], int count, const SOURCE * source)
-    {
-        y16_y16_from_y12i_10_cuda_helper_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source));
+        y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast<const SOURCE*>(source));
     }
 
     void unpack_z16_y8_from_sr300_inzi_cuda(uint8_t* const dest, const uint16_t* source, int count);

diff --git a/src/proc/y12i-to-y16y16-mipi.cpp b/src/proc/y12i-to-y16y16-mipi.cpp
@@ -16,7 +16,7 @@ namespace librealsense
     {
         auto count = width * height;
 #ifdef RS2_USE_CUDA
-        rscuda::split_frame_y16_y16_from_y12i_cuda_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi *>(source));
+        rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi *>(source));
 #else
         split_frame(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source),
             [](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; },  // We want to convert 10-bit data to 16-bit data

diff --git a/src/proc/y12i-to-y16y16.cpp b/src/proc/y12i-to-y16y16.cpp
@@ -15,7 +15,7 @@ namespace librealsense
     {
         auto count = width * height;
 #ifdef RS2_USE_CUDA
-        rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const y12i_pixel *>(source));
+        rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel *>(source));
 #else
         split_frame(dest, count, reinterpret_cast<const y12i_pixel*>(source),
             [](const y12i_pixel & p) -> uint16_t { return p.l() << 6 | p.l() >> 4; },  // We want to convert 10-bit data to 16-bit data