Fix: Y12i stream bug in MIPI with CUDA case

Arun-Prasad-V · Jun 25, 2024 · 1cec5c3 · 1cec5c3
1 parent df33071
commit 1cec5c3
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 1 deletion.
diff --git a/src/cuda/cuda-conversion.cu b/src/cuda/cuda-conversion.cu
@@ -397,6 +397,61 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count,
 }
 
 
+__global__ void kernel_split_frame_y16_y16_from_y12i_cuda_mipi(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel_mipi * source)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i >= count)
+        return;
+
+    a[i] = source[i].l() << 6 | source[i].l() >> 4;
+    b[i] = source[i].r() << 6 | source[i].r() >> 4;
+}
+
+
+void rscuda::y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source)
+{
+    /*
+        cudaEvent_t start, stop;
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaEventRecord(start); */
+
+    source = reinterpret_cast<const y12i_pixel_mipi*>(source);
+
+    int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK;
+    uint16_t* a = reinterpret_cast<uint16_t*>(dest[0]);
+    uint16_t* b = reinterpret_cast<uint16_t*>(dest[1]);
+
+    auto d_src = alloc_dev<rscuda::y12i_pixel_mipi>(count);
+    auto d_dst_0 = alloc_dev<uint16_t>(count);
+    auto d_dst_1 = alloc_dev<uint16_t>(count);
+
+
+    auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel_mipi), cudaMemcpyHostToDevice);
+    assert(result == cudaSuccess);
+
+    kernel_split_frame_y16_y16_from_y12i_cuda_mipi <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
+    cudaStreamSynchronize(0);
+
+    result = cudaGetLastError();
+    assert(result == cudaSuccess);
+
+    result = cudaMemcpy(a, d_dst_0.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
+    assert(result == cudaSuccess);
+    result = cudaMemcpy(b, d_dst_1.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
+    assert(result == cudaSuccess);
+
+    /*
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    std::cout << milliseconds << std::endl;
+    */
+}
+
+
 __global__ void kernel_z16_y8_from_sr300_inzi_cuda(const uint16_t* source, uint8_t* const dest, int count)
 {
     int i = blockDim.x * blockIdx.x + threadIdx.x;

diff --git a/src/cuda/cuda-conversion.cuh b/src/cuda/cuda-conversion.cuh
@@ -26,8 +26,10 @@ namespace rscuda
 {   
     struct y8i_pixel { uint8_t l; uint8_t r; };  
     struct y12i_pixel { uint8_t rl : 8, rh : 4, ll : 4, lh : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
+    struct y12i_pixel_mipi { uint8_t rl : 8, rh : 4, ll : 4, lh : 8, padding : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
     void y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const y8i_pixel * source);
     void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source);
+    void y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source);
     void unpack_yuy2_cuda_helper(const uint8_t* src, uint8_t* dst, int n, rs2_format format);
 
     template<rs2_format FORMAT> void unpack_yuy2_cuda(uint8_t * const d[], const uint8_t * s, int n)
@@ -47,6 +49,11 @@ namespace rscuda
     {
         y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast<const y12i_pixel*>(source));
     }
+
+    template<class SOURCE> void split_frame_y16_y16_from_y12i_cuda_mipi(uint8_t* const dest[], int count, const SOURCE * source)
+    {
+        y16_y16_from_y12i_10_cuda_helper_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source));
+    }
 
     void unpack_z16_y8_from_sr300_inzi_cuda(uint8_t* const dest, const uint16_t* source, int count);
 

diff --git a/src/proc/y12i-to-y16y16-mipi.cpp b/src/proc/y12i-to-y16y16-mipi.cpp
@@ -16,7 +16,7 @@ namespace librealsense
     {
         auto count = width * height;
 #ifdef RS2_USE_CUDA
-        rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const y12i_pixel_mipi *>(source));
+        rscuda::split_frame_y16_y16_from_y12i_cuda_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi *>(source));
 #else
         split_frame(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source),
             [](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; },  // We want to convert 10-bit data to 16-bit data