Skip to content

Commit

Permalink
Fix: Y12i stream bug in MIPI with CUDA case
Browse files Browse the repository at this point in the history
  • Loading branch information
Arun-Prasad-V committed Jun 25, 2024
1 parent df33071 commit 1cec5c3
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 1 deletion.
55 changes: 55 additions & 0 deletions src/cuda/cuda-conversion.cu
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,61 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count,
}


__global__ void kernel_split_frame_y16_y16_from_y12i_cuda_mipi(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel_mipi * source)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i >= count)
return;

a[i] = source[i].l() << 6 | source[i].l() >> 4;
b[i] = source[i].r() << 6 | source[i].r() >> 4;
}


void rscuda::y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source)
{
/*
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start); */

source = reinterpret_cast<const y12i_pixel_mipi*>(source);

int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK;
uint16_t* a = reinterpret_cast<uint16_t*>(dest[0]);
uint16_t* b = reinterpret_cast<uint16_t*>(dest[1]);

auto d_src = alloc_dev<rscuda::y12i_pixel_mipi>(count);
auto d_dst_0 = alloc_dev<uint16_t>(count);
auto d_dst_1 = alloc_dev<uint16_t>(count);


auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel_mipi), cudaMemcpyHostToDevice);
assert(result == cudaSuccess);

kernel_split_frame_y16_y16_from_y12i_cuda_mipi <<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get());
cudaStreamSynchronize(0);

result = cudaGetLastError();
assert(result == cudaSuccess);

result = cudaMemcpy(a, d_dst_0.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
assert(result == cudaSuccess);
result = cudaMemcpy(b, d_dst_1.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost);
assert(result == cudaSuccess);

/*
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << milliseconds << std::endl;
*/
}


__global__ void kernel_z16_y8_from_sr300_inzi_cuda(const uint16_t* source, uint8_t* const dest, int count)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
Expand Down
7 changes: 7 additions & 0 deletions src/cuda/cuda-conversion.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ namespace rscuda
{
struct y8i_pixel { uint8_t l; uint8_t r; };
struct y12i_pixel { uint8_t rl : 8, rh : 4, ll : 4, lh : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
struct y12i_pixel_mipi { uint8_t rl : 8, rh : 4, ll : 4, lh : 8, padding : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } };
void y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const y8i_pixel * source);
void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source);
void y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source);
void unpack_yuy2_cuda_helper(const uint8_t* src, uint8_t* dst, int n, rs2_format format);

template<rs2_format FORMAT> void unpack_yuy2_cuda(uint8_t * const d[], const uint8_t * s, int n)
Expand All @@ -47,6 +49,11 @@ namespace rscuda
{
y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast<const y12i_pixel*>(source));
}

template<class SOURCE> void split_frame_y16_y16_from_y12i_cuda_mipi(uint8_t* const dest[], int count, const SOURCE * source)
{
y16_y16_from_y12i_10_cuda_helper_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source));
}

void unpack_z16_y8_from_sr300_inzi_cuda(uint8_t* const dest, const uint16_t* source, int count);

Expand Down
2 changes: 1 addition & 1 deletion src/proc/y12i-to-y16y16-mipi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace librealsense
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const y12i_pixel_mipi *>(source));
rscuda::split_frame_y16_y16_from_y12i_cuda_mipi(dest, count, reinterpret_cast<const y12i_pixel_mipi *>(source));
#else
split_frame(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source),
[](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
Expand Down

0 comments on commit 1cec5c3

Please sign in to comment.