diff --git a/src/cuda/cuda-conversion.cu b/src/cuda/cuda-conversion.cu index 874e53e9a1..edd4c678cf 100644 --- a/src/cuda/cuda-conversion.cu +++ b/src/cuda/cuda-conversion.cu @@ -342,7 +342,8 @@ void rscuda::y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const std::cout << milliseconds << std::endl; */ } -__global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel * source) +template +__global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t* b, int count, const SOURCE * source) { int i = blockDim.x * blockIdx.x + threadIdx.x; @@ -354,7 +355,8 @@ __global__ void kernel_split_frame_y16_y16_from_y12i_cuda(uint16_t* a, uint16_t* } -void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source) +template +void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const SOURCE * source) { /* cudaEvent_t start, stop; @@ -362,18 +364,18 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, cudaEventCreate(&stop); cudaEventRecord(start); */ - source = reinterpret_cast(source); + source = reinterpret_cast(source); int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK; uint16_t* a = reinterpret_cast(dest[0]); uint16_t* b = reinterpret_cast(dest[1]); - auto d_src = alloc_dev(count); + auto d_src = alloc_dev(count); auto d_dst_0 = alloc_dev(count); auto d_dst_1 = alloc_dev(count); - auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel), cudaMemcpyHostToDevice); + auto result = cudaMemcpy(d_src.get(), source, count * sizeof(SOURCE), cudaMemcpyHostToDevice); assert(result == cudaSuccess); kernel_split_frame_y16_y16_from_y12i_cuda <<>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get()); @@ -396,61 +398,10 @@ void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, */ } +template void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source); -__global__ void kernel_split_frame_y16_y16_from_y12i_cuda_mipi(uint16_t* a, uint16_t* b, int count, const rscuda::y12i_pixel_mipi * source) -{ - int i = blockDim.x * blockIdx.x + threadIdx.x; - - if (i >= count) - return; - - a[i] = source[i].l() << 6 | source[i].l() >> 4; - b[i] = source[i].r() << 6 | source[i].r() >> 4; -} - - -void rscuda::y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source) -{ - /* - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start); */ - - source = reinterpret_cast(source); - - int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK; - uint16_t* a = reinterpret_cast(dest[0]); - uint16_t* b = reinterpret_cast(dest[1]); - - auto d_src = alloc_dev(count); - auto d_dst_0 = alloc_dev(count); - auto d_dst_1 = alloc_dev(count); - - - auto result = cudaMemcpy(d_src.get(), source, count * sizeof(rscuda::y12i_pixel_mipi), cudaMemcpyHostToDevice); - assert(result == cudaSuccess); - - kernel_split_frame_y16_y16_from_y12i_cuda_mipi <<>> (d_dst_0.get(), d_dst_1.get(), count, d_src.get()); - cudaStreamSynchronize(0); - - result = cudaGetLastError(); - assert(result == cudaSuccess); - - result = cudaMemcpy(a, d_dst_0.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost); - assert(result == cudaSuccess); - result = cudaMemcpy(b, d_dst_1.get(), count * sizeof(uint16_t), cudaMemcpyDeviceToHost); - assert(result == cudaSuccess); - - /* - cudaEventRecord(stop); - cudaEventSynchronize(stop); - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << milliseconds << std::endl; - */ -} +template void rscuda::y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source); __global__ void kernel_z16_y8_from_sr300_inzi_cuda(const uint16_t* source, uint8_t* const dest, int count) { diff --git a/src/cuda/cuda-conversion.cuh b/src/cuda/cuda-conversion.cuh index 1f9d8820fd..bf2a0747bf 100644 --- a/src/cuda/cuda-conversion.cuh +++ b/src/cuda/cuda-conversion.cuh @@ -28,8 +28,8 @@ namespace rscuda struct y12i_pixel { uint8_t rl : 8, rh : 4, ll : 4, lh : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } }; struct y12i_pixel_mipi { uint8_t rl : 8, rh : 4, ll : 4, lh : 8, padding : 8; __host__ __device__ int l() const { return lh << 4 | ll; } __host__ __device__ int r() const { return rh << 8 | rl; } }; void y8_y8_from_y8i_cuda_helper(uint8_t* const dest[], int count, const y8i_pixel * source); - void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const rscuda::y12i_pixel * source); - void y16_y16_from_y12i_10_cuda_helper_mipi(uint8_t* const dest[], int count, const rscuda::y12i_pixel_mipi * source); + template + void y16_y16_from_y12i_10_cuda_helper(uint8_t* const dest[], int count, const SOURCE * source); void unpack_yuy2_cuda_helper(const uint8_t* src, uint8_t* dst, int n, rs2_format format); template void unpack_yuy2_cuda(uint8_t * const d[], const uint8_t * s, int n) @@ -47,12 +47,7 @@ namespace rscuda template void split_frame_y16_y16_from_y12i_cuda(uint8_t* const dest[], int count, const SOURCE * source) { - y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast(source)); - } - - template void split_frame_y16_y16_from_y12i_cuda_mipi(uint8_t* const dest[], int count, const SOURCE * source) - { - y16_y16_from_y12i_10_cuda_helper_mipi(dest, count, reinterpret_cast(source)); + y16_y16_from_y12i_10_cuda_helper(dest, count, reinterpret_cast(source)); } void unpack_z16_y8_from_sr300_inzi_cuda(uint8_t* const dest, const uint16_t* source, int count); diff --git a/src/proc/y12i-to-y16y16-mipi.cpp b/src/proc/y12i-to-y16y16-mipi.cpp index 2e1b5e0d29..145d603efa 100644 --- a/src/proc/y12i-to-y16y16-mipi.cpp +++ b/src/proc/y12i-to-y16y16-mipi.cpp @@ -16,7 +16,7 @@ namespace librealsense { auto count = width * height; #ifdef RS2_USE_CUDA - rscuda::split_frame_y16_y16_from_y12i_cuda_mipi(dest, count, reinterpret_cast(source)); + rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast(source)); #else split_frame(dest, count, reinterpret_cast(source), [](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data diff --git a/src/proc/y12i-to-y16y16.cpp b/src/proc/y12i-to-y16y16.cpp index fda43e282d..ac19d5b478 100644 --- a/src/proc/y12i-to-y16y16.cpp +++ b/src/proc/y12i-to-y16y16.cpp @@ -15,7 +15,7 @@ namespace librealsense { auto count = width * height; #ifdef RS2_USE_CUDA - rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast(source)); + rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast(source)); #else split_frame(dest, count, reinterpret_cast(source), [](const y12i_pixel & p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data