From e0668a6f3d7c0988d13b24da56ef64778084c7e3 Mon Sep 17 00:00:00 2001 From: pizzoli Date: Sat, 14 Nov 2015 02:06:10 +0100 Subject: [PATCH] small optimizations --- CMakeLists.txt | 5 +++ include/rmd/mvs_device_data.cuh | 4 ++- include/rmd/seed_matrix.cuh | 3 ++ include/rmd/texture_memory.cuh | 4 +++ src/epipolar_match.cu | 54 ++++++++++++++++----------------- src/seed_matrix.cu | 7 +++++ test/dataset_main.cpp | 3 ++ 7 files changed, 51 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e65598c..e317ca2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,11 @@ SET(CMAKE_BUILD_TYPE Release) # Release, RelWithDebInfo find_package(CUDA REQUIRED) SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-O3) + +# Specify compute capability +#list(APPEND CUDA_NVCC_FLAGS -arch=sm_30) +#list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_30,code=sm_30 -use_fast_math) + SET(CUDA_PROPAGATE_HOST_FLAGS OFF) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") diff --git a/include/rmd/mvs_device_data.cuh b/include/rmd/mvs_device_data.cuh index 46dd60b..fbc804e 100644 --- a/include/rmd/mvs_device_data.cuh +++ b/include/rmd/mvs_device_data.cuh @@ -41,9 +41,11 @@ struct CorrPatch #ifndef RMD_CORR_PATCH_SIDE #define RMD_CORR_PATCH_SIDE 5 #endif +#define RMD_CORR_PATCH_OFFSET -RMD_CORR_PATCH_SIDE/2 CorrPatch() : side(RMD_CORR_PATCH_SIDE) - , offset(make_int2(-side/2, -side/2)) + , offset(make_int2(RMD_CORR_PATCH_OFFSET, + RMD_CORR_PATCH_OFFSET)) { } const int side; const int2 offset; diff --git a/include/rmd/seed_matrix.cuh b/include/rmd/seed_matrix.cuh index a90be22..500e12f 100644 --- a/include/rmd/seed_matrix.cuh +++ b/include/rmd/seed_matrix.cuh @@ -99,6 +99,9 @@ private: dim3 dim_grid_; // Image reduction to compute seed statistics ImageReducer *img_reducer_; + + // Image size to be copied to constant memory + int host_img_size_xy_[2]; }; } // rmd namespace diff --git a/include/rmd/texture_memory.cuh b/include/rmd/texture_memory.cuh index 2b207b5..50f91d5 100644 --- a/include/rmd/texture_memory.cuh +++ b/include/rmd/texture_memory.cuh @@ -37,6 +37,10 @@ texture epipolar_matches_tex texture g_tex; +// Pre-computed template statistics +texture sum_templ_tex; +texture const_templ_denom_tex; + template inline void bindTexture( texture &tex, diff --git a/src/epipolar_match.cu b/src/epipolar_match.cu index 178056f..385cef6 100644 --- a/src/epipolar_match.cu +++ b/src/epipolar_match.cu @@ -28,6 +28,14 @@ namespace rmd { +__constant__ +int c_img_size_xy[2]; + +extern "C" void copyImgSzXY2Const(int *h_img_size_xy) +{ + cudaMemcpyToSymbol(c_img_size_xy, h_img_size_xy, 2*sizeof(int)); +} + __global__ void seedEpipolarMatchKernel( mvs::DeviceData *dev_ptr, @@ -36,7 +44,7 @@ void seedEpipolarMatchKernel( int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; - if(x >= dev_ptr->width || y >= dev_ptr->height) + if(x >= c_img_size_xy[0] || y >= c_img_size_xy[1]) return; const float xx = x+0.5f; @@ -59,15 +67,6 @@ void seedEpipolarMatchKernel( const float2 px_mean_curr = dev_ptr->cam.world2cam( T_curr_ref * (f_ref * mu) ); - if( (px_mean_curr.x >= dev_ptr->width) || - (px_mean_curr.y >= dev_ptr->height) || - (px_mean_curr.x < 0) || - (px_mean_curr.y < 0) ) - { - dev_ptr->convergence->atXY(x, y) = ConvergenceStates::NOT_VISIBLE; - return; - } - const float2 px_min_curr = dev_ptr->cam.world2cam( T_curr_ref * (f_ref * fmaxf( mu - 3.0f*sigma, 0.01f)) ); const float2 px_max_curr = @@ -78,13 +77,10 @@ void seedEpipolarMatchKernel( const float half_length = 0.5f * fminf(norm(epi_line), RMD_MAX_EXTENT_EPIPOLAR_SEARCH); float2 px_curr, best_px_curr; - const int &side = dev_ptr->patch.side; - const int2 &offset = dev_ptr->patch.offset; - const float n = (float)side * (float)side; + // Retrieve template statistics for NCC matching; + const float sum_templ = tex2D(sum_templ_tex, xx, yy); + const float const_templ_denom = tex2D(const_templ_denom_tex, xx, yy); - // Retrieve template statistics for NCC matching - const float sum_templ = dev_ptr->sum_templ->atXY(x, y); - const float const_templ_denom = dev_ptr->const_templ_denom->atXY(x, y); // init best match score float best_ncc = -1.0f; @@ -94,10 +90,10 @@ void seedEpipolarMatchKernel( for(float l = -half_length; l <= half_length; l += 0.7f) { px_curr = px_mean_curr + l*epi_dir; - if( (px_curr.x >= dev_ptr->width - dev_ptr->patch.side) || - (px_curr.y >= dev_ptr->height - dev_ptr->patch.side) || - (px_curr.x < dev_ptr->patch.side) || - (px_curr.y < dev_ptr->patch.side) ) + if( (px_curr.x >= c_img_size_xy[0] - RMD_CORR_PATCH_SIDE) || + (px_curr.y >= c_img_size_xy[1] - RMD_CORR_PATCH_SIDE) || + (px_curr.x < RMD_CORR_PATCH_SIDE) || + (px_curr.y < RMD_CORR_PATCH_SIDE) ) { continue; } @@ -106,25 +102,27 @@ void seedEpipolarMatchKernel( sum_img_sq = 0.0f; sum_img_templ = 0.0f; - for(int patch_y=0; patch_y best_ncc) diff --git a/src/seed_matrix.cu b/src/seed_matrix.cu index a6872a1..72c98b3 100644 --- a/src/seed_matrix.cu +++ b/src/seed_matrix.cu @@ -16,6 +16,7 @@ // along with this program. If not, see . #include + #include #include @@ -58,6 +59,8 @@ rmd::SeedMatrix::SeedMatrix( dev_data_.one_pix_angle = cam.getOnePixAngle(); dev_data_.width = width; dev_data_.height = height; + host_img_size_xy_[0] = width_; + host_img_size_xy_[1] = height_; // Kernel configuration for depth estimation dim_block_.x = 16; @@ -107,6 +110,9 @@ bool rmd::SeedMatrix::setReferenceImage( rmd::seedInitKernel<<>>(dev_data_.dev_ptr); cudaDeviceSynchronize(); + + rmd::bindTexture(sum_templ_tex, sum_templ_, cudaFilterModePoint); + rmd::bindTexture(const_templ_denom_tex, const_templ_denom_, cudaFilterModePoint); return true; } @@ -136,6 +142,7 @@ bool rmd::SeedMatrix::update( // Establish epipolar correspondences // call epipolar matching kernel + rmd::copyImgSzXY2Const(host_img_size_xy_); rmd::seedEpipolarMatchKernel<<>>(dev_data_.dev_ptr, T_curr_ref); err = cudaDeviceSynchronize(); if(cudaSuccess != err) diff --git a/test/dataset_main.cpp b/test/dataset_main.cpp index df30451..a95d632 100644 --- a/test/dataset_main.cpp +++ b/test/dataset_main.cpp @@ -92,7 +92,10 @@ int main(int argc, char **argv) } else { + double t = (double)cv::getTickCount(); depthmap.update(img, T_world_curr.inv()); + t = ((double)cv::getTickCount() - t)/cv::getTickFrequency(); + printf("\nUPDATE execution time: %f seconds.\n", t); } }