From 03973845709cb06cdf54e4f693172e5380ef79dc Mon Sep 17 00:00:00 2001 From: Cem Moluluo Date: Fri, 9 Feb 2024 12:32:23 +0100 Subject: [PATCH 1/5] added vmaf_cuda_fex_synchronize, fixed cuda fex flush functions --- libvmaf/include/libvmaf/libvmaf_cuda.h | 11 +++++++++++ libvmaf/src/feature/cuda/integer_adm_cuda.c | 4 ++-- libvmaf/src/feature/cuda/integer_motion_cuda.c | 13 +++++++------ libvmaf/src/feature/cuda/integer_vif_cuda.c | 5 +++-- libvmaf/src/libvmaf.c | 14 ++++++++++++++ 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/libvmaf/include/libvmaf/libvmaf_cuda.h b/libvmaf/include/libvmaf/libvmaf_cuda.h index ac34c24a3..c7fb5adab 100644 --- a/libvmaf/include/libvmaf/libvmaf_cuda.h +++ b/libvmaf/include/libvmaf/libvmaf_cuda.h @@ -102,6 +102,17 @@ int vmaf_cuda_preallocate_pictures(VmafContext *vmaf, */ int vmaf_cuda_fetch_preallocated_picture(VmafContext *vmaf, VmafPicture* pic); +/** + * Synchronizes all CUDA feature extractors within the VmafContext + * with the CPU using their flush function. All feature scores will + * be written when this function returns. + * + * @param vmaf VMAF context allocated with `vmaf_init()` and + * initialized with `vmaf_cuda_preallocate_pictures()`. + * @return 0 on success, or < 0 (a negative errno code) on error. + */ +int vmaf_cuda_fex_synchronize(VmafContext *vmaf); + #ifdef __cplusplus } #endif diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index d8b414436..caa4d25db 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -31,7 +31,6 @@ #include "cuda/integer_adm_cuda.h" #include "picture_cuda.h" #include - #include #define RES_BUFFER_SIZE 4 * 3 * 2 @@ -1223,7 +1222,8 @@ static int flush_fex_cuda(VmafFeatureExtractor *fex, { AdmStateCuda *s = fex->priv; CHECK_CUDA(cuStreamSynchronize(s->str)); - return 1; + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + return 0; } static const char *provided_features[] = { diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c index 9615cde98..5796e2404 100644 --- a/libvmaf/src/feature/cuda/integer_motion_cuda.c +++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c @@ -210,13 +210,14 @@ static int flush_fex_cuda(VmafFeatureExtractor *fex, CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - if (s->index > 0) { - ret = vmaf_feature_collector_append(feature_collector, - "VMAF_integer_feature_motion2_score", - s->score, s->index); - } + // Not required, write_scores takes care of this + // if (s->index > 0) { + // ret = vmaf_feature_collector_append(feature_collector, + // "VMAF_integer_feature_motion2_score", + // s->score, s->index); + // } - return (ret < 0) ? ret : !ret; + return 0; } static inline double normalize_and_scale_sad(uint64_t sad, diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c index fcf68f442..d992a529f 100644 --- a/libvmaf/src/feature/cuda/integer_vif_cuda.c +++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c @@ -496,7 +496,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, write_score_parameters_vif *data = s->buf.cpu_param_buf; data->feature_collector = feature_collector; data->index = index; - CHECK_CUDA(cuLaunchHostFunc(s->str, write_scores, data)); + CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, data)); return 0; } @@ -527,7 +527,8 @@ static int flush_fex_cuda(VmafFeatureExtractor *fex, VifStateCuda *s = fex->priv; CHECK_CUDA(cuStreamSynchronize(s->str)); - return 1; + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + return 0; } static const char *provided_features[] = { diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index 3fbb050c5..50f254463 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -362,6 +362,20 @@ int vmaf_use_features_from_model_collection(VmafContext *vmaf, return err; } + +int vmaf_cuda_fex_synchronize(VmafContext *vmaf) { + if(!vmaf) return -EINVAL; + int err = 0; + RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors; + for (unsigned i = 0; i < rfe.cnt; i++) { + if ((rfe.fex_ctx[i]->fex->flags & VMAF_FEATURE_EXTRACTOR_CUDA)) + err |= vmaf_feature_extractor_context_flush(rfe.fex_ctx[i], + vmaf->feature_collector); + } + + return err; +} + struct ThreadData { VmafFeatureExtractorContext *fex_ctx; VmafPicture ref, dist; From dcdef5f1867b709320fda8f48e699513722a29ca Mon Sep 17 00:00:00 2001 From: Cem Moluluo Date: Tue, 12 Mar 2024 18:00:42 +0100 Subject: [PATCH 2/5] fixed fex flush function --- libvmaf/src/feature/cuda/integer_adm_cuda.c | 36 +++++++--- .../src/feature/cuda/integer_motion_cuda.c | 71 +++++++++++++------ libvmaf/src/feature/cuda/integer_vif_cuda.c | 34 +++++++-- libvmaf/src/libvmaf.c | 14 ++++ 4 files changed, 116 insertions(+), 39 deletions(-) diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index caa4d25db..516851825 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -32,6 +32,7 @@ #include "picture_cuda.h" #include #include +#include "nvtx3/nvToolsExt.h" #define RES_BUFFER_SIZE 4 * 3 * 2 @@ -54,7 +55,7 @@ typedef struct AdmStateCuda { int dst_stride, CUstream c_stream); CUstream str, host_stream; void* write_score_parameters; - CUevent ref_event, dis_event, finished; + CUevent ref_event, dis_event, finished, write_scores; VmafDictionary *feature_name_dict; // adm_dwt kernels @@ -641,7 +642,7 @@ typedef struct write_score_parameters_adm { static int write_scores(write_score_parameters_adm* params) { - + nvtxRangePushA("write_scores ADM"); VmafFeatureCollector *feature_collector = params->feature_collector; AdmStateCuda *s = params->s; unsigned index = params->index; @@ -714,7 +715,12 @@ static int write_scores(write_score_parameters_adm* params) s->feature_name_dict, "integer_adm_scale3", scores[6] / scores[7], index); - if (!s->debug) return err; + if (!s->debug) { + + nvtxRangePop(); + return err; + } + err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_adm", score, index); @@ -748,7 +754,7 @@ static int write_scores(write_score_parameters_adm* params) err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_adm_den_scale3", scores[7], index); - + nvtxRangePop(); return err; } @@ -1014,9 +1020,10 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CUmodule adm_cm_module, adm_csf_den_module, adm_csf_module, adm_decouple_module, adm_dwt_module; @@ -1157,7 +1164,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventDestroy(s->write_scores)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); // current implementation is limited by the 16-bit data pipeline, thus @@ -1178,6 +1187,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, data->w = ref_pic->w[0]; CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); + CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream)); return 0; } @@ -1220,10 +1230,18 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { + nvtxRangePushA("flush ADM"); AdmStateCuda *s = fex->priv; + int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - return 0; + while (cuEventQuery(s->write_scores) != CUDA_SUCCESS) + { + continue; + } + CHECK_CUDA(cuEventSynchronize(s->write_scores)); + nvtxRangePop(); + return (ret < 0) ? ret : !ret; } static const char *provided_features[] = { diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c index 5796e2404..76fa7873c 100644 --- a/libvmaf/src/feature/cuda/integer_motion_cuda.c +++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c @@ -31,9 +31,10 @@ #include "picture.h" #include "picture_cuda.h" #include "cuda_helper.cuh" +#include "nvtx3/nvToolsExt.h" typedef struct MotionStateCuda { - CUevent event, finished; + CUevent event, finished, scores_written; CUfunction funcbpc8, funcbpc16; CUstream str, host_stream; VmafCudaBuffer* blur[2]; @@ -44,6 +45,8 @@ typedef struct MotionStateCuda { double score; bool debug; bool motion_force_zero; + bool flushed; + bool closed; void (*calculate_motion_score)(const VmafPicture* src, VmafCudaBuffer* src_blurred, const VmafCudaBuffer* prev_blurred, VmafCudaBuffer* sad, unsigned width, unsigned height, @@ -136,12 +139,15 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt unsigned bpc, unsigned w, unsigned h) { MotionStateCuda *s = fex->priv; + s->flushed = true; + s->closed = false; CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC)); CUmodule module; CHECK_CUDA(cuModuleLoadData(&module, src_motion_score_ptx)); @@ -202,22 +208,34 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt return -ENOMEM; } +// if called twice in a row, finalize FEX and close static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { + nvtxRangePushA("FLUSH MOT"); + MotionStateCuda *s = fex->priv; int ret = 0; - CHECK_CUDA(cuStreamSynchronize(s->str)); - CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - - // Not required, write_scores takes care of this - // if (s->index > 0) { - // ret = vmaf_feature_collector_append(feature_collector, - // "VMAF_integer_feature_motion2_score", - // s->score, s->index); - // } - - return 0; + if(!s->flushed) { + CHECK_CUDA(cuStreamSynchronize(s->str)); + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + while (cuEventQuery(s->scores_written) != CUDA_SUCCESS) + { + continue; + } + CHECK_CUDA(cuEventSynchronize(s->scores_written)); + nvtxRangePop(); + } + else { + if (s->index > 0 && !s->closed) { + ret = vmaf_feature_collector_append(feature_collector, + "VMAF_integer_feature_motion2_score", + s->score, s->index); + } + s->closed = true; + } + s->flushed = true; + return (ret < 0) ? ret : !ret; } static inline double normalize_and_scale_sad(uint64_t sad, @@ -243,7 +261,7 @@ static int write_scores(write_score_parameters_moco* params) } if (err) return err; - if (params->index == 1) + if (params->index == 1) return 0; err = vmaf_feature_collector_append(feature_collector, @@ -258,13 +276,16 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, VmafFeatureCollector *feature_collector) { MotionStateCuda *s = fex->priv; - + if(s->closed) { + return -ESHUTDOWN; // TODO: proper error code here + } + s->flushed = false; // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores CHECK_CUDA(cuStreamSynchronize(s->str)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); + CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + // CHECK_CUDA(cuEventDestroy(s->finished)); + // CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); int err = 0; @@ -287,10 +308,12 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); // This event ensures the input buffer is consumed CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->event)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + // CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); + // CHECK_CUDA(cuEventDestroy(s->event)); + // CHECK_CUDA(cuEventDestroy(s->scores_written)); + // CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); + // CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC)); + // CHECK_CUDA(cuCtxPopCurrent(NULL)); if (index == 0) { err = vmaf_feature_collector_append(feature_collector, @@ -312,11 +335,13 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); write_score_parameters_moco* params = s->write_score_parameters; + cuEventSynchronize(s->scores_written); params->feature_collector = feature_collector; params->h = ref_pic->h[0]; params->w = ref_pic->w[0]; params->index = index; CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters)); + CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream)); return 0; } diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c index d992a529f..c2a4486c1 100644 --- a/libvmaf/src/feature/cuda/integer_vif_cuda.c +++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c @@ -33,6 +33,9 @@ #include "cuda/integer_vif_cuda.h" #include "picture_cuda.h" + +#include "nvtx3/nvToolsExt.h" + #if ARCH_X86 #include "x86/vif_avx2.h" #if HAVE_AVX512 @@ -42,7 +45,7 @@ typedef struct VifStateCuda { VifBufferCuda buf; - CUevent event, finished; + CUevent event, finished, write_scores; CUstream str, host_stream; bool debug; double vif_enhn_gain_limit; @@ -101,7 +104,7 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); // make this static CUmodule filter1d_module; CHECK_CUDA(cuModuleLoadData(&filter1d_module, src_filter1d_ptx)); @@ -346,6 +349,7 @@ typedef struct VifScore { static int write_scores(write_score_parameters_vif* data) { + nvtxRangePushA("write_scoes VIF"); VmafFeatureCollector *feature_collector = data->feature_collector; VifStateCuda *s = data->s; unsigned index = data->index; @@ -380,7 +384,11 @@ static int write_scores(write_score_parameters_vif* data) s->feature_name_dict, "VMAF_integer_feature_vif_scale3_score", vif.scale[3].num / vif.scale[3].den, index); - if (!s->debug) return err; + if (!s->debug) { + + nvtxRangePop(); + return err; + } const double score_num = (double)vif.scale[0].num + (double)vif.scale[1].num + @@ -433,7 +441,7 @@ static int write_scores(write_score_parameters_vif* data) err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_vif_den_scale3", vif.scale[3].den, index); - + nvtxRangePop(); return err; } @@ -454,7 +462,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventDestroy(s->write_scores)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str)); @@ -496,7 +506,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, write_score_parameters_vif *data = s->buf.cpu_param_buf; data->feature_collector = feature_collector; data->index = index; - CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, data)); + CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); + CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream)); return 0; } @@ -524,11 +535,20 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { + nvtxRangePushA("flush VIF"); VifStateCuda *s = fex->priv; + int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - return 0; + while (cuEventQuery(s->write_scores) != CUDA_SUCCESS) + { + continue; + } + CHECK_CUDA(cuEventSynchronize(s->write_scores)); + nvtxRangePop(); + + return (ret < 0) ? ret : !ret; } static const char *provided_features[] = { diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index 50f254463..845103d19 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -506,6 +506,8 @@ static int flush_context(VmafContext *vmaf) } #ifdef HAVE_CUDA + vmaf_cuda_fex_synchronize(vmaf); + vmaf_cuda_fex_synchronize(vmaf); if (vmaf->cuda.state.ctx) { RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors; for (unsigned i = 0; i < rfe.cnt; i++) { @@ -761,6 +763,16 @@ int vmaf_score_at_index(VmafContext *vmaf, VmafModel *model, double *score, if (err) { err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index, score, true, 0); + // if(err) { + // // Error? Sync and try again + // vmaf_cuda_fex_synchronize(vmaf); + // err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index, + // score, true, 0); + // if(err == 0) { + // // No error - got score + // return 0; + // } + // } } return err; @@ -789,6 +801,8 @@ int vmaf_feature_score_pooled(VmafContext *vmaf, const char *feature_name, if (index_low > index_high) return -EINVAL; if (!pool_method) return -EINVAL; + // vmaf_cuda_fex_synchronize(vmaf); + unsigned pic_cnt = 0; double min = 0., max = 0., sum = 0., i_sum = 0.; for (unsigned i = index_low; i <= index_high; i++) { From c6a7a927e7f975238eeea89d788a2b68db1bda6d Mon Sep 17 00:00:00 2001 From: Cem Moluluo Date: Tue, 12 Mar 2024 19:17:07 +0100 Subject: [PATCH 3/5] code cleanup --- libvmaf/src/feature/cuda/integer_adm_cuda.c | 27 +++++++++---------- .../src/feature/cuda/integer_motion_cuda.c | 15 +++++------ libvmaf/src/feature/cuda/integer_vif_cuda.c | 27 +++++++++---------- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index 516851825..4ec67da68 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -55,7 +55,7 @@ typedef struct AdmStateCuda { int dst_stride, CUstream c_stream); CUstream str, host_stream; void* write_score_parameters; - CUevent ref_event, dis_event, finished, write_scores; + CUevent ref_event, dis_event, finished, scores_written; VmafDictionary *feature_name_dict; // adm_dwt kernels @@ -1023,7 +1023,7 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DISABLE_TIMING)); - CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_DISABLE_TIMING)); CUmodule adm_cm_module, adm_csf_den_module, adm_csf_module, adm_decouple_module, adm_dwt_module; @@ -1161,12 +1161,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores CHECK_CUDA(cuStreamSynchronize(s->str)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventDestroy(s->write_scores)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); - CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); // current implementation is limited by the 16-bit data pipeline, thus @@ -1187,7 +1182,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, data->w = ref_pic->w[0]; CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); - CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream)); + CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream)); return 0; } @@ -1224,23 +1219,25 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) if (s->write_score_parameters) free(s->write_score_parameters); ret |= vmaf_dictionary_free(&s->feature_name_dict); + + if(s->ref_event) CHECK_CUDA(cuEventDestroy(s->ref_event)); + if(s->dis_event) CHECK_CUDA(cuEventDestroy(s->dis_event)); + if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished)); + if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written)); + + //cuStreamDestroy(s->str); + return ret; } static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { - nvtxRangePushA("flush ADM"); AdmStateCuda *s = fex->priv; int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - while (cuEventQuery(s->write_scores) != CUDA_SUCCESS) - { - continue; - } - CHECK_CUDA(cuEventSynchronize(s->write_scores)); - nvtxRangePop(); + CHECK_CUDA(cuEventSynchronize(s->scores_written)); return (ret < 0) ? ret : !ret; } diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c index 76fa7873c..419c4843f 100644 --- a/libvmaf/src/feature/cuda/integer_motion_cuda.c +++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c @@ -284,8 +284,6 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - // CHECK_CUDA(cuEventDestroy(s->finished)); - // CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); int err = 0; @@ -308,12 +306,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); // This event ensures the input buffer is consumed CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); - // CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - // CHECK_CUDA(cuEventDestroy(s->event)); - // CHECK_CUDA(cuEventDestroy(s->scores_written)); - // CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); - // CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC)); - // CHECK_CUDA(cuCtxPopCurrent(NULL)); + if (index == 0) { err = vmaf_feature_collector_append(feature_collector, @@ -368,6 +361,12 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) if(s->write_score_parameters) { free(s->write_score_parameters); } + + + if(s->event) CHECK_CUDA(cuEventDestroy(s->event)); + if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished)); + if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written)); + return ret; } diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c index c2a4486c1..3974923e1 100644 --- a/libvmaf/src/feature/cuda/integer_vif_cuda.c +++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c @@ -45,7 +45,7 @@ typedef struct VifStateCuda { VifBufferCuda buf; - CUevent event, finished, write_scores; + CUevent event, finished, scores_written; CUstream str, host_stream; bool debug; double vif_enhn_gain_limit; @@ -102,9 +102,9 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_DISABLE_TIMING)); // make this static CUmodule filter1d_module; CHECK_CUDA(cuModuleLoadData(&filter1d_module, src_filter1d_ptx)); @@ -461,10 +461,6 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // before the GPU has finished writing to it. CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventDestroy(s->write_scores)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); - CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str)); @@ -488,7 +484,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuEventDestroy(s->event)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); } } @@ -507,7 +503,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, data->feature_collector = feature_collector; data->index = index; CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); - CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream)); + CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream)); return 0; } @@ -528,6 +524,11 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) if (s->buf.accum_host) { ret |= vmaf_cuda_buffer_host_free(fex->cu_state, s->buf.accum_host); } + + if(s->event) CHECK_CUDA(cuEventDestroy(s->event)); + if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished)); + if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written)); + ret |= vmaf_dictionary_free(&s->feature_name_dict); return ret; } @@ -541,11 +542,7 @@ static int flush_fex_cuda(VmafFeatureExtractor *fex, int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - while (cuEventQuery(s->write_scores) != CUDA_SUCCESS) - { - continue; - } - CHECK_CUDA(cuEventSynchronize(s->write_scores)); + CHECK_CUDA(cuEventSynchronize(s->scores_written)); nvtxRangePop(); return (ret < 0) ? ret : !ret; From a8286da398cb38a6f5c85a7811c82d0f96f2969a Mon Sep 17 00:00:00 2001 From: Cem Moluluo Date: Wed, 13 Mar 2024 00:25:06 +0100 Subject: [PATCH 4/5] removed redundant nvtx ranges --- libvmaf/src/feature/cuda/integer_adm_cuda.c | 11 ++++++++--- libvmaf/src/feature/cuda/integer_motion_cuda.c | 12 ++++++++++-- libvmaf/src/feature/cuda/integer_vif_cuda.c | 18 +++++++++--------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index 4ec67da68..52f23fc32 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -32,7 +32,9 @@ #include "picture_cuda.h" #include #include +#ifdef HAVE_NVTX #include "nvtx3/nvToolsExt.h" +#endif #define RES_BUFFER_SIZE 4 * 3 * 2 @@ -642,7 +644,6 @@ typedef struct write_score_parameters_adm { static int write_scores(write_score_parameters_adm* params) { - nvtxRangePushA("write_scores ADM"); VmafFeatureCollector *feature_collector = params->feature_collector; AdmStateCuda *s = params->s; unsigned index = params->index; @@ -717,7 +718,6 @@ static int write_scores(write_score_parameters_adm* params) if (!s->debug) { - nvtxRangePop(); return err; } @@ -754,7 +754,6 @@ static int write_scores(write_score_parameters_adm* params) err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_adm_den_scale3", scores[7], index); - nvtxRangePop(); return err; } @@ -1233,11 +1232,17 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { +#ifdef HAVE_NVTX + nvtxRangePushA("flush adm_cuda"); +#endif AdmStateCuda *s = fex->priv; int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); CHECK_CUDA(cuEventSynchronize(s->scores_written)); +#ifdef HAVE_NVTX + nvtxRangePop(); +#endif return (ret < 0) ? ret : !ret; } diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c index 419c4843f..3998bc42c 100644 --- a/libvmaf/src/feature/cuda/integer_motion_cuda.c +++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c @@ -31,7 +31,9 @@ #include "picture.h" #include "picture_cuda.h" #include "cuda_helper.cuh" +#ifdef HAVE_NVTX #include "nvtx3/nvToolsExt.h" +#endif typedef struct MotionStateCuda { CUevent event, finished, scores_written; @@ -212,7 +214,9 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { - nvtxRangePushA("FLUSH MOT"); +#ifdef HAVE_NVTX + nvtxRangePushA("flush motion_cuda"); +#endif MotionStateCuda *s = fex->priv; int ret = 0; @@ -224,7 +228,7 @@ static int flush_fex_cuda(VmafFeatureExtractor *fex, continue; } CHECK_CUDA(cuEventSynchronize(s->scores_written)); - nvtxRangePop(); + } else { if (s->index > 0 && !s->closed) { @@ -235,6 +239,10 @@ static int flush_fex_cuda(VmafFeatureExtractor *fex, s->closed = true; } s->flushed = true; + +#ifdef HAVE_NVTX + nvtxRangePop(); +#endif return (ret < 0) ? ret : !ret; } diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c index 3974923e1..c43b81706 100644 --- a/libvmaf/src/feature/cuda/integer_vif_cuda.c +++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c @@ -33,8 +33,9 @@ #include "cuda/integer_vif_cuda.h" #include "picture_cuda.h" - +#ifdef HAVE_NVTX #include "nvtx3/nvToolsExt.h" +#endif #if ARCH_X86 #include "x86/vif_avx2.h" @@ -349,7 +350,6 @@ typedef struct VifScore { static int write_scores(write_score_parameters_vif* data) { - nvtxRangePushA("write_scoes VIF"); VmafFeatureCollector *feature_collector = data->feature_collector; VifStateCuda *s = data->s; unsigned index = data->index; @@ -384,11 +384,7 @@ static int write_scores(write_score_parameters_vif* data) s->feature_name_dict, "VMAF_integer_feature_vif_scale3_score", vif.scale[3].num / vif.scale[3].den, index); - if (!s->debug) { - - nvtxRangePop(); - return err; - } + if (!s->debug) return err; const double score_num = (double)vif.scale[0].num + (double)vif.scale[1].num + @@ -441,7 +437,7 @@ static int write_scores(write_score_parameters_vif* data) err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_vif_den_scale3", vif.scale[3].den, index); - nvtxRangePop(); + return err; } @@ -536,14 +532,18 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { - nvtxRangePushA("flush VIF"); +#ifdef HAVE_NVTX + nvtxRangePushA("flush vif_cuda"); +#endif VifStateCuda *s = fex->priv; int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); CHECK_CUDA(cuEventSynchronize(s->scores_written)); +#ifdef HAVE_NVTX nvtxRangePop(); +#endif return (ret < 0) ? ret : !ret; } From 7293db2c039548d39ec57ac118dcdf491a4dc17d Mon Sep 17 00:00:00 2001 From: Cem Moluluo Date: Wed, 13 Mar 2024 00:25:32 +0100 Subject: [PATCH 5/5] removed redundant sync --- libvmaf/src/libvmaf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index 845103d19..560735666 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -506,7 +506,6 @@ static int flush_context(VmafContext *vmaf) } #ifdef HAVE_CUDA - vmaf_cuda_fex_synchronize(vmaf); vmaf_cuda_fex_synchronize(vmaf); if (vmaf->cuda.state.ctx) { RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors;