From dcdef5f1867b709320fda8f48e699513722a29ca Mon Sep 17 00:00:00 2001 From: Cem Moluluo Date: Tue, 12 Mar 2024 18:00:42 +0100 Subject: [PATCH] fixed fex flush function --- libvmaf/src/feature/cuda/integer_adm_cuda.c | 36 +++++++--- .../src/feature/cuda/integer_motion_cuda.c | 71 +++++++++++++------ libvmaf/src/feature/cuda/integer_vif_cuda.c | 34 +++++++-- libvmaf/src/libvmaf.c | 14 ++++ 4 files changed, 116 insertions(+), 39 deletions(-) diff --git a/libvmaf/src/feature/cuda/integer_adm_cuda.c b/libvmaf/src/feature/cuda/integer_adm_cuda.c index caa4d25db..516851825 100644 --- a/libvmaf/src/feature/cuda/integer_adm_cuda.c +++ b/libvmaf/src/feature/cuda/integer_adm_cuda.c @@ -32,6 +32,7 @@ #include "picture_cuda.h" #include #include +#include "nvtx3/nvToolsExt.h" #define RES_BUFFER_SIZE 4 * 3 * 2 @@ -54,7 +55,7 @@ typedef struct AdmStateCuda { int dst_stride, CUstream c_stream); CUstream str, host_stream; void* write_score_parameters; - CUevent ref_event, dis_event, finished; + CUevent ref_event, dis_event, finished, write_scores; VmafDictionary *feature_name_dict; // adm_dwt kernels @@ -641,7 +642,7 @@ typedef struct write_score_parameters_adm { static int write_scores(write_score_parameters_adm* params) { - + nvtxRangePushA("write_scores ADM"); VmafFeatureCollector *feature_collector = params->feature_collector; AdmStateCuda *s = params->s; unsigned index = params->index; @@ -714,7 +715,12 @@ static int write_scores(write_score_parameters_adm* params) s->feature_name_dict, "integer_adm_scale3", scores[6] / scores[7], index); - if (!s->debug) return err; + if (!s->debug) { + + nvtxRangePop(); + return err; + } + err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_adm", score, index); @@ -748,7 +754,7 @@ static int write_scores(write_score_parameters_adm* params) err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_adm_den_scale3", scores[7], index); - + nvtxRangePop(); return err; } @@ -1014,9 +1020,10 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CUmodule adm_cm_module, adm_csf_den_module, adm_csf_module, adm_decouple_module, adm_dwt_module; @@ -1157,7 +1164,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, // CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventDestroy(s->write_scores)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); // current implementation is limited by the 16-bit data pipeline, thus @@ -1178,6 +1187,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, data->w = ref_pic->w[0]; CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); + CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream)); return 0; } @@ -1220,10 +1230,18 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { + nvtxRangePushA("flush ADM"); AdmStateCuda *s = fex->priv; + int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - return 0; + while (cuEventQuery(s->write_scores) != CUDA_SUCCESS) + { + continue; + } + CHECK_CUDA(cuEventSynchronize(s->write_scores)); + nvtxRangePop(); + return (ret < 0) ? ret : !ret; } static const char *provided_features[] = { diff --git a/libvmaf/src/feature/cuda/integer_motion_cuda.c b/libvmaf/src/feature/cuda/integer_motion_cuda.c index 5796e2404..76fa7873c 100644 --- a/libvmaf/src/feature/cuda/integer_motion_cuda.c +++ b/libvmaf/src/feature/cuda/integer_motion_cuda.c @@ -31,9 +31,10 @@ #include "picture.h" #include "picture_cuda.h" #include "cuda_helper.cuh" +#include "nvtx3/nvToolsExt.h" typedef struct MotionStateCuda { - CUevent event, finished; + CUevent event, finished, scores_written; CUfunction funcbpc8, funcbpc16; CUstream str, host_stream; VmafCudaBuffer* blur[2]; @@ -44,6 +45,8 @@ typedef struct MotionStateCuda { double score; bool debug; bool motion_force_zero; + bool flushed; + bool closed; void (*calculate_motion_score)(const VmafPicture* src, VmafCudaBuffer* src_blurred, const VmafCudaBuffer* prev_blurred, VmafCudaBuffer* sad, unsigned width, unsigned height, @@ -136,12 +139,15 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt unsigned bpc, unsigned w, unsigned h) { MotionStateCuda *s = fex->priv; + s->flushed = true; + s->closed = false; CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC)); CUmodule module; CHECK_CUDA(cuModuleLoadData(&module, src_motion_score_ptx)); @@ -202,22 +208,34 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt return -ENOMEM; } +// if called twice in a row, finalize FEX and close static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { + nvtxRangePushA("FLUSH MOT"); + MotionStateCuda *s = fex->priv; int ret = 0; - CHECK_CUDA(cuStreamSynchronize(s->str)); - CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - - // Not required, write_scores takes care of this - // if (s->index > 0) { - // ret = vmaf_feature_collector_append(feature_collector, - // "VMAF_integer_feature_motion2_score", - // s->score, s->index); - // } - - return 0; + if(!s->flushed) { + CHECK_CUDA(cuStreamSynchronize(s->str)); + CHECK_CUDA(cuStreamSynchronize(s->host_stream)); + while (cuEventQuery(s->scores_written) != CUDA_SUCCESS) + { + continue; + } + CHECK_CUDA(cuEventSynchronize(s->scores_written)); + nvtxRangePop(); + } + else { + if (s->index > 0 && !s->closed) { + ret = vmaf_feature_collector_append(feature_collector, + "VMAF_integer_feature_motion2_score", + s->score, s->index); + } + s->closed = true; + } + s->flushed = true; + return (ret < 0) ? ret : !ret; } static inline double normalize_and_scale_sad(uint64_t sad, @@ -243,7 +261,7 @@ static int write_scores(write_score_parameters_moco* params) } if (err) return err; - if (params->index == 1) + if (params->index == 1) return 0; err = vmaf_feature_collector_append(feature_collector, @@ -258,13 +276,16 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, VmafFeatureCollector *feature_collector) { MotionStateCuda *s = fex->priv; - + if(s->closed) { + return -ESHUTDOWN; // TODO: proper error code here + } + s->flushed = false; // this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores CHECK_CUDA(cuStreamSynchronize(s->str)); - // CHECK_CUDA(cuEventSynchronize(s->finished)); + CHECK_CUDA(cuEventSynchronize(s->finished)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + // CHECK_CUDA(cuEventDestroy(s->finished)); + // CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); int err = 0; @@ -287,10 +308,12 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic))); // This event ensures the input buffer is consumed CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT)); - CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); - CHECK_CUDA(cuEventDestroy(s->event)); - CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); - CHECK_CUDA(cuCtxPopCurrent(NULL)); + // CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); + // CHECK_CUDA(cuEventDestroy(s->event)); + // CHECK_CUDA(cuEventDestroy(s->scores_written)); + // CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING)); + // CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC)); + // CHECK_CUDA(cuCtxPopCurrent(NULL)); if (index == 0) { err = vmaf_feature_collector_append(feature_collector, @@ -312,11 +335,13 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic, CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT)); write_score_parameters_moco* params = s->write_score_parameters; + cuEventSynchronize(s->scores_written); params->feature_collector = feature_collector; params->h = ref_pic->h[0]; params->w = ref_pic->w[0]; params->index = index; CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters)); + CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream)); return 0; } diff --git a/libvmaf/src/feature/cuda/integer_vif_cuda.c b/libvmaf/src/feature/cuda/integer_vif_cuda.c index d992a529f..c2a4486c1 100644 --- a/libvmaf/src/feature/cuda/integer_vif_cuda.c +++ b/libvmaf/src/feature/cuda/integer_vif_cuda.c @@ -33,6 +33,9 @@ #include "cuda/integer_vif_cuda.h" #include "picture_cuda.h" + +#include "nvtx3/nvToolsExt.h" + #if ARCH_X86 #include "x86/vif_avx2.h" #if HAVE_AVX512 @@ -42,7 +45,7 @@ typedef struct VifStateCuda { VifBufferCuda buf; - CUevent event, finished; + CUevent event, finished, write_scores; CUstream str, host_stream; bool debug; double vif_enhn_gain_limit; @@ -101,7 +104,7 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0)); CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT)); CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); - + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); // make this static CUmodule filter1d_module; CHECK_CUDA(cuModuleLoadData(&filter1d_module, src_filter1d_ptx)); @@ -346,6 +349,7 @@ typedef struct VifScore { static int write_scores(write_score_parameters_vif* data) { + nvtxRangePushA("write_scoes VIF"); VmafFeatureCollector *feature_collector = data->feature_collector; VifStateCuda *s = data->s; unsigned index = data->index; @@ -380,7 +384,11 @@ static int write_scores(write_score_parameters_vif* data) s->feature_name_dict, "VMAF_integer_feature_vif_scale3_score", vif.scale[3].num / vif.scale[3].den, index); - if (!s->debug) return err; + if (!s->debug) { + + nvtxRangePop(); + return err; + } const double score_num = (double)vif.scale[0].num + (double)vif.scale[1].num + @@ -433,7 +441,7 @@ static int write_scores(write_score_parameters_vif* data) err |= vmaf_feature_collector_append_with_dict(feature_collector, s->feature_name_dict, "integer_vif_den_scale3", vif.scale[3].den, index); - + nvtxRangePop(); return err; } @@ -454,7 +462,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx)); CHECK_CUDA(cuEventDestroy(s->finished)); - CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT)); + CHECK_CUDA(cuEventDestroy(s->write_scores)); + CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING)); + CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING)); CHECK_CUDA(cuCtxPopCurrent(NULL)); CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str)); @@ -496,7 +506,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, write_score_parameters_vif *data = s->buf.cpu_param_buf; data->feature_collector = feature_collector; data->index = index; - CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, data)); + CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data)); + CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream)); return 0; } @@ -524,11 +535,20 @@ static int close_fex_cuda(VmafFeatureExtractor *fex) static int flush_fex_cuda(VmafFeatureExtractor *fex, VmafFeatureCollector *feature_collector) { + nvtxRangePushA("flush VIF"); VifStateCuda *s = fex->priv; + int ret = 0; CHECK_CUDA(cuStreamSynchronize(s->str)); CHECK_CUDA(cuStreamSynchronize(s->host_stream)); - return 0; + while (cuEventQuery(s->write_scores) != CUDA_SUCCESS) + { + continue; + } + CHECK_CUDA(cuEventSynchronize(s->write_scores)); + nvtxRangePop(); + + return (ret < 0) ? ret : !ret; } static const char *provided_features[] = { diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index 50f254463..845103d19 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -506,6 +506,8 @@ static int flush_context(VmafContext *vmaf) } #ifdef HAVE_CUDA + vmaf_cuda_fex_synchronize(vmaf); + vmaf_cuda_fex_synchronize(vmaf); if (vmaf->cuda.state.ctx) { RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors; for (unsigned i = 0; i < rfe.cnt; i++) { @@ -761,6 +763,16 @@ int vmaf_score_at_index(VmafContext *vmaf, VmafModel *model, double *score, if (err) { err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index, score, true, 0); + // if(err) { + // // Error? Sync and try again + // vmaf_cuda_fex_synchronize(vmaf); + // err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index, + // score, true, 0); + // if(err == 0) { + // // No error - got score + // return 0; + // } + // } } return err; @@ -789,6 +801,8 @@ int vmaf_feature_score_pooled(VmafContext *vmaf, const char *feature_name, if (index_low > index_high) return -EINVAL; if (!pool_method) return -EINVAL; + // vmaf_cuda_fex_synchronize(vmaf); + unsigned pic_cnt = 0; double min = 0., max = 0., sum = 0., i_sum = 0.; for (unsigned i = index_low; i <= index_high; i++) {