Skip to content

Commit

Permalink
fixed fex flush function
Browse files Browse the repository at this point in the history
  • Loading branch information
MorkTheOrk committed Mar 12, 2024
1 parent 0397384 commit dcdef5f
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 39 deletions.
36 changes: 27 additions & 9 deletions libvmaf/src/feature/cuda/integer_adm_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "picture_cuda.h"
#include <unistd.h>
#include <assert.h>
#include "nvtx3/nvToolsExt.h"

#define RES_BUFFER_SIZE 4 * 3 * 2

Expand All @@ -54,7 +55,7 @@ typedef struct AdmStateCuda {
int dst_stride, CUstream c_stream);
CUstream str, host_stream;
void* write_score_parameters;
CUevent ref_event, dis_event, finished;
CUevent ref_event, dis_event, finished, write_scores;
VmafDictionary *feature_name_dict;

// adm_dwt kernels
Expand Down Expand Up @@ -641,7 +642,7 @@ typedef struct write_score_parameters_adm {

static int write_scores(write_score_parameters_adm* params)
{

nvtxRangePushA("write_scores ADM");
VmafFeatureCollector *feature_collector = params->feature_collector;
AdmStateCuda *s = params->s;
unsigned index = params->index;
Expand Down Expand Up @@ -714,7 +715,12 @@ static int write_scores(write_score_parameters_adm* params)
s->feature_name_dict, "integer_adm_scale3", scores[6] / scores[7],
index);

if (!s->debug) return err;
if (!s->debug) {

nvtxRangePop();
return err;
}


err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_adm", score, index);
Expand Down Expand Up @@ -748,7 +754,7 @@ static int write_scores(write_score_parameters_adm* params)

err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_adm_den_scale3", scores[7], index);

nvtxRangePop();
return err;
}

Expand Down Expand Up @@ -1014,9 +1020,10 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));


CUmodule adm_cm_module, adm_csf_den_module, adm_csf_module, adm_decouple_module, adm_dwt_module;
Expand Down Expand Up @@ -1157,7 +1164,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventDestroy(s->write_scores));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuCtxPopCurrent(NULL));

// current implementation is limited by the 16-bit data pipeline, thus
Expand All @@ -1178,6 +1187,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
data->w = ref_pic->w[0];
CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data));
CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream));
return 0;
}

Expand Down Expand Up @@ -1220,10 +1230,18 @@ static int close_fex_cuda(VmafFeatureExtractor *fex)
static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
nvtxRangePushA("flush ADM");
AdmStateCuda *s = fex->priv;
int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
return 0;
while (cuEventQuery(s->write_scores) != CUDA_SUCCESS)
{
continue;
}
CHECK_CUDA(cuEventSynchronize(s->write_scores));
nvtxRangePop();
return (ret < 0) ? ret : !ret;
}

static const char *provided_features[] = {
Expand Down
71 changes: 48 additions & 23 deletions libvmaf/src/feature/cuda/integer_motion_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@
#include "picture.h"
#include "picture_cuda.h"
#include "cuda_helper.cuh"
#include "nvtx3/nvToolsExt.h"

typedef struct MotionStateCuda {
CUevent event, finished;
CUevent event, finished, scores_written;
CUfunction funcbpc8, funcbpc16;
CUstream str, host_stream;
VmafCudaBuffer* blur[2];
Expand All @@ -44,6 +45,8 @@ typedef struct MotionStateCuda {
double score;
bool debug;
bool motion_force_zero;
bool flushed;
bool closed;
void (*calculate_motion_score)(const VmafPicture* src, VmafCudaBuffer* src_blurred,
const VmafCudaBuffer* prev_blurred, VmafCudaBuffer* sad,
unsigned width, unsigned height,
Expand Down Expand Up @@ -136,12 +139,15 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
unsigned bpc, unsigned w, unsigned h)
{
MotionStateCuda *s = fex->priv;
s->flushed = true;
s->closed = false;

CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC));

CUmodule module;
CHECK_CUDA(cuModuleLoadData(&module, src_motion_score_ptx));
Expand Down Expand Up @@ -202,22 +208,34 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
return -ENOMEM;
}

// if called twice in a row, finalize FEX and close
static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
nvtxRangePushA("FLUSH MOT");

MotionStateCuda *s = fex->priv;
int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));

// Not required, write_scores takes care of this
// if (s->index > 0) {
// ret = vmaf_feature_collector_append(feature_collector,
// "VMAF_integer_feature_motion2_score",
// s->score, s->index);
// }

return 0;
if(!s->flushed) {
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
while (cuEventQuery(s->scores_written) != CUDA_SUCCESS)
{
continue;
}
CHECK_CUDA(cuEventSynchronize(s->scores_written));
nvtxRangePop();
}
else {
if (s->index > 0 && !s->closed) {
ret = vmaf_feature_collector_append(feature_collector,
"VMAF_integer_feature_motion2_score",
s->score, s->index);
}
s->closed = true;
}
s->flushed = true;
return (ret < 0) ? ret : !ret;
}

static inline double normalize_and_scale_sad(uint64_t sad,
Expand All @@ -243,7 +261,7 @@ static int write_scores(write_score_parameters_moco* params)
}
if (err) return err;

if (params->index == 1)
if (params->index == 1)
return 0;

err = vmaf_feature_collector_append(feature_collector,
Expand All @@ -258,13 +276,16 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
VmafFeatureCollector *feature_collector)
{
MotionStateCuda *s = fex->priv;

if(s->closed) {
return -ESHUTDOWN; // TODO: proper error code here
}
s->flushed = false;
// this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
CHECK_CUDA(cuStreamSynchronize(s->str));
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
// CHECK_CUDA(cuEventDestroy(s->finished));
// CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuCtxPopCurrent(NULL));

int err = 0;
Expand All @@ -287,10 +308,12 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic)));
// This event ensures the input buffer is consumed
CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->event));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));
// CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
// CHECK_CUDA(cuEventDestroy(s->event));
// CHECK_CUDA(cuEventDestroy(s->scores_written));
// CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING));
// CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC));
// CHECK_CUDA(cuCtxPopCurrent(NULL));

if (index == 0) {
err = vmaf_feature_collector_append(feature_collector,
Expand All @@ -312,11 +335,13 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT));

write_score_parameters_moco* params = s->write_score_parameters;
cuEventSynchronize(s->scores_written);
params->feature_collector = feature_collector;
params->h = ref_pic->h[0];
params->w = ref_pic->w[0];
params->index = index;
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters));
CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream));
return 0;
}

Expand Down
34 changes: 27 additions & 7 deletions libvmaf/src/feature/cuda/integer_vif_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
#include "cuda/integer_vif_cuda.h"
#include "picture_cuda.h"


#include "nvtx3/nvToolsExt.h"

#if ARCH_X86
#include "x86/vif_avx2.h"
#if HAVE_AVX512
Expand All @@ -42,7 +45,7 @@

typedef struct VifStateCuda {
VifBufferCuda buf;
CUevent event, finished;
CUevent event, finished, write_scores;
CUstream str, host_stream;
bool debug;
double vif_enhn_gain_limit;
Expand Down Expand Up @@ -101,7 +104,7 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));

CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));
// make this static
CUmodule filter1d_module;
CHECK_CUDA(cuModuleLoadData(&filter1d_module, src_filter1d_ptx));
Expand Down Expand Up @@ -346,6 +349,7 @@ typedef struct VifScore {

static int write_scores(write_score_parameters_vif* data)
{
nvtxRangePushA("write_scoes VIF");
VmafFeatureCollector *feature_collector = data->feature_collector;
VifStateCuda *s = data->s;
unsigned index = data->index;
Expand Down Expand Up @@ -380,7 +384,11 @@ static int write_scores(write_score_parameters_vif* data)
s->feature_name_dict, "VMAF_integer_feature_vif_scale3_score",
vif.scale[3].num / vif.scale[3].den, index);

if (!s->debug) return err;
if (!s->debug) {

nvtxRangePop();
return err;
}

const double score_num =
(double)vif.scale[0].num + (double)vif.scale[1].num +
Expand Down Expand Up @@ -433,7 +441,7 @@ static int write_scores(write_score_parameters_vif* data)
err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_vif_den_scale3", vif.scale[3].den,
index);

nvtxRangePop();
return err;
}

Expand All @@ -454,7 +462,9 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventDestroy(s->write_scores));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->write_scores, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuCtxPopCurrent(NULL));

CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str));
Expand Down Expand Up @@ -496,7 +506,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
write_score_parameters_vif *data = s->buf.cpu_param_buf;
data->feature_collector = feature_collector;
data->index = index;
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, data));
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data));
CHECK_CUDA(cuEventRecord(s->write_scores, s->host_stream));
return 0;
}

Expand Down Expand Up @@ -524,11 +535,20 @@ static int close_fex_cuda(VmafFeatureExtractor *fex)
static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
nvtxRangePushA("flush VIF");
VifStateCuda *s = fex->priv;

int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
return 0;
while (cuEventQuery(s->write_scores) != CUDA_SUCCESS)
{
continue;
}
CHECK_CUDA(cuEventSynchronize(s->write_scores));
nvtxRangePop();

return (ret < 0) ? ret : !ret;
}

static const char *provided_features[] = {
Expand Down
14 changes: 14 additions & 0 deletions libvmaf/src/libvmaf.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ static int flush_context(VmafContext *vmaf)
}

#ifdef HAVE_CUDA
vmaf_cuda_fex_synchronize(vmaf);
vmaf_cuda_fex_synchronize(vmaf);
if (vmaf->cuda.state.ctx) {
RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors;
for (unsigned i = 0; i < rfe.cnt; i++) {
Expand Down Expand Up @@ -761,6 +763,16 @@ int vmaf_score_at_index(VmafContext *vmaf, VmafModel *model, double *score,
if (err) {
err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index,
score, true, 0);
// if(err) {
// // Error? Sync and try again
// vmaf_cuda_fex_synchronize(vmaf);
// err = vmaf_predict_score_at_index(model, vmaf->feature_collector, index,
// score, true, 0);
// if(err == 0) {
// // No error - got score
// return 0;
// }
// }
}

return err;
Expand Down Expand Up @@ -789,6 +801,8 @@ int vmaf_feature_score_pooled(VmafContext *vmaf, const char *feature_name,
if (index_low > index_high) return -EINVAL;
if (!pool_method) return -EINVAL;

// vmaf_cuda_fex_synchronize(vmaf);

unsigned pic_cnt = 0;
double min = 0., max = 0., sum = 0., i_sum = 0.;
for (unsigned i = index_low; i <= index_high; i++) {
Expand Down

0 comments on commit dcdef5f

Please sign in to comment.