-
Notifications
You must be signed in to change notification settings - Fork 30
/
evaluate_one_video.py
144 lines (115 loc) · 4.32 KB
/
evaluate_one_video.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import torch
import argparse
import pickle as pkl
import decord
import numpy as np
import yaml
from dover.datasets import UnifiedFrameSampler, spatial_temporal_view_decomposition
from dover.models import DOVER
mean, std = (
torch.FloatTensor([123.675, 116.28, 103.53]),
torch.FloatTensor([58.395, 57.12, 57.375]),
)
def fuse_results(results: list):
x = (results[0] - 0.1107) / 0.07355 * 0.6104 + (
results[1] + 0.08285
) / 0.03774 * 0.3896
print(x)
return 1 / (1 + np.exp(-x))
def gaussian_rescale(pr):
# The results should follow N(0,1)
pr = (pr - np.mean(pr)) / np.std(pr)
return pr
def uniform_rescale(pr):
# The result scores should follow U(0,1)
return np.arange(len(pr))[np.argsort(pr).argsort()] / len(pr)
def rescale_results(results: list, vname="undefined"):
dbs = {
"livevqc": "LIVE_VQC",
"kv1k": "KoNViD-1k",
"ltest": "LSVQ_Test",
"l1080p": "LSVQ_1080P",
"ytugc": "YouTube_UGC",
}
for abbr, full_name in dbs.items():
with open(f"dover_predictions/val-{abbr}.pkl", "rb") as f:
pr_labels = pkl.load(f)
aqe_score_set = pr_labels["resize"]
tqe_score_set = pr_labels["fragments"]
tqe_score_set_p = np.concatenate((np.array([results[0]]), tqe_score_set), 0)
aqe_score_set_p = np.concatenate((np.array([results[1]]), aqe_score_set), 0)
tqe_nscore = gaussian_rescale(tqe_score_set_p)[0]
tqe_uscore = uniform_rescale(tqe_score_set_p)[0]
print(f"Compared with all videos in the {full_name} dataset:")
print(
f"-- the technical quality of video [{vname}] is better than {int(tqe_uscore*100)}% of videos, with normalized score {tqe_nscore:.2f}."
)
aqe_nscore = gaussian_rescale(aqe_score_set_p)[0]
aqe_uscore = uniform_rescale(aqe_score_set_p)[0]
print(
f"-- the aesthetic quality of video [{vname}] is better than {int(aqe_uscore*100)}% of videos, with normalized score {aqe_nscore:.2f}."
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-o", "--opt", type=str, default="./dover.yml", help="the option file"
)
## can be your own
parser.add_argument(
"-v",
"--video_path",
type=str,
default="./demo/17734.mp4",
help="the input video path",
)
parser.add_argument(
"-d", "--device", type=str, default="cuda", help="the running device"
)
parser.add_argument(
"-f", "--fusion", action="store_true",
)
args = parser.parse_args()
with open(args.opt, "r") as f:
opt = yaml.safe_load(f)
### Load DOVER
evaluator = DOVER(**opt["model"]["args"]).to(args.device)
evaluator.load_state_dict(
torch.load(opt["test_load_path"], map_location=args.device)
)
dopt = opt["data"]["val-l1080p"]["args"]
temporal_samplers = {}
for stype, sopt in dopt["sample_types"].items():
if "t_frag" not in sopt:
# resized temporal sampling for TQE in DOVER
temporal_samplers[stype] = UnifiedFrameSampler(
sopt["clip_len"], sopt["num_clips"], sopt["frame_interval"]
)
else:
# temporal sampling for AQE in DOVER
temporal_samplers[stype] = UnifiedFrameSampler(
sopt["clip_len"] // sopt["t_frag"],
sopt["t_frag"],
sopt["frame_interval"],
sopt["num_clips"],
)
### View Decomposition
views, _ = spatial_temporal_view_decomposition(
args.video_path, dopt["sample_types"], temporal_samplers
)
for k, v in views.items():
num_clips = dopt["sample_types"][k].get("num_clips", 1)
views[k] = (
((v.permute(1, 2, 3, 0) - mean) / std)
.permute(3, 0, 1, 2)
.reshape(v.shape[0], num_clips, -1, *v.shape[2:])
.transpose(0, 1)
.to(args.device)
)
print(views.keys())
results = [r.mean().item() for r in evaluator(views)]
if args.fusion:
# predict fused overall score, with default score-level fusion parameters
print("Normalized fused overall score (scale in [0,1]):", fuse_results(results))
else:
# predict disentangled scores
rescale_results(results, vname=args.video_path)