-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset_splitter.py
203 lines (158 loc) · 7.36 KB
/
dataset_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from data_processing.data_augmentation import *
import config
import random
import glob
root_path = config.root_dir()
SPLIT_DATA = False
AUGMENTATION = True
def copy_files(path, ROI, df, data_type: str):
"""
Copies video clips from a dataset folder (LC clips and NLC clips) to another folder
:param path: path to copy the files into
:param ROI: Region of interest size
:param df: dataframe containing the clips to be handled
:param data_type: Recognition or Prediction
"""
if not os.path.isdir(path):
os.makedirs(path)
os.makedirs(path+f'/ROI {ROI}/NLC')
os.makedirs(path + f'/ROI {ROI}/LLC')
os.makedirs(path + f'/ROI {ROI}/RLC')
for _, row in df.iterrows():
if row['class'] == 'NLC':
file = f"{root_path}/datasets/NLC clips/ROI {ROI}/processed/{row['clip']}"
target = f"{path}/ROI {ROI}/NLC/{row['clip']}"
shutil.copy(file, target)
else:
file = f"{root_path}/datasets/LC clips/{data_type}/ROI {ROI}/processed/{row['clip']}"
target = f"{path}/ROI {ROI}/{row['clip']}"
shutil.copy(file, target)
return
def get_full_df(ROI, path_LC, path_NLC):
"""
Generates a full dataframe that contains the information for all the clips in folder LC clips and NLC clips
for a specified ROI.
:param ROI: size of the region of interest
:param path_LC: path to LC clips folder
:param path_NLC: path NLC clips folder
:return: a dataframe with the data details concatenated together
"""
LC_clip_store = f'{path_LC}/ROI {ROI}/clip_store.csv'
NLC_clip_store = f'{path_NLC}/ROI {ROI}/clip_store.csv'
LC_data = pd.read_csv(LC_clip_store)
NLC_data = pd.read_csv(NLC_clip_store)
max_amount = LC_data['class'].value_counts().max()
NLC_data = NLC_data.sample(n=max_amount, random_state=0) # we want same split for all ROIs
return pd.concat([LC_data, NLC_data])
def augmentation(path, LLC_clips, NLC_clips, RLC_clips, max_amount, aug_function, suffix):
"""
Performs data augmentation on videos according to the input aug_function.
:param LLC_clips: list of LLC clips
:param NLC_clips: list of NLC clips
:param RLC_clips: list of RLC clips
:param max_amount: maximum amount of clips to be generated
:param aug_function: the function used to perform augmentation
:param suffix: the suffix used for renaming the videos
"""
random_NLCs = random.sample(NLC_clips, max_amount)
random_RLCs = random.sample(RLC_clips, max_amount)
random_LLCs = random.sample(LLC_clips, max_amount)
for (clip_llc, clip_nlc, clip_rlc) in zip(random_LLCs, random_NLCs, random_RLCs):
# LLC
name = os.path.basename(clip_llc).split('.')[0]
new_clip = aug_function(clip_llc)
save_video(f'{path}/LLC', f'{name}_{suffix}', new_clip)
# NLC
name = os.path.basename(clip_nlc).split('.')[0]
new_clip = aug_function(clip_nlc)
save_video(f'{path}/NLC', f'{name}_{suffix}', new_clip)
# RLC
name = os.path.basename(clip_rlc).split('.')[0]
new_clip = aug_function(clip_rlc)
save_video(f'{path}/RLC', f'{name}_{suffix}', new_clip)
return
if __name__ == '__main__':
# split the dataset to test-train?
if SPLIT_DATA:
main_lcs = f'{root_path}/datasets/LC clips/Recognition'
data_type = 'Recognition'
# main_lcs = f'{root_path}/datasets/LC clips/Prediction'
# data_type = 'Prediction'
nlcs = f'{root_path}/datasets/NLC clips'
test = f'{root_path}/datasets/test'
train = f'{root_path}/datasets/train'
# ROI 2
ROI = 2
full_data = get_full_df(ROI=ROI, path_LC=main_lcs, path_NLC=nlcs)
# 75, 25 split, random state = 35 produces a fairly equal split
itrain, itest = train_test_split(range(full_data.shape[0]), random_state=35, test_size=0.25)
X_train = full_data.iloc[itrain, :]
X_test = full_data.iloc[itest, :]
print('Training data size:', len(itrain))
print('Test data size:', len(itest))
print('Training data split:')
print(X_train['class'].value_counts())
print('Test data split:')
print(X_test['class'].value_counts())
copy_files(train + '/' + data_type, ROI, X_train, data_type)
copy_files(test + '/' + data_type, ROI, X_test, data_type)
# ROI 3
ROI = 3
full_data = get_full_df(ROI=ROI, path_LC=main_lcs, path_NLC=nlcs)
# use same split
X_train = full_data.iloc[itrain, :]
X_test = full_data.iloc[itest, :]
copy_files(train + '/' + data_type, ROI, X_train, data_type)
copy_files(test + '/' + data_type, ROI, X_test, data_type)
# ROI 4
ROI = 4
full_data = get_full_df(ROI=ROI, path_LC=main_lcs, path_NLC=nlcs)
# use same split
X_train = full_data.iloc[itrain, :]
X_test = full_data.iloc[itest, :]
copy_files(train + '/' + data_type, ROI, X_train, data_type)
copy_files(test + '/' + data_type, ROI, X_test, data_type)
# perform data augmentation on training set with horizontal flip, jitter, gaussian noise, random rotation
# and adjusting brightness? the data will be balanced. Augmentation done for one ROI at a time
if AUGMENTATION:
ROI = 3
# train = f'{root_path}/datasets/train/Recognition/ROI {ROI}' # -- RECOGNITION --
train = f'{root_path}/datasets/train/Prediction/ROI {ROI}' # -- PREDICTION --
LLC_clips = glob.glob(f"{train}/LLC/*.mp4")
RLC_clips = glob.glob(f"{train}/RLC/*.mp4")
NLC_clips = glob.glob(f"{train}/NLC/*.mp4")
max_amount = min([len(LLC_clips), len(RLC_clips), len(NLC_clips)]) # length of LLC
# horizontal flips
for clip in LLC_clips:
name = os.path.basename(clip).split('.')[0]
new_clip = horizontal_flip(clip)
save_video(f'{train}/RLC', f'{name}_flipped', new_clip)
for clip in RLC_clips:
name = os.path.basename(clip).split('.')[0]
new_clip = horizontal_flip(clip)
save_video(f'{train}/LLC', f'{name}_flipped', new_clip)
random_NLCs = random.sample(NLC_clips, max_amount)
for clip in random_NLCs:
name = os.path.basename(clip).split('.')[0]
new_clip = horizontal_flip(clip)
save_video(f'{train}/NLC', f'{name}_flipped', new_clip)
# gaussian noise
noise = lambda vid: random_noise(vid)
augmentation(train, LLC_clips, NLC_clips, RLC_clips, max_amount, noise, 'noise')
# jitter
jitter = lambda vid: apply_jitter(vid)
augmentation(train, LLC_clips, NLC_clips, RLC_clips, max_amount, jitter, 'jitter')
# adjusted brightness
brightness = lambda vid: random_brightness(vid)
augmentation(train, LLC_clips, NLC_clips, RLC_clips, max_amount, brightness, 'brightness')
# random rotate
rotate = lambda vid: random_rotate(vid)
augmentation(train, LLC_clips, NLC_clips, RLC_clips, max_amount, rotate, 'rotate')
# random augment
random_aug = lambda vid: random_augment(vid)
augmentation(train, LLC_clips, NLC_clips, RLC_clips, 95, random_aug, 'random') # 95 to get to 1700 clips per class
print('Data augmentation done for ROI', ROI)