-
Notifications
You must be signed in to change notification settings - Fork 4
/
scienceqa_data_preprocess.py
105 lines (93 loc) · 3.46 KB
/
scienceqa_data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import json
from tqdm import tqdm
with open("scienceqa_problems_path.json", 'r') as file:
data = json.load(file)
with open("scienceqa_pid_splits.json") as file:
pid_splits = json.load(file)
train_ids = pid_splits['train']
val_ids = pid_splits['val']
test_ids = pid_splits['test']
# make train annotation
train_annotation = []
for id in tqdm(train_ids):
train_data = data[str(id)]
if train_data['image'] is None:
continue
image_url = f"scienceqa/images/train/{id}/image.png"
if train_data['answer'] == 0:
answer = "(a) " + train_data['choices'][train_data['answer']]
elif train_data['answer'] == 1:
answer = "(b) " + train_data['choices'][train_data['answer']]
elif train_data['answer'] == 2:
answer = "(c) " + train_data['choices'][train_data['answer']]
elif train_data['answer'] == 3:
answer = "(d) " + train_data['choices'][train_data['answer']]
else:
answer = "(e) " + train_data['choices'][train_data['answer']]
ann = {
"image": image_url,
"question": train_data['question'],
"answer" : answer,
"choices": train_data['choices'],
"context" : train_data['hint'] + " " + train_data['lecture'],
"question_id" : id
}
train_annotation.append(ann)
# make val annotation
val_annotation = []
for id in tqdm(val_ids):
val_data = data[str(id)]
if val_data['image'] is None:
continue
image_url = f"scienceqa/images/val/{id}/image.png"
if val_data['answer'] == 0:
answer = "(a) " + val_data['choices'][val_data['answer']]
elif val_data['answer'] == 1:
answer = "(b) " + val_data['choices'][val_data['answer']]
elif val_data['answer'] == 2:
answer = "(c) " + val_data['choices'][val_data['answer']]
elif val_data['answer'] == 3:
answer = "(d) " + val_data['choices'][val_data['answer']]
else:
answer = "(e) " + val_data['choices'][val_data['answer']]
ann = {
"image": image_url,
"question": val_data['question'],
"answer" : answer,
"choices": val_data['choices'],
"context" : val_data['hint']+ " " + val_data['lecture'],
"question_id" : id
}
val_annotation.append(ann)
# make test annotation
test_annotation = []
for id in tqdm(test_ids):
test_data = data[str(id)]
if test_data['image'] is None:
continue
image_url = f"scienceqa/images/test/{id}/image.png"
if test_data['answer'] == 0:
answer = "(a) " + test_data['choices'][test_data['answer']]
elif test_data['answer'] == 1:
answer = "(b) " + test_data['choices'][test_data['answer']]
elif test_data['answer'] == 2:
answer = "(c) " + test_data['choices'][test_data['answer']]
elif test_data['answer'] == 3:
answer = "(d) " + test_data['choices'][test_data['answer']]
else:
answer = "(e) " + test_data['choices'][test_data['answer']]
ann = {
"image": image_url,
"question": test_data['question'],
"answer" : answer,
"choices": test_data['choices'],
"context" :test_data['hint']+ " " + test_data['lecture'],
"question_id" : id
}
test_annotation.append(ann)
with open("/input/scienceqa/scienceqa_train.json", 'w') as file:
json.dump(train_annotation, file)
with open("/input/scienceqa/scienceqa_test.json", 'w') as file:
json.dump(test_annotation, file)
with open("/input/scienceqa/scienceqa_val.json", 'w') as file:
json.dump(val_annotation, file)