-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdataset.py
76 lines (67 loc) · 2.49 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import json
def get_dataset(input_dir, file_name, temp_output_path=None):
data_path = os.path.join(input_dir,file_name)
with open(data_path, 'r') as f:
data = json.load(f)
processed_data = []
for d in data:
processed_data_dict = {}
id = d['id']
model = d['model']
question = d['question']
answer = d['answer']
gt_answer = d['gt_answer'][0]['text']
question_text = ""
answer_text = ""
image_no = 0
images = []
for q in question:
if q['text'] is not None:
question_text+=q['text']+'\n'
if q['image'] is not None:
image_path = os.path.join(input_dir,q['image'])
if os.path.exists(image_path):
question_text+=f"Image-{image_no}: <image>\n"
image_no+=1
images.append(image_path)
else:
print(f"{image_path} not found!")
if isinstance(answer,str):
answer = [{"text": answer,"image": None}]
for a in answer:
if a['text'] is not None:
answer_text+=a['text']+'\n'
if image_no>2:
break
if a['image'] is not None:
image_path = os.path.join(input_dir,a['image'])
if os.path.exists(image_path):
answer_text+=f"Image-{image_no}: <image>\n"
image_no+=1
images.append(image_path)
else:
print(f"{image_path} not found!")
if image_no>2:
break
processed_data_dict['id'] = id
processed_data_dict['model'] = model
processed_data_dict['question'] = question_text
processed_data_dict['answer'] = answer_text
processed_data_dict['gt_answer'] = gt_answer
processed_data_dict['images'] = images
if len(images) <= 4:
processed_data.append(processed_data_dict)
if temp_output_path and os.path.exists(temp_output_path):
print("Find temp data!")
with open(temp_output_path, 'r') as f:
temp_data = json.load(f)
i = 0
for td in temp_data:
if 'gpt_feedback' in td:
assert(processed_data[i]['id'] == td['id'])
processed_data[i]['gpt_feedback'] = td['gpt_feedback']
i+=1
else:
break
return processed_data