This repository has been archived by the owner on Jun 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 38
/
merge_datasets.py
39 lines (37 loc) · 2.05 KB
/
merge_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import json
import numpy as np
if __name__ == '__main__':
combined_data = dict()
for i in range(8):
with open(f"calc_data_{i}.json") as f:
data = json.load(f)
for item in data:
for output in item['calculator_outputs']:
if any(["*" in output[2], "/" in output[2], "+" in output[2], "-" in output[2]]):
if output[0] > 0.07:
if item["file_index"] not in list(combined_data.keys()):
combined_data[item["file_index"]] = dict()
combined_data[item["file_index"]]["text"] = item["text"]
combined_data[item["file_index"]]["outputs"] = list()
combined_data[item["file_index"]]["outputs"].append([output[1], output[2], output[3]])
with open(f"calendar_data_{i}.json") as f:
data = json.load(f)
for item in data:
for output in item['calendar_outputs']:
if output[0] > 0.25:
if item["file_index"] not in list(combined_data.keys()):
combined_data[item["file_index"]] = dict()
combined_data[item["file_index"]]["text"] = item["text"]
combined_data[item["file_index"]]["outputs"] = list()
combined_data[item["file_index"]]["outputs"].append([output[1], output[2], output[3]])
with open(f"retrieval_data_{i}.json") as f:
data = json.load(f)
for item in data:
for output in item['retrieval_outputs']:
if item["file_index"] not in list(combined_data.keys()):
combined_data[item["file_index"]] = dict()
combined_data[item["file_index"]]["text"] = item["text"]
combined_data[item["file_index"]]["outputs"] = list()
combined_data[item["file_index"]]["outputs"].append([output[1], output[2], output[3]])
with open("../combined_data.json", 'w') as f:
json.dump(combined_data, f, indent=2)