-
Notifications
You must be signed in to change notification settings - Fork 6
/
prepare_dataset.py
139 lines (114 loc) · 4.28 KB
/
prepare_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import shutil
import json
from dotenv import load_dotenv
def download_datasets_from_kaggle(dst_dir):
"""
Download datasets from kaggle API
(icaif-24-finance-rag-challenge)
"""
print("Download of kaggle dataset initiating...")
# Load environment variables for `Kaggle API`
load_dotenv()
# Download dataset from Kaggle
kaggle_competition = "icaif-24-finance-rag-challenge"
download_command = f"kaggle competitions download -c {kaggle_competition}"
os.system(download_command)
# Create temporary directory for extraction
temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)
# Unzip dataset
zip_file = f"{kaggle_competition}.zip"
os.system(f"unzip -q {zip_file} -d {temp_dir}")
os.remove(zip_file) # Remove zip file after extraction
# Define subsets and file structure
subsets = {
"FinDER": (
"finder_queries.jsonl/queries.jsonl",
"finder_corpus.jsonl/corpus.jsonl",
"FinDER_qrels.tsv",
),
"FinQABench": (
"finqabench_queries.jsonl/queries.jsonl",
"finqabench_corpus.jsonl/corpus.jsonl",
"FinQABench_qrels.tsv",
),
"MultiHiertt": (
"multiheirtt_queries.jsonl/queries.jsonl",
"multiheirtt_corpus.jsonl/corpus.jsonl",
"MultiHeirtt_qrels.tsv",
),
"ConvFinQA": (
"convfinqa_queries.jsonl/queries.jsonl",
"convfinqa_corpus.jsonl/corpus.jsonl",
"ConvFinQA_qrels.tsv",
),
"TATQA": (
"tatqa_queries.jsonl/queries.jsonl",
"tatqa_corpus.jsonl/corpus.jsonl",
"TATQA_qrels.tsv",
),
"FinanceBench": (
"financebench_queries.jsonl/queries.jsonl",
"financebench_corpus.jsonl/corpus.jsonl",
"FinanceBench_qrels.tsv",
),
"FinQA": (
"finqa_queries.jsonl/queries.jsonl",
"finqa_corpus.jsonl/corpus.jsonl",
"FinQA_qrels.tsv",
),
}
# Create destination directory
os.makedirs(dst_dir, exist_ok=True)
# Move files to appropriate subset directories
for subset, (query_file, corpus_file, qrel_file) in subsets.items():
subset_dir = os.path.join(dst_dir, subset)
os.makedirs(subset_dir, exist_ok=True)
for src_file, dest_file in zip(
(query_file, corpus_file, qrel_file),
("queries.jsonl", "corpus.jsonl", "qrels.tsv"),
):
src_path = os.path.join(temp_dir, src_file)
dest_path = os.path.join(subset_dir, dest_file)
if os.path.exists(src_path):
shutil.move(src_path, dest_path)
else:
raise FileNotFoundError(
f"Error: Dataset file ({src_file}) not found in {temp_dir}."
)
# Clean up temporary directory
shutil.rmtree(temp_dir)
print("Download of kaggle dataset completed.")
def prepare_datasets(dataset_dir):
"""
Merge corpus and queries into pair
"""
print("Dataset pre-processing initiating...")
subsets = [
"FinanceBench",
"FinDER",
"FinQABench",
"MultiHiertt",
"ConvFinQA",
"TATQA",
"FinQA",
]
for subset in subsets:
query_path = os.path.join(dataset_dir, subset, "queries.jsonl")
with open(query_path, "r", encoding="utf-8") as f:
query_id_list = [json.loads(line)["_id"] for line in f]
corpus_path = os.path.join(dataset_dir, subset, "corpus.jsonl")
with open(corpus_path, "r", encoding="utf-8") as f:
corpus_id_list = [json.loads(line)["_id"] for line in f]
prep_datasets = {}
for query_id in query_id_list:
prep_datasets[query_id] = {corpus_id: 0 for corpus_id in corpus_id_list}
save_path = os.path.join(dataset_dir, subset, "merge.json")
with open(save_path, "w") as f:
json.dump(prep_datasets, f)
print("Dataset pre-processing completed.")
if __name__ == "__main__":
dataset_dir = "./dataset"
download_datasets_from_kaggle(dataset_dir)
prepare_datasets(dataset_dir)