forked from jingtaozhan/DRhard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cvt_back.py
33 lines (29 loc) · 1.48 KB
/
cvt_back.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
import pickle
import argparse
from tqdm import tqdm
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir", type=str, required=True)
parser.add_argument("--preprocess_dir", type=str, required=True)
parser.add_argument("--mode", type=str, choices=["train", "dev", "test", "lead"], required=True)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument("--dataset", type=str, choices=["passage", "doc"], required=True)
args = parser.parse_args()
input_path = os.path.join(args.input_dir, f"{args.mode}.rank.tsv")
output_path = os.path.join(args.output_dir, f"{args.mode}.rank.tsv")
assert not os.path.exists(output_path)
os.makedirs(args.output_dir, exist_ok=True)
pid2offset = pickle.load(open(os.path.join(args.preprocess_dir, "pid2offset.pickle"), 'rb'))
offset2pid = {v:k for k, v in pid2offset.items()}
qid2offset = pickle.load(open(os.path.join(args.preprocess_dir, f"{args.mode}-qid2offset.pickle"), 'rb'))
offset2qid = {v:k for k, v in qid2offset.items()}
with open(output_path, 'w') as output:
for line in tqdm(open(input_path)):
qid, pid, rank = line.split()
qid, pid, rank = int(qid), int(pid), int(rank)
qid, pid = offset2qid[qid], offset2pid[pid]
if args.dataset == "doc":
output.write(f"{qid}\tD{pid}\t{rank}\n")
else:
output.write(f"{qid}\t{pid}\t{rank}\n")