forked from google-research-datasets/natural-questions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
simplify_nq_data.py
75 lines (61 loc) · 2.51 KB
/
simplify_nq_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Script to apply `text_utils.simplify_nq_data` to all examples in a split.
We have provided the processed training set at the link below.
https://storage.cloud.google.com/natural_questions/v1.0-simplified/simplified-nq-train.jsonl.gz
The test set, used by NQ's competition website, is only provided in the original
NQ format. If you wish to use the simplified format, then you should call
`text_utils.simplify_nq_data` in your submitted system.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import glob
import gzip
import json
import os
import time
from absl import app
from absl import flags
import text_utils as text_utils
FLAGS = flags.FLAGS
flags.DEFINE_string(
"data_dir", None, "Path to directory containing original NQ"
"files, matching the pattern `nq-<split>-??.jsonl.gz`.")
def main(_):
"""Runs `text_utils.simplify_nq_example` over all shards of a split.
Prints simplified examples to a single gzipped file in the same directory
as the input shards.
"""
split = os.path.basename(FLAGS.data_dir)
outpath = os.path.join(FLAGS.data_dir,
"simplified-nq-{}.jsonl.gz".format(split))
with gzip.open(outpath, "wb") as fout:
num_processed = 0
start = time.time()
for inpath in glob.glob(os.path.join(FLAGS.data_dir, "nq-*-??.jsonl.gz")):
print("Processing {}".format(inpath))
with gzip.open(inpath, "rb") as fin:
for l in fin:
utf8_in = l.decode("utf8", "strict")
utf8_out = json.dumps(
text_utils.simplify_nq_example(json.loads(utf8_in))) + u"\n"
fout.write(utf8_out.encode("utf8"))
num_processed += 1
if not num_processed % 100:
print("Processed {} examples in {}.".format(num_processed,
time.time() - start))
if __name__ == "__main__":
app.run(main)