This repository has been archived by the owner on Dec 11, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tasks.py
executable file
·87 lines (62 loc) · 2.71 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
import argparse
import csv
import re
import sys
import random
import os
import json
from collections import namedtuple, defaultdict
SEPARATOR = re.compile(' *[,;] +\d+ +- +')
MEANING = re.compile('^(\d+ +-){0,1} *(\'){0,1}')
Task = namedtuple('Task', 'id lemma left word right sense_id hint senses')
parser = argparse.ArgumentParser()
parser.add_argument('--summary', required=True, type=argparse.FileType('r', encoding='UTF-8'))
parser.add_argument('--shuffle', type=int)
parser.add_argument('--train', type=int)
parser.add_argument('word', type=argparse.FileType('r', encoding='UTF-8'), nargs='+')
args = parser.parse_args()
if args.shuffle is not None:
random.seed(args.shuffle)
senses = {}
reader = csv.DictReader(args.summary, delimiter=',')
for row in reader:
senses[row['word']] = {i + 1: re.sub(MEANING, '', sense)
for i, sense in enumerate(re.split(SEPARATOR, row['meaning BTS'].strip()))}
def senses_array(lemma_senses):
return [{'sense': sense, 'definition': definition}
for sense, definition in lemma_senses.items()]
count, id = defaultdict(lambda: defaultdict(lambda: 0)), 1
tasks = []
for f in args.word:
lemma, *_ = os.path.basename(f.name).rpartition('.')
reader = csv.reader(f, delimiter=',')
for row in reader:
if not row[0]:
continue
sense_id = int(row[0])
if not sense_id in senses[lemma]:
print('%s: sense_id is %d, but we have only %d' % (f.name, sense_id, max(senses[lemma].keys())), file=sys.stderr)
continue
left, word, right = row[1:4]
hint = 'В данном случае, слово «%s» имеет значение «%s».' % (lemma, senses[lemma][sense_id])
if args.train is None or count[lemma][sense_id] < args.train:
tasks.append(Task(id, lemma, *row[1:4], sense_id, hint, senses[lemma]))
id += 1
count[lemma][sense_id] += 1
if args.shuffle:
random.shuffle(tasks)
writer = csv.writer(sys.stdout, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
if args.train is None:
writer.writerow(('INPUT:id', 'INPUT:lemma', 'INPUT:left', 'INPUT:word',
'INPUT:right', 'INPUT:senses'))
else:
writer.writerow(('INPUT:id', 'INPUT:lemma', 'INPUT:left', 'INPUT:word',
'INPUT:right', 'GOLDEN:sense_id', 'HINT:text',
'INPUT:senses'))
for task in tasks:
senses_json = json.dumps(senses_array(task.senses))
if args.train is None:
writer.writerow((task.id, task.lemma, task.left, task.word, task.right, senses_json))
else:
writer.writerow((task.id, task.lemma, task.left, task.word, task.right, task.sense_id, task.hint, senses_json))