-
Notifications
You must be signed in to change notification settings - Fork 3
/
simple_stats.py
96 lines (78 loc) · 4 KB
/
simple_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""Provide a few simple stats for the dataset"""
import argparse
import numpy as np
import pandas as pd
import pickle
import re
import sys
no_letter_pattern = re.compile("^[^a-zA-Z]*$")
seat_number_pattern1 = re.compile("^[1-9][0-9]?[a-zA-Z]$")
seat_number_pattern2 = re.compile("^[a-zA-Z][1-9][0-9]?$")
def text_stats(series, fout, exclude_word_list, precomputed_spell_check):
exclude_list = set()
if exclude_word_list is not None:
with open(exclude_word_list) as fin:
exclude_list = set(fin.read().split('\n'))
spell_check = dict()
if precomputed_spell_check is not None:
with open(precomputed_spell_check) as fin:
for line in fin.read.split('\n'):
k, v = line.split('\t')
spell_check[k] = v
vocabulary = dict()
total = 0
subtotal = 0
rows_empty = 0
for txt in series:
if type(txt) != str:
rows_empty += 1
continue
txt = txt.replace(",", "")
txt = txt.replace(".", "")
txt = txt.replace(";", "")
txt = txt.replace("(", "")
txt = txt.replace(")", "")
txt = txt.replace("[", "")
txt = txt.replace("]", "")
for token in txt.split(' '):
total += 1
if not no_letter_pattern.match(token):
if token in spell_check:
token = spell_check[token]
if token.lower() not in exclude_list and not seat_number_pattern1.match(token) and not seat_number_pattern2.match(token):
subtotal += 1
nb = vocabulary.get(token, 0)
vocabulary[token] = nb + 1
avg = len(vocabulary) / total
print(total, subtotal, len(vocabulary), file=sys.stderr)
med = sorted(vocabulary.values())[round(len(vocabulary)/2)] / total
print(f"{rows_empty} empty rows", file=sys.stderr)
print(f"average: {avg}\nmedian: {med}\n", file=fout)
for word, count in sorted(vocabulary.items(), key=lambda item: item[1], reverse=True):
print(f"{count/total}\t{word}", file=fout)
def mel_corrections(defect_df_full, mel_df):
defect_with_mel = defect_df_full[defect_df_full['mel_number'].notnull()]
join = defect_with_mel.join(mel_df, on='mel_number', rsuffix='_mel_df')
identical = join[join.chapter == join.chapter_mel_df]
identical = identical[identical.section == identical.section_mel_df]
return len(identical) / len(defect_with_mel)
# parse args
parser = argparse.ArgumentParser("A sample program.")
parser.add_argument("input_file", help="A pickle input file, e.g. aircan-data-split-clean.pkl.")
parser.add_argument("--description_stats_output_file", help="Output file for simple text stats on the defect description field.")
parser.add_argument("--word_exclude_file", help="Use with description_stats to ignore words from a given dictionary file.") # e.g. en_dict.txt
parser.add_argument("--precomputed_spell_checks")
args = parser.parse_args()
with open(args.input_file, 'rb') as fin:
[defect_df_train, defect_df_dev, defect_df_test, ata_df, mel_df, trax_df] = pickle.load(fin)
print(f"Read # samples: {len(defect_df_train)} train, {len(defect_df_dev)} dev, {len(defect_df_test)} test.")
defect_df_full = pd.concat([defect_df_train, defect_df_test, defect_df_dev], sort=False)
print(f"Nb of defects: {len(defect_df_full)}")
text_len = defect_df_full.defect_description.apply(lambda s: len(s) if not pd.isnull(s) else np.nan)
nb_toks = defect_df_full.defect_description.apply(lambda s: len(re.split(r'[\s,\.:;]', s)) if not pd.isnull(s) else np.nan)
print(f"Avg text len in chars: {text_len.mean():.1f}")
print(f"Avg text len in tokens: {nb_toks.mean():.1f} +- {nb_toks.std():.1f}")
print(f"Proportion of chapter-section corrections in MEL table: {mel_corrections(defect_df_full, mel_df)*100:.1f}%")
if args.description_stats_output_file is not None:
with open(args.description_stats_output_file, 'w') as fout:
text_stats(defect_df_full.defect_description, fout, args.word_exclude_file, args.precomputed_spell_checks)