forked from SupervisedStylometry/SuperStyl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_to_open-set.py
59 lines (47 loc) · 2.28 KB
/
main_to_open-set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import sys
import os
import jagen_will.preproc.tuyau as tuy
import jagen_will.preproc.features_extract as fex
from jagen_will.preproc.text_count import count_process
import fasttext
import pandas
import json
# from multiprocessing import Pool
from multiprocessing.pool import ThreadPool as Pool
import tqdm
# from importlib import reload
# tuy = reload(tuy)
#import json , json.dump, file et object, json.load sur des files, dumps et loads sur des str
#import json
# TODO: eliminate features that occur only n times ?
# Do the Moisl Selection ?
# Z-scores, etc. ?
# Vector-length normalisation ?
# TODO: free up memory as the script goes by deleting unnecessary objects
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-f', action="store", help="optional list of features in json", default=False)
parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str)
parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int)
parser.add_argument('-c', action='store', help="Path to file with metadata corrections", default=None, type=str)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False)
parser.add_argument('-s', nargs='+', help="paths to files")
args = parser.parse_args()
model = fasttext.load_model("jagen_will/preproc/models/lid.176.bin")
print(".......loading texts.......")
if args.c:
# "debug_authors.csv"
correct_aut= pandas.read_csv(args.c)
# a bit hacky. Improve later
correct_aut.index = list(correct_aut.loc[:, "Original"])
myTexts = tuy.load_texts(args.s, model, correct_aut=correct_aut)
else:
myTexts = tuy.load_texts(args.s, model)
print(".......Saving to csv with text.......")
# saving
pandas.DataFrame(columns=["authors", "texts"], index=[t["name"] for t in myTexts],
data= {"authors": [t["aut"] for t in myTexts],
"texts": [t["text"] for t in myTexts]}).to_csv("openset_feats.csv")