-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_model.py
66 lines (56 loc) · 2.09 KB
/
build_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Copyright (c) 2019 Cisco and/or its affiliates.
This software is licensed to you under the terms of the Cisco Sample
Code License, Version 1.1 (the "License"). You may obtain a copy of the
License at
https://developer.cisco.com/docs/licenses
All use of the material herein must be in accordance with the terms of
the License. All rights not expressly granted by the License are
reserved. Unless required by applicable law or agreed to separately in
writing, software distributed under the License is distributed on an "AS
IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
or implied.
"""
from random import shuffle
import pickle
import string
import yaml
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
def get_commands():
with open("colabot_commands.yaml", "r", encoding="utf8") as f:
commands = yaml.safe_load(f)
list_of_tuples = []
for k in commands:
for c in commands[k]:
c = preprocess(c)
list_of_tuples.append((k, c))
return list_of_tuples
def generate_df(tuple_commands):
shuffle(tuple_commands)
df = pd.DataFrame(tuple_commands, columns=["y", "x"])
return df
def create_pipeline():
return Pipeline(
[
("bow", CountVectorizer()),
("tfidf", TfidfTransformer()),
("classifier", SGDClassifier(loss="modified_huber", alpha=0.01)),
]
)
def preprocess(text):
text = [word.lower().strip().rstrip("s") for word in text.split()]
text = ["".join(c for c in s if c not in string.punctuation) for s in text]
return [" ".join(x for x in text if x)][0]
if __name__ == "__main__":
cmds = get_commands()
df2 = generate_df(cmds)
final_pipeline = create_pipeline()
final_pipeline.fit(df2["x"], df2["y"])
with open("model.pickle", "wb") as f2:
pickle.dump(final_pipeline, f2, protocol=pickle.HIGHEST_PROTOCOL)