-
Notifications
You must be signed in to change notification settings - Fork 1
/
aim_log_scraper.py
250 lines (228 loc) · 9.99 KB
/
aim_log_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import os
import re
import sys
import json
import pandas
from html2text import HTML2Text
from datasets import Dataset
from jinja2 import Template
from config import Config
from split_dataset import SplitDataset
class AIMSplitDataset(SplitDataset):
def __init__(self, config: Config) -> None:
self.aim_log_scraper = AimLogScraper(config)
self.config = config.config
self.max_data_length = self.config.max_data_length
self.prompt = self.config.instruct_prompt
self.prompt_padding = len(self.prompt)
self.train_dataset = []
self.eval_dataset = []
self.build_dataset()
def split_convo(self, conversation):
convos = []
asst = conversation['assistant']
user = conversation['user']
assistant_one, assistant_two = asst[:len(asst)//2], asst[len(asst)//2:]
user_one, user_two = asst[:len(user)//2], asst[len(user)//2:]
convo_one = {
'assistant': assistant_one,
'user': user_one,
'id': conversation['id'] + '_0'
}
convo_two = {
'assistant': assistant_two,
'user': user_two,
'id': conversation['id'] + '_1'
}
if len(assistant_one + user_one) + self.prompt_padding > self.max_data_length:
convos.extend(self.split_convo(convo_one))
else:
convos.append(convo_one)
if len(assistant_two + user_two) + self.prompt_padding > self.max_data_length:
convos.extend(self.split_convo(convo_two))
else:
convos.append(convo_two)
return convos
def build_dataset(self):
output = self.aim_log_scraper.convert_htmls()
if self.config.replacement_dict_json != '':
with open(os.path.join(self.config.replacement_dict_json), "r") as f:
replacement_dict = json.load(f)
output = self.aim_log_scraper.replace_symbols(output, replacement_dict, False)
conversations = self.aim_log_scraper.to_convo_format(output)
i = 0
train_dataset_raw = []
eval_dataset_raw = []
for conversation in conversations:
if len(conversation['assistant'] + conversation['user']) + self.prompt_padding > self.max_data_length:
train_dataset_raw.extend(self.split_convo(conversation))
else:
if self.config.eval_fraction == 0:
train_dataset_raw.append(conversation)
else:
if i % self.config.eval_fraction == 0:
eval_dataset_raw.append(conversation)
else:
train_dataset_raw.append(conversation)
i+=1
for item in train_dataset_raw:
item['train_data'] = self.format_training_text(item)
for item in eval_dataset_raw:
item['train_data'] = self.format_training_text(item)
self.eval_dataset = Dataset.from_pandas(pandas.DataFrame(data=eval_dataset_raw))
self.train_dataset = Dataset.from_pandas(pandas.DataFrame(data=train_dataset_raw))
def format_training_text(self, convo):
prompt = self.config.instruct_prompt + ' ' if self.config.instruct_prompt != '' else ''
template = Template(self.config.training_template)
return template.render({
'prompt': prompt,
'user': convo['user'],
'assistant': convo['assistant']
})
def get_training(self) -> Dataset:
return self.train_dataset
def get_eval(self) -> Dataset:
return self.eval_dataset
USER='user'
ASSISTANT='assistant'
class AimLogScraper():
def __init__(self, config: Config = None):
self.html_converter = HTML2Text()
self.html_converter.unicode_snob = 1
self.html_converter.ignore_emphasis = 1
self.html_converter.ignore_images = 1
self.html_converter.ignore_links = 1
self.html_converter.body_width = 0
if config != None:
self.config = config.config
def convert_htmls(self, path = None, screennames = None):
output = ""
if path == None:
path = self.config.chats_location
if screennames == None:
screennames = self.config.screen_names
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".htm") or file.endswith(".html"):
with open(os.path.join(root, file), "r") as f:
output += self.filter(self.html_converter.handle(f.read()), screennames)
return output
def to_convo_format(self, input):
turns = 0
id = 0
last_role = ""
list_of_convos = []
convo_struct = {
'user': '',
'assistant': '',
'id': str(id)
}
list_of_convos.append(convo_struct)
for line in input.split("\n"):
if line.startswith('assistant :'):
role = ASSISTANT
elif line.startswith('user :'):
role = USER
else:
role = USER if last_role == "" else last_role
if last_role == "": #init
last_role = role
if turns == 1 and last_role != role:
id += 1
convo_struct = {
'user': '',
'assistant': '',
'id': str(id)
}
list_of_convos.append(convo_struct)
turns = 0
elif last_role != role:
turns += 1
if convo_struct[role] != '':
convo_struct[role] += '\n'
convo_struct[role] += line.replace(f"{role} :", '').strip()
last_role = role
return list_of_convos
def replace_symbols(self, text: str, replacement_dict: dict, case_sensitive: bool):
#print(text)
for key, val in replacement_dict.items():
# If a key starts with a caret, don't add a leading space and strip the caret
if key[0] == '^':
key = key[1:]
else:
key = " " + key # Add leading space so we minimize mid-word replacements
val = " " + val
if case_sensitive:
text = text.replace(key, val)
else:
pattern = re.compile(re.escape(key), re.IGNORECASE)
text = pattern.sub(val, text)
return text
def filter(self, text, screennames):
lines = text.splitlines()
screennames = screennames.split(",")
filtered = []
for line in lines:
if "is idle at " in line or "is no longer idle at " in line or \
" returned at " in line or "is away at " in line or \
"Session concluded at " in line or \
"Auto response from " in line or "* * *" in line or \
"wants to directly connect " in line or \
"is now directly connected " in line or \
"Your screen name " in line or \
"was not successful. Make sure that your Buddy is using the latest version of AIM" in line or \
"For best results, you and your buddy should use the latest version of AIM" in line or \
"direct connection is closed " in line or \
" signed off at " in line or \
" signed on at " in line :
continue
line = re.sub(r'\((\d{1,2}:\d{2}:\d{2}\s*(AM|PM))\)', '', line)
for screenname in screennames:
line = line.replace(f"{screenname} :", "assistant___")
line = re.sub(r"^([^:]*):", 'user :', line, 0, re.MULTILINE)
line = line.replace("assistant___", "assistant :")
if line.isspace() == True or line == "":
continue
filtered.append(line)
return "\n".join(filtered) + "\n"
def find_names(self, text):
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk_results = ne_chunk(pos_tag(word_tokenize(text)))
for nltk_result in nltk_results:
if type(nltk_result) == Tree:
name = ''
for nltk_result_leaf in nltk_result.leaves():
name += nltk_result_leaf[0] + ' '
if nltk_result.label() == "PERSON":
print(name)
# arg 1 is the path to where you wish to scan for AIM chat logs,
# arg 2 is a comma separated list of AIM screennames to use as the assistant role
# arg 3 is optional; to print personal names found in training data to the console,
# set this argument to '--print-names'. To pass in a json dictionary of substitutions
# to be made in the source data (e.g. change all instances of "Rich" to "Steve"), pass
# a path to a file for substitutions of names. Keys wil be searched case insensitive.
# The file name must end with `.json`, and be structured like `{"rich": "Steve", "rob": "Bill"}`
# arg 4 is optional and if set to '--print-names', will print names found in sanitized output
if __name__ == "__main__":
aim_log_scraper = AimLogScraper()
if len(sys.argv) < 3:
print("Usage: python aim_log_scraper /path/to/aim/chats MyFirstScreenname,MySecondScreenname")
exit(0)
output = aim_log_scraper.convert_htmls(sys.argv[1], sys.argv[2])
if len(sys.argv) >= 3 and sys.argv[3] == '--print-names':
aim_log_scraper.find_names(output)
elif len(sys.argv) >= 3 and '.json' in sys.argv[3]:
with open(os.path.join(sys.argv[3]), "r") as f:
replacement_dict = json.load(f)
output = aim_log_scraper.replace_symbols(output, replacement_dict, False)
if len(sys.argv) >= 4 and sys.argv[4] == '--print-names':
aim_log_scraper.find_names(output)
convos = aim_log_scraper.to_convo_format(output)
for convo in convos:
print(json.dumps(convo))