From 44e136e3dc3c8b11233706ccbcff603c52b4a7cd Mon Sep 17 00:00:00 2001 From: KondratiukYuliia Date: Tue, 11 Dec 2018 21:35:44 +0200 Subject: [PATCH 1/3] add some to print string --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 99f91a7..77e01fc 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ class Tool: def my_method(self): - print("my_method called!") + print("my_method called!!!!!") if __name__ == "__main__": From bfcbc8dcab56d6bc504a0d47c342b90b30a77332 Mon Sep 17 00:00:00 2001 From: KondratiukYuliia Date: Wed, 12 Dec 2018 20:28:14 +0200 Subject: [PATCH 2/3] no message --- main.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 77e01fc..f88a008 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,82 @@ -#!/usr/bin/env python +import json +import pandas as pd +from nltk import word_tokenize +from nltk.corpus import stopwords +from stop_words import get_stop_words +from nltk.wsd import lesk -# Your classes or methods go here: +class Writer: -class Tool: - def my_method(self): - print("my_method called!!!!!") + """Writer data to json file. Read txt file""" + + @staticmethod + def write_to_json(data_text): + with open('task.json', 'w') as json_w: + json.dump(data_text, json_w, indent=4) + + +class Cleaner: + + @staticmethod + def clear_data(table, col_name): + table[col_name] = table['original_text'].replace('(\?*[$].+?[ ])', '', regex=True) + table[col_name] = table['cleared_text'].replace('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|[@]\w+', '', regex=True) + table[col_name] = table['cleared_text'].replace('[#]\w+', '', regex=True) + table[col_name] = table['cleared_text'].replace('[^\w\s]', '', regex=True) + return table[col_name] + + +class Tokenizer: + + @staticmethod + def stop_words(table, col_name): + nltk_words = stopwords.words('english') + stop_words = get_stop_words('en') + stop_words.extend(nltk_words) + table[col_name] = table[col_name].apply(lambda without_stopwords: ' '.join( + [word for word in without_stopwords.split() if word not in (stop_words)])) + return table[col_name] + + @staticmethod + def token(table, col_name): + data['tokens'] = table[col_name].str.split() + return data['tokens'] if __name__ == "__main__": - t = Tool() - t.my_method() + # open txt + filename = 'input.txt' +f = open(filename) +text = f.readlines() +f.close() +# data to DataFrame +data = pd.DataFrame(text) +data.columns = ['original_text'] +data.original_text = data.original_text.str.lower() + +cleaner = Cleaner() +data['cleared_text'] = cleaner.clear_data(data, 'cleared_text') + +# find tags and metadata +data['tags'] = data['original_text'].str.findall(r'[#]\w+') +data['metadata'] = data['original_text'].str.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|[@]\w+') + +# tokenize +tokens = Tokenizer() +tokens.stop_words(data, 'cleared_text') +tokens.token(data, 'cleared_text') + +# find orphan_tokens +orphan_tokens = [] +for row in range(len(data.cleared_text)): + text = str(data.cleared_text[row]) + for word in word_tokenize(text): + if lesk(text, word) is None: + orphan_tokens.append(word) + +# data to json +for_json = data.to_dict() +write = Writer() +write.write_to_json(for_json) +# к сожалению не все успела реализовать From a802071b13ea18c110f92925b9fa5632e969abfb Mon Sep 17 00:00:00 2001 From: KondratiukYuliia Date: Wed, 19 Dec 2018 15:19:33 +0200 Subject: [PATCH 3/3] new --- main.py | 63 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index f88a008..4455c6d 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ from nltk.corpus import stopwords from stop_words import get_stop_words from nltk.wsd import lesk +import re class Writer: @@ -19,29 +20,48 @@ def write_to_json(data_text): class Cleaner: @staticmethod - def clear_data(table, col_name): - table[col_name] = table['original_text'].replace('(\?*[$].+?[ ])', '', regex=True) - table[col_name] = table['cleared_text'].replace('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|[@]\w+', '', regex=True) - table[col_name] = table['cleared_text'].replace('[#]\w+', '', regex=True) - table[col_name] = table['cleared_text'].replace('[^\w\s]', '', regex=True) + def clear_data(table, col_name, pattern): + table[col_name].replace(pattern, ' ', regex=True) return table[col_name] class Tokenizer: @staticmethod - def stop_words(table, col_name): + def delete_stop_words(table, col_name): nltk_words = stopwords.words('english') stop_words = get_stop_words('en') stop_words.extend(nltk_words) - table[col_name] = table[col_name].apply(lambda without_stopwords: ' '.join( + table[col_name].apply(lambda without_stopwords: ' '.join( [word for word in without_stopwords.split() if word not in (stop_words)])) return table[col_name] + @staticmethod + def delete_stop_words(table, col_name): + nltk_words = stopwords.words('english') + stop_words = get_stop_words('en') + stop_words.extend(nltk_words) + ex_stopwords = lambda ex_stopwords:''.join([word for word in ex_stopwords.split() if word not in (stop_words)]) + table[col_name].apply(ex_stopwords) + return table[col_name] + @staticmethod def token(table, col_name): - data['tokens'] = table[col_name].str.split() - return data['tokens'] + table['tokens'] = table[col_name].str.split() + return table['tokens'] + + +class FinderOrphan: + + @staticmethod + def find_orphan(table): + orphan_tokens = [] + for row in range(len(table.cleared_text)): + text = str(table.cleared_text[row]) + for word in word_tokenize(text): + if lesk(text, word) is None: + orphan_tokens.append(word) + return orphan_tokens if __name__ == "__main__": @@ -56,27 +76,30 @@ def token(table, col_name): data.original_text = data.original_text.str.lower() cleaner = Cleaner() -data['cleared_text'] = cleaner.clear_data(data, 'cleared_text') +dollar_symbol = re.compile('(\?*[$].+?[ ])') +URL_symbol = re.compile('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|[@]\w+') +lattice_symbol = re.compile('[#]\w+') +other_symbol = re.compile('[^\w\s]') + +data['cleared_text'] = cleaner.clear_data(data, 'original_text', dollar_symbol) +data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', URL_symbol) +data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', lattice_symbol) +data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', other_symbol) # find tags and metadata -data['tags'] = data['original_text'].str.findall(r'[#]\w+') -data['metadata'] = data['original_text'].str.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|[@]\w+') +data['tags'] = data['original_text'].str.findall(lattice_symbol) +data['metadata'] = data['original_text'].str.findall(URL_symbol) # tokenize tokens = Tokenizer() -tokens.stop_words(data, 'cleared_text') +tokens.delete_stop_words(data, 'cleared_text') tokens.token(data, 'cleared_text') # find orphan_tokens -orphan_tokens = [] -for row in range(len(data.cleared_text)): - text = str(data.cleared_text[row]) - for word in word_tokenize(text): - if lesk(text, word) is None: - orphan_tokens.append(word) +orphan_tokens = FinderOrphan() +orphan = orphan_tokens.find_orphan(data) # data to json for_json = data.to_dict() write = Writer() write.write_to_json(for_json) -# к сожалению не все успела реализовать