diff --git a/main.py b/main.py index 99f91a7..4455c6d 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,105 @@ -#!/usr/bin/env python +import json +import pandas as pd +from nltk import word_tokenize +from nltk.corpus import stopwords +from stop_words import get_stop_words +from nltk.wsd import lesk +import re -# Your classes or methods go here: +class Writer: -class Tool: - def my_method(self): - print("my_method called!") + """Writer data to json file. Read txt file""" + + @staticmethod + def write_to_json(data_text): + with open('task.json', 'w') as json_w: + json.dump(data_text, json_w, indent=4) + + +class Cleaner: + + @staticmethod + def clear_data(table, col_name, pattern): + table[col_name].replace(pattern, ' ', regex=True) + return table[col_name] + + +class Tokenizer: + + @staticmethod + def delete_stop_words(table, col_name): + nltk_words = stopwords.words('english') + stop_words = get_stop_words('en') + stop_words.extend(nltk_words) + table[col_name].apply(lambda without_stopwords: ' '.join( + [word for word in without_stopwords.split() if word not in (stop_words)])) + return table[col_name] + + @staticmethod + def delete_stop_words(table, col_name): + nltk_words = stopwords.words('english') + stop_words = get_stop_words('en') + stop_words.extend(nltk_words) + ex_stopwords = lambda ex_stopwords:''.join([word for word in ex_stopwords.split() if word not in (stop_words)]) + table[col_name].apply(ex_stopwords) + return table[col_name] + + @staticmethod + def token(table, col_name): + table['tokens'] = table[col_name].str.split() + return table['tokens'] + + +class FinderOrphan: + + @staticmethod + def find_orphan(table): + orphan_tokens = [] + for row in range(len(table.cleared_text)): + text = str(table.cleared_text[row]) + for word in word_tokenize(text): + if lesk(text, word) is None: + orphan_tokens.append(word) + return orphan_tokens if __name__ == "__main__": - t = Tool() - t.my_method() + # open txt + filename = 'input.txt' +f = open(filename) +text = f.readlines() +f.close() +# data to DataFrame +data = pd.DataFrame(text) +data.columns = ['original_text'] +data.original_text = data.original_text.str.lower() + +cleaner = Cleaner() +dollar_symbol = re.compile('(\?*[$].+?[ ])') +URL_symbol = re.compile('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|[@]\w+') +lattice_symbol = re.compile('[#]\w+') +other_symbol = re.compile('[^\w\s]') + +data['cleared_text'] = cleaner.clear_data(data, 'original_text', dollar_symbol) +data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', URL_symbol) +data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', lattice_symbol) +data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', other_symbol) + +# find tags and metadata +data['tags'] = data['original_text'].str.findall(lattice_symbol) +data['metadata'] = data['original_text'].str.findall(URL_symbol) + +# tokenize +tokens = Tokenizer() +tokens.delete_stop_words(data, 'cleared_text') +tokens.token(data, 'cleared_text') + +# find orphan_tokens +orphan_tokens = FinderOrphan() +orphan = orphan_tokens.find_orphan(data) + +# data to json +for_json = data.to_dict() +write = Writer() +write.write_to_json(for_json)