SlobodaStudio · KondratiukYuliia · Dec 11, 2018 · Dec 12, 2018 · Dec 19, 2018 · vittorius
diff --git a/main.py b/main.py
@@ -1,13 +1,105 @@
-#!/usr/bin/env python
+import json
+import pandas as pd
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+from stop_words import get_stop_words
+from nltk.wsd import lesk
+import re
 
-# Your classes or methods go here:
 
+class Writer:
 
-class Tool:
-    def my_method(self):
-        print("my_method called!")
+    """Writer data to json file. Read txt file"""
+
+    @staticmethod
+    def write_to_json(data_text):
+        with open('task.json', 'w') as json_w:
+            json.dump(data_text, json_w, indent=4)
+
+
+class Cleaner:
+
+    @staticmethod
+    def clear_data(table, col_name, pattern):
+        table[col_name].replace(pattern, ' ', regex=True)
+        return table[col_name]
+
+
+class Tokenizer:
+
+    @staticmethod
+    def delete_stop_words(table, col_name):
+        nltk_words = stopwords.words('english')
+        stop_words = get_stop_words('en')
+        stop_words.extend(nltk_words)
+        table[col_name].apply(lambda without_stopwords: ' '.join(
+            [word for word in without_stopwords.split() if word not in (stop_words)]))
+        return table[col_name]
+
+    @staticmethod
+    def delete_stop_words(table, col_name):
+        nltk_words = stopwords.words('english')
+        stop_words = get_stop_words('en')
+        stop_words.extend(nltk_words)
+        ex_stopwords = lambda ex_stopwords:''.join([word for word in ex_stopwords.split() if word not in (stop_words)])
+        table[col_name].apply(ex_stopwords)
+        return table[col_name]
+
+    @staticmethod
+    def token(table, col_name):
+        table['tokens'] = table[col_name].str.split()
+        return table['tokens']
+
+
+class FinderOrphan:
+
+    @staticmethod
+    def find_orphan(table):
+        orphan_tokens = []
+        for row in range(len(table.cleared_text)):
+            text = str(table.cleared_text[row])
+            for word in word_tokenize(text):
+                if lesk(text, word) is None:
+                    orphan_tokens.append(word)
+        return orphan_tokens
 
 
 if __name__ == "__main__":
-    t = Tool()
-    t.my_method()
+    # open txt
+    filename = 'input.txt'
+f = open(filename)
+text = f.readlines()
+f.close()
+# data to DataFrame
+data = pd.DataFrame(text)
+data.columns = ['original_text']
+data.original_text = data.original_text.str.lower()
+
+cleaner = Cleaner()
+dollar_symbol = re.compile('(\?*[$].+?[ ])')
+URL_symbol = re.compile('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|[@]\w+')
+lattice_symbol = re.compile('[#]\w+')
+other_symbol = re.compile('[^\w\s]')
+
+data['cleared_text'] = cleaner.clear_data(data, 'original_text', dollar_symbol)
+data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', URL_symbol)
+data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', lattice_symbol)
+data['cleared_text'] = cleaner.clear_data(data, 'cleared_text', other_symbol)
+
+# find tags and metadata
+data['tags'] = data['original_text'].str.findall(lattice_symbol)
+data['metadata'] = data['original_text'].str.findall(URL_symbol)
+
+# tokenize
+tokens = Tokenizer()
+tokens.delete_stop_words(data, 'cleared_text')
+tokens.token(data, 'cleared_text')
+
+# find orphan_tokens
+orphan_tokens = FinderOrphan()
+orphan = orphan_tokens.find_orphan(data)
+
+# data to json
+for_json = data.to_dict()
+write = Writer()
+write.write_to_json(for_json)