-
Notifications
You must be signed in to change notification settings - Fork 16
/
tweet_cleaner.py
150 lines (121 loc) · 4.8 KB
/
tweet_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from alphabet_detector import AlphabetDetector
import preprocessor as tweet_processor
import re
from itertools import groupby
rx = re.compile(r'(.)\1{1,}') # check if there is repeated consecutive characters more than once
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)
arabic_diacritics = re.compile(""" ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
def tokenize(s):
return tokens_re.findall(s)
def normalize_arabic(text):
text = remove_diacritics(text)
# text = re.sub("[إأآا]", "ا", text)
# text = re.sub("ى", "ي", text)
# text = re.sub("ؤ", "ء", text)
# text = re.sub("ئ", "ء", text)
text = re.sub("ة", "ه", text)
text = re.sub("گ", "ك", text)
return text
def remove_diacritics(text):
text = re.sub(arabic_diacritics, '', text)
return text
def remove_repeating_char(text):
# return re.sub(r'(.)\1+', r'\1', text) # keep only 1 repeat
return re.sub(r'(.)\1+', r'\1\1', text) # keep 2 repeat
##################################################
# Implemented by Kathrien Abu Kwaik
# Orignial implementation:
# https://github.com/kathrein/Arabic-processing--repeated-characters/
def special_match(char_to_ckh): # helper function
repeated_characters = ['ب','ت','ل','ه','ر','م','ن','ص','ط','د','ف','ي','ه','خ']
return char_to_ckh in repeated_characters
def modify_str(modified_str, index, repeated_char): # help function
if index == 0 and repeated_char =='و':
modified_str = modified_str +'و'+' ''و'
else :
if special_match(repeated_char):
modified_str = modified_str+(repeated_char*2)
else:
modified_str = modified_str+repeated_char
return modified_str
def remove_repeated_letters(word):
modified_str = ""
groups = groupby(word)
result = [(label, sum(1 for _ in group)) for label, group in groups] # compute number of consecutive characters
rxx = rx.search(word)
if rxx: # if it contains sequential characters
index = 0 # to locate the repeated character
modified_str = modified_str+ ' '
for x,y in result:
if y > 1:
modified_str = modify_str(modified_str, index, x)
else:
modified_str = modified_str+ x # if the character has one apperance
index = index +y
else: # if there is no repeated characters in the word
modified_str = modified_str +' '+ word
return modified_str.strip()
##################################################
def get_repeated_letters(text):
repeated_letters = list()
# find letters
for letter in text:
pass
# if letter is repeated :
# repeated_letters.append(letter)
repeated_letters = set(repeated_letters)
return repeated_letters
def get_words(letter, words):
selected_words = list()
for word in words:
if letter in word:
selected_words.append(word)
return set(selected_words)
def keep_only_arabic(words):
ad = AlphabetDetector()
tokens = [token for token in words if ad.is_arabic(token)]
tweet = ' '.join(tokens)
return tweet
def clean_tweet(tweet):
tweet_processor.set_options(tweet_processor.OPT.URL,
tweet_processor.OPT.MENTION,
tweet_processor.OPT.HASHTAG,
tweet_processor.OPT.RESERVED,
tweet_processor.OPT.NUMBER
)
tweet = tweet.lower()
tweet = tweet_processor.clean(tweet)
tweet = tweet.replace(" : ", " ")
tweet = tweet.replace("\n", " ").strip()
tokens = tokenize(tweet)
tokens = [token if emoticon_re.search(token) else token for token in tokens]
tweet = ' '.join(tokens)
tweet = tweet.replace("/ ", " ")
return tweet