-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlanguageIdentification.py
307 lines (265 loc) · 10.6 KB
/
languageIdentification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/usr/bin/python
# coding:utf-8
import random
from urllib import urlopen
class NGram(object):
def __init__(self, fn=None, ttype='file', n=3):
self.length = None
self.n = n
self.table = {}
if fn is not None:
# print fn,ttype
self.parseFile(fn, ttype)
self.calculate_length()
def parseFile(self, fn, ttype):
chars = ' ' * self.n # initial sequence of spaces with length n
flag = False
if ttype == 'link':
print "trying to fetch url, may take time..."
f = urlopen(fn)
elif ttype == 'file':
f = open(fn)
# print "read file from:%s" % fn
elif ttype == 'text':
f = fn.split('\n')
flag = True
else:
print ttype
print "Forwarded type unknown"
return
for z, line in enumerate(f):
for letter in (" ".join(line.strip().split()) + " "):
chars = chars[1:] + letter # append letter to sequence of length n
self.table[chars] = self.table.get(chars, 0) + 1 # increment count
if not flag:
f.close()
def calculate_length(self):
""" Treat the N-Gram table as a vector and return its scalar magnitude
to be used for performing a vector-based search.
"""
self.length = sum([x * x for x in self.table.values()]) ** 0.5
return self.length
def __sub__(self, other):
""" Find the difference between two NGram objects by finding the cosine
of the angle between the two vector representations of the table of
N-Grams. Return a float value between 0 and 1 where 0 indicates that
the two NGrams are exactly the same.
"""
if not isinstance(other, NGram):
raise TypeError("Can't compare NGram with non-NGram object.")
if self.n != other.n:
raise TypeError("Can't compare NGram objects of different size.")
total = 0
for k in self.table:
total += self.table[k] * other.table.get(k, 0)
return 1.0 - (float(total)) / (float(self.length) * float(other.length))
class Trigram:
length = 0
def __init__(self, fn=None, ttype='file'):
self.lut = {}
if fn is not None:
# print fn,ttype
self.parseFile(fn, ttype)
def parseFile(self, fn, ttype):
pair = ' '
flag = False
if ttype == 'link':
print "trying to fetch url, may take time..."
f = urlopen(fn)
elif ttype == 'file':
# f = codecs.open(fn,encoding='US-ASCII')
f = open(fn)
# print "read file from:%s" % fn
elif ttype == 'text':
f = fn.split('\n')
flag = True
else:
print "Forwarded type unknown"
for z, line in enumerate(f):
# if not z % 10000:
# print "line %s" % z
# \n's are spurious in a prose context
# to remove more space
for letter in (" ".join(line.strip().split()) + ' '):
d = self.lut.setdefault(pair, {})
d[letter] = d.get(letter, 0) + 1
pair = pair[1] + letter
if not flag:
f.close()
self.measure()
def measure(self):
"""calculates the scalar length of the trigram vector and
stores it in self.length."""
total = 0
for y in self.lut.values():
total += sum([x * x for x in y.values()])
self.length = total ** 0.5
def similarity(self, other):
"""returns a number between 0 and 1 indicating similarity.
1 means an identical ratio of trigrams;
0 means no trigrams in common.
"""
if not isinstance(other, Trigram):
raise TypeError("can't compare Trigram with non-Trigram")
lut1 = self.lut
lut2 = other.lut
total = 0
for k in lut1.keys():
if k in lut2:
a = lut1[k]
b = lut2[k]
for x in a:
if x in b:
total += a[x] * b[x]
return float(total) / (self.length * other.length)
def __sub__(self, other):
"""indicates difference between trigram sets; 1 is entirely
different, 0 is entirely the same."""
return 1 - self.similarity(other)
def makeWords(self, count):
"""returns a string of made-up words based on the known text."""
text = []
k = ' '
while count:
n = self.likely(k)
text.append(n)
k = k[1] + n
if n in ' \t':
count -= 1
return ''.join(text)
def likely(self, k):
"""Returns a character likely to follow the given string
two character string, or a space if nothing is found."""
if k not in self.lut:
return ' '
# if you were using this a lot, caching would a good idea.
letters = []
for k, v in self.lut[k].items():
letters.append(k * v)
letters = ''.join(letters)
return random.choice(letters)
class languageDetectionModel:
def __init__(self,ngram):
self.ngram = ngram
self.en = NGram('./english2.txt', n=ngram)
# NB fr and some others have English license text.
# no has english excerpts.
self.fr = NGram('./french.txt', n=ngram)
self.fi = NGram('./Finnish.txt', n=ngram)
self.no = NGram('./Norwegian.txt', n=ngram)
self.se = NGram('./Swedish.txt', n=ngram)
self.de = NGram("./German2.txt", n=ngram)
self.zh = NGram("./Chinese2.txt", n=ngram)
self.ja = NGram("./Japanese.txt", n=ngram)
self.languages = [self.en, self.fr, self.fi, self.no, self.se, self.de, self.zh, self.ja]
self.languageName = ["English", "French", "Finnish", "Norwegian", "Swedish", "German", "Chinese", "Japanese"]
def find_match(self, input,ttype):
unknown = NGram(input, ttype=ttype,n=self.ngram)
similarity_scores = []
for profile in self.languages:
similarity_scores.append(profile - unknown)
reference_score = min(similarity_scores)
lan = self.languageName[similarity_scores.index(reference_score)]
if reference_score < 0.2:
print "The language of input : %s \n is %s with confidence:%f" % (input,lan, 1 - reference_score)
else:
# print similarity_scores
print "The language of input : %s \n most likely to be %s with confidence:%f" % (input,lan, 1 - reference_score)
return lan,reference_score
def testNgram(ngram=3,testSentences="",infile=""):
print "Test NGram class!"
en = NGram('./english.txt', n=ngram)
# NB fr and some others have English license text.
# no has english excerpts.
fr = NGram('./french.txt', n=ngram)
fi = NGram('./Finnish.txt', n=ngram)
no = NGram('./Norwegian.txt', n=ngram)
se = NGram('./Swedish.txt', n=ngram)
no2 = NGram('./Norwegian.txt', n=ngram)
en2 = NGram('./english2.txt', n=ngram)
fr2 = NGram('./French2.txt', n=ngram)
de = NGram("./German.txt", n=ngram)
de2 = NGram("./German2.txt", n=ngram)
zh = NGram("./Chinese.txt", n=ngram)
zh2 = NGram("./Chinese2.txt", n=ngram)
ja = NGram("./Japanese.txt", n=ngram)
print "calculating difference:"
print "English - German is %s" % (en - de)
print "German - English is %s" % (de - en)
print "German - German2 is %s" % (de - de2)
print "German - Chinese is %s" % (de - zh)
print "English - Chinese is %s" % (en - zh)
print "Chinese - Chinese2 is %s" % (zh - zh2)
print "Chinese - Japanese is %s" % (zh - ja)
print "English - Japanese is %s" % (en - ja)
print "English - French is %s" % (en - fr)
print "English - English2 is %s" % (en - en2)
print "English - French2 is %s" % (en - fr2)
print "French - English2 is %s" % (fr - en2)
print "French - French2 is %s" % (fr - fr2)
print "French2 - English2 is %s" % (fr2 - en2)
print "Finnish - French is %s" % (fi - fr)
print "Finnish - English is %s" % (fi - en)
print "Finnish - Swedish is %s" % (fi - se)
print "Norwegian - Swedish is %s" % (no - se)
print "English - Norwegian is %s" % (en - no)
print "Norwegian - Norwegian2 is %s" % (no - no2)
print "Swedish - Norwegian2 is %s" % (se - no2)
print "English - Norwegian2 is %s" % (en - no2)
print "French - Norwegian2 is %s" % (fr - no2)
lanModel = languageDetectionModel(ngram)
if infile:
lanModel.find_match(infile, "file")
if testSentences:
lanModel.find_match(testSentences, "text")
def testTrigram():
print "Test Trigram class"
en = Trigram('./english.txt')
# NB fr and some others have English license text.
# no has english excerpts.
fr = Trigram('./french.txt')
fi = Trigram('./Finnish.txt')
no = Trigram('./Norwegian.txt')
se = Trigram('./Swedish.txt')
no2 = Trigram('./Norwegian.txt')
en2 = Trigram('./english2.txt')
fr2 = Trigram('./French2.txt')
de = Trigram("./German.txt")
de2 = Trigram("./German2.txt")
zh = Trigram("./Chinese.txt")
zh2 = Trigram("./Chinese2.txt")
ja = Trigram("./Japanese.txt")
print "calculating difference:"
print "en - de is %s" % (en - de)
print "de - en is %s" % (de - en)
print "de - de2 is %s" % (de - de2)
print "de - zh is %s" % (de - zh)
print "en - zh is %s" % (en - zh)
print "zh - zh2 is %s" % (zh - zh2)
print "zh - ja is %s" % (zh - ja)
print "en - ja is %s" % (en - ja)
print "en - fr is %s" % (en - fr)
print "en - en2 is %s" % (en - en2)
print "en - fr2 is %s" % (en - fr2)
print "fr - en2 is %s" % (fr - en2)
print "fr - fr2 is %s" % (fr - fr2)
print "fr2 - en2 is %s" % (fr2 - en2)
print "fi - fr is %s" % (fi - fr)
print "fi - en is %s" % (fi - en)
print "fi - se is %s" % (fi - se)
print "no - se is %s" % (no - se)
print "en - no is %s" % (en - no)
print "no - no2 is %s" % (no - no2)
print "se - no2 is %s" % (se - no2)
print "en - no2 is %s" % (en - no2)
print "fr - no2 is %s" % (fr - no2)
print "\nmaking up English"
print en.makeWords(30)
print "\nmaking up German"
print de2.makeWords(30)
if __name__ == '__main__':
# testTrigram()
ngram=3
testSentences = "How did you get on? Great. If you want to hear more about this topic, please visit our website bbclearningenglish.com. That's about it from the pronunciation workshop this week. Bye bye."
infile = "./testFile.txt"
testNgram(ngram,testSentences,infile)