Merge pull request #13 from mbanon/newlangs

Newlangs
mbanon · Aug 24, 2023 · 4309321 · 4309321
2 parents 3216cfc + 21aaa7d
commit 4309321
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,15 @@
 # CHANGELOG
+
+## FastSpell 1.10
+- Added more languages: af, ar, az, be, bn, cy, et, fa, fi, ga, gu, he, hi, hu, id, kk, kn, ky, lt, lv, mn, ml, mr, ms, ne, pl, pt, ru, so, sv, ta, te, th, tr, tt, uk, ur, uz
+- Fixed bug with character encoding that resulted in some sentences not being evaluated for certain languages.
+- Fix issue that was preventing non-latin words from being evaluated.
+- Improved removal of punctuation of evaluated tokens
+- Conservative mode is now less conservative:
+  - Raised error threshold
+  - Tag targetted language in case of tie, if error rate is 0
+
+
 ## FastSpell 0.9:
 - Now using CyHunspell.
 - Added automatic tests.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,16 +1,17 @@
 [project]
 name = "fastspell"
-version = "0.9.1"
+version = "0.10"
 license = {file = "LICENSE"}
 readme = "README.md"
 description = "Targetted language identifier, based on FastText and Hunspell."
 requires-python = ">=3.8"
 dependencies = [
     "cyhunspell>=2.0.2, <=2.0.3",
-    "fastspell-dictionaries==3.0",
+    "fastspell-dictionaries==3.1",
     "fasttext-wheel==0.9.2",
     "urllib3",
     "PyYAML",
+    "regex",
 ]
 classifiers = [
     "Environment :: Console",

diff --git a/src/fastspell/config/hunspell.yaml b/src/fastspell/config/hunspell.yaml
@@ -4,30 +4,72 @@ dictpath: ""
 
 #This is how hunspell files (.dic and .aff) are named in dictpath
 hunspell_codes:
+    af: af_ZA
+    an: an_ES
+    ar: ar
+    az: az_AZ
+    be: be_BY
     bg: bg_BG
+    bn: bn_BD
+    bs: bs_BA
     ca: ca_ES
-    gl: gl_ES
-    nb: nb_NO
-    nn: nn_NO
-    sv: sv_SE
+    cs: cs_CZ
+    cy: cy_GB
     da: da_DK
-    is: is_IS
     de: de_DE
-    hu: hu_HU
+    en: en_GB
+    es: es_ES
     et: et_ET
-    bs: bs_BA
-    cs: cs_CZ
-    sk: sk_SK
+    fa: fa_IR
+    fi: fi_FI
+    fo: fo
+    ga: ga
+    gl: gl_ES
+    gu: gu_IN
+    hbs_lat: hbs_Lat_HBS
+    hbs_cyr: hbs_Cyr_HBS
+    he: he_IL
+    hi: hi_IN
     hr: hr_HR
+    hu: hu_HU
+    id: id_ID
+    is: is_IS
+    iw: he_IL
+    kk: kk_KZ
+    kn: kn_IN
+    ky: ky_KG
+    lo: lo_LA
+    lt: lt
+    lv: lv_LV
     me: sr_ME
     mk: mk_MK
-    sq: sq_AL
-    sr: sr_RS
-    sl: sl_SI
-    es: es_ES
-    pt: pt_PT
+    ml: ml_IN
+    mn: mn_MN
+    mr: mr_IN
+    ms: ms_MY
+    nb: nb_NO
+    ne: ne_NP
     nl: nl_NL
-    af: af_ZA
-    hbs_lat: hr_HR
-    hbs_cyr: sr_RS
+    nn: nn_NO
+    "no": nb_NO
+    oc: oc
+    pl: pl
+    pt: pt_PT
     ru: ru_RU
+    sk: sk_SK
+    sl: sl_SI
+    so: so_SO
+    sq: sq_AL
+    sr: sr_RS
+    sv: sv_SE   
+    ta: ta_IN
+    te: te_IN
+    tg: tg_TG
+    th: th_TH
+    tk: tk
+    tr: tr
+    tt: tt_RU
+    uk: uk_UA
+    ur: ur_PK
+    uz: uz_UZ
+    yi: yi
diff --git a/src/fastspell/config/similar.yaml b/src/fastspell/config/similar.yaml
@@ -1,21 +1,48 @@
 #Target langs (keys) dict for mistakeable languages (values)
 similar:
-    ca: [es, ca]
-    bg: [mk, bg]
-    bs: [bs, hr, me, sr, sl]
+    af: [nl, de, af]
+    az: [tr, az]
+    be: [ru, uk, be]    
+    bg: [mk, ru, bg]
+    bs: [hr, sr, sl, bs] #should add me (not adding because dict is the same as bs)
+    ca: [es, oc, ca]
     cs: [sk, cs]
-    da: [da, nb]
-    is: [is, da, nb, nn, de, sv, hu, et]
-    es: [es, gl, ca]
+    cy: [ga, en, cy]
+    da: [nb, sv, da]
+    es: [gl, ca, es]
+    fa: [ar, az, fa]
+    ga: [cy, en, ga]
     gl: [es, pt, gl]
-    hr: [bs, hr, me, sr, sl]
-    me: [bs, hr, me, sr, sl]
-    mk: [bg, mk]
-    nb: [nn, da, nb]
-    nl: [nl, af]
-    nn: [nb, da, nn]
-    sk: [cs, sk]
-    sr: [bs, hr, me, sr, sl]
-    hbs_lat: [hbs_lat, sl]
-    hbs_cyr: [hbs_cyr, ru, mk, bg]
-    sl: [hbs_lat, sl]
+    hbs_lat: [sl, hbs_lat]
+    hbs_cyr: [ru, mk, bg,  hbs_cyr]    
+    hi: [mr, ne, hi] #add SD
+    hr: [bs, sr, sl, hr] #should add me (not adding because dict is the same as bs)
+    id: [ms, id]
+    is: [da, nb, nn, sv, is]
+    iw: [yi, iw]
+    kk: [ky, tt, ru, kk]
+    ky: [ru, kk, tt, mn, ky]
+    lv: [lt, lv]
+    me: [hr, sr, sl, me]  #should add bs (not adding because dict is the same as me)
+    mk: [bg, sr, ru,  mk]    
+    mn: [ru, ky, bg, mn]
+    mr: [hi, mr]
+    ms: [id, ms]
+    nb: [da, sv, nn, nb]
+    ne: [mr, hi, ne]
+    nl: [af, nl]
+    nn: [nb, da, sv, nn]
+    "no": [da, sv, nn, "no"] #same as NB, since dict is the same
+    #ps: [ar, fa, ps]
+    pt: [es, gl, pt]
+    ru: [uk, bg, ru]
+    sk: [cs, pl, sr,  sk]
+    sl: [hbs_lat, sr, hr, bs, sl]    
+    so: [en, fi, cy, kn, so]    
+    sr: [bs, hr, sl, sr]#should add me (not adding because dict is the same as bs)
+    sv: [da, nb,sv]
+    tt: [kk, ky, ru, tt] #maybe add BA
+    uk: [be, ru, uk, mk, bg]
+    ur: [fa, ar, ur]    
+    uz: [tr, uz]
+    yi: [he, yi]
diff --git a/src/fastspell/fastspell.py b/src/fastspell/fastspell.py
@@ -57,7 +57,7 @@ def initialization():
 
 class FastSpell:
 
-    threshold = 0.25 #Hunspell max error rate allowed in a sentence
+    threshold = 0.5 #Hunspell max error rate allowed in a sentence
     prefix = "__label__" #FastText returns langs labeled as __label__LANGCODE
     ft_model_hash = "01810bc59c6a3d2b79c79e6336612f65"
     ft_download_url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
@@ -181,10 +181,12 @@ def getlang(self, sent):
 
         # If prediction does not specify the with variant
         # replace it by any of the variants to trigger hunspell refinement
-        if prediction == "no":
+        if prediction == "no" and self.lang != "no":
             prediction = "nb"
         if prediction == "sh":
             prediction = "sr"
+        if prediction == "he" and self.lang == "iw": #trick for deprecated iw language code for hebrew
+            prediction = "iw"
 
         # Always detect script if supported (will be printed only if requested)
         script = ''
@@ -212,10 +214,15 @@ def getlang(self, sent):
                 dec_sent = sent.encode(encoding='UTF-8',errors='strict').decode('UTF-8') #Not 100% sure about this...
                 raw_toks = sent.strip().split(" ")
                 toks = remove_unwanted_words(raw_toks, self.lang)
-                try:
-                    correct_list = list(map(self.hunspell_objs[l].spell, toks))
-                except UnicodeEncodeError: #...because it sometimes fails here for certain characters
-                    correct_list = []
+                #spellcheck_map = map_except(self.hunspell_objs[l].spell, toks)
+                correct_list = []
+                for token in toks:
+                    try:
+                        correct_list.append(self.hunspell_objs[l].spell(token))                                 
+                        #correct_list = list(map(self.hunspell_objs[l].spell, toks))
+                    except UnicodeEncodeError as ex: #...because it sometimes fails here for certain characters
+                        logging.debug(ex)
+                        correct_list.append(False)
                 corrects = sum(correct_list*1)
                 logging.debug("Tokens: " +str(toks))
                 logging.debug("Corrects: " + str(correct_list))
@@ -225,7 +232,7 @@ def getlang(self, sent):
                 else:
                     error_rate = 1
                 logging.debug("error_rate: " + str(error_rate))
-                if error_rate < self.threshold: #we don't keep it if the error rate is above the threshold
+                if error_rate <= self.threshold: #we don't keep it if the error rate is above the threshold
                     spellchecked[l] =  error_rate
                 logging.debug("----------------")
 
@@ -251,8 +258,11 @@ def getlang(self, sent):
                             #Just take one
                             refined_prediction = best_keys[0]
                     if self.mode == "cons":
-                        #Conservative: just keep it as unknown
-                        refined_prediction = "unk"
+                        #Conservative: just keep it as unknown, unless the  error_rate is 0.0 for the targetted language
+                        if self.lang in best_keys and best_value == 0:
+                            refined_prediction = self.lang
+                        else:
+                            refined_prediction = "unk"
             else:
                 #Nothing in the spellchecking list
                 if self.mode == "aggr":

diff --git a/src/fastspell/util.py b/src/fastspell/util.py
@@ -1,15 +1,21 @@
 from tempfile import TemporaryDirectory
 from argparse import ArgumentTypeError
-from string import punctuation
+#from string import punctuation
 import logging
 import hashlib
 import sys
 import os
+#import unicodedata
+import regex
 
 import fastspell_dictionaries
 import yaml
 
 
+
+#punct = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) #punctuation
+PUNCT_REGEX = regex.compile("(\p{P}+$|^\p{P}+)")
+
 def logging_setup(args = None):
     logger = logging.getLogger()
     logger.handlers = [] # Removing default handler to avoid duplication of log messages
@@ -36,12 +42,13 @@ def remove_unwanted_words(tokens, lang):
     newtokens = []
     isfirsttoken=True
     for token in tokens:
-        token=token.strip(punctuation+" ")
+        token=PUNCT_REGEX.sub("", token.strip()).strip()  #Regex to remove punctuation
         if lang=="de":
-            if token.upper() != token.lower():
+            if any(c.isalpha() for c in token): #token.upper() != token.lower():
                 newtokens.append(token)
         else:
-            if token.upper() != token.lower() and (isfirsttoken or token[0]!=token[0].upper()):    
+            #if token.upper() != token.lower() and (isfirsttoken or (token[0]!=token[0].upper()):
+            if any(c.isalpha() for c in token) and ( isfirsttoken or token[0]==token[0].lower()):
                 newtokens.append(token.lower())
         isfirsttoken=False
     return newtokens