Skip to content

Commit

Permalink
Merge pull request #13 from mbanon/newlangs
Browse files Browse the repository at this point in the history
Newlangs
  • Loading branch information
mbanon authored Aug 24, 2023
2 parents 3216cfc + 21aaa7d commit 4309321
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 49 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
# CHANGELOG

## FastSpell 1.10
- Added more languages: af, ar, az, be, bn, cy, et, fa, fi, ga, gu, he, hi, hu, id, kk, kn, ky, lt, lv, mn, ml, mr, ms, ne, pl, pt, ru, so, sv, ta, te, th, tr, tt, uk, ur, uz
- Fixed bug with character encoding that resulted in some sentences not being evaluated for certain languages.
- Fix issue that was preventing non-latin words from being evaluated.
- Improved removal of punctuation of evaluated tokens
- Conservative mode is now less conservative:
- Raised error threshold
- Tag targetted language in case of tie, if error rate is 0


## FastSpell 0.9:
- Now using CyHunspell.
- Added automatic tests.
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
[project]
name = "fastspell"
version = "0.9.1"
version = "0.10"
license = {file = "LICENSE"}
readme = "README.md"
description = "Targetted language identifier, based on FastText and Hunspell."
requires-python = ">=3.8"
dependencies = [
"cyhunspell>=2.0.2, <=2.0.3",
"fastspell-dictionaries==3.0",
"fastspell-dictionaries==3.1",
"fasttext-wheel==0.9.2",
"urllib3",
"PyYAML",
"regex",
]
classifiers = [
"Environment :: Console",
Expand Down
76 changes: 59 additions & 17 deletions src/fastspell/config/hunspell.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,72 @@ dictpath: ""

#This is how hunspell files (.dic and .aff) are named in dictpath
hunspell_codes:
af: af_ZA
an: an_ES
ar: ar
az: az_AZ
be: be_BY
bg: bg_BG
bn: bn_BD
bs: bs_BA
ca: ca_ES
gl: gl_ES
nb: nb_NO
nn: nn_NO
sv: sv_SE
cs: cs_CZ
cy: cy_GB
da: da_DK
is: is_IS
de: de_DE
hu: hu_HU
en: en_GB
es: es_ES
et: et_ET
bs: bs_BA
cs: cs_CZ
sk: sk_SK
fa: fa_IR
fi: fi_FI
fo: fo
ga: ga
gl: gl_ES
gu: gu_IN
hbs_lat: hbs_Lat_HBS
hbs_cyr: hbs_Cyr_HBS
he: he_IL
hi: hi_IN
hr: hr_HR
hu: hu_HU
id: id_ID
is: is_IS
iw: he_IL
kk: kk_KZ
kn: kn_IN
ky: ky_KG
lo: lo_LA
lt: lt
lv: lv_LV
me: sr_ME
mk: mk_MK
sq: sq_AL
sr: sr_RS
sl: sl_SI
es: es_ES
pt: pt_PT
ml: ml_IN
mn: mn_MN
mr: mr_IN
ms: ms_MY
nb: nb_NO
ne: ne_NP
nl: nl_NL
af: af_ZA
hbs_lat: hr_HR
hbs_cyr: sr_RS
nn: nn_NO
"no": nb_NO
oc: oc
pl: pl
pt: pt_PT
ru: ru_RU
sk: sk_SK
sl: sl_SI
so: so_SO
sq: sq_AL
sr: sr_RS
sv: sv_SE
ta: ta_IN
te: te_IN
tg: tg_TG
th: th_TH
tk: tk
tr: tr
tt: tt_RU
uk: uk_UA
ur: ur_PK
uz: uz_UZ
yi: yi
61 changes: 44 additions & 17 deletions src/fastspell/config/similar.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,48 @@
#Target langs (keys) dict for mistakeable languages (values)
similar:
ca: [es, ca]
bg: [mk, bg]
bs: [bs, hr, me, sr, sl]
af: [nl, de, af]
az: [tr, az]
be: [ru, uk, be]
bg: [mk, ru, bg]
bs: [hr, sr, sl, bs] #should add me (not adding because dict is the same as bs)
ca: [es, oc, ca]
cs: [sk, cs]
da: [da, nb]
is: [is, da, nb, nn, de, sv, hu, et]
es: [es, gl, ca]
cy: [ga, en, cy]
da: [nb, sv, da]
es: [gl, ca, es]
fa: [ar, az, fa]
ga: [cy, en, ga]
gl: [es, pt, gl]
hr: [bs, hr, me, sr, sl]
me: [bs, hr, me, sr, sl]
mk: [bg, mk]
nb: [nn, da, nb]
nl: [nl, af]
nn: [nb, da, nn]
sk: [cs, sk]
sr: [bs, hr, me, sr, sl]
hbs_lat: [hbs_lat, sl]
hbs_cyr: [hbs_cyr, ru, mk, bg]
sl: [hbs_lat, sl]
hbs_lat: [sl, hbs_lat]
hbs_cyr: [ru, mk, bg, hbs_cyr]
hi: [mr, ne, hi] #add SD
hr: [bs, sr, sl, hr] #should add me (not adding because dict is the same as bs)
id: [ms, id]
is: [da, nb, nn, sv, is]
iw: [yi, iw]
kk: [ky, tt, ru, kk]
ky: [ru, kk, tt, mn, ky]
lv: [lt, lv]
me: [hr, sr, sl, me] #should add bs (not adding because dict is the same as me)
mk: [bg, sr, ru, mk]
mn: [ru, ky, bg, mn]
mr: [hi, mr]
ms: [id, ms]
nb: [da, sv, nn, nb]
ne: [mr, hi, ne]
nl: [af, nl]
nn: [nb, da, sv, nn]
"no": [da, sv, nn, "no"] #same as NB, since dict is the same
#ps: [ar, fa, ps]
pt: [es, gl, pt]
ru: [uk, bg, ru]
sk: [cs, pl, sr, sk]
sl: [hbs_lat, sr, hr, bs, sl]
so: [en, fi, cy, kn, so]
sr: [bs, hr, sl, sr]#should add me (not adding because dict is the same as bs)
sv: [da, nb,sv]
tt: [kk, ky, ru, tt] #maybe add BA
uk: [be, ru, uk, mk, bg]
ur: [fa, ar, ur]
uz: [tr, uz]
yi: [he, yi]
28 changes: 19 additions & 9 deletions src/fastspell/fastspell.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def initialization():

class FastSpell:

threshold = 0.25 #Hunspell max error rate allowed in a sentence
threshold = 0.5 #Hunspell max error rate allowed in a sentence
prefix = "__label__" #FastText returns langs labeled as __label__LANGCODE
ft_model_hash = "01810bc59c6a3d2b79c79e6336612f65"
ft_download_url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
Expand Down Expand Up @@ -181,10 +181,12 @@ def getlang(self, sent):

# If prediction does not specify the with variant
# replace it by any of the variants to trigger hunspell refinement
if prediction == "no":
if prediction == "no" and self.lang != "no":
prediction = "nb"
if prediction == "sh":
prediction = "sr"
if prediction == "he" and self.lang == "iw": #trick for deprecated iw language code for hebrew
prediction = "iw"

# Always detect script if supported (will be printed only if requested)
script = ''
Expand Down Expand Up @@ -212,10 +214,15 @@ def getlang(self, sent):
dec_sent = sent.encode(encoding='UTF-8',errors='strict').decode('UTF-8') #Not 100% sure about this...
raw_toks = sent.strip().split(" ")
toks = remove_unwanted_words(raw_toks, self.lang)
try:
correct_list = list(map(self.hunspell_objs[l].spell, toks))
except UnicodeEncodeError: #...because it sometimes fails here for certain characters
correct_list = []
#spellcheck_map = map_except(self.hunspell_objs[l].spell, toks)
correct_list = []
for token in toks:
try:
correct_list.append(self.hunspell_objs[l].spell(token))
#correct_list = list(map(self.hunspell_objs[l].spell, toks))
except UnicodeEncodeError as ex: #...because it sometimes fails here for certain characters
logging.debug(ex)
correct_list.append(False)
corrects = sum(correct_list*1)
logging.debug("Tokens: " +str(toks))
logging.debug("Corrects: " + str(correct_list))
Expand All @@ -225,7 +232,7 @@ def getlang(self, sent):
else:
error_rate = 1
logging.debug("error_rate: " + str(error_rate))
if error_rate < self.threshold: #we don't keep it if the error rate is above the threshold
if error_rate <= self.threshold: #we don't keep it if the error rate is above the threshold
spellchecked[l] = error_rate
logging.debug("----------------")

Expand All @@ -251,8 +258,11 @@ def getlang(self, sent):
#Just take one
refined_prediction = best_keys[0]
if self.mode == "cons":
#Conservative: just keep it as unknown
refined_prediction = "unk"
#Conservative: just keep it as unknown, unless the error_rate is 0.0 for the targetted language
if self.lang in best_keys and best_value == 0:
refined_prediction = self.lang
else:
refined_prediction = "unk"
else:
#Nothing in the spellchecking list
if self.mode == "aggr":
Expand Down
15 changes: 11 additions & 4 deletions src/fastspell/util.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
from tempfile import TemporaryDirectory
from argparse import ArgumentTypeError
from string import punctuation
#from string import punctuation
import logging
import hashlib
import sys
import os
#import unicodedata
import regex

import fastspell_dictionaries
import yaml



#punct = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) #punctuation
PUNCT_REGEX = regex.compile("(\p{P}+$|^\p{P}+)")

def logging_setup(args = None):
logger = logging.getLogger()
logger.handlers = [] # Removing default handler to avoid duplication of log messages
Expand All @@ -36,12 +42,13 @@ def remove_unwanted_words(tokens, lang):
newtokens = []
isfirsttoken=True
for token in tokens:
token=token.strip(punctuation+" ")
token=PUNCT_REGEX.sub("", token.strip()).strip() #Regex to remove punctuation
if lang=="de":
if token.upper() != token.lower():
if any(c.isalpha() for c in token): #token.upper() != token.lower():
newtokens.append(token)
else:
if token.upper() != token.lower() and (isfirsttoken or token[0]!=token[0].upper()):
#if token.upper() != token.lower() and (isfirsttoken or (token[0]!=token[0].upper()):
if any(c.isalpha() for c in token) and ( isfirsttoken or token[0]==token[0].lower()):
newtokens.append(token.lower())
isfirsttoken=False
return newtokens
Expand Down

0 comments on commit 4309321

Please sign in to comment.