diff --git a/AUTHORS.md b/AUTHORS.md index b680ba8e83..bf772b2bb7 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -219,6 +219,7 @@ - Deng Wang - Ali Abdullah - Stoytcho Stoytchev +- Lakhdar Benzahia ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: ### Contributors to the Porter Stemmer @@ -232,3 +233,7 @@ - Matt Chaput, [Whoosh](https://bitbucket.org/mchaput/whoosh/wiki/Home) - Strip prefix - Custom rule. +### Authors of snowball arabic stemmer algorithm +- Assem Chelli +- Abdelkrim Aries +- Lakhdar Benzahia \ No newline at end of file diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py index 3ed2dbb931..00b511c183 100644 --- a/nltk/stem/snowball.py +++ b/nltk/stem/snowball.py @@ -5,7 +5,12 @@ # Copyright (C) 2001-2017 NLTK Project # Author: Peter Michael Stahl # Peter Ljunglof (revisions) +# Lakhdar Benzahia (co-writer) +# Assem Chelli (reviewer arabicstemmer) +# Abdelkrim Aries (reviewer arabicstemmer) # Algorithms: Dr Martin Porter +# Assem Chelli arabic stemming algorithm +# Benzahia Lakhdar # URL: # For license information, see LICENSE.TXT @@ -21,11 +26,12 @@ from __future__ import unicode_literals, print_function from six.moves import input +import re from nltk import compat from nltk.corpus import stopwords from nltk.stem import porter -from nltk.stem.util import suffix_replace +from nltk.stem.util import suffix_replace, prefix_replace from nltk.stem.api import StemmerI @@ -36,7 +42,7 @@ class SnowballStemmer(StemmerI): Snowball Stemmer The following languages are supported: - Danish, Dutch, English, Finnish, French, German, + Arabic, Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish and Swedish. @@ -55,7 +61,7 @@ class SnowballStemmer(StemmerI): >>> from nltk.stem import SnowballStemmer >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported - danish dutch english finnish french german hungarian + arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish >>> stemmer = SnowballStemmer("german") # Choose a language @@ -81,7 +87,7 @@ class SnowballStemmer(StemmerI): language, a ValueError is raised. """ - languages = ("danish", "dutch", "english", "finnish", "french", "german", + languages = ("arabic", "danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "porter", "portuguese", "romanian", "russian", "spanish", "swedish") @@ -288,6 +294,520 @@ def _rv_standard(self, word, vowels): return rv +class ArabicStemmer(_LanguageSpecificStemmer): + """ + https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) + The Snowball Arabic light Stemmer + Algorithm : Assem Chelli + Abdelkrim Aries + Lakhdar Benzahia + Nltk Version Author : Lakhdar Benzahia + """ + # Normalize_pre stes + __vocalization = re.compile(r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]') # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ + + __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda + + __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') # ؛ ، ؟ + + # Normalize_post + __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ + + # normalize other hamza's + __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') # أ، إ، آ + + __waw_hamza = re.compile(r'[\u0624]') # ؤ + + __yeh_hamza = re.compile(r'[\u0626]') # ئ + + __alefat = re.compile(r'[\u0623\u0622\u0625]') # أ، إ، آ + + # Checks + __checks1 = ('\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال + '\u0627\u0644', '\u0644\u0644' # لل، ال + ) + + __checks2 = ('\u0629', # ة + '\u0627\u062a' # female plural ات + ) + + # Suffixes + __suffix_noun_step1a = ('\u064a', '\u0643', '\u0647', # ي، ك، ه + '\u0646\u0627', '\u0643\u0645', '\u0647\u0627', '\u0647\u0646', '\u0647\u0645', # نا، كم، ها، هن، هم + '\u0643\u0645\u0627', '\u0647\u0645\u0627' # كما، هما + ) + + __suffix_noun_step1b = ('\u0646') # ن + + __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و + + __suffix_noun_step2b = ('\u0627\u062a') # ات + + __suffix_noun_step2c1 = ('\u062a') # ت + + __suffix_noun_step2c2 = ('\u0629') # ة + + __suffix_noun_step3 = ('\u064a') # ي + + __suffix_verb_step1 = ('\u0647', '\u0643', # ه، ك + '\u0646\u064a', '\u0646\u0627', '\u0647\u0627', '\u0647\u0645', # ني، نا، ها، هم + '\u0647\u0646', '\u0643\u0645', '\u0643\u0646', # هن، كم، كن + '\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648' # هما، كما، كمو + ) + + __suffix_verb_step2a = ( '\u062a', '\u0627', '\u0646' , '\u064a', # ت، ا، ن، ي + '\u0646\u0627', '\u062a\u0627', '\u062a\u0646', # نا، تا، تن Past + '\u0627\u0646', '\u0648\u0646', '\u064a\u0646', # ان، هن، ين Present + '\u062a\u0645\u0627' # تما + ) + + __suffix_verb_step2b = ('\u0648\u0627','\u062a\u0645') # وا، تم + + __suffix_verb_step2c = ('\u0648', # و + '\u062a\u0645\u0648' # تمو + ) + + __suffix_all_alef_maqsura = ('\u0649') # ى + + # Prefixes + __prefix_step1 = ('\u0623', # أ + '\u0623\u0623', '\u0623\u0622', '\u0623\u0624', '\u0623\u0627', '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ + ) + + __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال + + __prefix_step2b = ('\u0641', '\u0648') # ف، و + + __prefix_step3a_noun = ('\u0627\u0644', '\u0644\u0644', # لل، ال + '\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال + ) + + __prefix_step3b_noun = ('\u0628', '\u0643', '\u0644', # ب، ك، ل + '\u0628\u0628', '\u0643\u0643' # بب، كك + ) + + __prefix_step3_verb = ('\u0633\u064a', '\u0633\u062a', '\u0633\u0646', '\u0633\u0623') # سي، ست، سن، سأ + + __prefix_step4_verb = ('\u064a\u0633\u062a', '\u0646\u0633\u062a', '\u062a\u0633\u062a') # يست، نست، تست + + # Suffixes added due to Conjugation Verbs + __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك + + __conjugation_suffix_verb_2 = ('\u0646\u064a', '\u0646\u0627','\u0647\u0627', # ني، نا، ها + '\u0647\u0645', '\u0647\u0646', '\u0643\u0645', # هم، هن، كم + '\u0643\u0646' # كن + ) + __conjugation_suffix_verb_3 = ('\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648') # هما، كما، كمو + + __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي + + __conjugation_suffix_verb_past = ('\u0646\u0627', '\u062a\u0627', '\u062a\u0646') # نا، تا، تن + + __conjugation_suffix_verb_presnet = ('\u0627\u0646', '\u0648\u0646', '\u064a\u0646') # ان، ون، ين + + # Suffixes added due to derivation Names + __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه + + __conjugation_suffix_noun_2 = ('\u0646\u0627', '\u0643\u0645', # نا، كم + '\u0647\u0627', '\u0647\u0646', '\u0647\u0645' # ها، هن، هم + ) + + __conjugation_suffix_noun_3 = ('\u0643\u0645\u0627', '\u0647\u0645\u0627') # كما، هما + + # Prefixes added due to derivation Names + __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا + + __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644') # بال كال + + __articles_2len = ('\u0627\u0644', '\u0644\u0644') # ال لل + + # Prepositions letters + __prepositions1 = ('\u0643', '\u0644') # ك، ل + __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك + + is_verb = True + is_noun = True + is_defined = False + + suffixes_verb_step1_success = False + suffix_verb_step2a_success = False + suffix_verb_step2b_success = False + suffix_noun_step2c2_success = False + suffix_noun_step1a_success = False + suffix_noun_step2a_success = False + suffix_noun_step2b_success = False + suffixe_noun_step1b_success = False + prefix_step2a_success = False + prefix_step3a_noun_success = False + prefix_step3b_noun_success = False + + def __normalize_pre(self, token): + """ + :param token: string + :return: normalized token type string + """ + # strip diacritics + token = self.__vocalization.sub('', token) + #strip kasheeda + token = self.__kasheeda.sub('', token) + # strip punctuation marks + token = self.__arabic_punctuation_marks.sub('', token) + return token + + def __normalize_post(self, token): + # normalize last hamza + for hamza in self.__last_hamzat: + if token.endswith(hamza): + token = suffix_replace(token, hamza, '\u0621') + break + # normalize other hamzat + token = self.__initial_hamzat.sub('\u0627', token) + token = self.__waw_hamza.sub('\u0648', token) + token = self.__yeh_hamza.sub('\u064a', token) + token = self.__alefat.sub('\u0627', token) + return token + + def __checks_1(self, token): + for prefix in self.__checks1 : + if token.startswith(prefix): + if prefix in self.__articles_3len and len(token) > 4 : + self.is_noun = True + self.is_verb = False + self.is_defined = True + break + + if prefix in self.__articles_2len and len(token) > 3 : + self.is_noun = True + self.is_verb = False + self.is_defined = True + break + + def __checks_2(self, token): + for suffix in self.__checks2: + if token.endswith(suffix): + if suffix == '\u0629' and len(token) > 2: + self.is_noun = True + self.is_verb = False + break + + if suffix == '\u0627\u062a' and len(token) > 3: + self.is_noun = True + self.is_verb = False + break + + def __Suffix_Verb_Step1(self, token): + for suffix in self.__suffix_verb_step1: + if token.endswith(suffix): + if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4: + token = token[:-1] + self.suffixes_verb_step1_success = True + break + + if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5: + token = token[:-2] + self.suffixes_verb_step1_success = True + break + + if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6: + token = token[:-3] + self.suffixes_verb_step1_success = True + break + return token + + def __Suffix_Verb_Step2a(self, token): + for suffix in self.__suffix_verb_step2a: + if token.endswith(suffix): + if suffix == '\u062a' and len(token) >= 4: + token = token[:-1] + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4: + token = token[:-1] + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5: + token = token[:-2] # past + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_present and len(token) > 5: + token = token[:-2] # present + self.suffix_verb_step2a_success = True + break + + if suffix == '\u062a\u0645\u0627' and len(token) >= 6: + token = token[:-3] + self.suffix_verb_step2a_success = True + break + return token + + def __Suffix_Verb_Step2c(self, token): + for suffix in self.__suffix_verb_step2c: + if token.endswith(suffix): + if suffix == '\u062a\u0645\u0648' and len(token) >= 6: + token = token[:-3] + break + + if suffix == '\u0648' and len(token) >= 4: + token = token[:-1] + break + return token + + def __Suffix_Verb_Step2b(self, token): + for suffix in self.__suffix_verb_step2b: + if token.endswith(suffix) and len(token) >= 5: + token = token[:-2] + self.suffix_verb_step2b_success = True + break + return token + + def __Suffix_Noun_Step2c2(self, token): + for suffix in self.__suffix_noun_step2c2: + if token.endswith(suffix) and len(token) >= 3: + token = token[:-1] + self.suffix_noun_step2c2_success = True + break + return token + + def __Suffix_Noun_Step1a(self, token): + for suffix in self.__suffix_noun_step1a: + if token.endswith(suffix): + if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4: + token = token[:-1] + self.suffix_noun_step1a_success = True + break + + if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5: + token = token[:-2] + self.suffix_noun_step1a_success = True + break + + if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6: + token = token[:-3] + self.suffix_noun_step1a_success = True + break + return token + + def __Suffix_Noun_Step2a(self, token): + for suffix in self.__suffix_noun_step2a: + if token.endswith(suffix) and len(token) > 4: + token = token[:-1] + self.suffix_noun_step2a_success = True + break + return token + + def __Suffix_Noun_Step2b(self, token): + for suffix in self.__suffix_noun_step2b: + if token.endswith(suffix) and len(token) >= 5: + token = token[:-2] + self.suffix_noun_step2b_success = True + break + return token + + def __Suffix_Noun_Step2c1(self, token): + for suffix in self.__suffix_noun_step2c1: + if token.endswith(suffix) and len(token) >= 4: + token = token[:-1] + break + return token + + def __Suffix_Noun_Step1b(self, token): + for suffix in self.__suffix_noun_step1b: + if token.endswith(suffix) and len(token) > 5: + token = token[:-1] + self.suffixe_noun_step1b_success = True + break + return token + + def __Suffix_Noun_Step3(self, token): + for suffix in self.__suffix_noun_step3: + if token.endswith(suffix) and len(token) >= 3: + token = token[:-1] # ya' nisbiya + break + return token + + def __Suffix_All_alef_maqsura(self, token): + for suffix in self.__suffix_all_alef_maqsura: + if token.endswith(suffix): + token = suffix_replace(token, suffix, '\u064a') + return token + + def __Prefix_Step1(self, token): + for prefix in self.__prefix_step1: + if token.startswith(prefix) and len(token) > 3: + if prefix == '\u0623\u0623': + token = prefix_replace(token, prefix, '\u0623') + break + + elif prefix == '\u0623\u0622': + token = prefix_replace(token, prefix, '\u0622') + break + + elif prefix == '\u0623\u0624': + token = prefix_replace(token, prefix, '\u0624') + break + + elif prefix == '\u0623\u0627' : + token = prefix_replace(token, prefix, '\u0627') + break + + elif prefix == '\u0623\u0625' : + token = prefix_replace(token, prefix, '\u0625') + break + return token + + def __Prefix_Step2a(self, token): + for prefix in self.__prefix_step2a: + if token.startswith(prefix) and len(token) > 5: + token = token[len(prefix):] + self.prefix_step2a_success = True + break + return token + + def __Prefix_Step2b(self, token): + for prefix in self.__prefix_step2b: + if token.startswith(prefix) and len(token) > 3 : + if token[:2] not in self.__prefixes1: + token = token[len(prefix):] + break + return token + + def __Prefix_Step3a_Noun(self, token): + for prefix in self.__prefix_step3a_noun: + if token.startswith(prefix): + if prefix in self.__articles_2len and len(token) > 4: + token = token[len(prefix):] + self.prefix_step3a_noun_success = True + break + if prefix in self.__articles_3len and len(token) > 5: + token = token[len(prefix):] + break + return token + + def __Prefix_Step3b_Noun(self, token): + for prefix in self.__prefix_step3b_noun: + if token.startswith(prefix): + if len(token) > 3: + if prefix == '\u0628': + token = token[len(prefix):] + self.prefix_step3b_noun_success = True + break + + if prefix in self.__prepositions2: + token = prefix_replace(token, prefix, prefix[1]) + self.prefix_step3b_noun_success = True + break + + if prefix in self.__prepositions1 and len(token) > 4: + token = token[len(prefix):] # BUG: cause confusion + self.prefix_step3b_noun_success = True + break + return token + + def __Prefix_Step3_Verb(self, token): + for prefix in self.__prefix_step3_verb: + if token.startswith(prefix) and len(token) > 4: + token = prefix_replace(token, prefix, prefix[1]) + break + return token + + def __Prefix_Step4_Verb(self, token): + for prefix in self.__prefix_step4_verb: + if token.startswith(prefix) and len(token) > 4: + token = prefix_replace(token, prefix, '\u0627\u0633\u062a') + self.is_verb = True + self.is_noun = False + break + return token + + def stem(self, word): + """ + Stem an Arabic word and return the stemmed form. + :param word: string + :return: string + """ + # set initial values + self.is_verb = True + self.is_noun = True + self.is_defined = False + + self.suffix_verb_step2a_success = False + self.suffix_verb_step2b_success = False + self.suffix_noun_step2c2_success = False + self.suffix_noun_step1a_success = False + self.suffix_noun_step2a_success = False + self.suffix_noun_step2b_success = False + self.suffixe_noun_step1b_success = False + self.prefix_step2a_success = False + self.prefix_step3a_noun_success = False + self.prefix_step3b_noun_success = False + + modified_word = word + # guess type and properties + # checks1 + self.__checks_1(modified_word) + # checks2 + self.__checks_2(modified_word) + modified_word = self.__normalize_pre(modified_word) + if self.is_verb: + modified_word = self.__Suffix_Verb_Step1(modified_word) + if self.suffixes_verb_step1_success: + modified_word = self.__Suffix_Verb_Step2a(modified_word) + if not self.suffix_verb_step2a_success : + modified_word = self.__Suffix_Verb_Step2c(modified_word) + #or next + else: + modified_word = self.__Suffix_Verb_Step2b(modified_word) + if not self.suffix_verb_step2b_success: + modified_word = self.__Suffix_Verb_Step2a(modified_word) + if self.is_noun: + modified_word = self.__Suffix_Noun_Step2c2(modified_word) + if not self.suffix_noun_step2c2_success: + if not self.is_defined: + modified_word = self.__Suffix_Noun_Step1a(modified_word) + #if self.suffix_noun_step1a_success: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + if not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2b(modified_word) + if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2c1(modified_word) + # or next ? todo : how to deal with or next + else: + modified_word = self.__Suffix_Noun_Step1b(modified_word) + if self.suffixe_noun_step1b_success: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + if not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2b(modified_word) + if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2c1(modified_word) + else: + if not self.is_defined: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + modified_word = self.__Suffix_Noun_Step2b(modified_word) + modified_word = self.__Suffix_Noun_Step3(modified_word) + if not self.is_noun and self.is_verb: + modified_word = self.__Suffix_All_alef_maqsura(modified_word) + + # prefixes + modified_word = self.__Prefix_Step1(modified_word) + modified_word = self.__Prefix_Step2a(modified_word) + if not self.prefix_step2a_success: + modified_word = self.__Prefix_Step2b(modified_word) + modified_word = self.__Prefix_Step3a_Noun(modified_word) + if not self.prefix_step3a_noun_success and self.is_noun: + modified_word = self.__Prefix_Step3b_Noun(modified_word) + else: + if not self.prefix_step3b_noun_success and self.is_verb: + modified_word = self.__Prefix_Step3_Verb(modified_word) + modified_word = self.__Prefix_Step4_Verb(modified_word) + + # post normalization stemming + modified_word = self.__normalize_post(modified_word) + stemmed_word = modified_word + return stemmed_word + class DanishStemmer(_ScandinavianStemmer): """ @@ -3658,7 +4178,8 @@ def demo(): import re from nltk.corpus import udhr - udhr_corpus = {"danish": "Danish_Dansk-Latin1", + udhr_corpus = {"arabic": "Arabic_Alarabia-Arabic", + "danish": "Danish_Dansk-Latin1", "dutch": "Dutch_Nederlands-Latin1", "english": "English-Latin1", "finnish": "Finnish_Suomi-Latin1", diff --git a/nltk/stem/util.py b/nltk/stem/util.py index c3d9b90170..2ba8547b74 100644 --- a/nltk/stem/util.py +++ b/nltk/stem/util.py @@ -10,3 +10,13 @@ def suffix_replace(original, old, new): Replaces the old suffix of the original string by a new suffix """ return original[:-len(old)] + new + +def prefix_replace(original, old, new): + """ + Replaces the old prefix of the original string by a new suffix + :param original: string + :param old: string + :param new: string + :return: string + """ + return new + original[len(old):] \ No newline at end of file diff --git a/nltk/test/stem.doctest b/nltk/test/stem.doctest index d8427afc81..eff4d2cff1 100644 --- a/nltk/test/stem.doctest +++ b/nltk/test/stem.doctest @@ -46,7 +46,7 @@ Unit tests for Snowball stemmer See which languages are supported. >>> print(" ".join(SnowballStemmer.languages)) - danish dutch english finnish french german hungarian italian + arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish Create a new instance of a language specific subclass. diff --git a/nltk/test/unit/test_stem.py b/nltk/test/unit/test_stem.py index 6287f4292f..5f359d4900 100644 --- a/nltk/test/unit/test_stem.py +++ b/nltk/test/unit/test_stem.py @@ -10,6 +10,20 @@ class SnowballTest(unittest.TestCase): + def test_arabic(self): + """ + this unit testing for test the snowball arabic light stemmer + this stemmer deals with prefixes and suffixes + """ + ar_stemmer = SnowballStemmer("arabic") + assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب" + assert ar_stemmer.stem("العربية") == "عرب" + assert ar_stemmer.stem("فقالوا") == "قال" + assert ar_stemmer.stem("الطالبات") == "طالب" + assert ar_stemmer.stem("فالطالبات") == "طالب" + assert ar_stemmer.stem("والطالبات") == "طالب" + assert ar_stemmer.stem("الطالبون") == "طالب" + def test_russian(self): # Russian words both consisting of Cyrillic # and Roman letters can be stemmed.