Merge branch 'release/0.20'

GlobalMaksimum · Apr 16, 2021 · 30ec53f · 30ec53f
2 parents 5be2a60 + cfc1f44
commit 30ec53f
Show file tree

Hide file tree

Showing 58 changed files with 2,144 additions and 176 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Development of the library started as a part of [Açık Kaynak Hackathon Program
 We keep adding lots to become a general purpose open source NLP library for Turkish langauge.
 
 
-💫 **Version 0.19 out now!**
+💫 **Version 0.20 out now!**
 [Check out the release notes here.](https://github.com/GlobalMaksimum/sadedegel/releases)
 
 
@@ -40,8 +40,10 @@ We keep adding lots to become a general purpose open source NLP library for Turk
 The SadedeGel project is initialized by [@globalmaksimum](https://github.com/GlobalMaksimum) AI team members
 [@dafajon](https://github.com/dafajon),
 [@askarbozcan](https://github.com/askarbozcan),
-[@mccakir](https://github.com/mccakir) and 
-[@husnusensoy](https://github.com/husnusensoy). 
+[@mccakir](https://github.com/mccakir),
+[@husnusensoy](https://github.com/husnusensoy) and 
+[@ertugruldemir](https://github.com/ertugrul-dmr).
+
 
 Other community maintainers
 

diff --git a/prod.requirements.txt b/prod.requirements.txt
@@ -1,7 +1,7 @@
 loguru>=0.5.1
 click>=7.1.2
 
-smart-open>=2.1.0
+smart-open==2.0.0
 
 uvicorn>=0.11.8
 fastapi>=0.61.0

diff --git a/sadedegel/about.py b/sadedegel/about.py
@@ -1,5 +1,5 @@
 __title__ = "sadedegel"  # pragma: no cover
-__version__ = "0.19.2"  # pragma: no cover
+__version__ = "0.20"  # pragma: no cover
 __release__ = True  # pragma: no cover
 __download_url__ = "https://github.com/globalmaksimum/sadedegel/releases"  # pragma: no cover
 __herokuapp_url__ = "https://sadedegel.herokuapp.com"  # pragma: no cover
diff --git a/sadedegel/bblock/doc.py b/sadedegel/bblock/doc.py
@@ -173,9 +173,9 @@ def raw_tf(self, drop_stopwords=False, lowercase=False, drop_suffix=False, drop_
             v = np.zeros(self.vocabulary.size_cs)
 
         if lowercase:
-            tokens = [tr_lower(t) for t in self.tokens]
+            tokens = [t.lower_ for t in self.tokens]
         else:
-            tokens = self.tokens
+            tokens = [t.word for t in self.tokens]
 
         counter = Counter(tokens)
 
@@ -304,7 +304,6 @@ def __init__(self, id_: int, text: str, doc, config: dict = {}):
         self.id = id_
         self.text = text
 
-        self._tokens = None
         self.document = doc
         self.config = doc.builder.config
         self._bert = None
@@ -332,7 +331,7 @@ def __init__(self, id_: int, text: str, doc, config: dict = {}):
                 f"Unknown term frequency method {self.tf_method}. Choose on of {','.join(TF_METHOD_VALUES)}")
 
     @property
-    def avgdl(self) -> int:
+    def avgdl(self) -> float:
         """Average number of tokens per sentence"""
         return self.config['default'].getfloat('avg_sentence_length')
 
@@ -360,21 +359,18 @@ def bert(self, bert):
     def input_ids(self):
         return self.tokenizer.convert_tokens_to_ids(self.tokens_with_special_symbols)
 
-    @property
-    def tokens(self):
-        if self._tokens is None:
-            self._tokens = self.tokenizer(self.text)
-
-        return self._tokens
+    @cached_property
+    def tokens(self) -> List[Token]:
+        return [t for t in self.tokenizer(self.text)]
 
     @property
     def tokens_with_special_symbols(self):
-        return ['[CLS]'] + self.tokens + ['[SEP]']
+        return [Token('[CLS]')] + self.tokens + [Token('[SEP]')]
 
-    def rouge1(self, metric):
+    def rouge1(self, metric) -> float:
         return rouge1_score(
-            flatten([[tr_lower(token) for token in sent.tokens] for sent in self.document if sent.id != self.id]),
-            [tr_lower(t) for t in self.tokens], metric)
+            flatten([[t.lower_ for t in sent] for sent in self.document if sent.id != self.id]),
+            [t.lower_ for t in self], metric)
 
     @property
     def bm25(self) -> np.float32:
@@ -421,8 +417,7 @@ def tf(self):
     def idf(self):
         v = np.zeros(len(self.vocabulary))
 
-        for token in self.tokens:
-            t = self.vocabulary[token]
+        for t in self.tokens:
             if not t.is_oov:
                 v[t.id] = t.idf
 
@@ -441,11 +436,10 @@ def __eq__(self, s: str):
         return self.text == s  # no need for type checking, will return false for non-strings
 
     def __getitem__(self, token_ix):
-        return Token(self.tokens[token_ix])
+        return self.tokens[token_ix]
 
     def __iter__(self):
-        for t in self.tokens:
-            yield Token(t)
+        yield from self.tokens
 
 
 class Document(TFImpl, IDFImpl, BM25Impl):
@@ -462,19 +456,18 @@ def __init__(self, raw, builder):
         self.config = self.builder.config
 
     @property
-    def avgdl(self) -> int:
+    def avgdl(self) -> float:
         """Average number of tokens per document"""
         return self.config['default'].getfloat('avg_document_length')
 
-    @property
-    def tokens(self):
-        if self._tokens is None:
-            self._tokens = []
-            for s in self:
-                for t in s.tokens:
-                    self._tokens.append(t)
+    @cached_property
+    def tokens(self) -> List[str]:
+        tokens = []
+        for s in self:
+            for t in s.tokens:
+                tokens.append(t)
 
-        return self._tokens
+        return tokens
 
     @property
     def vocabulary(self):
@@ -609,7 +602,9 @@ def __init__(self, **kwargs):
 
         tokenizer_str = normalize_tokenizer_name(self.config['default']['tokenizer'])
 
-        self.tokenizer = WordTokenizer.factory(tokenizer_str)
+        self.tokenizer = WordTokenizer.factory(tokenizer_str, emoji=self.config['tokenizer'].getboolean('emoji'),
+                                               hashtag=self.config['tokenizer'].getboolean('hashtag'),
+                                               mention=self.config['tokenizer'].getboolean('mention'))
 
         Token.set_vocabulary(self.tokenizer.vocabulary)
 

diff --git a/sadedegel/tokenize/_sent.py → sadedegel/bblock/sbd.py b/sadedegel/tokenize/_sent.py → sadedegel/bblock/sbd.py
diff --git a/sadedegel/bblock/token.py b/sadedegel/bblock/token.py
@@ -1,12 +1,14 @@
 import unicodedata
-
-from .vocabulary import Vocabulary
 from math import log
+
 import numpy as np
+from cached_property import cached_property
+
 from .util import tr_lower, load_stopwords, deprecate, ConfigNotSet, VocabularyIsNotSet, WordVectorNotFound
+from .vocabulary import Vocabulary
 
-IDF_SMOOTH, IDF_PROBABILISTIC = "smooth", "probabilistic"
-IDF_METHOD_VALUES = [IDF_SMOOTH, IDF_PROBABILISTIC]
+IDF_SMOOTH, IDF_PROBABILISTIC, IDF_UNARY = "smooth", "probabilistic", "unary"
+IDF_METHOD_VALUES = [IDF_SMOOTH, IDF_PROBABILISTIC, IDF_UNARY]
 
 
 class IDFImpl:
@@ -24,25 +26,23 @@ def get_idf(self, method=IDF_SMOOTH, drop_stopwords=False, lowercase=False, drop
         else:
             v = np.zeros(self.vocabulary.size_cs)
 
-        if lowercase:
-            tokens = [tr_lower(t) for t in self.tokens]
-        else:
-            tokens = self.tokens
-
-        for token in tokens:
-            t = Token(token)
+        for t in self.tokens:
             if t.is_oov or (drop_stopwords and t.is_stopword) or (drop_suffix and t.is_suffix) or (
                     drop_punct and t.is_punct):
                 continue
 
             if lowercase:
                 if method == IDF_SMOOTH:
                     v[t.id] = t.smooth_idf
+                elif method == IDF_UNARY:
+                    v[t.id] = t.unary_idf
                 else:
                     v[t.id] = t.prob_idf
             else:
                 if method == IDF_SMOOTH:
                     v[t.id_cs] = t.smooth_idf_cs
+                elif method == IDF_UNARY:
+                    v[t.id] = t.unary_idf_cs
                 else:
                     v[t.id_cs] = t.prob_idf_cs
 
@@ -101,7 +101,9 @@ def _create_token(cls, word: str):
         token.is_punct = all(unicodedata.category(c).startswith("P") for c in token.word)
         token.is_digit = token.word.isdigit()
         token.is_suffix = token.word.startswith('##')
-        token.shape = word_shape(token.word)
+        token.is_emoji = False
+        token.is_hashtag = False
+        token.is_mention = False
 
         return token
 
@@ -112,6 +114,17 @@ def __new__(cls, word: str):
 
         return cls.cache[word]
 
+    def __len__(self):
+        return len(self.word)
+
+    def __eq__(self, other):
+        if type(other) == str:
+            return self.word == other
+        elif type(other) == Token:
+            return self.word == other.word
+        else:
+            raise TypeError(f"Unknown comparison type with Token {type(other)}")
+
     @classmethod
     def set_vocabulary(cls, vocab: Vocabulary):
         Token.vocabulary = vocab
@@ -134,6 +147,8 @@ def idf(self):
         else:
             if Token.config['idf']['method'] == IDF_SMOOTH:
                 return self.smooth_idf
+            elif Token.config['idf']['method'] == IDF_UNARY:
+                return self.unary_idf
             else:
                 return self.prob_idf
 
@@ -151,6 +166,20 @@ def smooth_idf_cs(self):
         else:
             return log(self.vocabulary.document_count / (1 + self.df_cs)) + 1
 
+    @property
+    def unary_idf(self):
+        if Token.vocabulary is None:
+            raise VocabularyIsNotSet("First run set_vocabulary")
+        else:
+            return int(self.df > 0)
+
+    @property
+    def unary_idf_cs(self):
+        if Token.vocabulary is None:
+            raise VocabularyIsNotSet("First run set_vocabulary")
+        else:
+            return int(self.df_cs > 0)
+
     @property
     def prob_idf(self) -> float:
         if Token.vocabulary is None:
@@ -215,6 +244,10 @@ def vector(self) -> np.ndarray:
         else:
             raise WordVectorNotFound(self.word)
 
+    @cached_property
+    def shape(self) -> str:
+        return word_shape(self.word)
+
     def __str__(self):
         return self.word
 

diff --git a/sadedegel/bblock/util.py b/sadedegel/bblock/util.py
@@ -20,7 +20,10 @@
 
 
 def tr_lower(s: str) -> str:
-    return s.replace("I", "ı").replace("İ", "i").lower()
+    if "I" in s or "İ" in s:
+        return s.replace("I", "ı").replace("İ", "i").lower()
+    else:
+        return s.lower()
 
 
 def tr_upper(s: str) -> str:
@@ -161,14 +164,15 @@ def load_stopwords(base_path=None):
     return stopwords
 
 
-def deprecate(message: str, eol_version: tuple):
+def deprecate(message: str, eol_version: tuple, post_message: str = None):
     current = tuple([int(v) for v in __version__.split('.')])
 
     if current >= eol_version:
-        console.print(f"[red]{message}[/red]")
+        console.print(f"[red]{message}[/red]. {post_message}")
         sys.exit(1)
     else:
-        console.print(f"[magenta]{message}[/magenta], will be dropped by {'.'.join(map(str, eol_version))}")
+        console.print(
+            f"{message}, will be [magenta]dropped[/magenta] by {'.'.join(map(str, eol_version))}. {post_message}")
 
 
 class ConfigNotSet(Exception):