Merge pull request #13 from brucewlee/option_stanza

Adding Stanza option to TTR, Entity Grid
dpalmasan · Apr 3, 2021 · cf655a6 · cf655a6
2 parents b978e16 + a911578
commit cf655a6
Show file tree

Hide file tree

Showing 6 changed files with 192 additions and 33 deletions.
diff --git a/src/TRUNAJOD/entity_grid.py b/src/TRUNAJOD/entity_grid.py
@@ -18,8 +18,9 @@
 sequence and the API currently does not provide any hyper-parameter tunning to
 change this.
 """
+from TRUNAJOD.utils import SupportedModels
 
-SPACY_UNIVERSAL_NOUN_TAGS = set([u'NOUN', u'PRON', u'PROPN'])
+UNIVERSAL_NOUN_TAGS = set([u'NOUN', u'PRON', u'PROPN'])
 
 ordered_transitions = [
     u'SS', u'SO', u'SX', u'S-', u'OS', u'OO', u'OX', u'O-', u'XS', u'XO',
@@ -63,7 +64,7 @@ class EntityGrid(object):
     module. It only supports 2-transitions entity grid.
     """
 
-    def __init__(self, doc):
+    def __init__(self, doc, model_name="spacy"):
         """Construct EntityGrid object."""
         # Initialization
         entity_map = dict()
@@ -88,9 +89,14 @@ def __init__(self, doc):
             u'-X': 0,
             u'--': 0
         }
+        # check model
+        model = SupportedModels(model_name)
 
         # Get number of sentences in the text
-        n_sent = len(list(doc.sents))
+        if model == SupportedModels.SPACY:
+            n_sent = len(list(doc.sents))
+        elif model == SupportedModels.STANZA:
+            n_sent = len(list(doc.sentences))
 
         # To get coherence measurements we need at least 2 sentences
         if n_sent < 2:
@@ -99,15 +105,26 @@ def __init__(self, doc):
                 .format(n_sent))
 
         # For each sentence, get dependencies and its grammatical role
-        for sent in doc.sents:
-            for token in sent:
-                if token.pos_ in SPACY_UNIVERSAL_NOUN_TAGS:
-                    entity_map['s%d' % i].append((token.text.upper(),
-                                                  token.dep_))
-                    if token.text.upper() not in entity_grid:
-                        entity_grid[token.text.upper()] = [u'-'] * n_sent
-            i += 1
-            entity_map['s%d' % i] = []
+        if model == SupportedModels.SPACY:
+            for sent in doc.sents:
+                for token in sent:
+                    if token.pos_ in UNIVERSAL_NOUN_TAGS:
+                        entity_map['s%d' % i].append((token.text.upper(),
+                                                    token.dep_))
+                        if token.text.upper() not in entity_grid:
+                            entity_grid[token.text.upper()] = [u'-'] * n_sent
+                i += 1
+                entity_map['s%d' % i] = []
+        elif model == SupportedModels.STANZA:
+            for sent in doc.sentences:
+                for word in sent.words:
+                    if word.upos in UNIVERSAL_NOUN_TAGS:
+                        entity_map['s%d' % i].append((word.text.upper(),
+                                                    word.deprel))
+                        if word.text.upper() not in entity_grid:
+                            entity_grid[word.text.upper()] = ['-'] * n_sent
+                i += 1
+                entity_map['s%d' % i] = []
 
         # Last iteration will create an extra element, so I remove it.
         entity_map.pop('s%d' % i)

diff --git a/src/TRUNAJOD/ttr.py b/src/TRUNAJOD/ttr.py
@@ -8,7 +8,10 @@
 tend to 0. This measurement is not recommended if analyzing texts of different
 lengths, as when the number of tokens increases, the TTR tends flatten.
 """
-from TRUNAJOD.utils import is_word
+from TRUNAJOD.utils import SupportedModels,is_word
+
+# dev import
+# from src.TRUNAJOD.utils import is_word
 
 
 def type_token_ratio(word_list):
@@ -22,27 +25,36 @@ def type_token_ratio(word_list):
     return len(set(word_list)) / len(word_list)
 
 
-def lexical_diversity_mtld(doc, ttr_segment=0.72):
+def lexical_diversity_mtld(doc, model_name="spacy", ttr_segment=0.72):
     """Compute MTLD lexical diversity in a bi-directional fashion.
 
     :param doc: Processed text
-    :type doc: Spacy Doc
+    :type doc: NLP Doc
     :return: Bi-directional lexical diversity MTLD
     :rtype: float
     """
+    # check model
+    model = SupportedModels(model_name)
+
     word_list = []
-    for token in doc:
-        if is_word(token.pos_):
-            word_list.append(token.lemma_)
-    return (one_side_lexical_diversity_mtld(word_list, ttr_segment) +
-            one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2
+    if model == SupportedModels.SPACY:
+        for token in doc:
+            if is_word(token.pos_):
+                word_list.append(token.lemma_)
+    elif model == SupportedModels.STANZA:
+        for sent in doc.sentences:
+            for word in sent.words:
+                if is_word(word.upos):
+                    word_list.append(word.lemma)
+    return (one_side_lexical_diversity_mtld(word_list, model, ttr_segment) +
+            one_side_lexical_diversity_mtld(word_list[::-1], model, ttr_segment)) / 2
 
 
-def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
+def one_side_lexical_diversity_mtld(doc, model_name="spacy", ttr_segment=0.72):
     """Lexical diversity per MTLD.
 
     :param doc: Tokenized text
-    :type doc: Spacy Doc
+    :type doc: NLP Doc
     :param ttr_segment: Threshold for TTR mean computation
     :type ttr_segment: float
     :return: MLTD lexical diversity
@@ -52,17 +64,31 @@ def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
     total_words = 0
     non_ttr_segment = 1 - ttr_segment
     word_list = []
-    for token in doc:
-        word_list.append(token.lower())
-        total_words += 1
-        ttr = type_token_ratio(word_list)
-        if ttr < ttr_segment:
-            word_list = []
-            factor += 1
+
+    # check model
+    model = SupportedModels(model_name)
+
+    if model == SupportedModels.SPACY or type(doc) == list:
+        for token in doc:
+            word_list.append(token.lower())
+            total_words += 1
+            ttr = type_token_ratio(word_list)
+            if ttr < ttr_segment:
+                word_list = []
+                factor += 1
+    elif model == SupportedModels.STANZA:
+        if type(doc) != list:
+            for sent in doc.sentences:
+                for word in sent.words:
+                    word_list.append(word.text.lower())
+                    total_words += 1
+                    ttr = type_token_ratio(word_list)
+                    if ttr < ttr_segment:
+                        word_list = []
+                        factor += 1
 
     if word_list:
         factor += 1 - (
             type_token_ratio(word_list) - ttr_segment) / non_ttr_segment
         total_words += 1
-
     return total_words / factor
diff --git a/src/TRUNAJOD/utils.py b/src/TRUNAJOD/utils.py
@@ -1,7 +1,12 @@
 #!/usr/bin/env python
 """Utility functions for TRUNAJOD library."""
+from enum import Enum
 
 
+class SupportedModels(str, Enum):
+    SPACY = "spacy"
+    STANZA = "stanza"
+
 def flatten(list_of_lists):
     """Flatten a list of list.
 

diff --git a/stanza_example.py b/stanza_example.py
@@ -0,0 +1,43 @@
+from TRUNAJOD.entity_grid import EntityGrid
+from TRUNAJOD.ttr import lexical_diversity_mtld, one_side_lexical_diversity_mtld
+import spacy
+import stanza
+"""
+MUST CHANGE TTR IMPORT TO WORK
+"""
+# Load spaCy model
+nlp = spacy.load("es_core_news_sm")
+
+# Load stanza model
+nlp_s = stanza.Pipeline('es', use_gpu=False)
+
+# Example
+example_text = (
+    "El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas"
+    "sobre el universo, su origen y su funcionamiento. No es sorprendente que "
+    "todas las civilizaciones y culturas hayan formado sus propias "
+    "cosmologías. Unas relatan, por ejemplo, que el universo ha"
+    "sido siempre tal como es, con ciclos que inmutablemente se repiten; "
+    "otras explican que este universo ha tenido un principio, "
+    "que ha aparecido por obra creadora de una divinidad."
+)
+
+# Create Doc
+doc = nlp(example_text)
+doc_s = nlp_s(example_text)
+
+# TTR Check - change TTR import to test
+print("spacy result: ", lexical_diversity_mtld(doc))
+# or
+# print("spacy result: ", lexical_diversity_mtld(doc, model_name="spacy"))
+print("stanza result: ", lexical_diversity_mtld(doc_s, model_name="stanza"))
+
+# Entity Grid Check
+egrid = EntityGrid(doc)
+egrid_s = EntityGrid(doc_s, model_name="stanza")
+
+print("spacy Entity grid:")
+print(egrid.get_egrid())
+
+print("stanza Entity grid:")
+print(egrid_s.get_egrid())
diff --git a/tester.py b/tester.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+"""Type Token Ratios module.
+
+Type token ratios (TTR) are a measurement of lexical diversity. They are
+defined as the ratio of unique tokens divided by the total number of tokens.
+This measurement is bounded between 0 and 1. If there is no repetition in
+the text this measurement is 1, and if there is infinite repetition, it will
+tend to 0. This measurement is not recommended if analyzing texts of different
+lengths, as when the number of tokens increases, the TTR tends flatten.
+"""
+from TRUNAJOD.utils import is_word
+
+
+def type_token_ratio(word_list):
+    """Return Type Token Ratio of a word list.
+
+    :param word_list: List of words
+    :type word_list: List of strings
+    :return: TTR of the word list
+    :rtype: float
+    """
+    return len(set(word_list)) / len(word_list)
+
+
+def lexical_diversity_mtld(doc, ttr_segment=0.72):
+    """Compute MTLD lexical diversity in a bi-directional fashion.
+
+    :param doc: Processed text
+    :type doc: Spacy Doc
+    :return: Bi-directional lexical diversity MTLD
+    :rtype: float
+    """
+    word_list = []
+    for token in doc:
+        if is_word(token.pos_):
+            word_list.append(token.lemma_)
+    return (one_side_lexical_diversity_mtld(word_list, ttr_segment) +
+            one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2
+
+
+def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
+    """Lexical diversity per MTLD.
+
+    :param doc: Tokenized text
+    :type doc: Spacy Doc
+    :param ttr_segment: Threshold for TTR mean computation
+    :type ttr_segment: float
+    :return: MLTD lexical diversity
+    :rtype: float
+    """
+    factor = 0
+    total_words = 0
+    non_ttr_segment = 1 - ttr_segment
+    word_list = []
+    for token in doc:
+        word_list.append(token.lower())
+        total_words += 1
+        ttr = type_token_ratio(word_list)
+        if ttr < ttr_segment:
+            word_list = []
+            factor += 1
+
+    if word_list:
+        factor += 1 - (
+            type_token_ratio(word_list) - ttr_segment) / non_ttr_segment
+        total_words += 1
+
+    return total_words / factor
diff --git a/tests/ttr_test.py b/tests/ttr_test.py
@@ -7,13 +7,13 @@
 def test_type_token_ratio():
     """Test type_token_ratio func."""
     assert ttr.type_token_ratio(
-        ['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], ) == 0.5
+        ['hola', 'hola', 'chao', 'hola', 'perro', 'hola']) == 0.5
 
 
 def test_one_side_lexical_diversity_mtld():
     """Test one_side_lexical_diversity_mtld."""
     assert ttr.one_side_lexical_diversity_mtld(
-        ['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], 1) == 3
+        ['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], ttr_segment=1) == 3
 
 
 def test_lexical_diversity_mtld():
@@ -27,4 +27,4 @@ def test_lexical_diversity_mtld():
         Token('perro', 'perro'),
         Token('hola', 'hola'),
     ]
-    assert ttr.lexical_diversity_mtld(doc, 1) == 3
+    assert ttr.lexical_diversity_mtld(doc, ttr_segment=1) == 3