Skip to content

Commit

Permalink
Merge pull request #13 from brucewlee/option_stanza
Browse files Browse the repository at this point in the history
Adding Stanza option to TTR, Entity Grid
  • Loading branch information
dpalmasan authored Apr 3, 2021
2 parents b978e16 + a911578 commit cf655a6
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 33 deletions.
41 changes: 29 additions & 12 deletions src/TRUNAJOD/entity_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
sequence and the API currently does not provide any hyper-parameter tunning to
change this.
"""
from TRUNAJOD.utils import SupportedModels

SPACY_UNIVERSAL_NOUN_TAGS = set([u'NOUN', u'PRON', u'PROPN'])
UNIVERSAL_NOUN_TAGS = set([u'NOUN', u'PRON', u'PROPN'])

ordered_transitions = [
u'SS', u'SO', u'SX', u'S-', u'OS', u'OO', u'OX', u'O-', u'XS', u'XO',
Expand Down Expand Up @@ -63,7 +64,7 @@ class EntityGrid(object):
module. It only supports 2-transitions entity grid.
"""

def __init__(self, doc):
def __init__(self, doc, model_name="spacy"):
"""Construct EntityGrid object."""
# Initialization
entity_map = dict()
Expand All @@ -88,9 +89,14 @@ def __init__(self, doc):
u'-X': 0,
u'--': 0
}
# check model
model = SupportedModels(model_name)

# Get number of sentences in the text
n_sent = len(list(doc.sents))
if model == SupportedModels.SPACY:
n_sent = len(list(doc.sents))
elif model == SupportedModels.STANZA:
n_sent = len(list(doc.sentences))

# To get coherence measurements we need at least 2 sentences
if n_sent < 2:
Expand All @@ -99,15 +105,26 @@ def __init__(self, doc):
.format(n_sent))

# For each sentence, get dependencies and its grammatical role
for sent in doc.sents:
for token in sent:
if token.pos_ in SPACY_UNIVERSAL_NOUN_TAGS:
entity_map['s%d' % i].append((token.text.upper(),
token.dep_))
if token.text.upper() not in entity_grid:
entity_grid[token.text.upper()] = [u'-'] * n_sent
i += 1
entity_map['s%d' % i] = []
if model == SupportedModels.SPACY:
for sent in doc.sents:
for token in sent:
if token.pos_ in UNIVERSAL_NOUN_TAGS:
entity_map['s%d' % i].append((token.text.upper(),
token.dep_))
if token.text.upper() not in entity_grid:
entity_grid[token.text.upper()] = [u'-'] * n_sent
i += 1
entity_map['s%d' % i] = []
elif model == SupportedModels.STANZA:
for sent in doc.sentences:
for word in sent.words:
if word.upos in UNIVERSAL_NOUN_TAGS:
entity_map['s%d' % i].append((word.text.upper(),
word.deprel))
if word.text.upper() not in entity_grid:
entity_grid[word.text.upper()] = ['-'] * n_sent
i += 1
entity_map['s%d' % i] = []

# Last iteration will create an extra element, so I remove it.
entity_map.pop('s%d' % i)
Expand Down
62 changes: 44 additions & 18 deletions src/TRUNAJOD/ttr.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
tend to 0. This measurement is not recommended if analyzing texts of different
lengths, as when the number of tokens increases, the TTR tends flatten.
"""
from TRUNAJOD.utils import is_word
from TRUNAJOD.utils import SupportedModels,is_word

# dev import
# from src.TRUNAJOD.utils import is_word


def type_token_ratio(word_list):
Expand All @@ -22,27 +25,36 @@ def type_token_ratio(word_list):
return len(set(word_list)) / len(word_list)


def lexical_diversity_mtld(doc, ttr_segment=0.72):
def lexical_diversity_mtld(doc, model_name="spacy", ttr_segment=0.72):
"""Compute MTLD lexical diversity in a bi-directional fashion.
:param doc: Processed text
:type doc: Spacy Doc
:type doc: NLP Doc
:return: Bi-directional lexical diversity MTLD
:rtype: float
"""
# check model
model = SupportedModels(model_name)

word_list = []
for token in doc:
if is_word(token.pos_):
word_list.append(token.lemma_)
return (one_side_lexical_diversity_mtld(word_list, ttr_segment) +
one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2
if model == SupportedModels.SPACY:
for token in doc:
if is_word(token.pos_):
word_list.append(token.lemma_)
elif model == SupportedModels.STANZA:
for sent in doc.sentences:
for word in sent.words:
if is_word(word.upos):
word_list.append(word.lemma)
return (one_side_lexical_diversity_mtld(word_list, model, ttr_segment) +
one_side_lexical_diversity_mtld(word_list[::-1], model, ttr_segment)) / 2


def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
def one_side_lexical_diversity_mtld(doc, model_name="spacy", ttr_segment=0.72):
"""Lexical diversity per MTLD.
:param doc: Tokenized text
:type doc: Spacy Doc
:type doc: NLP Doc
:param ttr_segment: Threshold for TTR mean computation
:type ttr_segment: float
:return: MLTD lexical diversity
Expand All @@ -52,17 +64,31 @@ def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
total_words = 0
non_ttr_segment = 1 - ttr_segment
word_list = []
for token in doc:
word_list.append(token.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1

# check model
model = SupportedModels(model_name)

if model == SupportedModels.SPACY or type(doc) == list:
for token in doc:
word_list.append(token.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1
elif model == SupportedModels.STANZA:
if type(doc) != list:
for sent in doc.sentences:
for word in sent.words:
word_list.append(word.text.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1

if word_list:
factor += 1 - (
type_token_ratio(word_list) - ttr_segment) / non_ttr_segment
total_words += 1

return total_words / factor
5 changes: 5 additions & 0 deletions src/TRUNAJOD/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
#!/usr/bin/env python
"""Utility functions for TRUNAJOD library."""
from enum import Enum


class SupportedModels(str, Enum):
SPACY = "spacy"
STANZA = "stanza"

def flatten(list_of_lists):
"""Flatten a list of list.
Expand Down
43 changes: 43 additions & 0 deletions stanza_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from TRUNAJOD.entity_grid import EntityGrid
from TRUNAJOD.ttr import lexical_diversity_mtld, one_side_lexical_diversity_mtld
import spacy
import stanza
"""
MUST CHANGE TTR IMPORT TO WORK
"""
# Load spaCy model
nlp = spacy.load("es_core_news_sm")

# Load stanza model
nlp_s = stanza.Pipeline('es', use_gpu=False)

# Example
example_text = (
"El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas"
"sobre el universo, su origen y su funcionamiento. No es sorprendente que "
"todas las civilizaciones y culturas hayan formado sus propias "
"cosmologías. Unas relatan, por ejemplo, que el universo ha"
"sido siempre tal como es, con ciclos que inmutablemente se repiten; "
"otras explican que este universo ha tenido un principio, "
"que ha aparecido por obra creadora de una divinidad."
)

# Create Doc
doc = nlp(example_text)
doc_s = nlp_s(example_text)

# TTR Check - change TTR import to test
print("spacy result: ", lexical_diversity_mtld(doc))
# or
# print("spacy result: ", lexical_diversity_mtld(doc, model_name="spacy"))
print("stanza result: ", lexical_diversity_mtld(doc_s, model_name="stanza"))

# Entity Grid Check
egrid = EntityGrid(doc)
egrid_s = EntityGrid(doc_s, model_name="stanza")

print("spacy Entity grid:")
print(egrid.get_egrid())

print("stanza Entity grid:")
print(egrid_s.get_egrid())
68 changes: 68 additions & 0 deletions tester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python
"""Type Token Ratios module.
Type token ratios (TTR) are a measurement of lexical diversity. They are
defined as the ratio of unique tokens divided by the total number of tokens.
This measurement is bounded between 0 and 1. If there is no repetition in
the text this measurement is 1, and if there is infinite repetition, it will
tend to 0. This measurement is not recommended if analyzing texts of different
lengths, as when the number of tokens increases, the TTR tends flatten.
"""
from TRUNAJOD.utils import is_word


def type_token_ratio(word_list):
"""Return Type Token Ratio of a word list.
:param word_list: List of words
:type word_list: List of strings
:return: TTR of the word list
:rtype: float
"""
return len(set(word_list)) / len(word_list)


def lexical_diversity_mtld(doc, ttr_segment=0.72):
"""Compute MTLD lexical diversity in a bi-directional fashion.
:param doc: Processed text
:type doc: Spacy Doc
:return: Bi-directional lexical diversity MTLD
:rtype: float
"""
word_list = []
for token in doc:
if is_word(token.pos_):
word_list.append(token.lemma_)
return (one_side_lexical_diversity_mtld(word_list, ttr_segment) +
one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2


def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
"""Lexical diversity per MTLD.
:param doc: Tokenized text
:type doc: Spacy Doc
:param ttr_segment: Threshold for TTR mean computation
:type ttr_segment: float
:return: MLTD lexical diversity
:rtype: float
"""
factor = 0
total_words = 0
non_ttr_segment = 1 - ttr_segment
word_list = []
for token in doc:
word_list.append(token.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1

if word_list:
factor += 1 - (
type_token_ratio(word_list) - ttr_segment) / non_ttr_segment
total_words += 1

return total_words / factor
6 changes: 3 additions & 3 deletions tests/ttr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
def test_type_token_ratio():
"""Test type_token_ratio func."""
assert ttr.type_token_ratio(
['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], ) == 0.5
['hola', 'hola', 'chao', 'hola', 'perro', 'hola']) == 0.5


def test_one_side_lexical_diversity_mtld():
"""Test one_side_lexical_diversity_mtld."""
assert ttr.one_side_lexical_diversity_mtld(
['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], 1) == 3
['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], ttr_segment=1) == 3


def test_lexical_diversity_mtld():
Expand All @@ -27,4 +27,4 @@ def test_lexical_diversity_mtld():
Token('perro', 'perro'),
Token('hola', 'hola'),
]
assert ttr.lexical_diversity_mtld(doc, 1) == 3
assert ttr.lexical_diversity_mtld(doc, ttr_segment=1) == 3

0 comments on commit cf655a6

Please sign in to comment.