-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from brucewlee/option_stanza
Adding Stanza option to TTR, Entity Grid
- Loading branch information
Showing
6 changed files
with
192 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from TRUNAJOD.entity_grid import EntityGrid | ||
from TRUNAJOD.ttr import lexical_diversity_mtld, one_side_lexical_diversity_mtld | ||
import spacy | ||
import stanza | ||
""" | ||
MUST CHANGE TTR IMPORT TO WORK | ||
""" | ||
# Load spaCy model | ||
nlp = spacy.load("es_core_news_sm") | ||
|
||
# Load stanza model | ||
nlp_s = stanza.Pipeline('es', use_gpu=False) | ||
|
||
# Example | ||
example_text = ( | ||
"El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas" | ||
"sobre el universo, su origen y su funcionamiento. No es sorprendente que " | ||
"todas las civilizaciones y culturas hayan formado sus propias " | ||
"cosmologías. Unas relatan, por ejemplo, que el universo ha" | ||
"sido siempre tal como es, con ciclos que inmutablemente se repiten; " | ||
"otras explican que este universo ha tenido un principio, " | ||
"que ha aparecido por obra creadora de una divinidad." | ||
) | ||
|
||
# Create Doc | ||
doc = nlp(example_text) | ||
doc_s = nlp_s(example_text) | ||
|
||
# TTR Check - change TTR import to test | ||
print("spacy result: ", lexical_diversity_mtld(doc)) | ||
# or | ||
# print("spacy result: ", lexical_diversity_mtld(doc, model_name="spacy")) | ||
print("stanza result: ", lexical_diversity_mtld(doc_s, model_name="stanza")) | ||
|
||
# Entity Grid Check | ||
egrid = EntityGrid(doc) | ||
egrid_s = EntityGrid(doc_s, model_name="stanza") | ||
|
||
print("spacy Entity grid:") | ||
print(egrid.get_egrid()) | ||
|
||
print("stanza Entity grid:") | ||
print(egrid_s.get_egrid()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#!/usr/bin/env python | ||
"""Type Token Ratios module. | ||
Type token ratios (TTR) are a measurement of lexical diversity. They are | ||
defined as the ratio of unique tokens divided by the total number of tokens. | ||
This measurement is bounded between 0 and 1. If there is no repetition in | ||
the text this measurement is 1, and if there is infinite repetition, it will | ||
tend to 0. This measurement is not recommended if analyzing texts of different | ||
lengths, as when the number of tokens increases, the TTR tends flatten. | ||
""" | ||
from TRUNAJOD.utils import is_word | ||
|
||
|
||
def type_token_ratio(word_list): | ||
"""Return Type Token Ratio of a word list. | ||
:param word_list: List of words | ||
:type word_list: List of strings | ||
:return: TTR of the word list | ||
:rtype: float | ||
""" | ||
return len(set(word_list)) / len(word_list) | ||
|
||
|
||
def lexical_diversity_mtld(doc, ttr_segment=0.72): | ||
"""Compute MTLD lexical diversity in a bi-directional fashion. | ||
:param doc: Processed text | ||
:type doc: Spacy Doc | ||
:return: Bi-directional lexical diversity MTLD | ||
:rtype: float | ||
""" | ||
word_list = [] | ||
for token in doc: | ||
if is_word(token.pos_): | ||
word_list.append(token.lemma_) | ||
return (one_side_lexical_diversity_mtld(word_list, ttr_segment) + | ||
one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2 | ||
|
||
|
||
def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72): | ||
"""Lexical diversity per MTLD. | ||
:param doc: Tokenized text | ||
:type doc: Spacy Doc | ||
:param ttr_segment: Threshold for TTR mean computation | ||
:type ttr_segment: float | ||
:return: MLTD lexical diversity | ||
:rtype: float | ||
""" | ||
factor = 0 | ||
total_words = 0 | ||
non_ttr_segment = 1 - ttr_segment | ||
word_list = [] | ||
for token in doc: | ||
word_list.append(token.lower()) | ||
total_words += 1 | ||
ttr = type_token_ratio(word_list) | ||
if ttr < ttr_segment: | ||
word_list = [] | ||
factor += 1 | ||
|
||
if word_list: | ||
factor += 1 - ( | ||
type_token_ratio(word_list) - ttr_segment) / non_ttr_segment | ||
total_words += 1 | ||
|
||
return total_words / factor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters