From 172ee434ea7b0947f9117ed494df9fa7f3b454aa Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Thu, 5 May 2022 21:05:56 +0200 Subject: [PATCH] added faster minilm model to overcome slow --- README.md | 7 ++++--- crosslingual_coreference/CrossLingualPredictor.py | 9 ++++++++- crosslingual_coreference/CrossLingualPredictorSpacy.py | 2 +- crosslingual_coreference/__init__.py | 2 +- crosslingual_coreference/examples/test_spacy.py | 2 +- pyproject.toml | 2 +- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e310da1..763cfa5 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,9 @@ text = ( " noodles, but they don't even know him." ) +# choose minilm for speed/memory and info_xlm for accuracy predictor = Predictor( - language="en_core_web_sm", device=-1, model_name="info_xlm" + language="en_core_web_sm", device=-1, model_name="minilm" ) print(predictor.predict(text)["resolved_text"]) @@ -44,7 +45,7 @@ from crosslingual_coreference import Predictor predictor = Predictor( language="en_core_web_sm", device=0, - model_name="info_xlm", + model_name="minilm", chunk_size=2500, chunk_overlap=2, ) @@ -86,6 +87,6 @@ print(doc._.resolved_text) # but Many students don't even know Momofuku Ando. ``` ## Available models -As of now, there are two models available "info_xlm", "xlm_roberta", which scored 77 and 74 on OntoNotes Release 5.0 English data, respectively. +As of now, there are two models available "info_xlm", "xlm_roberta", "minilm", which scored 77, 74 and 74 on OntoNotes Release 5.0 English data, respectively. ## More Examples ![](https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png) diff --git a/crosslingual_coreference/CrossLingualPredictor.py b/crosslingual_coreference/CrossLingualPredictor.py index aa165a9..1c21ebf 100644 --- a/crosslingual_coreference/CrossLingualPredictor.py +++ b/crosslingual_coreference/CrossLingualPredictor.py @@ -20,6 +20,13 @@ "f1_score_ontonotes": 77, "file_extension": ".tar.gz", }, + "minilm": { + "url": ( + "https://storage.googleapis.com/pandora-intelligence/models/crosslingual-coreference/minilm/model.tar.gz" + ), + "f1_score_ontonotes": 74, + "file_extension": ".tar.gz", + }, } @@ -28,7 +35,7 @@ def __init__( self, language: str, device: int = -1, - model_name: str = "info_xlm", + model_name: str = "minilm", chunk_size: Union[int, None] = None, # determines the # sentences per batch chunk_overlap: int = 2, # determines the # of overlapping sentences per chunk ) -> None: diff --git a/crosslingual_coreference/CrossLingualPredictorSpacy.py b/crosslingual_coreference/CrossLingualPredictorSpacy.py index 48fc795..5030247 100644 --- a/crosslingual_coreference/CrossLingualPredictorSpacy.py +++ b/crosslingual_coreference/CrossLingualPredictorSpacy.py @@ -11,7 +11,7 @@ def __init__( self, language: str, device: int = -1, - model_name: str = "info_xlm", + model_name: str = "minilm", chunk_size: Union[int, None] = None, chunk_overlap: int = 2, ) -> None: diff --git a/crosslingual_coreference/__init__.py b/crosslingual_coreference/__init__.py index e2d3306..4979c82 100644 --- a/crosslingual_coreference/__init__.py +++ b/crosslingual_coreference/__init__.py @@ -17,7 +17,7 @@ "xx_coref", default_config={ "device": -1, - "model_name": "info_xlm", + "model_name": "minilm", "chunk_size": None, "chunk_overlap": 2, }, diff --git a/crosslingual_coreference/examples/test_spacy.py b/crosslingual_coreference/examples/test_spacy.py index a2ce557..143b9b2 100644 --- a/crosslingual_coreference/examples/test_spacy.py +++ b/crosslingual_coreference/examples/test_spacy.py @@ -6,7 +6,7 @@ nlp = spacy.load("nl_core_news_sm") -nlp.add_pipe("xx_coref", config={"model_name": "xlm_roberta"}) +nlp.add_pipe("xx_coref", config={"model_name": "minilm"}) for doc in nlp.pipe(texts): print(doc._.coref_clusters) diff --git a/pyproject.toml b/pyproject.toml index 166bfec..3e6047f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "crosslingual-coreference" -version = "0.2.1" +version = "0.2.2" description = "A multi-lingual approach to AllenNLP CoReference Resolution, along with a wrapper for spaCy." authors = ["David Berenstein "] license = "MIT"