added faster minilm model to overcome slow

davidberenstein1957 · May 5, 2022 · 172ee43 · 172ee43
1 parent 7230d5f
commit 172ee43
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -21,8 +21,9 @@ text = (
     " noodles, but they don't even know him."
 )
 
+# choose minilm for speed/memory and info_xlm for accuracy
 predictor = Predictor(
-    language="en_core_web_sm", device=-1, model_name="info_xlm"
+    language="en_core_web_sm", device=-1, model_name="minilm"
 )
 
 print(predictor.predict(text)["resolved_text"])
@@ -44,7 +45,7 @@ from crosslingual_coreference import Predictor
 predictor = Predictor(
     language="en_core_web_sm",
     device=0,
-    model_name="info_xlm",
+    model_name="minilm",
     chunk_size=2500,
     chunk_overlap=2,
 )
@@ -86,6 +87,6 @@ print(doc._.resolved_text)
 # but Many students don't even know Momofuku Ando.
 ```
 ## Available models
-As of now, there are two models available "info_xlm", "xlm_roberta", which scored 77 and 74 on OntoNotes Release 5.0 English data, respectively.
+As of now, there are two models available "info_xlm", "xlm_roberta", "minilm", which scored 77, 74 and 74 on OntoNotes Release 5.0 English data, respectively.
 ## More Examples
 ![](https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png)
diff --git a/crosslingual_coreference/CrossLingualPredictor.py b/crosslingual_coreference/CrossLingualPredictor.py
@@ -20,6 +20,13 @@
         "f1_score_ontonotes": 77,
         "file_extension": ".tar.gz",
     },
+    "minilm": {
+        "url": (
+            "https://storage.googleapis.com/pandora-intelligence/models/crosslingual-coreference/minilm/model.tar.gz"
+        ),
+        "f1_score_ontonotes": 74,
+        "file_extension": ".tar.gz",
+    },
 }
 
 
@@ -28,7 +35,7 @@ def __init__(
         self,
         language: str,
         device: int = -1,
-        model_name: str = "info_xlm",
+        model_name: str = "minilm",
         chunk_size: Union[int, None] = None,  # determines the # sentences per batch
         chunk_overlap: int = 2,  # determines the # of overlapping sentences per chunk
     ) -> None:

diff --git a/crosslingual_coreference/CrossLingualPredictorSpacy.py b/crosslingual_coreference/CrossLingualPredictorSpacy.py
@@ -11,7 +11,7 @@ def __init__(
         self,
         language: str,
         device: int = -1,
-        model_name: str = "info_xlm",
+        model_name: str = "minilm",
         chunk_size: Union[int, None] = None,
         chunk_overlap: int = 2,
     ) -> None:

diff --git a/crosslingual_coreference/__init__.py b/crosslingual_coreference/__init__.py
@@ -17,7 +17,7 @@
     "xx_coref",
     default_config={
         "device": -1,
-        "model_name": "info_xlm",
+        "model_name": "minilm",
         "chunk_size": None,
         "chunk_overlap": 2,
     },

diff --git a/crosslingual_coreference/examples/test_spacy.py b/crosslingual_coreference/examples/test_spacy.py
@@ -6,7 +6,7 @@
 
 nlp = spacy.load("nl_core_news_sm")
 
-nlp.add_pipe("xx_coref", config={"model_name": "xlm_roberta"})
+nlp.add_pipe("xx_coref", config={"model_name": "minilm"})
 
 for doc in nlp.pipe(texts):
     print(doc._.coref_clusters)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "crosslingual-coreference"
-version = "0.2.1"
+version = "0.2.2"
 description = "A multi-lingual approach to AllenNLP CoReference Resolution, along with a wrapper for spaCy."
 authors = ["David Berenstein <[email protected]>"]
 license = "MIT"