diff --git a/.gitignore b/.gitignore index 263199c..acea84a 100644 --- a/.gitignore +++ b/.gitignore @@ -132,4 +132,4 @@ dmypy.json /crosslingual_coreference/models test.py /batching.ipynb -/test.py \ No newline at end of file +/test.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2b11c7b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-added-large-files + - id: end-of-file-fixer + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + # Execute isort on all changed files (make sure the version is the same as in pyproject) + - repo: https://github.com/pycqa/isort + rev: 5.10.1 + hooks: + - id: isort + # Execute flake8 on all changed files (make sure the version is the same as in pyproject) + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + additional_dependencies: + ["flake8-docstrings", "flake8-bugbear", "pep8-naming"] diff --git a/README.md b/README.md index c8de721..e310da1 100644 --- a/README.md +++ b/README.md @@ -89,4 +89,3 @@ print(doc._.resolved_text) As of now, there are two models available "info_xlm", "xlm_roberta", which scored 77 and 74 on OntoNotes Release 5.0 English data, respectively. ## More Examples ![](https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/example_total.png) - diff --git a/crosslingual_coreference/CrossLingualPredictor.py b/crosslingual_coreference/CrossLingualPredictor.py index cc8ccb6..aa165a9 100644 --- a/crosslingual_coreference/CrossLingualPredictor.py +++ b/crosslingual_coreference/CrossLingualPredictor.py @@ -11,12 +11,12 @@ MODELS = { "xlm_roberta": { - "url": "https://storage.googleapis.com/pandora-intelligence/models/crosslingual-coreference/xlm-roberta-base/model.tar.gz", + "url": "https://storage.googleapis.com/pandora-intelligence/models/crosslingual-coreference/xlm-roberta-base/model.tar.gz", # noqa: B950 "f1_score_ontonotes": 74, "file_extension": ".tar.gz", }, "info_xlm": { - "url": "https://storage.googleapis.com/pandora-intelligence/models/crosslingual-coreference/infoxlm-base/model.tar.gz", + "url": "https://storage.googleapis.com/pandora-intelligence/models/crosslingual-coreference/infoxlm-base/model.tar.gz", # noqa: B950 "f1_score_ontonotes": 77, "file_extension": ".tar.gz", }, @@ -78,7 +78,8 @@ def predict(self, text: str, advanced_resolve: bool = True) -> dict: Args: text (str): an input text advanced_resolve (bool, optional): use more advanced resoled from - https://towardsdatascience.com/how-to-make-an-effective-coreference-resolution-model-55875d2b5f19. Defaults to True. + https://towardsdatascience.com/how-to-make-an-effective-coreference-resolution-model-55875d2b5f19. + Defaults to True. Returns: dict: a prediciton diff --git a/crosslingual_coreference/examples/data.py b/crosslingual_coreference/examples/data.py index 8d80aef..9456e3d 100644 --- a/crosslingual_coreference/examples/data.py +++ b/crosslingual_coreference/examples/data.py @@ -1,9 +1,30 @@ -text_en = """Do not forget about Momofuku Ando! He created instant noodles in Osaka. At that location, Nissin was founded. Many students survived by eating his noodles, but they don't even know him.""" -text_nl = """Vergeet Momofuku Ando niet! Hij maakte instantnoedels in Osaka. Op die locatie werd Nissin opgericht. Veel studenten overleefden door zijn noedels te eten, maar ze kennen zijn naam niet eens.""" -text_de = """Vergiss Momofuku Ando nicht! Er kreierte Instantnudeln in Osaka. An diesem Standort wurde Nissin gegründet. Viele Studenten überlebten, indem sie seine Nudeln aßen, aber sie kennen ihn nicht einmal.""" -text_dk = """Glem ikke Momofuku Ando! Han skabte instant nudler i Osaka. På det sted blev Nissin grundlagt. Mange elever overlevede ved at spise hans nudler, men de kender ham ikke engang.""" -text_fr = """N'oubliez pas Momofuku Ando ! Il a créé des nouilles instantanées à Osaka. À cet endroit, Nissin a été fondée. De nombreux étudiants ont survécu en mangeant ses nouilles, mais ils ne le connaissent même pas.""" -text_es = """¡No te olvides de Momofuku Ando! Creó fideos instantáneos en Osaka. En ese lugar, se fundó Nissin. Muchos estudiantes sobrevivieron comiendo sus fideos, pero ni siquiera lo conocen.""" -text_it = """Non dimenticare Momofuku Ando! Ha creato spaghetti istantanei a Osaka. In quel luogo è stata fondata Nissin. Molti studenti sono sopravvissuti mangiando i suoi noodles, ma non lo conoscono nemmeno.""" +text_en = ( + "Do not forget about Momofuku Ando! He created instant noodles in Osaka. At that location, Nissin was founded." + " Many students survived by eating his noodles, but they don't even know him." +) +text_nl = ( + "Vergeet Momofuku Ando niet! Hij maakte instantnoedels in Osaka. Op die locatie werd Nissin opgericht. Veel" + " studenten overleefden door zijn noedels te eten, maar ze kennen zijn naam niet eens." +) +text_de = ( + "Vergiss Momofuku Ando nicht! Er kreierte Instantnudeln in Osaka. An diesem Standort wurde Nissin gegründet. Viele" + " Studenten überlebten, indem sie seine Nudeln aßen, aber sie kennen ihn nicht einmal." +) +text_dk = ( + "Glem ikke Momofuku Ando! Han skabte instant nudler i Osaka. På det sted blev Nissin grundlagt. Mange elever" + " overlevede ved at spise hans nudler, men de kender ham ikke engang." +) +text_fr = ( + "N'oubliez pas Momofuku Ando ! Il a créé des nouilles instantanées à Osaka. À cet endroit, Nissin a été fondée. De" + " nombreux étudiants ont survécu en mangeant ses nouilles, mais ils ne le connaissent même pas." +) +text_es = ( + "¡No te olvides de Momofuku Ando! Creó fideos instantáneos en Osaka. En ese lugar, se fundó Nissin. Muchos" + " estudiantes sobrevivieron comiendo sus fideos, pero ni siquiera lo conocen." +) +text_it = ( + "Non dimenticare Momofuku Ando! Ha creato spaghetti istantanei a Osaka. In quel luogo è stata fondata Nissin." + " Molti studenti sono sopravvissuti mangiando i suoi noodles, ma non lo conoscono nemmeno." +) texts = [text_en, text_nl, text_de, text_dk, text_fr, text_es, text_it] diff --git a/pyproject.toml b/pyproject.toml index a39b8a3..166bfec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,4 +55,4 @@ experimental-string-processing = true [tool.isort] profile = "black" -src_paths = ["crosslingual_coreference"] \ No newline at end of file +src_paths = ["crosslingual_coreference"]