diff --git a/README.md b/README.md index 829c69d3..6f37e90a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Code for the paper [Segment Any Text: A Universal Approach for Robust, Efficient and Adaptable Sentence Segmentation](TODO) by Markus Frohmann, Igor Sterner, Benjamin Minixhofer, Ivan Vulić and Markus Schedl. -This repository contains `segment-any-text`, a package for robust, efficient and adaptable sentence segmentation across 85 languages, as well as the code and configs to reproduce the **state-of-the-art** results in 8 distinct corpora and 85 languages demonstrated in our paper. +This repository contains `wtpsplit`, a package for robust, efficient and adaptable sentence segmentation across 85 languages, as well as the code and configs to reproduce the **state-of-the-art** results in 8 distinct corpora and 85 languages demonstrated in our Segment any Text [paper](TODO). ![System Figure](./system-fig.png) @@ -10,13 +10,13 @@ This repository contains `segment-any-text`, a package for robust, efficient and ## Installation ```bash -pip install segment-any-text +pip install wtpsplit ``` ## Usage ```python -from sat import SaT +from wtpsplit import SaT sat = SaT("sat-3l") # optionally run on GPU for better performance @@ -43,49 +43,48 @@ sat.split("This is a test This is another test.", lang_code="en", style="ud") ## Available Models If you need a general sentence segmentation model, use `-sm` models (e.g., `sat-3l-sm`) -For speed-sensitive applications, we recommend 3-layer models (`sat-3l` and `sat-3l-sm`). They provide a good tradeoff between speen and performance. -The best (and largest) models are our 12-layer models: `sat-12l` and `sat-12l-sm`. -## TODO TODO TODO - +| PySBD | 69.6 | +| SpaCy (sentencizer; monolingual) | 92.9 | +| SpaCy (sentencizer; multilingual) | 91.5 | +| Ersatz | 91.4 | +| Punkt (`nltk.sent_tokenize`) | 92.2 | +| [WtP (3l)](https://huggingface.co/benjamin/wtp-canine-s-3l) | 93.9 | Note that this library also supports previous [`WtP`](https://arxiv.org/abs/2305.18893) models. You can use them in essentially the same way as `SaT`models: ```python -from sat import WtP +from wtpsplit import WtP wtp = WtP("wtp-bert-mini") # similar functionality as for SaT models wtp.split("This is a test This is another test.") ``` -For more details on WtP and reproduction details, see the `wtpsplit` branch. +For more details on WtP and reproduction details, see the `wtp` branch. ## Paragraph Segmentation @@ -179,7 +178,7 @@ Our models can be efficiently adapted via LoRA in a powerful way. Only 10-100 tr Clone the repository and install requirements: ``` -git clone https://github.com/segment-any-text/segment-any-text +git clone https://github.com/segment-any-text/wtpsplit cd segment-any-text pip install -e . pip install -r requirements.txt @@ -349,7 +348,7 @@ For details, we refer to our [paper](TODO). ## Citation -If you find our `segment-any-text` useful, please kindly cite our paper: +If you find `wtpsplit` and our `SaT` models useful, please kindly cite our paper: ``` @inproceedings{TODO,} ``` diff --git a/scripts/export_to_onnx_sat-sm.py b/scripts/export_to_onnx_sat.py similarity index 77% rename from scripts/export_to_onnx_sat-sm.py rename to scripts/export_to_onnx_sat.py index 47f9fee1..f7674dd9 100644 --- a/scripts/export_to_onnx_sat-sm.py +++ b/scripts/export_to_onnx_sat.py @@ -3,7 +3,7 @@ import onnx import torch -from onnxruntime.transformers.optimizer import optimize_model +from onnxruntime.transformers.optimizer import optimize_model # noqa from transformers import AutoModelForTokenClassification, HfArgumentParser import wtpsplit # noqa @@ -12,8 +12,8 @@ @dataclass class Args: - model_name_or_path: str = "segment-any-text/sat-12l-sm" - output_dir: str = "sat-12l-sm" + model_name_or_path: str = "segment-any-text/sat-12l-no-limited-lookahead" + output_dir: str = "sat-12l-no-limited-lookahead" device: str = "cpu" # TODO: lora merging here @@ -24,7 +24,7 @@ class Args: output_dir = Path(args.output_dir) output_dir.mkdir(exist_ok=True, parents=True) - model = AutoModelForTokenClassification.from_pretrained(args.model_name_or_path) + model = AutoModelForTokenClassification.from_pretrained(args.model_name_or_path, force_download=True) # model = model.half() # CUDA ONLY! model = model.to(args.device) @@ -41,8 +41,9 @@ class Args: dynamic_axes={ "input_ids": {0: "batch", 1: "sequence"}, "attention_mask": {0: "batch", 1: "sequence"}, - "logits": {0: "batch", 1: "sequence"}, + "logits": {0: "batch", 1: "sequence"} }, + # opset_version=11 ) # m = optimize_model( @@ -55,3 +56,6 @@ class Args: # optimized_model_path = output_dir / "model_optimized.onnx" # onnx.save_model(m.model, optimized_model_path) + + onnx_model = onnx.load(output_dir / "model.onnx") + onnx.checker.check_model(onnx_model, full_check=True) \ No newline at end of file diff --git a/setup.py b/setup.py index e6a1794b..c44a8b1a 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,8 @@ from setuptools import setup setup( - name="segment-any-text", - version="1.0.0", + name="wtpsplit", + version="2.0.0", packages=["wtpsplit"], description="Universal Robust, Efficient and Adaptable Sentence Segmentation", author="Markus Frohmann, Igor Sterner, Benjamin Minixhofer", @@ -20,7 +20,7 @@ "mosestokenizer", "adapters==0.2.1" ], - url="https://github.com/bminixhofer/wtpsplit", + url="https://github.com/segment-any-text/wtpsplit", package_data={"wtpsplit": ["data/*"]}, include_package_data=True, license="MIT",