Merge pull request #506 from CogStack/master

v1.14.0 release PR
CogStack · Nov 19, 2024 · ceb74b1 · ceb74b1
2 parents 34e5cde + 37a8a63
commit ceb74b1
Show file tree

Hide file tree

Showing 29 changed files with 1,011 additions and 933 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.8', '3.9', '3.10', '3.11' ]
+        python-version: [ '3.9', '3.10', '3.11' ]
       max-parallel: 4
 
     steps:
@@ -42,6 +42,8 @@ jobs:
           timeout 25m python -m unittest ${second_half_nl[@]}
       - name: Regression
         run: source tests/resources/regression/run_regression.sh
+      - name: Model backwards compatibility
+        run: source tests/resources/model_compatibility/check_backwards_compatibility.sh
       - name: Get the latest release version
         id: get_latest_release
         uses: actions/github-script@v6

diff --git a/docs/main.md b/docs/main.md
@@ -122,12 +122,12 @@ If you have access to UMLS or SNOMED-CT, you can download the pre-built CDB and
 A basic trained model is made public. It contains ~ 35K concepts available in `MedMentions`. This was compiled from MedMentions and does not have any data from [NLM](https://www.nlm.nih.gov/research/umls/) as that data is not publicaly available.
 
 Model packs:
-- MedMentions with Status (Is Concept Affirmed or Negated/Hypothetical) [Download](https://medcat.rosalind.kcl.ac.uk/media/medmen_wstatus_2021_oct.zip)
+- MedMentions with Status (Is Concept Affirmed or Negated/Hypothetical) [Download](https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/medmen_wstatus_2021_oct.zip)
 
 Separate models:
-- Vocabulary [Download](https://medcat.rosalind.kcl.ac.uk/media/vocab.dat) - Built from MedMentions
-- CDB [Download](https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1_2.dat) - Built from MedMentions
-- MetaCAT Status [Download](https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip) - Built from a sample from MIMIC-III, detects is an annotation Affirmed (Positve) or Other (Negated or Hypothetical)
+- Vocabulary [Download](https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/vocab.dat) - Built from MedMentions
+- CDB [Download](https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/cdb-medmen-v1.dat) - Built from MedMentions
+- MetaCAT Status [Download](https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/mc_status.zip) - Built from a sample from MIMIC-III, detects is an annotation Affirmed (Positve) or Other (Negated or Hypothetical)
 
 ## Acknowledgements
 Entity extraction was trained on [MedMentions](https://github.com/chanzuckerberg/MedMentions) In total it has ~ 35K entites from UMLS

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -2,103 +2,105 @@ sphinx==6.2.1
 sphinx-rtd-theme~=1.0
 myst-parser~=0.17
 sphinx-autoapi~=3.0.0
-MarkupSafe==2.1.3
-accelerate==0.23.0
-aiofiles==23.2.1
-aiohttp==3.8.5
+MarkupSafe==2.1.5
+accelerate==0.34.2
+aiofiles==24.1.0
+aiohttp==3.10.5
 aiosignal==1.3.1
-asttokens==2.4.0
+asttokens==2.4.1
 async-timeout==4.0.3
-attrs==23.1.0
+attrs==24.2.0
 backcall==0.2.0
 blis==0.7.11
 catalogue==2.0.10
-certifi==2023.7.22
-charset-normalizer==3.3.0
+certifi==2024.8.30
+charset-normalizer==3.3.2
 click==8.1.7
-comm==0.1.4
-confection==0.1.3
+comm==0.2.2
+confection==0.1.5
 cymem==2.0.8
-datasets==2.14.5
+darglint==1.8.1
+datasets==2.21.0
 decorator==5.1.1
-dill==0.3.7
-exceptiongroup==1.1.3
-executing==2.0.0
-filelock==3.12.4
-flake8==4.0.1
-frozenlist==1.4.0
-fsspec==2023.6.0
-gensim==4.3.2
-huggingface-hub==0.17.3
-idna==3.4
-ipython==8.16.1
-ipywidgets==8.1.1
+dill==0.3.8
+exceptiongroup==1.2.2
+executing==2.1.0
+filelock==3.16.0
+flake8==7.0.0
+frozenlist==1.4.1
+fsspec==2024.6.1
+gensim==4.3.3
+huggingface-hub==0.24.7
+idna==3.10
+ipython==8.27.0
+ipywidgets==8.1.5
 jedi==0.19.1
-jinja2==3.1.2
-joblib==1.3.2
-jsonpickle==3.0.2
-jupyterlab-widgets==3.0.9
-langcodes==3.3.0
-matplotlib-inline==0.1.6
-mccabe==0.6.1
+jinja2==3.1.4
+joblib==1.4.2
+jsonpickle==3.3.0
+jupyterlab-widgets==3.0.13
+langcodes==3.4.0
+matplotlib-inline==0.1.7
+mccabe==0.7.0
 mpmath==1.3.0
-multidict==6.0.4
-multiprocess==0.70.15
+multidict==6.1.0
+multiprocess==0.70.16
 murmurhash==1.0.10
-mypy==1.0.0
-mypy-extensions==0.4.3
-networkx==3.1
+mypy==1.11.2
+mypy-extensions==1.0.0
+networkx==3.3
 numpy==1.25.2
-packaging==23.2
-pandas==2.1.1
-parso==0.8.3
-pathy==0.10.2
-pexpect==4.8.0
+packaging==24.1
+pandas==2.2.2
+parso==0.8.4
+pathy==0.11.0
+peft==0.12.0
+pexpect==4.9.0
 pickleshare==0.7.5
 preshed==3.0.9
-prompt-toolkit==3.0.39
-psutil==5.9.5
+prompt-toolkit==3.0.47
+psutil==6.0.0
 ptyprocess==0.7.0
-pure-eval==0.2.2
-pyarrow==13.0.0
-pycodestyle==2.8.0
-pydantic==1.10.13
-pyflakes==2.4.0
-pygments==2.16.1
-python-dateutil==2.8.2
-pytz==2023.3.post1
-pyyaml==6.0.1
-regex==2023.10.3
-requests==2.31.0
-safetensors==0.4.0
-scikit-learn==1.3.1
+pure-eval==0.2.3
+pyarrow==17.0.0
+pycodestyle==2.11.1
+pydantic==1.10.18
+pyflakes==3.2.0
+pygments==2.18.0
+python-dateutil==2.9.0
+pytz==2024.2
+pyyaml==6.0.2
+regex==2024.9.11
+requests==2.32.3
+safetensors==0.4.5
+scikit-learn==1.5.2
 scipy==1.9.3
 six==1.16.0
 smart-open==6.4.0
-spacy==3.4.4
+spacy==3.6.1
 spacy-legacy==3.0.12
 spacy-loggers==1.0.5
 srsly==2.4.8
 stack-data==0.6.3
-sympy==1.12
+sympy==1.13.2
 thinc==8.1.12
-threadpoolctl==3.2.0
-tokenizers==0.14.1
+threadpoolctl==3.5.0
+tokenizers==0.19.1
 tomli==2.0.1
-torch==2.1.0
-tqdm==4.66.1
-traitlets==5.11.2
-transformers==4.34.0
-triton==2.1.0
-typer==0.7.0
+torch==2.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.44.2
+triton==3.0.0
+typer==0.9.4
 types-PyYAML==6.0.3
 types-aiofiles==0.8.3
 types-setuptools==57.4.10
-typing-extensions==4.8.0
-tzdata==2023.3
-urllib3==2.0.6
-wasabi==0.10.1
-wcwidth==0.2.8
-widgetsnbextension==4.0.9
-xxhash==3.4.1
-yarl==1.9.2
+typing-extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.3
+wasabi==1.1.3
+wcwidth==0.2.13
+widgetsnbextension==4.0.13
+xxhash==3.5.0
+yarl==1.11.1
diff --git a/examples/cdb_new.dat b/examples/cdb_new.dat
diff --git a/install_requires.txt b/install_requires.txt
@@ -1,7 +1,7 @@
 'numpy>=1.22.0,<1.26.0'  # 1.22.0 is first to support python 3.11; post 1.26.0 there's issues with scipy
 'pandas>=1.4.2' # first to support 3.11
 'gensim>=4.3.0,<5.0.0'  # 5.3.0 is first to support 3.11; avoid major version bump
-'spacy>=3.6.0,<4.0.0'  # Some later model packs (e.g HPO) are made with 3.6.0 spacy model; avoid major version bump
+'spacy>=3.6.0,<3.8.0'  # 3.8 only supports numpy2 which we can't use due to other dependencies
 'scipy~=1.9.2'  # 1.9.2 is first to support 3.11
 'transformers>=4.34.0,<5.0.0'  # avoid major version bump
 'accelerate>=0.23.0' # required by Trainer class in de-id
@@ -21,4 +21,4 @@
 'click>=8.0.4' # allow later versions, tested with 8.1.3
 'pydantic>=1.10.0,<2.0' # for spacy compatibility; avoid 2.0 due to breaking changes
 "humanfriendly~=10.0"  # for human readable file / RAM sizes
-"peft>=0.8.2"
+"peft>=0.8.2"
diff --git a/medcat/cat.py b/medcat/cat.py
@@ -1127,11 +1127,29 @@ def get_entities_multi_texts(self,
             self.pipe.set_error_handler(self._pipe_error_handler)
             try:
                 texts_ = self._get_trimmed_texts(texts)
+                if self.config.general.usage_monitor.enabled:
+                    input_lengths: List[Tuple[int, int]] = []
+                    for orig_text, trimmed_text in zip(texts, texts_):
+                        if orig_text is None or trimmed_text is None:
+                            l1, l2 = 0, 0
+                        else:
+                            l1 = len(orig_text)
+                            l2 = len(trimmed_text)
+                        input_lengths.append((l1, l2))
                 docs = self.pipe.batch_multi_process(texts_, n_process, batch_size)
 
-                for doc in tqdm(docs, total=len(texts_)):
+                for doc_nr, doc in tqdm(enumerate(docs), total=len(texts_)):
                     doc = None if doc.text.strip() == '' else doc
                     out.append(self._doc_to_out(doc, only_cui, addl_info, out_with_text=True))
+                    if self.config.general.usage_monitor.enabled:
+                        l1, l2 = input_lengths[doc_nr]
+                        if doc is None:
+                            nents = 0
+                        elif self.config.general.show_nested_entities:
+                            nents = len(doc._.ents)  # type: ignore
+                        else:
+                            nents = len(doc.ents)  # type: ignore
+                        self.usage_monitor.log_inference(l1, l2, nents)
 
                 # Currently spaCy cannot mark which pieces of texts failed within the pipe so be this workaround,
                 # which also assumes texts are different from each others.
@@ -1637,6 +1655,9 @@ def _mp_cons(self, in_q: Queue, out_list: List, min_free_memory: float,
                         logger.warning("PID: %s failed one document in _mp_cons, running will continue normally. \n" +
                                          "Document length in chars: %s, and ID: %s", pid, len(str(text)), i_text)
                         logger.warning(str(e))
+        if self.config.general.usage_monitor.enabled:
+            # NOTE: This is in another process, so need to explicitly flush
+            self.usage_monitor._flush_logs()
         sleep(2)
 
     def _add_nested_ent(self, doc: Doc, _ents: List[Span], _ent: Union[Dict, Span]) -> None:

diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
@@ -257,20 +257,19 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data
         category_value2id = g_config['category_value2id']
         if not category_value2id:
             # Encode the category values
-            data_undersampled, full_data, category_value2id = encode_category_values(data,
+            full_data, data_undersampled, category_value2id = encode_category_values(data,
                                                                                      category_undersample=self.config.model.category_undersample)
             g_config['category_value2id'] = category_value2id
         else:
             # We already have everything, just get the data
-            data_undersampled, full_data, category_value2id = encode_category_values(data,
+            full_data, data_undersampled, category_value2id = encode_category_values(data,
                                                                                      existing_category_value2id=category_value2id,
                                                                                      category_undersample=self.config.model.category_undersample)
             g_config['category_value2id'] = category_value2id
         # Make sure the config number of classes is the same as the one found in the data
         if len(category_value2id) != self.config.model['nclasses']:
             logger.warning(
-                "The number of classes set in the config is not the same as the one found in the data: {} vs {}".format(
-                    self.config.model['nclasses'], len(category_value2id)))
+                "The number of classes set in the config is not the same as the one found in the data: %d vs %d",self.config.model['nclasses'], len(category_value2id))
             logger.warning("Auto-setting the nclasses value in config and rebuilding the model.")
             self.config.model['nclasses'] = len(category_value2id)
 

diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py
@@ -4,7 +4,7 @@
 import datasets
 from spacy.tokens import Doc
 from datetime import datetime
-from typing import Iterable, Iterator, Optional, Dict, List, cast, Union, Tuple, Callable
+from typing import Iterable, Iterator, Optional, Dict, List, cast, Union, Tuple, Callable, Type
 from spacy.tokens import Span
 import inspect
 from functools import partial
@@ -87,7 +87,13 @@ def create_eval_pipeline(self):
             # NOTE: this will fix the DeID model(s) created before medcat 1.9.3
             #       though this fix may very well be unstable
             self.ner_pipe.tokenizer._in_target_context_manager = False
+        if not hasattr(self.ner_pipe.tokenizer, 'split_special_tokens'):
+            # NOTE: this will fix the DeID model(s) created with transformers before 4.42
+            #       and allow them to run with later transforemrs
+            self.ner_pipe.tokenizer.split_special_tokens = False
         self.ner_pipe.device = self.model.device
+        self._consecutive_identical_failures = 0
+        self._last_exception: Optional[Tuple[str, Type[Exception]]] = None
 
     def get_hash(self) -> str:
         """A partial hash trying to catch differences between models.
@@ -390,34 +396,33 @@ def _process(self,
             #all_text_processed = self.tokenizer.encode_eval(all_text)
             # For now we will process the documents one by one, should be improved in the future to use batching
             for doc in docs:
-                try:
-                    res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy'])
-                    doc.ents = []  # type: ignore
-                    for r in res:
-                        inds = []
-                        for ind, word in enumerate(doc):
-                            end_char = word.idx + len(word.text)
-                            if end_char <= r['end'] and end_char > r['start']:
-                                inds.append(ind)
-                            # To not loop through everything
-                            if end_char > r['end']:
-                                break
-                        if inds:
-                            entity = Span(doc, min(inds), max(inds) + 1, label=r['entity_group'])
-                            entity._.cui = r['entity_group']
-                            entity._.context_similarity = r['score']
-                            entity._.detected_name = r['word']
-                            entity._.id = len(doc._.ents)
-                            entity._.confidence = r['score']
-
-                            doc._.ents.append(entity)
-                    create_main_ann(self.cdb, doc)
-                    if self.cdb.config.general['make_pretty_labels'] is not None:
-                        make_pretty_labels(self.cdb, doc, LabelStyle[self.cdb.config.general['make_pretty_labels']])
-                    if self.cdb.config.general['map_cui_to_group'] is not None and self.cdb.addl_info.get('cui2group', {}):
-                        map_ents_to_groups(self.cdb, doc)
-                except Exception as e:
-                    logger.warning(e, exc_info=True)
+                res = self.ner_pipe(doc.text, aggregation_strategy=self.config.general['ner_aggregation_strategy'])
+                doc.ents = []  # type: ignore
+                for r in res:
+                    inds = []
+                    for ind, word in enumerate(doc):
+                        end_char = word.idx + len(word.text)
+                        if end_char <= r['end'] and end_char > r['start']:
+                            inds.append(ind)
+                        # To not loop through everything
+                        if end_char > r['end']:
+                            break
+                    if inds:
+                        entity = Span(doc, min(inds), max(inds) + 1, label=r['entity_group'])
+                        entity._.cui = r['entity_group']
+                        entity._.context_similarity = r['score']
+                        entity._.detected_name = r['word']
+                        entity._.id = len(doc._.ents)
+                        entity._.confidence = r['score']
+
+                        doc._.ents.append(entity)
+                create_main_ann(self.cdb, doc)
+                if self.cdb.config.general['make_pretty_labels'] is not None:
+                    make_pretty_labels(self.cdb, doc, LabelStyle[self.cdb.config.general['make_pretty_labels']])
+                if self.cdb.config.general['map_cui_to_group'] is not None and self.cdb.addl_info.get('cui2group', {}):
+                    map_ents_to_groups(self.cdb, doc)
+                self._consecutive_identical_failures = 0  # success
+                self._last_exception = None
             yield from docs
 
     # Override