Allow QuickUMLS to be used as component in spacy pipeline (#57)

* Implementation to allow QuickUMLS to be used as a component within a spacy pipeline. This can be used as an entity matcher for UMLS concepts in other modular spacy pipelines. This is already being used in an operational capacity for syndromic surveillance. * Factoring QuickUMLS class slightly to minimize duplicated code between QuickUMLS and SpacyQuickUMLS. Fixed keyword arguments to QuickUMLS when creating a spacy component. Added documentation as well. Removing previous standalone example Python file and instead adding an example of a QuickUMLS spacy pipleline to the README.
Georgetown-IR-Lab · Sep 3, 2020 · c0b5db0 · c0b5db0
1 parent 8420573
commit c0b5db0
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,29 @@ Set `best_match` to `False` if you want to return overlapping candidates, `ignor
 
 If the matcher throws a warning during initialization, read [this page](https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4) to learn why and how to stop it from doing so.
 
+## spaCy pipeline component
+
+QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline.  This follows traditional spaCy handling of concepts to be entity objects added to the Document object.  These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object.
+
+Adding QuickUMLS as a component in a pipeline can be done as follows:
+
+```python
+from quickumls.spacy_component import SpacyQuickUMLS
+
+# common English pipeline
+nlp = spacy.load('en_core_web_sm')
+
+quickumls_component = SpacyQuickUMLS(nlp, 'PATH_TO_QUICKUMLS_DATA')
+nlp.add_pipe(quickumls_component)
+
+doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea')
+
+for ent in doc.ents:
+    print('Entity text : {}'.format(ent.text))
+    print('Label (UMLS CUI) : {}'.format(ent.label_))
+    print('Similarity : {}'.format(ent._.similarity))
+    print('Semtypes : {}'.format(ent._.semtypes))
+```
 
 ## Server / Client Support
 

diff --git a/quickumls/core.py b/quickumls/core.py
@@ -26,7 +26,8 @@ def __init__(
             overlapping_criteria='score', threshold=0.7, window=5,
             similarity_name='jaccard', min_match_length=3,
             accepted_semtypes=constants.ACCEPTED_SEMTYPES,
-            verbose=False, keep_uppercase=False):
+            verbose=False, keep_uppercase=False,
+            spacy_component = False):
         """Instantiate QuickUMLS object
 
             This is the main interface through which text can be processed.
@@ -146,18 +147,23 @@ def __init__(
 
         self.accepted_semtypes = accepted_semtypes
 
-        try:
-            self.nlp = spacy.load(spacy_lang)
-        except OSError:
-            msg = (
-                'Model for language "{}" is not downloaded. Please '
-                'run "python -m spacy download {}" before launching '
-                'QuickUMLS'
-            ).format(
-                self.language_flag,
-                constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx')
-            )
-            raise OSError(msg)
+        # if this is not being executed as as spacy component, then it must be standalone
+        if spacy_component:
+            # In this case, the pipeline is external to this current class
+            self.nlp = None
+        else:
+            try:
+                self.nlp = spacy.load(spacy_lang)
+            except OSError:
+                msg = (
+                    'Model for language "{}" is not downloaded. Please '
+                    'run "python -m spacy download {}" before launching '
+                    'QuickUMLS'
+                ).format(
+                    self.language_flag,
+                    constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx')
+                )
+                raise OSError(msg)
 
         self.ss_db = toolbox.SimstringDBReader(
             simstring_fp, similarity_name, threshold
@@ -437,17 +443,39 @@ def match(self, text, best_match=True, ignore_syntax=False):
         """
 
         parsed = self.nlp(u'{}'.format(text))
+
+        # pass in parsed spacy doc to get concept matches
+        matches = self._match(parsed)
 
+        return matches
+
+    def _match(self, doc, best_match=True, ignore_syntax=False):
+        """Gathers ngram matches given a spaCy document object.
+
+        [extended_summary]
+
+        Args:
+            text (Document): spaCy Document object to be used for extracting ngrams
+
+            best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
+            ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
+
+        Returns:
+            List: List of all matches in the text
+            TODO: Describe format
+        """
+
+        ngrams = None
         if ignore_syntax:
-            ngrams = self._make_token_sequences(parsed)
+            ngrams = self._make_token_sequences(doc)
         else:
-            ngrams = self._make_ngrams(parsed)
+            ngrams = self._make_ngrams(doc)
 
         matches = self._get_all_matches(ngrams)
 
         if best_match:
             matches = self._select_terms(matches)
 
-        self._print_verbose_status(parsed, matches)
-
+        self._print_verbose_status(doc, matches)
+        
         return matches
diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py
@@ -0,0 +1,67 @@
+import spacy
+from spacy.tokens import Span
+from spacy.strings import StringStore
+
+from .core import QuickUMLS
+from . import constants
+
+class SpacyQuickUMLS(object):
+    name = 'QuickUMLS matcher'
+
+    def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs):
+        """Instantiate SpacyQuickUMLS object
+
+            This creates a QuickUMLS spaCy component which can be used in modular pipelines.  
+            This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts.
+
+        Args:
+            nlp: Existing spaCy pipeline.  This is needed to update the vocabulary with UMLS CUI values
+            quickumls_fp (str): Path to QuickUMLS data
+            best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
+            ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
+            **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py)
+        """
+
+        self.quickumls = QuickUMLS(quickumls_fp, 
+            # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed
+            # when we're using it as a component in a pipeline
+            spacy_component = True,
+            **kwargs)
+
+        # save this off so that we can get vocab values of labels later
+        self.nlp = nlp
+
+        # keep these for matching
+        self.best_match = best_match
+        self.ignore_syntax = ignore_syntax
+
+        # let's extend this with some proprties that we want
+        Span.set_extension('similarity', default = -1.0)
+        Span.set_extension('semtypes', default = -1.0)
+
+    def __call__(self, doc):
+        # pass in the document which has been parsed to this point in the pipeline for ngrams and matches
+        matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax)
+
+        # Convert QuickUMLS match objects into Spans
+        for match in matches:
+            # each match may match multiple ngrams
+            for ngram_match_dict in match:
+                start_char_idx = int(ngram_match_dict['start'])
+                end_char_idx = int(ngram_match_dict['end'])
+
+                cui = ngram_match_dict['cui']
+                # add the string to the spacy vocab
+                self.nlp.vocab.strings.add(cui)
+                # pull out the value
+                cui_label_value = self.nlp.vocab.strings[cui]
+
+                # char_span() creates a Span from these character indices
+                # UMLS CUI should work well as the label here
+                span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value)
+                # add some custom metadata to the spans
+                span._.similarity = ngram_match_dict['similarity']
+                span._.semtypes = ngram_match_dict['semtypes']
+                doc.ents = list(doc.ents) + [span]
+
+        return doc