Skip to content

Commit

Permalink
Allow QuickUMLS to be used as component in spacy pipeline (#57)
Browse files Browse the repository at this point in the history
* Implementation to allow QuickUMLS to be used as a component within a spacy pipeline.  This can be used as an entity matcher for UMLS concepts in other modular spacy pipelines.  This is already being used in an operational capacity for syndromic surveillance.

* Factoring QuickUMLS class slightly to minimize duplicated code between QuickUMLS and SpacyQuickUMLS.  Fixed keyword arguments to QuickUMLS when creating a spacy component.  Added documentation as well.  Removing previous standalone example Python file and instead adding an example of a QuickUMLS spacy pipleline to the README.
  • Loading branch information
burgersmoke authored Sep 3, 2020
1 parent 8420573 commit c0b5db0
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 17 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,29 @@ Set `best_match` to `False` if you want to return overlapping candidates, `ignor

If the matcher throws a warning during initialization, read [this page](https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4) to learn why and how to stop it from doing so.

## spaCy pipeline component

QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline. This follows traditional spaCy handling of concepts to be entity objects added to the Document object. These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object.

Adding QuickUMLS as a component in a pipeline can be done as follows:

```python
from quickumls.spacy_component import SpacyQuickUMLS

# common English pipeline
nlp = spacy.load('en_core_web_sm')

quickumls_component = SpacyQuickUMLS(nlp, 'PATH_TO_QUICKUMLS_DATA')
nlp.add_pipe(quickumls_component)

doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea')

for ent in doc.ents:
print('Entity text : {}'.format(ent.text))
print('Label (UMLS CUI) : {}'.format(ent.label_))
print('Similarity : {}'.format(ent._.similarity))
print('Semtypes : {}'.format(ent._.semtypes))
```

## Server / Client Support

Expand Down
62 changes: 45 additions & 17 deletions quickumls/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def __init__(
overlapping_criteria='score', threshold=0.7, window=5,
similarity_name='jaccard', min_match_length=3,
accepted_semtypes=constants.ACCEPTED_SEMTYPES,
verbose=False, keep_uppercase=False):
verbose=False, keep_uppercase=False,
spacy_component = False):
"""Instantiate QuickUMLS object
This is the main interface through which text can be processed.
Expand Down Expand Up @@ -146,18 +147,23 @@ def __init__(

self.accepted_semtypes = accepted_semtypes

try:
self.nlp = spacy.load(spacy_lang)
except OSError:
msg = (
'Model for language "{}" is not downloaded. Please '
'run "python -m spacy download {}" before launching '
'QuickUMLS'
).format(
self.language_flag,
constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx')
)
raise OSError(msg)
# if this is not being executed as as spacy component, then it must be standalone
if spacy_component:
# In this case, the pipeline is external to this current class
self.nlp = None
else:
try:
self.nlp = spacy.load(spacy_lang)
except OSError:
msg = (
'Model for language "{}" is not downloaded. Please '
'run "python -m spacy download {}" before launching '
'QuickUMLS'
).format(
self.language_flag,
constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx')
)
raise OSError(msg)

self.ss_db = toolbox.SimstringDBReader(
simstring_fp, similarity_name, threshold
Expand Down Expand Up @@ -437,17 +443,39 @@ def match(self, text, best_match=True, ignore_syntax=False):
"""

parsed = self.nlp(u'{}'.format(text))

# pass in parsed spacy doc to get concept matches
matches = self._match(parsed)

return matches

def _match(self, doc, best_match=True, ignore_syntax=False):
"""Gathers ngram matches given a spaCy document object.
[extended_summary]
Args:
text (Document): spaCy Document object to be used for extracting ngrams
best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
Returns:
List: List of all matches in the text
TODO: Describe format
"""

ngrams = None
if ignore_syntax:
ngrams = self._make_token_sequences(parsed)
ngrams = self._make_token_sequences(doc)
else:
ngrams = self._make_ngrams(parsed)
ngrams = self._make_ngrams(doc)

matches = self._get_all_matches(ngrams)

if best_match:
matches = self._select_terms(matches)

self._print_verbose_status(parsed, matches)

self._print_verbose_status(doc, matches)
return matches
67 changes: 67 additions & 0 deletions quickumls/spacy_component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import spacy
from spacy.tokens import Span
from spacy.strings import StringStore

from .core import QuickUMLS
from . import constants

class SpacyQuickUMLS(object):
name = 'QuickUMLS matcher'

def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs):
"""Instantiate SpacyQuickUMLS object
This creates a QuickUMLS spaCy component which can be used in modular pipelines.
This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts.
Args:
nlp: Existing spaCy pipeline. This is needed to update the vocabulary with UMLS CUI values
quickumls_fp (str): Path to QuickUMLS data
best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
**kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py)
"""

self.quickumls = QuickUMLS(quickumls_fp,
# By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed
# when we're using it as a component in a pipeline
spacy_component = True,
**kwargs)

# save this off so that we can get vocab values of labels later
self.nlp = nlp

# keep these for matching
self.best_match = best_match
self.ignore_syntax = ignore_syntax

# let's extend this with some proprties that we want
Span.set_extension('similarity', default = -1.0)
Span.set_extension('semtypes', default = -1.0)

def __call__(self, doc):
# pass in the document which has been parsed to this point in the pipeline for ngrams and matches
matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax)

# Convert QuickUMLS match objects into Spans
for match in matches:
# each match may match multiple ngrams
for ngram_match_dict in match:
start_char_idx = int(ngram_match_dict['start'])
end_char_idx = int(ngram_match_dict['end'])

cui = ngram_match_dict['cui']
# add the string to the spacy vocab
self.nlp.vocab.strings.add(cui)
# pull out the value
cui_label_value = self.nlp.vocab.strings[cui]

# char_span() creates a Span from these character indices
# UMLS CUI should work well as the label here
span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value)
# add some custom metadata to the spans
span._.similarity = ngram_match_dict['similarity']
span._.semtypes = ngram_match_dict['semtypes']
doc.ents = list(doc.ents) + [span]

return doc

0 comments on commit c0b5db0

Please sign in to comment.