diff --git a/README.md b/README.md index 3e42585..8ed022e 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,29 @@ Set `best_match` to `False` if you want to return overlapping candidates, `ignor If the matcher throws a warning during initialization, read [this page](https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4) to learn why and how to stop it from doing so. +## spaCy pipeline component + +QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline. This follows traditional spaCy handling of concepts to be entity objects added to the Document object. These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object. + +Adding QuickUMLS as a component in a pipeline can be done as follows: + +```python +from quickumls.spacy_component import SpacyQuickUMLS + +# common English pipeline +nlp = spacy.load('en_core_web_sm') + +quickumls_component = SpacyQuickUMLS(nlp, 'PATH_TO_QUICKUMLS_DATA') +nlp.add_pipe(quickumls_component) + +doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea') + +for ent in doc.ents: + print('Entity text : {}'.format(ent.text)) + print('Label (UMLS CUI) : {}'.format(ent.label_)) + print('Similarity : {}'.format(ent._.similarity)) + print('Semtypes : {}'.format(ent._.semtypes)) +``` ## Server / Client Support diff --git a/quickumls/core.py b/quickumls/core.py index fa9955b..7164d18 100644 --- a/quickumls/core.py +++ b/quickumls/core.py @@ -26,7 +26,8 @@ def __init__( overlapping_criteria='score', threshold=0.7, window=5, similarity_name='jaccard', min_match_length=3, accepted_semtypes=constants.ACCEPTED_SEMTYPES, - verbose=False, keep_uppercase=False): + verbose=False, keep_uppercase=False, + spacy_component = False): """Instantiate QuickUMLS object This is the main interface through which text can be processed. @@ -146,18 +147,23 @@ def __init__( self.accepted_semtypes = accepted_semtypes - try: - self.nlp = spacy.load(spacy_lang) - except OSError: - msg = ( - 'Model for language "{}" is not downloaded. Please ' - 'run "python -m spacy download {}" before launching ' - 'QuickUMLS' - ).format( - self.language_flag, - constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx') - ) - raise OSError(msg) + # if this is not being executed as as spacy component, then it must be standalone + if spacy_component: + # In this case, the pipeline is external to this current class + self.nlp = None + else: + try: + self.nlp = spacy.load(spacy_lang) + except OSError: + msg = ( + 'Model for language "{}" is not downloaded. Please ' + 'run "python -m spacy download {}" before launching ' + 'QuickUMLS' + ).format( + self.language_flag, + constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx') + ) + raise OSError(msg) self.ss_db = toolbox.SimstringDBReader( simstring_fp, similarity_name, threshold @@ -437,17 +443,39 @@ def match(self, text, best_match=True, ignore_syntax=False): """ parsed = self.nlp(u'{}'.format(text)) + + # pass in parsed spacy doc to get concept matches + matches = self._match(parsed) + return matches + + def _match(self, doc, best_match=True, ignore_syntax=False): + """Gathers ngram matches given a spaCy document object. + + [extended_summary] + + Args: + text (Document): spaCy Document object to be used for extracting ngrams + + best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True. + ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False + + Returns: + List: List of all matches in the text + TODO: Describe format + """ + + ngrams = None if ignore_syntax: - ngrams = self._make_token_sequences(parsed) + ngrams = self._make_token_sequences(doc) else: - ngrams = self._make_ngrams(parsed) + ngrams = self._make_ngrams(doc) matches = self._get_all_matches(ngrams) if best_match: matches = self._select_terms(matches) - self._print_verbose_status(parsed, matches) - + self._print_verbose_status(doc, matches) + return matches diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py new file mode 100644 index 0000000..f64982f --- /dev/null +++ b/quickumls/spacy_component.py @@ -0,0 +1,67 @@ +import spacy +from spacy.tokens import Span +from spacy.strings import StringStore + +from .core import QuickUMLS +from . import constants + +class SpacyQuickUMLS(object): + name = 'QuickUMLS matcher' + + def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs): + """Instantiate SpacyQuickUMLS object + + This creates a QuickUMLS spaCy component which can be used in modular pipelines. + This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts. + + Args: + nlp: Existing spaCy pipeline. This is needed to update the vocabulary with UMLS CUI values + quickumls_fp (str): Path to QuickUMLS data + best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True. + ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False + **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py) + """ + + self.quickumls = QuickUMLS(quickumls_fp, + # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed + # when we're using it as a component in a pipeline + spacy_component = True, + **kwargs) + + # save this off so that we can get vocab values of labels later + self.nlp = nlp + + # keep these for matching + self.best_match = best_match + self.ignore_syntax = ignore_syntax + + # let's extend this with some proprties that we want + Span.set_extension('similarity', default = -1.0) + Span.set_extension('semtypes', default = -1.0) + + def __call__(self, doc): + # pass in the document which has been parsed to this point in the pipeline for ngrams and matches + matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax) + + # Convert QuickUMLS match objects into Spans + for match in matches: + # each match may match multiple ngrams + for ngram_match_dict in match: + start_char_idx = int(ngram_match_dict['start']) + end_char_idx = int(ngram_match_dict['end']) + + cui = ngram_match_dict['cui'] + # add the string to the spacy vocab + self.nlp.vocab.strings.add(cui) + # pull out the value + cui_label_value = self.nlp.vocab.strings[cui] + + # char_span() creates a Span from these character indices + # UMLS CUI should work well as the label here + span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value) + # add some custom metadata to the spans + span._.similarity = ngram_match_dict['similarity'] + span._.semtypes = ngram_match_dict['semtypes'] + doc.ents = list(doc.ents) + [span] + + return doc \ No newline at end of file