diff --git a/stanza/models/constituency/parse_tree.py b/stanza/models/constituency/parse_tree.py index 9dbaf6dc39..bbdc51f063 100644 --- a/stanza/models/constituency/parse_tree.py +++ b/stanza/models/constituency/parse_tree.py @@ -18,7 +18,11 @@ EMPTY_CHILDREN = () -CONSTITUENT_SPLIT = re.compile("[-=#]") +# used to split off the functional tags from various treebanks +# for example, the Icelandic treebank (which we don't currently +# incorporate) uses * to distinguish 'ADJP', 'ADJP*OC' but we treat +# those as the same +CONSTITUENT_SPLIT = re.compile("[-=#*]") # These words occur in the VLSP dataset. # The documentation claims there might be *O*, although those don't