Skip to content

Commit

Permalink
fixed loading error for customized pipelines + add conllu conversion …
Browse files Browse the repository at this point in the history
…for trankit outputs
  • Loading branch information
minhhdvn committed Jun 19, 2021
1 parent cbbbd30 commit 1c19b9b
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 38 deletions.
29 changes: 28 additions & 1 deletion docs/source/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ import trankit

trankit.verify_customized_pipeline(
category='customized-mwt-ner', # pipeline category
save_dir='./save_dir' # directory used for saving models in previous steps
save_dir='./save_dir', # directory used for saving models in previous steps
embedding_name='xlm-roberta-base' # embedding version that we use for training our customized pipeline, by default, it is `xlm-roberta-base`
)
```
If the verification is success, this would printout the following:
Expand All @@ -130,3 +131,29 @@ from trankit import Pipeline
p = Pipeline(lang='customized-mwt-ner', cache_dir='./save_dir')
```
From now on, the customized pipeline can be used as a normal pretrained pipeline.

The verification would fail if some of the expected model files of the pipeline are missing. This can be solved via the handy function `download_missing_files`, which is created for borrowing model files from pretrained pipelines provided by Trankit. Suppose that the language of your customized pipeline is English, the function can be used as below:
```
import trankit

trankit.download_missing_files(
category='customized-ner',
save_dir='./save_dir',
embedding_name='xlm-roberta-base',
language='english'
)
```
where `category` is the category that we specified for the customized pipeline, `save_dir` is the path to the directory that we saved the customized models, `embedding_name` is the embedding that we used for the customized pipeline (which is `xlm-roberta-base` by default if we did not specify this in the training process), and `language` is the language with the pretrained models that we want to borrow. For example, if we only trained a NER model for the customized pipeline, the snippet above would borrow the trained models for all the other pipeline components and print out the following message:
```
Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tokenizer.mdl
Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tagger.mdl
Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner.vocabs.json
Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner_lemmatizer.pt
http://nlp.uoregon.edu/download/trankit/v1.0.0/xlm-roberta-base/english.zip
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47.9M/47.9M [00:00<00:00, 114MiB/s]
Copying ./save_dir/xlm-roberta-base/english/english.tokenizer.mdl to ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tokenizer.mdl
Copying ./save_dir/xlm-roberta-base/english/english.tagger.mdl to ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tagger.mdl
Copying ./save_dir/xlm-roberta-base/english/english.vocabs.json to ./save_dir/xlm-roberta-base/customized-ner/customized-ner.vocabs.json
Copying ./save_dir/xlm-roberta-base/english/english_lemmatizer.pt to ./save_dir/xlm-roberta-base/customized-ner/customized-ner_lemmatizer.pt
```
After this, we can go back to do the verification step again.
132 changes: 101 additions & 31 deletions trankit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,119 @@
from .pipeline import Pipeline
from .tpipeline import TPipeline
from .pipeline import supported_langs, langwithner, remove_with_path
from .utils.base_utils import download, trankit2conllu
from .utils.tbinfo import supported_embeddings, supported_langs, saved_model_version
import os
from shutil import copyfile

__version__ = "1.0.1"
__version__ = "1.1.0"


def verify_customized_pipeline(category, save_dir):
def download_missing_files(category, save_dir, embedding_name, language):
assert language in supported_langs, '{} is not a pretrained language. Current pretrained languages: {}'.format(language, supported_langs)
assert embedding_name in supported_embeddings, '{} has not been supported. Current supported embeddings: {}'.format(embedding_name, supported_embeddings)

import os
assert category in {'customized', 'customized-ner', 'customized-mwt',
'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
if category == 'customized':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)))
]
elif category == 'customized-ner':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))),
('{}.ner.mdl', os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category))),
('{}.ner-vocab.json', os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category)))
]
elif category == 'customized-mwt':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}_mwt_expander.pt', os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)))
]
elif category == 'customized-mwt-ner':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}_mwt_expander.pt', os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))),
('{}.ner.mdl', os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category))),
('{}.ner-vocab.json', os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category)))
]
else:
assert 'Unknown customized lang!'
missing_filenamess = []
for filename, filepath in file_list:
if not os.path.exists(filepath):
print('Missing {}'.format(filepath))
missing_filenamess.append(filename)

download(
cache_dir=save_dir,
language=language,
saved_model_version=saved_model_version, # manually set this to avoid duplicated storage
embedding_name=embedding_name
)
# borrow pretrained files
src_dir = os.path.join(save_dir, embedding_name, language)
tgt_dir = os.path.join(save_dir, embedding_name, category)
for fname in missing_filenamess:
copyfile(os.path.join(src_dir, fname.format(language)), os.path.join(tgt_dir, fname.format(category)))
print('Copying {} to {}'.format(
os.path.join(src_dir, fname.format(language)),
os.path.join(tgt_dir, fname.format(category))
))
remove_with_path(src_dir)


def verify_customized_pipeline(category, save_dir, embedding_name):
assert embedding_name in supported_embeddings, '{} has not been supported. Current supported embeddings: {}'.format(
embedding_name, supported_embeddings)
assert category in {'customized', 'customized-ner', 'customized-mwt',
'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
if category == 'customized':
file_list = [
os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category))
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))
]
elif category == 'customized-ner':
file_list = [
os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category)),
os.path.join(save_dir, category, '{}.ner.mdl'.format(category)),
os.path.join(save_dir, category, '{}.ner-vocab.json'.format(category))
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category))
]
elif category == 'customized-mwt':
file_list = [
os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, category, '{}_mwt_expander.pt'.format(category)),
os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category))
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))
]
elif category == 'customized-mwt-ner':
file_list = [
os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, category, '{}_mwt_expander.pt'.format(category)),
os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category)),
os.path.join(save_dir, category, '{}.ner.mdl'.format(category)),
os.path.join(save_dir, category, '{}.ner-vocab.json'.format(category))
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category))
]
else:
assert 'Unknown customized lang!'
Expand All @@ -52,13 +124,11 @@ def verify_customized_pipeline(category, save_dir):
verified = False
print('Missing {}'.format(filepath))
if verified:
with open(os.path.join(save_dir, category, '{}.downloaded'.format(category)), 'w') as f:
with open(os.path.join(save_dir, embedding_name, category, '{}.downloaded'.format(category)), 'w') as f:
f.write('')
remove_with_path(os.path.join(save_dir, category, 'train.txt.character'))
remove_with_path(os.path.join(save_dir, category, 'logs'))
remove_with_path(os.path.join(save_dir, category, 'preds'))
remove_with_path(os.path.join(save_dir, category, 'xlm-roberta-large'))
remove_with_path(os.path.join(save_dir, category, 'xlm-roberta-base'))
remove_with_path(os.path.join(save_dir, embedding_name, category, 'train.txt.character'))
remove_with_path(os.path.join(save_dir, embedding_name, category, 'logs'))
remove_with_path(os.path.join(save_dir, embedding_name, category, 'preds'))
print(
"Customized pipeline is ready to use!\nIt can be initialized as follows:\n-----------------------------------\nfrom trankit import Pipeline\np = Pipeline(lang='{}', cache_dir='{}')".format(
category, save_dir))
Expand Down
2 changes: 1 addition & 1 deletion trankit/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, lang, cache_dir=None, gpu=True, embedding='xlm-roberta-base')
download(
cache_dir=self._config._cache_dir,
language=lang,
saved_model_version='v1.0.0', # manually set this to avoid duplicated storage
saved_model_version=saved_model_version, # manually set this to avoid duplicated storage
embedding_name=master_config.embedding_name
)

Expand Down
33 changes: 33 additions & 0 deletions trankit/tests/test_training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import trankit

# initialize a trainer for the task
trainer = trankit.TPipeline(
training_config={
'category': 'customized-ner', # pipeline category
'task': 'ner', # task name
'save_dir': './save_dir', # directory to save the trained model
'train_bio_fpath': './train.bio', # training data in BIO format
'dev_bio_fpath': './dev.bio', # training data in BIO format
'max_epoch': 1
}
)

# start training
trainer.train()

trankit.download_missing_files(
category='customized-ner',
save_dir='./save_dir',
embedding_name='xlm-roberta-base',
language='english'
)

trankit.verify_customized_pipeline(
category='customized-ner', # pipeline category
save_dir='./save_dir', # directory used for saving models in previous steps
embedding_name='xlm-roberta-base'
)

p = trankit.Pipeline(lang='customized-ner', cache_dir='./save_dir')

print(trankit.trankit2conllu(p('I love you more than I can say. Do you know that?')))
6 changes: 2 additions & 4 deletions trankit/tpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _set_up_config(self, training_config):

# device and save dir
self._save_dir = training_config['save_dir'] if 'save_dir' in training_config else './cache/'
self._save_dir = os.path.join(self._save_dir, self._lang)
self._save_dir = os.path.join(self._save_dir, master_config.embedding_name, self._lang)
self._cache_dir = self._save_dir
self._gpu = training_config['gpu'] if 'gpu' in training_config else True
self._use_gpu = training_config['gpu'] if 'gpu' in training_config else True
Expand Down Expand Up @@ -211,9 +211,7 @@ def _set_up_config(self, training_config):
# wordpiece splitter
if self._task not in ['mwt', 'lemmatize']:
master_config.wordpiece_splitter = XLMRobertaTokenizer.from_pretrained(master_config.embedding_name,
cache_dir=os.path.join(
master_config._save_dir,
master_config.embedding_name))
cache_dir=master_config._save_dir)

def _prepare_tokenize(self):
self.train_set = TokenizeDataset(
Expand Down
37 changes: 36 additions & 1 deletion trankit/utils/base_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,40 @@
SPACE_RE = re.compile(r'\s')


def trankit2conllu(trankit_output):
assert type(trankit_output) == dict, "`trankit_output` must be a Python dictionary!"
if SENTENCES in trankit_output and len(trankit_output[SENTENCES]) > 0 and TOKENS in trankit_output[SENTENCES][0]:
output_type = 'document'
elif TOKENS in trankit_output:
output_type = 'sentence'
else:
print("Unknown format of `trankit_output`!")
return None
try:
if output_type == 'document':
json_doc = trankit_output[SENTENCES]
else:
assert output_type == 'sentence'
json_doc = [trankit_output]

conllu_doc = []
for sentence in json_doc:
conllu_sentence = []
for token in sentence[TOKENS]:
if type(token[ID]) == int or len(token[ID]) == 1:
conllu_sentence.append(token)
else:
conllu_sentence.append(token)
for word in token[EXPANDED]:
conllu_sentence.append(word)
conllu_doc.append(conllu_sentence)

return CoNLL.dict2conllstring(conllu_doc)
except:
print('Unsuccessful conversion! Please check the format of `trankit_output`')
return None


def remove_with_path(path):
if os.path.exists(path):
if os.path.isdir(path):
Expand Down Expand Up @@ -62,7 +96,8 @@ def download(cache_dir, language, saved_model_version, embedding_name): # put a
save_fpath = os.path.join(lang_dir, '{}.zip'.format(language))

if not os.path.exists(os.path.join(lang_dir, '{}.downloaded'.format(language))):
url = "http://nlp.uoregon.edu/download/trankit/{}/{}/{}.zip".format(saved_model_version, embedding_name, language)
url = "http://nlp.uoregon.edu/download/trankit/{}/{}/{}.zip".format(saved_model_version, embedding_name,
language)
print(url)

response = requests.get(url, stream=True)
Expand Down
8 changes: 8 additions & 0 deletions trankit/utils/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,11 @@ def dict2conll(doc_dict, filename):
conll_string = CoNLL.conll_as_string(doc_conll)
with open(filename, 'w') as outfile:
outfile.write(conll_string)

@staticmethod
def dict2conllstring(doc_dict):
""" Convert the dictionary format input data to the CoNLL-U format output data and write to a file.
"""
doc_conll = CoNLL.convert_dict(doc_dict)
conll_string = CoNLL.conll_as_string(doc_conll)
return conll_string
Loading

0 comments on commit 1c19b9b

Please sign in to comment.