diff --git a/docs/source/training.md b/docs/source/training.md index 3a6e1e9..549fa31 100644 --- a/docs/source/training.md +++ b/docs/source/training.md @@ -118,7 +118,8 @@ import trankit trankit.verify_customized_pipeline( category='customized-mwt-ner', # pipeline category - save_dir='./save_dir' # directory used for saving models in previous steps + save_dir='./save_dir', # directory used for saving models in previous steps + embedding_name='xlm-roberta-base' # embedding version that we use for training our customized pipeline, by default, it is `xlm-roberta-base` ) ``` If the verification is success, this would printout the following: @@ -130,3 +131,29 @@ from trankit import Pipeline p = Pipeline(lang='customized-mwt-ner', cache_dir='./save_dir') ``` From now on, the customized pipeline can be used as a normal pretrained pipeline. + +The verification would fail if some of the expected model files of the pipeline are missing. This can be solved via the handy function `download_missing_files`, which is created for borrowing model files from pretrained pipelines provided by Trankit. Suppose that the language of your customized pipeline is English, the function can be used as below: +``` +import trankit + +trankit.download_missing_files( + category='customized-ner', + save_dir='./save_dir', + embedding_name='xlm-roberta-base', + language='english' +) +``` +where `category` is the category that we specified for the customized pipeline, `save_dir` is the path to the directory that we saved the customized models, `embedding_name` is the embedding that we used for the customized pipeline (which is `xlm-roberta-base` by default if we did not specify this in the training process), and `language` is the language with the pretrained models that we want to borrow. For example, if we only trained a NER model for the customized pipeline, the snippet above would borrow the trained models for all the other pipeline components and print out the following message: +``` +Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tokenizer.mdl +Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tagger.mdl +Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner.vocabs.json +Missing ./save_dir/xlm-roberta-base/customized-ner/customized-ner_lemmatizer.pt +http://nlp.uoregon.edu/download/trankit/v1.0.0/xlm-roberta-base/english.zip +Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47.9M/47.9M [00:00<00:00, 114MiB/s] +Copying ./save_dir/xlm-roberta-base/english/english.tokenizer.mdl to ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tokenizer.mdl +Copying ./save_dir/xlm-roberta-base/english/english.tagger.mdl to ./save_dir/xlm-roberta-base/customized-ner/customized-ner.tagger.mdl +Copying ./save_dir/xlm-roberta-base/english/english.vocabs.json to ./save_dir/xlm-roberta-base/customized-ner/customized-ner.vocabs.json +Copying ./save_dir/xlm-roberta-base/english/english_lemmatizer.pt to ./save_dir/xlm-roberta-base/customized-ner/customized-ner_lemmatizer.pt +``` +After this, we can go back to do the verification step again. diff --git a/trankit/__init__.py b/trankit/__init__.py index 8148f29..c75cac7 100644 --- a/trankit/__init__.py +++ b/trankit/__init__.py @@ -1,47 +1,119 @@ from .pipeline import Pipeline from .tpipeline import TPipeline from .pipeline import supported_langs, langwithner, remove_with_path +from .utils.base_utils import download, trankit2conllu +from .utils.tbinfo import supported_embeddings, supported_langs, saved_model_version +import os +from shutil import copyfile -__version__ = "1.0.1" +__version__ = "1.1.0" -def verify_customized_pipeline(category, save_dir): +def download_missing_files(category, save_dir, embedding_name, language): + assert language in supported_langs, '{} is not a pretrained language. Current pretrained languages: {}'.format(language, supported_langs) + assert embedding_name in supported_embeddings, '{} has not been supported. Current supported embeddings: {}'.format(embedding_name, supported_embeddings) + import os assert category in {'customized', 'customized-ner', 'customized-mwt', - 'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'" + 'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'" + if category == 'customized': + file_list = [ + ('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))), + ('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))), + ('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))), + ('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))) + ] + elif category == 'customized-ner': + file_list = [ + ('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))), + ('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))), + ('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))), + ('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))), + ('{}.ner.mdl', os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category))), + ('{}.ner-vocab.json', os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category))) + ] + elif category == 'customized-mwt': + file_list = [ + ('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))), + ('{}_mwt_expander.pt', os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category))), + ('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))), + ('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))), + ('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))) + ] + elif category == 'customized-mwt-ner': + file_list = [ + ('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))), + ('{}_mwt_expander.pt', os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category))), + ('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))), + ('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))), + ('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))), + ('{}.ner.mdl', os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category))), + ('{}.ner-vocab.json', os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category))) + ] + else: + assert 'Unknown customized lang!' + missing_filenamess = [] + for filename, filepath in file_list: + if not os.path.exists(filepath): + print('Missing {}'.format(filepath)) + missing_filenamess.append(filename) + + download( + cache_dir=save_dir, + language=language, + saved_model_version=saved_model_version, # manually set this to avoid duplicated storage + embedding_name=embedding_name + ) + # borrow pretrained files + src_dir = os.path.join(save_dir, embedding_name, language) + tgt_dir = os.path.join(save_dir, embedding_name, category) + for fname in missing_filenamess: + copyfile(os.path.join(src_dir, fname.format(language)), os.path.join(tgt_dir, fname.format(category))) + print('Copying {} to {}'.format( + os.path.join(src_dir, fname.format(language)), + os.path.join(tgt_dir, fname.format(category)) + )) + remove_with_path(src_dir) + + +def verify_customized_pipeline(category, save_dir, embedding_name): + assert embedding_name in supported_embeddings, '{} has not been supported. Current supported embeddings: {}'.format( + embedding_name, supported_embeddings) + assert category in {'customized', 'customized-ner', 'customized-mwt', + 'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'" if category == 'customized': file_list = [ - os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)), - os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)), - os.path.join(save_dir, category, '{}.vocabs.json'.format(category)), - os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category)) + os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)) ] elif category == 'customized-ner': file_list = [ - os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)), - os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)), - os.path.join(save_dir, category, '{}.vocabs.json'.format(category)), - os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category)), - os.path.join(save_dir, category, '{}.ner.mdl'.format(category)), - os.path.join(save_dir, category, '{}.ner-vocab.json'.format(category)) + os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category)) ] elif category == 'customized-mwt': file_list = [ - os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)), - os.path.join(save_dir, category, '{}_mwt_expander.pt'.format(category)), - os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)), - os.path.join(save_dir, category, '{}.vocabs.json'.format(category)), - os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category)) + os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)) ] elif category == 'customized-mwt-ner': file_list = [ - os.path.join(save_dir, category, '{}.tokenizer.mdl'.format(category)), - os.path.join(save_dir, category, '{}_mwt_expander.pt'.format(category)), - os.path.join(save_dir, category, '{}.tagger.mdl'.format(category)), - os.path.join(save_dir, category, '{}.vocabs.json'.format(category)), - os.path.join(save_dir, category, '{}_lemmatizer.pt'.format(category)), - os.path.join(save_dir, category, '{}.ner.mdl'.format(category)), - os.path.join(save_dir, category, '{}.ner-vocab.json'.format(category)) + os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category)), + os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category)) ] else: assert 'Unknown customized lang!' @@ -52,13 +124,11 @@ def verify_customized_pipeline(category, save_dir): verified = False print('Missing {}'.format(filepath)) if verified: - with open(os.path.join(save_dir, category, '{}.downloaded'.format(category)), 'w') as f: + with open(os.path.join(save_dir, embedding_name, category, '{}.downloaded'.format(category)), 'w') as f: f.write('') - remove_with_path(os.path.join(save_dir, category, 'train.txt.character')) - remove_with_path(os.path.join(save_dir, category, 'logs')) - remove_with_path(os.path.join(save_dir, category, 'preds')) - remove_with_path(os.path.join(save_dir, category, 'xlm-roberta-large')) - remove_with_path(os.path.join(save_dir, category, 'xlm-roberta-base')) + remove_with_path(os.path.join(save_dir, embedding_name, category, 'train.txt.character')) + remove_with_path(os.path.join(save_dir, embedding_name, category, 'logs')) + remove_with_path(os.path.join(save_dir, embedding_name, category, 'preds')) print( "Customized pipeline is ready to use!\nIt can be initialized as follows:\n-----------------------------------\nfrom trankit import Pipeline\np = Pipeline(lang='{}', cache_dir='{}')".format( category, save_dir)) diff --git a/trankit/pipeline.py b/trankit/pipeline.py index bce0ed0..9a30f8c 100644 --- a/trankit/pipeline.py +++ b/trankit/pipeline.py @@ -70,7 +70,7 @@ def __init__(self, lang, cache_dir=None, gpu=True, embedding='xlm-roberta-base') download( cache_dir=self._config._cache_dir, language=lang, - saved_model_version='v1.0.0', # manually set this to avoid duplicated storage + saved_model_version=saved_model_version, # manually set this to avoid duplicated storage embedding_name=master_config.embedding_name ) diff --git a/trankit/tests/test_training.py b/trankit/tests/test_training.py new file mode 100644 index 0000000..28a8bb6 --- /dev/null +++ b/trankit/tests/test_training.py @@ -0,0 +1,33 @@ +import trankit + +# initialize a trainer for the task +trainer = trankit.TPipeline( + training_config={ + 'category': 'customized-ner', # pipeline category + 'task': 'ner', # task name + 'save_dir': './save_dir', # directory to save the trained model + 'train_bio_fpath': './train.bio', # training data in BIO format + 'dev_bio_fpath': './dev.bio', # training data in BIO format + 'max_epoch': 1 + } +) + +# start training +trainer.train() + +trankit.download_missing_files( + category='customized-ner', + save_dir='./save_dir', + embedding_name='xlm-roberta-base', + language='english' +) + +trankit.verify_customized_pipeline( + category='customized-ner', # pipeline category + save_dir='./save_dir', # directory used for saving models in previous steps + embedding_name='xlm-roberta-base' +) + +p = trankit.Pipeline(lang='customized-ner', cache_dir='./save_dir') + +print(trankit.trankit2conllu(p('I love you more than I can say. Do you know that?'))) diff --git a/trankit/tpipeline.py b/trankit/tpipeline.py index 9da5b99..63fee30 100644 --- a/trankit/tpipeline.py +++ b/trankit/tpipeline.py @@ -160,7 +160,7 @@ def _set_up_config(self, training_config): # device and save dir self._save_dir = training_config['save_dir'] if 'save_dir' in training_config else './cache/' - self._save_dir = os.path.join(self._save_dir, self._lang) + self._save_dir = os.path.join(self._save_dir, master_config.embedding_name, self._lang) self._cache_dir = self._save_dir self._gpu = training_config['gpu'] if 'gpu' in training_config else True self._use_gpu = training_config['gpu'] if 'gpu' in training_config else True @@ -211,9 +211,7 @@ def _set_up_config(self, training_config): # wordpiece splitter if self._task not in ['mwt', 'lemmatize']: master_config.wordpiece_splitter = XLMRobertaTokenizer.from_pretrained(master_config.embedding_name, - cache_dir=os.path.join( - master_config._save_dir, - master_config.embedding_name)) + cache_dir=master_config._save_dir) def _prepare_tokenize(self): self.train_set = TokenizeDataset( diff --git a/trankit/utils/base_utils.py b/trankit/utils/base_utils.py index 0c1267d..25fec28 100644 --- a/trankit/utils/base_utils.py +++ b/trankit/utils/base_utils.py @@ -18,6 +18,40 @@ SPACE_RE = re.compile(r'\s') +def trankit2conllu(trankit_output): + assert type(trankit_output) == dict, "`trankit_output` must be a Python dictionary!" + if SENTENCES in trankit_output and len(trankit_output[SENTENCES]) > 0 and TOKENS in trankit_output[SENTENCES][0]: + output_type = 'document' + elif TOKENS in trankit_output: + output_type = 'sentence' + else: + print("Unknown format of `trankit_output`!") + return None + try: + if output_type == 'document': + json_doc = trankit_output[SENTENCES] + else: + assert output_type == 'sentence' + json_doc = [trankit_output] + + conllu_doc = [] + for sentence in json_doc: + conllu_sentence = [] + for token in sentence[TOKENS]: + if type(token[ID]) == int or len(token[ID]) == 1: + conllu_sentence.append(token) + else: + conllu_sentence.append(token) + for word in token[EXPANDED]: + conllu_sentence.append(word) + conllu_doc.append(conllu_sentence) + + return CoNLL.dict2conllstring(conllu_doc) + except: + print('Unsuccessful conversion! Please check the format of `trankit_output`') + return None + + def remove_with_path(path): if os.path.exists(path): if os.path.isdir(path): @@ -62,7 +96,8 @@ def download(cache_dir, language, saved_model_version, embedding_name): # put a save_fpath = os.path.join(lang_dir, '{}.zip'.format(language)) if not os.path.exists(os.path.join(lang_dir, '{}.downloaded'.format(language))): - url = "http://nlp.uoregon.edu/download/trankit/{}/{}/{}.zip".format(saved_model_version, embedding_name, language) + url = "http://nlp.uoregon.edu/download/trankit/{}/{}/{}.zip".format(saved_model_version, embedding_name, + language) print(url) response = requests.get(url, stream=True) diff --git a/trankit/utils/conll.py b/trankit/utils/conll.py index 84ce016..28862b9 100644 --- a/trankit/utils/conll.py +++ b/trankit/utils/conll.py @@ -166,3 +166,11 @@ def dict2conll(doc_dict, filename): conll_string = CoNLL.conll_as_string(doc_conll) with open(filename, 'w') as outfile: outfile.write(conll_string) + + @staticmethod + def dict2conllstring(doc_dict): + """ Convert the dictionary format input data to the CoNLL-U format output data and write to a file. + """ + doc_conll = CoNLL.convert_dict(doc_dict) + conll_string = CoNLL.conll_as_string(doc_conll) + return conll_string \ No newline at end of file diff --git a/trankit/utils/tbinfo.py b/trankit/utils/tbinfo.py index f85fef4..06ff8f0 100644 --- a/trankit/utils/tbinfo.py +++ b/trankit/utils/tbinfo.py @@ -1,5 +1,7 @@ supported_embeddings = ['xlm-roberta-base', 'xlm-roberta-large'] +saved_model_version = 'v1.0.0' + langwithner = { 'arabic', 'chinese',