From db6923e1798020b082d66f4d3cb0809bb2cba232 Mon Sep 17 00:00:00 2001 From: mehrad Date: Fri, 25 Feb 2022 15:02:36 -0800 Subject: [PATCH 01/19] Remove almond_dialogue_multilingual_{nlu|nlg} tasks --- genienlp/tasks/almond_task.py | 63 ----------------------------------- 1 file changed, 63 deletions(-) diff --git a/genienlp/tasks/almond_task.py b/genienlp/tasks/almond_task.py index e1de6b81..a889c2a3 100644 --- a/genienlp/tasks/almond_task.py +++ b/genienlp/tasks/almond_task.py @@ -812,66 +812,3 @@ def _make_example(self, parts, dir_name, **kwargs): preprocess=self.preprocess_field, lower=False, ) - - -@register_task('almond_dialogue_multilingual_nlu') -class AlmondDialogMultiLingualNLU(BaseAlmondMultiLingualTask): - """Multi-turn NLU task (user and agent) for MultiLingual Almond dialogues""" - - def __init__(self, name, args): - super().__init__(name, args) - self._metrics = ['em', 'sm', 'bleu'] - - def _is_program_field(self, field_name): - return field_name in ('answer', 'context') - - @property - def utterance_field(self): - return 'question' - - def _make_example(self, parts, dir_name=None, **kwargs): - if self._almond_has_multiple_programs: - example_id, context, sentence, target_code = parts - else: - example_id, context, sentence, target_code = parts[:4] - answer = target_code - question = sentence - return Example.from_raw( - self.name + '/' + dir_name + '/' + example_id, - context, - question, - answer, - preprocess=self.preprocess_field, - lower=False, - ) - - -@register_task('almond_dialogue_multilingual_nlg') -class AlmondDialogMultiLingualNLG(BaseAlmondTask): - """Multi-turn NLG task (agent) for MultiLingual Almond dialogues""" - - def __init__(self, name, args): - super().__init__(name, args) - self._metrics = ['bleu'] - - def _is_program_field(self, field_name): - return field_name == 'context' - - @property - def utterance_field(self): - return 'question' - - def _make_example(self, parts, dir_name=None, **kwargs): - # the question is irrelevant for this task - example_id, context, sentence, target_code = parts - question = 'what should the agent say ?' - context = context + ' ' + target_code - answer = sentence - return Example.from_raw( - self.name + '/' + dir_name + '/' + example_id, - context, - question, - answer, - preprocess=self.preprocess_field, - lower=False, - ) From bc77aaa0d99e2539de8165d2d483e9ae148e14ee Mon Sep 17 00:00:00 2001 From: mehrad Date: Fri, 25 Feb 2022 16:14:56 -0800 Subject: [PATCH 02/19] predict: remove code added for task with multiple subfolders, one per language --- genienlp/metrics.py | 2 +- genienlp/predict.py | 388 ++++++++++++++++++------------------------- genienlp/validate.py | 12 +- 3 files changed, 173 insertions(+), 229 deletions(-) diff --git a/genienlp/metrics.py b/genienlp/metrics.py index aec0463e..4dda32b2 100644 --- a/genienlp/metrics.py +++ b/genienlp/metrics.py @@ -747,7 +747,7 @@ def convert_IOB2_to_IOB1(labels): return metric_dict -def calculate_and_reduce_metrics(generation_output, metrics_to_compute, args, lang): +def calculate_and_reduce_metrics(args, generation_output, metrics_to_compute, lang): metrics = OrderedDict() example_ids = generation_output.example_ids predictions = generation_output.predictions diff --git a/genienlp/predict.py b/genienlp/predict.py index c92a8213..ae7f826e 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -110,19 +110,15 @@ def parse_argv(parser): type=str, nargs='+', dest='pred_src_languages', - help='Specify dataset source languages used during prediction for multilingual tasks' - 'multiple languages for each task should be concatenated with +', + help='Specify dataset source languages used during prediction for multilingual tasks', ) parser.add_argument( '--pred_tgt_languages', type=str, nargs='+', - help='Specify dataset target languages used during prediction for multilingual tasks' - 'multiple languages for each task should be concatenated with +', + help='Specify dataset target languages used during prediction for multilingual tasks', ) - parser.add_argument('--separate_eval', action='store_true', help='evaluate on each language eval set separately') - parser.add_argument( '--main_metric_only', action='store_true', help='If True, we only calculate the deca score metric for each task.' ) @@ -287,22 +283,18 @@ def set_default_values(args): args.confidence_feature_path = os.path.join(args.path, 'confidence_features.pkl') if args.e2e_dialogue_evaluation and args.val_batch_size[0] != 1: - logger.warning('When evaluating bitod end-to-end, val_batch_size should be 1 so we load the data turn by turn') + logger.warning('When evaluating dialogues end-to-end, val_batch_size should be 1 so we load the data turn by turn') args.val_batch_size = [1] def check_args(args): - if not args.pred_src_languages: setattr(args, 'pred_src_languages', [args.eval_src_languages]) if not args.pred_tgt_languages: setattr(args, 'pred_tgt_languages', [args.eval_tgt_languages]) if len(args.task_names) != len(args.pred_src_languages): - raise ValueError( - 'You have to define prediction languages for each task.' - ' Use None for single language tasks. Also provide languages in the same order you provided the tasks.' - ) + raise ValueError('You have to define prediction languages for each task in the same order you provided the tasks.') if getattr(args, 'do_ned', False) and getattr(args, 'ned_retrieve_method', None) == 'bootleg': with open(os.path.join(args.path, 'config.json')) as config_file: @@ -316,7 +308,9 @@ def check_args(args): ) -def prepare_data(args, src_lang): +def prepare_data(args): + # TODO handle multiple languages + src_lang = args.pred_src_languages[0] datasets = [] paths = [] @@ -348,40 +342,32 @@ def prepare_data(args, src_lang): } ) - task_splits, task_paths = task.get_splits(root=args.data, lower=args.lower, **kwargs) - if not isinstance(task_splits, list): - task_splits = [task_splits] - task_paths = [task_paths] - task_data_processed = [] - task_path_processed = [] - for split, path in zip(task_splits, task_paths): - assert (split.eval or split.test or split.train) and not split.aux - if split.train: - data = split.train - path = path.train - elif split.eval: - data = split.eval - path = path.eval - else: - data = split.test - path = path.test - - file_name = os.path.basename(path.rsplit('.', 1)[0]) - if ( - args.ned_retrieve_method == 'bootleg' - and os.path.exists(f'{args.bootleg_output_dir}/{file_name}_bootleg/bootleg_wiki/bootleg_labels.jsonl') - ) or (args.ned_retrieve_method != 'bootleg'): - ned_model = init_ned_model(args) - else: - ned_model = init_ned_model(args, 'bootleg-annotator') - if ned_model: - ned_model.process_examples(data.examples, path, task.utterance_field) - - task_data_processed.append(data) - task_path_processed.append(path) - logger.info(f'{task.name} has {len(data.examples)} prediction examples') - datasets.append(task_data_processed) - paths.append(task_path_processed) + split, path = task.get_splits(root=args.data, lower=args.lower, **kwargs) + assert (split.eval or split.test or split.train) and not split.aux + if split.train: + data = split.train + path = path.train + elif split.eval: + data = split.eval + path = path.eval + else: + data = split.test + path = path.test + + file_name = os.path.basename(path.rsplit('.', 1)[0]) + if ( + args.ned_retrieve_method == 'bootleg' + and os.path.exists(f'{args.bootleg_output_dir}/{file_name}_bootleg/bootleg_wiki/bootleg_labels.jsonl') + ) or (args.ned_retrieve_method != 'bootleg'): + ned_model = init_ned_model(args) + else: + ned_model = init_ned_model(args, 'bootleg-annotator') + if ned_model: + ned_model.process_examples(data.examples, path, task.utterance_field) + + logger.info(f'{task.name} has {len(data.examples)} prediction examples') + datasets.append(data) + paths.append(path) return datasets @@ -391,36 +377,56 @@ def prepare_data_iterators(args, val_sets, numericalizer, device): if len(args.val_batch_size) == 1 and len(val_sets) > 1: args.val_batch_size *= len(val_sets) iters = [] - task_index = 0 for task, bs, val_set in zip(args.tasks, args.val_batch_size, val_sets): task_iter = [] - task_languages = args.pred_src_languages[task_index] - if task_languages is not None and args.separate_eval: - task_languages = task_languages.split('+') - assert len(task_languages) == len(val_set) - for index, set_ in enumerate(val_set): - loader, original_order = make_data_loader( - set_, numericalizer, bs, device, train=False, return_original_order=True - ) - task_iter.append((task, task_languages[index], loader, original_order)) - # single language task or no separate eval - else: - loader, original_order = make_data_loader( - val_set[0], numericalizer, bs, device, train=False, return_original_order=True - ) - task_iter.append((task, task_languages, loader, original_order)) + loader, original_order = make_data_loader(val_set, numericalizer, bs, device, train=False, return_original_order=True) + task_iter.append((task, loader, original_order)) iters.extend(task_iter) - task_index += 1 return iters +def create_output_line(args, generation_output): + lines = [] + for i in range(len(generation_output.example_ids)): + predictions = generation_output.raw_predictions if args.translate_return_raw_outputs else generation_output.predictions + if args.one_output_per_line: + lines = [ + '\t'.join( + [generation_output.example_ids[i], prediction, generation_output.answers[i], generation_output.contexts[i]] + ) + for prediction in predictions[i] + ] # one line per generation output + else: + lines = [ + '\t'.join( + [ + generation_output.example_ids[i], + *predictions[i], + generation_output.answers[i], + generation_output.contexts[i], + ] + ) + ] # one line with all generation outputs separated by '\t' + if args.calibrator_paths is not None: + for score in generation_output.confidence_scores: + lines = [line + '\t' + str(score[i]) for line in lines] # append score to all lines + return lines + + +def get_metrics_to_compute(args, task): + metrics_to_compute = task.metrics + metrics_to_compute += args.extra_metrics + metrics_to_compute = [metric for metric in task.metrics if metric not in ['loss']] + if args.main_metric_only: + metrics_to_compute = [metrics_to_compute[0]] + return metrics_to_compute + + def run(args, device): - # TODO handle multiple languages - src_lang = args.pred_src_languages[0] - tgt_lang = args.pred_tgt_languages[0] + # TODO handle multiple languages Model = getattr(models, args.model) model, _ = Model.load( args.path, @@ -428,11 +434,11 @@ def run(args, device): args=args, device=device, tasks=args.tasks, - src_lang=src_lang, - tgt_lang=tgt_lang, + src_lang=args.pred_src_languages[0], + tgt_lang=args.pred_tgt_languages[0], ) - val_sets = prepare_data(args, src_lang) + val_sets = prepare_data(args) model.add_new_vocab_from_data(args.tasks) iters = prepare_data_iterators(args, val_sets, model.numericalizer, device) @@ -446,149 +452,81 @@ def run(args, device): eval_dir = os.path.join(args.eval_dir, args.evaluate) os.makedirs(eval_dir, exist_ok=True) - with torch.no_grad(): - for task, language, it, original_order in iters: - logger.info(task.name) - # single language task - if language is None or 'multilingual' not in task.name: - prediction_file_name = os.path.join(eval_dir, task.name + '.tsv') - raw_prediction_file_name = os.path.join(eval_dir, task.name + '.raw.tsv') - results_file_name = os.path.join(eval_dir, task.name + '.results.json') - # multi language task - else: - prediction_file_name = os.path.join(eval_dir, task.name + '_{}.tsv'.format(language)) - raw_prediction_file_name = os.path.join(eval_dir, task.name + '_{}.raw.tsv'.format(language)) - results_file_name = os.path.join(eval_dir, task.name + '_{}.results.json'.format(language)) - - for fname in [prediction_file_name, raw_prediction_file_name, results_file_name]: - if os.path.exists(fname): - if args.overwrite: - logger.warning(f'{fname} already exists -- overwriting **') - else: - raise OSError(f'{fname} already exists') - - if args.calibrator_paths is not None: - confidence_estimators = [] - for path in args.calibrator_paths: - estimator = ConfidenceEstimator.load(path) - confidence_estimators.append(estimator) - logger.info('Loading confidence estimator "%s" from %s', estimator.name, path) - else: - confidence_estimators = None - with torch.cuda.amp.autocast(enabled=args.mixed_precision): - generation_output = generate_with_model( - model, - it, - model.numericalizer, - task, - args, - original_order=original_order, - output_confidence_features=args.save_confidence_features, - confidence_estimators=confidence_estimators, - disable_progbar=False, - eval_dir=eval_dir, - ) + for index, (task, it, original_order) in enumerate(iters): + logger.info(task.name) + tgt_lang = args.pred_tgt_languages[index] + prediction_file_name = os.path.join(eval_dir, task.name + '.tsv') + raw_prediction_file_name = os.path.join(eval_dir, task.name + '.raw.tsv') + results_file_name = os.path.join(eval_dir, task.name + '.results.json') + + for fname in [prediction_file_name, raw_prediction_file_name, results_file_name]: + if os.path.exists(fname): + if args.overwrite: + logger.warning(f'{fname} already exists -- overwriting **') + else: + raise OSError(f'{fname} already exists') + + if args.calibrator_paths is not None: + confidence_estimators = [] + for path in args.calibrator_paths: + estimator = ConfidenceEstimator.load(path) + confidence_estimators.append(estimator) + logger.info('Loading confidence estimator "%s" from %s', estimator.name, path) + else: + confidence_estimators = None + + with torch.no_grad(), torch.cuda.amp.autocast(enabled=args.mixed_precision): + generation_output = generate_with_model( + model, + it, + model.numericalizer, + task, + args, + original_order=original_order, + output_confidence_features=args.save_confidence_features, + confidence_estimators=confidence_estimators, + disable_progbar=False, + eval_dir=eval_dir, + ) - if args.save_confidence_features: - torch.save(generation_output.confidence_features, args.confidence_feature_path) - - # write into file - # TODO change to jsonl format - with open(prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: - for i in range(len(generation_output.example_ids)): - if args.one_output_per_line: - lines = [ - ( - generation_output.example_ids[i] - + '\t' - + prediction - + '\t' - + generation_output.answers[i] - + '\t' - + generation_output.contexts[i] - ) - for prediction in generation_output.predictions[i] - ] # one line per generation output - else: - lines = [ - ( - generation_output.example_ids[i] - + '\t' - + '\t'.join(generation_output.predictions[i]) - + '\t' - + generation_output.answers[i] - + '\t' - + generation_output.contexts[i] - ) - ] # one line with all generation outputs separated by '\t' + if args.save_confidence_features: + torch.save(generation_output.confidence_features, args.confidence_feature_path) + + # write into file + # TODO change to jsonl format + with open(prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: + lines = create_output_line(args, generation_output) + prediction_file.write('\n'.join(lines) + '\n') + + if args.translate_return_raw_outputs: + with open(raw_prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: + lines = create_output_line(args, generation_output) + prediction_file.write('\n'.join(lines) + '\n') + + if len(generation_output.answers) > 0: + metrics_to_compute = get_metrics_to_compute(args, task) + metrics = calculate_and_reduce_metrics(args, generation_output, metrics_to_compute, tgt_lang) + + with open(results_file_name, 'w' + ('' if args.overwrite else '+')) as results_file: + results_file.write(json.dumps(metrics) + '\n') + + if not args.silent: + for i, (c, p, a) in enumerate( + zip(generation_output.contexts, generation_output.predictions, generation_output.answers) + ): + log_string = '\n'.join( + [f'Context {i + 1}: {c}', f'Prediction {i + 1} ({len(p)} outputs): {p}', f'Answer {i + 1}: {a}'] + ) if args.calibrator_paths is not None: + log_string += f'Confidence {i + 1} : ' for score in generation_output.confidence_scores: - lines = [line + '\t' + str(score[i]) for line in lines] # append score to all lines - prediction_file.write('\n'.join(lines) + '\n') - - if args.translate_return_raw_outputs: - with open(raw_prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: - for i in range(len(generation_output.example_ids)): - if args.one_output_per_line: - lines = [ - ( - generation_output.example_ids[i] - + '\t' - + raw_prediction - + '\t' - + generation_output.answers[i] - + '\t' - + generation_output.contexts[i] - ) - for raw_prediction in generation_output.raw_predictions[i] - ] # one line per generation output - else: - lines = [ - ( - generation_output.example_ids[i] - + '\t' - + '\t'.join(generation_output.raw_predictions[i]) - + '\t' - + generation_output.answers[i] - + '\t' - + generation_output.contexts[i] - ) - ] # one line with all outputs separated by '\t' - prediction_file.write('\n'.join(lines) + '\n') - - if len(generation_output.answers) > 0: - metrics_to_compute = task.metrics - metrics_to_compute += args.extra_metrics - metrics_to_compute = [metric for metric in task.metrics if metric not in ['loss']] - if args.main_metric_only: - metrics_to_compute = [metrics_to_compute[0]] - metrics = calculate_and_reduce_metrics( - generation_output, - metrics_to_compute, - args, - tgt_lang, - ) - - with open(results_file_name, 'w' + ('' if args.overwrite else '+')) as results_file: - results_file.write(json.dumps(metrics) + '\n') + log_string += f'{score[i]:.3f}, ' + log_string += '\n' + logger.info(log_string) - if not args.silent: - for i, (c, p, a) in enumerate( - zip(generation_output.contexts, generation_output.predictions, generation_output.answers) - ): - log_string = ( - f'\nContext {i + 1}: {c}\nPrediction {i + 1} ({len(p)} outputs): {p}\nAnswer {i + 1}: {a}\n' - ) - if args.calibrator_paths is not None: - log_string += f'Confidence {i + 1} : ' - for score in generation_output.confidence_scores: - log_string += f'{score[i]:.3f}, ' - log_string += '\n' - logger.info(log_string) + logger.info(metrics) - logger.info(metrics) - - task_scores[task].append((len(generation_output.answers), metrics[task.metrics[0]])) + task_scores[task].append((len(generation_output.answers), metrics[task.metrics[0]])) decaScore = [] for task in task_scores.keys(): @@ -604,6 +542,22 @@ def run(args, device): logger.info(f'\nSummary: | {sum(decaScore)} | {" | ".join([str(x) for x in decaScore])} |\n') +def update_metrics(args): + assert len(args.override_valid_metrics) == len(args.tasks) + new_metrics = [] + for task, metrics in zip(args.tasks, args.override_valid_metrics): + for m in metrics: + # remove loss from validation metrics + if m == 'loss': + continue + # backward compatibility for models validated on sacrebleu (now casedbleu) + if m == 'sacrebleu': + m = 'casedblue' + new_metrics.append(m) + + task.metrics = new_metrics + + def main(args): load_config_json(args) check_and_update_generation_args(args) @@ -612,24 +566,10 @@ def main(args): set_seed(args) args.tasks = list(get_tasks(args.task_names, args).values()) - - logger.info(f'Arguments:\n{pformat(vars(args))}') - if args.override_valid_metrics: - assert len(args.override_valid_metrics) == len(args.tasks) - new_metrics = [] - for task, metrics in zip(args.tasks, args.override_valid_metrics): - for m in metrics: - # remove loss from validation metrics - if m == 'loss': - continue - # backward compatibility for models validated on sacrebleu (now casedbleu) - if m == 'sacrebleu': - m = 'casedblue' - new_metrics.append(m) - - task.metrics = new_metrics + update_metrics(args) + logger.info(f'Arguments:\n{pformat(vars(args))}') logger.info(f'Loading from {args.best_checkpoint}') devices = get_devices(args.devices) diff --git a/genienlp/validate.py b/genienlp/validate.py index 934a9f7d..e9bb35e0 100644 --- a/genienlp/validate.py +++ b/genienlp/validate.py @@ -677,15 +677,19 @@ def validate(task, val_iter, model, numericalizer, args, num_print=10): # get rid of the DataParallel wrapper model = model.module - output = generate_with_model(model, val_iter, numericalizer, task, args) + generation_output = generate_with_model(model, val_iter, numericalizer, task, args) # loss is already calculated metrics_to_return = [metric for metric in task.metrics if metric != 'loss'] - metrics = calculate_and_reduce_metrics(output, metrics_to_return, args, model.tgt_lang) + metrics = calculate_and_reduce_metrics(args, generation_output, metrics_to_return, model.tgt_lang) - results = {'model prediction': output.predictions, 'gold answer': output.answers, 'context': output.contexts} + results = { + 'model prediction': generation_output.predictions, + 'gold answer': generation_output.answers, + 'context': generation_output.contexts, + } print_results(results, num_print) - return output, metrics + return generation_output, metrics From aecd306b7d804ad35652dc62b2d17d42fe10826f Mon Sep 17 00:00:00 2001 From: mehrad Date: Fri, 25 Feb 2022 16:21:55 -0800 Subject: [PATCH 03/19] metrics: remove obsolete metric implementations these were inherited from decaNLP but the implementations are old and no longer used. We can import similar metrics from HF/datasets library. --- genienlp/metrics.py | 444 +++------------------------------ genienlp/tasks/generic_task.py | 16 +- 2 files changed, 39 insertions(+), 421 deletions(-) diff --git a/genienlp/metrics.py b/genienlp/metrics.py index 4dda32b2..63636651 100644 --- a/genienlp/metrics.py +++ b/genienlp/metrics.py @@ -28,25 +28,16 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import logging -import os -import re -import string from collections import Counter, OrderedDict, defaultdict -from contextlib import closing -from multiprocessing import Pool, cpu_count -from subprocess import PIPE, Popen from typing import Iterable, Union -import numpy as np import sacrebleu from datasets import load_metric from dialogues import Bitod from dialogues.bitod.src.evaluate import convert_lists_to_set -from pyrouge import Rouge155 from seqeval import metrics as seq_metrics from seqeval import scheme as seq_scheme -from .tasks.generic_dataset import Query from .util import requote_program logger = logging.getLogger(__name__) @@ -56,168 +47,6 @@ corpus_level_metrics = {'bleu', 'casedbleu', 'ter', 't5_bleu', 'nmt_bleu', 'corpus_f1', 'jga'} -def to_lf(s, table): - aggs = [y.lower() for y in Query.agg_ops] - agg_to_idx = {x: i for i, x in enumerate(aggs)} - conditionals = [y.lower() for y in Query.cond_ops] - headers_unsorted = [(y.lower(), i) for i, y in enumerate(table['header'])] - headers = [(y.lower(), i) for i, y in enumerate(table['header'])] - headers.sort(reverse=True, key=lambda x: len(x[0])) - condition_s, conds = None, [] - if 'where' in s: - s, condition_s = s.split('where', 1) - - s = ' '.join(s.split()[1:-2]) - sel, agg = None, 0 - for col, idx in headers: - if col == s: - sel = idx - if sel is None: - s = s.split() - agg = agg_to_idx[s[0]] - s = ' '.join(s[1:]) - for col, idx in headers: - if col == s: - sel = idx - - full_conditions = [] - if condition_s is not None: - - condition_s = ' ' + condition_s + ' ' - for idx, col in enumerate(headers): - condition_s = condition_s.replace(' ' + col[0] + ' ', ' Col{} '.format(col[1])) - condition_s = condition_s.strip() - - for idx, col in enumerate(conditionals): - new_s = [] - for t in condition_s.split(): - if t == col: - new_s.append('Cond{}'.format(idx)) - else: - new_s.append(t) - condition_s = ' '.join(new_s) - s = condition_s - conds = re.split('(Col\d+ Cond\d+)', s) - if len(conds) == 0: - conds = [s] - conds = [x for x in conds if len(x.strip()) > 0] - full_conditions = [] - for i, x in enumerate(conds): - if i % 2 == 0: - x = x.split() - col_num = int(x[0].replace('Col', '')) - opp_num = int(x[1].replace('Cond', '')) - full_conditions.append([col_num, opp_num]) - else: - x = x.split() - if x[-1] == 'and': - x = x[:-1] - x = ' '.join(x) - if 'Col' in x: - new_x = [] - for t in x.split(): - if 'Col' in t: - idx = int(t.replace('Col', '')) - t = headers_unsorted[idx][0] - new_x.append(t) - x = new_x - x = ' '.join(x) - if 'Cond' in x: - new_x = [] - for t in x.split(): - if 'Cond' in t: - idx = int(t.replace('Cond', '')) - t = conditionals[idx] - new_x.append(t) - x = new_x - x = ' '.join(x) - full_conditions[-1].append(x) - logical_form = {'sel': sel, 'conds': full_conditions, 'agg': agg} - return logical_form - - -def computeLFEM(greedy, answer): - answer = [x[0] for x in answer] - count = 0 - correct = 0 - text_answers = [] - for idx, (g, ex) in enumerate(zip(greedy, answer)): - count += 1 - text_answers.append([ex['answer'].lower()]) - try: - lf = to_lf(g, ex['table']) - gt = ex['sql'] - conds = gt['conds'] - lower_conds = [] - for c in conds: - lc = c - lc[2] = str(lc[2]).lower() - lower_conds.append(lc) - gt['conds'] = lower_conds - correct += lf == gt - except BaseException: - continue - return correct / count * 100, text_answers - - -def score(answer, gold): - if len(gold) > 0: - gold = set.union(*[simplify(g) for g in gold]) - answer = simplify(answer) - tp, tn, sys_pos, real_pos = 0, 0, 0, 0 - if answer == gold: - if not ('unanswerable' in gold and len(gold) == 1): - tp += 1 - else: - tn += 1 - if not ('unanswerable' in answer and len(answer) == 1): - sys_pos += 1 - if not ('unanswerable' in gold and len(gold) == 1): - real_pos += 1 - return np.array([tp, tn, sys_pos, real_pos]) - - -def simplify(answer): - simplified = answer.strip().lower().split() - simplified = (''.join(c for c in t if c not in string.punctuation) for t in simplified) - return set(simplified) - {'the', 'a', 'an', 'and', ''} - - -# http://nlp.cs.washington.edu/zeroshot/evaluate.py -def computeCF1(greedy, answer): - scores = np.zeros(4) - for g, a in zip(greedy, answer): - scores += score(g, a) - tp, tn, sys_pos, real_pos = scores.tolist() - if tp == 0: - p = r = f = 0.0 - else: - p = tp / float(sys_pos) - r = tp / float(real_pos) - f = 2 * p * r / (p + r) - - return f * 100, p * 100, r * 100 - - -def normalize_text(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - - def remove_articles(text): - return re.sub(r'\b(a|an|the)\b', ' ', text) - - def white_space_fix(text): - return ' '.join(text.split()) - - def remove_punc(text): - exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) - - def lower(text): - return text.lower() - - return white_space_fix(remove_articles(remove_punc(lower(s)))) - - def f1_score(prediction, ground_truth): prediction_tokens = prediction.split() ground_truth_tokens = ground_truth.split() @@ -344,184 +173,6 @@ def computeNMTBLEU(outputs, targets): return bleu_metric.compute(predictions=outputs, references=targets)['bleu'] * 100 -class Rouge(Rouge155): - """Rouge calculator class with custom command-line options.""" - - # See full list of options here: - # https://github.com/andersjo/pyrouge/blob/master/tools/ROUGE-1.5.5/README.txt#L82 - DEFAULT_OPTIONS = [ - '-a', # evaluate all systems - '-n', - 4, # max-ngram - '-x', # do not calculate ROUGE-L - '-2', - 4, # max-gap-length - '-u', # include unigram in skip-bigram - '-c', - 95, # confidence interval - '-r', - 1000, # number-of-samples (for resampling) - '-f', - 'A', # scoring formula - '-p', - 0.5, # 0 <= alpha <=1 - '-t', - 0, # count by token instead of sentence - '-d', # print per evaluation scores - ] - - def __init__(self, n_words=None, keep_files=False, options=None): - - if options is None: - self.options = self.DEFAULT_OPTIONS.copy() - else: - self.options = options - - if n_words: - options.extend(["-l", n_words]) - - stem = "-m" in self.options - - super(Rouge, self).__init__(n_words=n_words, stem=stem, keep_files=keep_files) - - def _run_rouge(self): - # Get full options - options = ['-e', self._rouge_data] + list(map(str, self.options)) + [os.path.join(self._config_dir, "settings.xml")] - - # logging.info("Running ROUGE with options {}".format(" ".join(options))) - # print([self._rouge_bin] + list(options)) - pipes = Popen([self._rouge_bin] + options, stdout=PIPE, stderr=PIPE) - std_out, std_err = pipes.communicate() - - div_by_zero_error = std_err.decode("utf-8").startswith("Illegal division by zero") - if pipes.returncode == 0 or div_by_zero_error: - # Still returns the correct output even with div by zero - return std_out - else: - raise ValueError(std_out.decode("utf-8") + "\n" + std_err.decode("utf-8")) - - -def computeROUGE(greedy, answer): - rouges = compute_rouge_scores(greedy, answer) - if len(rouges) > 0: - avg_rouges = {} - for key in rouges[0].keys(): - avg_rouges[key] = sum([r.get(key, 0.0) for r in rouges]) / len(rouges) * 100 - else: - avg_rouges = None - return avg_rouges - - -def split_sentences(txt, splitchar=".", include_splitchar=False): - """Split sentences of a text based on a given EOS char.""" - out = [s.split() for s in txt.strip().split(splitchar) if len(s) > 0] - return out - - -def compute_rouge_scores(summs, refs, splitchar='.', options=None, parallel=True): - assert len(summs) == len(refs) - options = [ - '-a', # evaluate all systems - '-c', - 95, # confidence interval - '-m', # use Porter stemmer - '-n', - 2, # max-ngram - '-w', - 1.3, # weight (weighting factor for WLCS) - ] - rr = Rouge(options=options) - rouge_args = [] - for summ, ref in zip(summs, refs): - letter = "A" - ref_dict = {} - for r in ref: - ref_dict[letter] = [x for x in split_sentences(r, splitchar) if len(x) > 0] - letter = chr(ord(letter) + 1) - s = [x for x in split_sentences(summ, splitchar) if len(x) > 0] - rouge_args.append((s, ref_dict)) - if parallel: - with closing(Pool(cpu_count() // 2)) as pool: - rouge_scores = pool.starmap(rr.score_summary, rouge_args) - else: - rouge_scores = [] - for s, a in rouge_args: - rouge_scores.append(rr.score_summary(s, ref_dict)) - return rouge_scores - - -def to_delta_state(line): - delta_state = {'inform': {}, 'request': {}} - try: - if line == 'None' or line.strip() == '' or line.strip() == ';': - return delta_state - inform, request = [[y.strip() for y in x.strip().split(',')] for x in line.split(';')] - inform_pairs = {} - for i in inform: - try: - k, v = i.split(':') - inform_pairs[k.strip()] = v.strip() - except BaseException: - pass - delta_state = {'inform': inform_pairs, 'request': request} - except BaseException: - pass - finally: - return delta_state - - -def update_state(state, delta): - for act, slot in delta.items(): - state[act] = slot - return state - - -def dict_cmp(d1, d2): - def cmp(a, b): - for k1, v1 in a.items(): - if k1 not in b: - return False - else: - if v1 != b[k1]: - return False - return True - - return cmp(d1, d2) and cmp(d2, d1) - - -def computeDialogue(greedy, answer): - examples = [] - for idx, (g, a) in enumerate(zip(greedy, answer)): - examples.append((a[0][0], g, a[0][1], idx)) - examples.sort() - turn_request_positives = 0 - turn_goal_positives = 0 - joint_goal_positives = 0 - ldt = None - for ex in examples: - if ldt is None or ldt.split('_')[:-1] != ex[0].split('_')[:-1]: - state, answer_state = {}, {} - ldt = ex[0] - delta_state = to_delta_state(ex[1]) - answer_delta_state = to_delta_state(ex[2]) - state = update_state(state, delta_state['inform']) - answer_state = update_state(answer_state, answer_delta_state['inform']) - if dict_cmp(state, answer_state): - joint_goal_positives += 1 - if delta_state['request'] == answer_delta_state['request']: - turn_request_positives += 1 - if dict_cmp(delta_state['inform'], answer_delta_state['inform']): - turn_goal_positives += 1 - - joint_goal_em = joint_goal_positives / len(examples) * 100 - turn_request_em = turn_request_positives / len(examples) * 100 - turn_goal_em = turn_goal_positives / len(examples) * 100 - answer = [(x[-1], x[-2]) for x in examples] - answer.sort() - answer = [[x[1]] for x in answer] - return joint_goal_em, turn_request_em, turn_goal_em, answer - - def compute_e2e_dialogue_score(greedy, answer, tgt_lang, args, example_ids): num_examples = len(answer) subtask_metrics_dict = OrderedDict() @@ -589,6 +240,32 @@ def computeJGA(greedy, answer, example_ids): return hit / len(greedy) * 100 +def convert_IOB2_to_IOB1(labels): + cur_category = None + for n, label in enumerate(labels): + if label[0] == "B" and label[2:] != cur_category: + labels[n] = "I" + label[1:] + cur_category = label[2:] + + +def compute_ner_f1(predictions, answers, schema='IOB2'): + predictions_processed = [pred.split() for pred in predictions] + answers_processed = [ans[0].split() for ans in answers] + f1 = 0.0 + + if schema == 'IOB1': + convert_IOB2_to_IOB1(predictions_processed) + convert_IOB2_to_IOB1(answers_processed) + f1 = ( + seq_metrics.f1_score(y_pred=predictions_processed, y_true=answers_processed, mode='strict', scheme=seq_scheme.IOB1) + * 100 + ) + elif schema == 'IOB2': + f1 = seq_metrics.f1_score(y_pred=predictions_processed, y_true=answers_processed) * 100 + + return f1 + + def compute_metrics( predictions: Iterable[str], answers: Union[Iterable[str], Iterable[Iterable[str]]], @@ -604,14 +281,7 @@ def compute_metrics( requested_metrics: contains a subset of the following metrics em (exact match) sm (structure match): valid if the output is ThingTalk code. Whether the gold answer and prediction are identical if we ignore parameter values of ThingTalk programs - bleu - rouge1, rouge2, rougeL, avg_rouge - f1: token-level F1 score, tokenizes with whitespace - nf1: normalize outputs then calculate token-level F1 score - nem: normalize outputs then calculate exact match - corpus_f1, precision, recall: corpus-level precision, recall and F1 score - lfem - joint_goal_em, turn_request_em, turn_goal_em, avg_dialogue + #TODO add all lang: the language of the predictions and answers. Used for BERTScore. args: arguments example_ids: used to calculate some of e2e dialogue metrics that need to know span of each dialogue such as JGA @@ -629,15 +299,6 @@ def compute_metrics( jga = computeJGA(predictions, answers, example_ids) metric_keys += ['jga'] metric_values += [jga] - if 'lfem' in requested_metrics: - lfem, answers = computeLFEM(predictions, answers) - metric_keys += ['lfem'] - metric_values += [lfem] - if 'joint_goal_em' in requested_metrics: - joint_goal_em, request_em, turn_goal_em, answers = computeDialogue(predictions, answers) - avg_dialogue = (joint_goal_em + request_em) / 2 - metric_keys += ['joint_goal_em', 'turn_request_em', 'turn_goal_em', 'avg_dialogue'] - metric_values += [joint_goal_em, request_em, turn_goal_em, avg_dialogue] if 'em' in requested_metrics: em = computeEM(predictions, answers) metric_keys += ['em'] @@ -674,11 +335,6 @@ def compute_metrics( nmt_bleu = computeNMTBLEU(predictions, answers) metric_keys.append('nmt_bleu') metric_values.append(nmt_bleu) - if 'avg_rouge' in requested_metrics: - rouge = computeROUGE(predictions, answers) - metric_keys += ['rouge1', 'rouge2', 'rougeL', 'avg_rouge'] - avg_rouge = (rouge['rouge_1_f_score'] + rouge['rouge_2_f_score'] + rouge['rouge_l_f_score']) / 3 - metric_values += [rouge['rouge_1_f_score'], rouge['rouge_2_f_score'], rouge['rouge_l_f_score'], avg_rouge] if 'sc_precision' in requested_metrics: precision = computeSequenceClassificationPrecision(predictions, answers) metric_keys.append('sc_precision') @@ -695,52 +351,14 @@ def compute_metrics( f1 = computeF1(predictions, answers) metric_keys.append('f1') metric_values.append(f1) - if 'ner_f1_IOB1' in requested_metrics: - predictions_processed = [pred.split() for pred in predictions] - answers_processed = [ans[0].split() for ans in answers] - - def convert_IOB2_to_IOB1(labels): - cur_category = None - for n, label in enumerate(labels): - if label[0] == "B" and label[2:] != cur_category: - labels[n] = "I" + label[1:] - cur_category = label[2:] - - convert_IOB2_to_IOB1(predictions_processed) - convert_IOB2_to_IOB1(answers_processed) - f1 = ( - seq_metrics.f1_score(y_pred=predictions_processed, y_true=answers_processed, mode='strict', scheme=seq_scheme.IOB1) - * 100 - ) - + ner_f1_iob1 = compute_ner_f1(predictions, answers, schema='IOB1') metric_keys.append('ner_f1_IOB1') - metric_values.append(f1) - + metric_values.append(ner_f1_iob1) if 'ner_f1' in requested_metrics: - predictions_processed = [pred.split() for pred in predictions] - answers_processed = [ans[0].split() for ans in answers] - - f1 = seq_metrics.f1_score(y_pred=predictions_processed, y_true=answers_processed) * 100 - + ner_f1 = compute_ner_f1(predictions, answers) metric_keys.append('ner_f1') - metric_values.append(f1) - - norm_predictions = [normalize_text(g) for g in predictions] - norm_answers = [[normalize_text(a) for a in al] for al in answers] - if 'nf1' in requested_metrics: - nf1 = computeF1(norm_predictions, norm_answers) - metric_keys.append('nf1') - metric_values.append(nf1) - if 'nem' in requested_metrics: - nem = computeEM(norm_predictions, norm_answers) - metric_keys.append('nem') - metric_values.append(nem) - - if 'corpus_f1' in requested_metrics: - corpus_f1, precision, recall = computeCF1(norm_predictions, norm_answers) - metric_keys += ['corpus_f1', 'precision', 'recall'] - metric_values += [corpus_f1, precision, recall] + metric_values.append(ner_f1) metric_dict = dict(zip(metric_keys, metric_values)) metric_dict = OrderedDict((key, metric_dict[key]) for key in requested_metrics) diff --git a/genienlp/tasks/generic_task.py b/genienlp/tasks/generic_task.py index 3418d6a0..6ae4b85a 100644 --- a/genienlp/tasks/generic_task.py +++ b/genienlp/tasks/generic_task.py @@ -41,7 +41,7 @@ class Multi30K(BaseTask): @property def metrics(self): - return ['bleu', 'em', 'nem', 'nf1'] + return ['casedbleu', 'em'] def get_splits(self, root, **kwargs): src, trg = ['.' + x for x in self.name.split('.')[1:]] @@ -52,7 +52,7 @@ def get_splits(self, root, **kwargs): class IWSLT(BaseTask): @property def metrics(self): - return ['bleu', 'em', 'nem', 'nf1'] + return ['casedbleu', 'em'] def get_splits(self, root, **kwargs): src, trg = ['.' + x for x in self.name.split('.')[1:]] @@ -63,7 +63,7 @@ def get_splits(self, root, **kwargs): class SQuAD(BaseTask): @property def metrics(self): - return ['nf1', 'em', 'nem'] + return ['em'] def get_splits(self, root, **kwargs): return generic_dataset.SQuAD.splits(root=root, description=self.name, **kwargs) @@ -73,7 +73,7 @@ def get_splits(self, root, **kwargs): class WikiSQL(BaseTask): @property def metrics(self): - return ['lfem', 'em', 'nem', 'nf1'] + return ['em'] def get_splits(self, root, **kwargs): return generic_dataset.WikiSQL.splits(root=root, query_as_question='query_as_question' in self.name, **kwargs) @@ -93,7 +93,7 @@ def get_splits(self, root, **kwargs): class WoZ(BaseTask): @property def metrics(self): - return ['joint_goal_em', 'turn_request_em', 'turn_goal_em', 'avg_dialogue', 'em', 'nem', 'nf1'] + return ['em'] def get_splits(self, root, **kwargs): return generic_dataset.WOZ.splits(description=self.name, root=root, **kwargs) @@ -109,7 +109,7 @@ def get_splits(self, root, **kwargs): class SRL(BaseTask): @property def metrics(self): - return ['nf1', 'em', 'nem'] + return ['em'] def get_splits(self, root, **kwargs): return generic_dataset.SRL.splits(root=root, **kwargs) @@ -130,7 +130,7 @@ def get_splits(self, root, **kwargs): class BaseSummarizationTask(BaseTask): @property def metrics(self): - return ['avg_rouge', 'rouge1', 'rouge2', 'rougeL', 'em', 'nem', 'nf1'] + return ['em'] @register_task('cnn') @@ -172,7 +172,7 @@ def get_splits(self, root, **kwargs): class ZRE(BaseTask): @property def metrics(self): - return ['corpus_f1', 'precision', 'recall', 'em', 'nem', 'nf1'] + return ['em'] def get_splits(self, root, **kwargs): return generic_dataset.ZeroShotRE.splits(root=root, **kwargs) From c05d8844b4e4a53f74bdb3454d4bd653505a5dda Mon Sep 17 00:00:00 2001 From: mehrad Date: Fri, 25 Feb 2022 17:08:30 -0800 Subject: [PATCH 04/19] Remove almond multilingual tasks --- genienlp/tasks/almond_task.py | 119 +--------------------------------- 1 file changed, 1 insertion(+), 118 deletions(-) diff --git a/genienlp/tasks/almond_task.py b/genienlp/tasks/almond_task.py index a889c2a3..1c05bef8 100644 --- a/genienlp/tasks/almond_task.py +++ b/genienlp/tasks/almond_task.py @@ -28,28 +28,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import logging import os -from collections import defaultdict import torch from genienlp.data_utils.almond_utils import split_text_into_sentences -from ..data_utils.almond_utils import ( - ISO_to_LANG, - detokenize_cjk_chars, - is_device, - is_entity, - is_entity_marker, - process_id, - tokenize_cjk_chars, -) +from ..data_utils.almond_utils import detokenize_cjk_chars, is_device, is_entity, is_entity_marker, tokenize_cjk_chars from ..data_utils.example import Example from ..model_utils.translation import align_and_replace, compute_attention from ..paraphrase.data_utils import input_heuristics, output_heuristics from .almond_dataset import AlmondDataset -from .base_dataset import Split from .base_task import BaseTask -from .generic_dataset import CQA, all_tokens_fn, default_batch_fn, input_then_output_len from .registry import register_task logger = logging.getLogger(__name__) @@ -706,109 +695,3 @@ def _make_example(self, parts, dir_name=None, **kwargs): def get_splits(self, root, **kwargs): return AlmondDataset.return_splits(path=os.path.join(root, 'almond/agent'), make_example=self._make_example, **kwargs) - - -class BaseAlmondMultiLingualTask(BaseAlmondTask): - """Base task for MultiLingual Almond""" - - def get_train_processed_ids(self, split): - all_ids = [] - for ex in split.examples: - all_ids.append(process_id(ex)) - return all_ids - - def combine_datasets(self, datasets, all_paths, sort_key_fn, batch_size_fn, used_fields, groups): - splits = defaultdict() - # paths = defaultdict() - - for field in used_fields: - # choose one path and replace dir name with 'combined' - # paths[field] = '/'.join([getattr(all_paths[0], field).rsplit('/', 2)[0], 'combined', getattr(all_paths[0], field).rsplit('/', 2)[2]]) - - all_examples = [] - for dataset in datasets: - all_examples.extend(getattr(dataset, field).examples) - - splits[field] = CQA(all_examples, sort_key_fn=sort_key_fn, batch_size_fn=batch_size_fn, groups=groups) - - return Split(train=splits.get('train'), eval=splits.get('eval'), test=splits.get('test'), aux=splits.get('aux')) - # Split(train=paths.get('train'), eval=paths.get('eval'), test=paths.get('test'), aux=paths.get('aux')) - - def get_splits(self, root, **kwargs): - all_datasets = [] - all_paths = [] - # number of directories to read data from - all_dirs = kwargs['all_dirs'].split('+') - - for dir in all_dirs: - splits, paths = AlmondDataset.return_splits( - path=os.path.join(root, 'almond/{}'.format(dir)), make_example=self._make_example, **kwargs - ) - all_datasets.append(splits) - all_paths.append(paths) - - used_fields = [field for field in all_datasets[0]._fields if getattr(all_datasets[0], field) is not None] - - assert len(all_datasets) >= 1 - if getattr(self.args, 'sentence_batching', False): - for field in used_fields: - lengths = list(map(lambda dataset: len(getattr(dataset, field)), all_datasets)) - assert len(set(lengths)) == 1, 'When using sentence batching your datasets should have the same size.' - if 'train' in used_fields: - ids_sets = list(map(lambda dataset: set(self.get_train_processed_ids(dataset.train)), all_datasets)) - id_set_base = set(ids_sets[0]) - for id_set in ids_sets: - assert set(id_set) == id_set_base, 'When using sentence batching your datasets should have matching ids' - - sort_key_fn = process_id - batch_size_fn = default_batch_fn - else: - # use default values for `sort_key_fn` and `batch_size_fn` - sort_key_fn = input_then_output_len - batch_size_fn = all_tokens_fn - - groups = len(all_datasets) if getattr(self.args, 'sentence_batching', False) else None - - if getattr(self.args, 'separate_eval', False) and (all_datasets[0].eval or all_datasets[0].test): - return all_datasets, all_paths - # TODO fix handling paths for multilingual - else: - return ( - self.combine_datasets(all_datasets, all_paths, sort_key_fn, batch_size_fn, used_fields, groups), - all_paths[0], - ) - - -@register_task('almond_multilingual') -class AlmondMultiLingual(BaseAlmondMultiLingualTask): - def __init__(self, name, args): - super().__init__(name, args) - self._metrics = ['em', 'sm', 'bleu'] - - def _is_program_field(self, field_name): - return field_name == 'answer' - - @property - def utterance_field(self): - return 'context' - - def _make_example(self, parts, dir_name, **kwargs): - if self._almond_has_multiple_programs: - example_id, sentence, target_code = parts[:3] - else: - example_id, sentence, target_code = parts - language = ISO_to_LANG.get(dir_name, 'English').lower() - if self.args.almond_lang_as_question: - question = 'translate from {} to thingtalk'.format(language) - else: - question = 'translate from english to thingtalk' - context = sentence - answer = target_code - return Example.from_raw( - self.name + '/' + dir_name + '/' + example_id, - context, - question, - answer, - preprocess=self.preprocess_field, - lower=False, - ) From c49ff4b2d317a3fa59f99fc56a579660756775fb Mon Sep 17 00:00:00 2001 From: mehrad Date: Fri, 25 Feb 2022 17:08:40 -0800 Subject: [PATCH 05/19] Remove obsolete generic tasks inherited from decanlp code --- genienlp/tasks/generic_dataset.py | 1702 ----------------------------- genienlp/tasks/generic_task.py | 142 --- 2 files changed, 1844 deletions(-) diff --git a/genienlp/tasks/generic_dataset.py b/genienlp/tasks/generic_dataset.py index de40eeda..161360e5 100644 --- a/genienlp/tasks/generic_dataset.py +++ b/genienlp/tasks/generic_dataset.py @@ -28,16 +28,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import csv -import glob -import hashlib -import io import json import logging import os -import re -import unicodedata -import xml.etree.ElementTree as ET from typing import Iterable import torch @@ -101,1701 +94,6 @@ def __init__(self, examples, sort_key_fn=input_then_output_len, batch_size_fn=al super().__init__(examples, **kwargs) -class IMDb(CQA): - urls = ['http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'] - name = 'imdb' - dirname = 'aclImdb' - - def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - examples = [] - labels = {'neg': 'negative', 'pos': 'positive'} - question = 'Is this review negative or positive?' - - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - for label in ['pos', 'neg']: - for fname in glob.iglob(os.path.join(path, label, '*.txt')): - with open(fname, 'r') as f: - context = f.readline() - answer = labels[label] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - if subsample is not None and len(examples) > subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - super().__init__(examples, **kwargs) - - @classmethod - def splits(cls, root='.data', train='train', validation=None, test='test', **kwargs): - assert validation is None - path = cls.download(root) - - train_data = None if train is None else cls(os.path.join(path, f'{train}'), **kwargs) - test_data = None if test is None else cls(os.path.join(path, f'{test}'), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux' + '.tsv'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class SST(CQA): - urls = [ - 'https://raw.githubusercontent.com/openai/generating-reviews-discovering-sentiment/master/data/train_binary_sent.csv', - 'https://raw.githubusercontent.com/openai/generating-reviews-discovering-sentiment/master/data/dev_binary_sent.csv', - 'https://raw.githubusercontent.com/openai/generating-reviews-discovering-sentiment/master/data/test_binary_sent.csv', - ] - name = 'sst' - dirname = '' - - def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - - examples = [] - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - labels = ['negative', 'positive'] - question = 'Is this review ' + labels[0] + ' or ' + labels[1] + '?' - - with io.open(os.path.expanduser(path), encoding='utf8') as f: - next(f) - for line in f: - parsed = list(csv.reader([line.rstrip('\n')]))[0] - context = parsed[-1] - answer = labels[int(parsed[0])] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - - if subsample is not None and len(examples) > subsample: - break - - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - - self.examples = examples - super().__init__(examples, **kwargs) - - @classmethod - def splits(cls, root='.data', train='train', validation='dev', test='test', **kwargs): - path = cls.download(root) - postfix = '_binary_sent.csv' - - train_data = None if train is None else cls(os.path.join(path, f'{train}{postfix}'), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, f'{validation}{postfix}'), **kwargs) - test_data = None if test is None else cls(os.path.join(path, f'{test}{postfix}'), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, f'aux{postfix}'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class TranslationDataset(CQA): - def __init__(self, path, exts, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - """Create a TranslationDataset given paths and fields. - - Arguments: - path: Common prefix of paths to the data files for both languages. - exts: A tuple containing the extension to path for each language. - fields$: fields for handling all columns - Remaining keyword arguments: Passed to the constructor of Dataset. - """ - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - langs = { - '.de': 'German', - '.en': 'English', - '.fr': 'French', - '.ar': 'Arabic', - '.cs': 'Czech', - '.tt': 'ThingTalk', - '.fa': 'Farsi', - } - source, target = langs[exts[0]], langs[exts[1]] - src_path, trg_path = tuple(os.path.expanduser(path + x) for x in exts) - question = f'Translate from {source} to {target}' - - examples = [] - with open(src_path) as src_file, open(trg_path) as trg_file: - for src_line, trg_line in zip(src_file, trg_file): - src_line, trg_line = src_line.strip(), trg_line.strip() - if src_line != '' and trg_line != '': - context = src_line - answer = trg_line - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - if subsample is not None and len(examples) >= subsample: - break - - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - super().__init__(examples, **kwargs) - - @classmethod - def splits(cls, exts, root='.data', train='train', validation='val', test='test', **kwargs): - """Create dataset objects for splits of a TranslationDataset. - - Arguments: - - root: Root dataset storage directory. Default is '.data'. - exts: A tuple containing the extension to path for each language. - fields: A tuple containing the fields that will be used for data - in each language. - train: The prefix of the train data. Default: 'train'. - validation: The prefix of the validation data. Default: 'val'. - test: The prefix of the test data. Default: 'test'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - path = cls.download(root) - - train_data = None if train is None else cls(os.path.join(path, train), exts, **kwargs) - val_data = None if validation is None else cls(os.path.join(path, validation), exts, **kwargs) - test_data = None if test is None else cls(os.path.join(path, test), exts, **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux'), exts, **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else val_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class Multi30k(TranslationDataset, CQA): - urls = [ - 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz', - 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz', - 'http://www.quest.dcs.shef.ac.uk/wmt17_files_mmt/mmt_task1_test2016.tar.gz', - ] - name = 'multi30k' - dirname = '' - - -class IWSLT(TranslationDataset, CQA): - base_url = 'https://wit3.fbk.eu/archive/2016-01//texts/{}/{}/{}.tgz' - name = 'iwslt' - base_dirname = '{}-{}' - - @classmethod - def splits(cls, exts, root='.data', train='train', validation='IWSLT16.TED.tst2013', test='IWSLT16.TED.tst2014', **kwargs): - """Create dataset objects for splits of the IWSLT dataset. - - Arguments: - - root: Root dataset storage directory. Default is '.data'. - exts: A tuple containing the extension to path for each language. - fields: A tuple containing the fields that will be used for data - in each language. - train: The prefix of the train data. Default: 'train'. - validation: The prefix of the validation data. Default: 'val'. - test: The prefix of the test data. Default: 'test'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - cls.dirname = cls.base_dirname.format(exts[0][1:], exts[1][1:]) - cls.urls = [cls.base_url.format(exts[0][1:], exts[1][1:], cls.dirname)] - check = os.path.join(root, cls.name, cls.dirname) - path = cls.download(root, check=check) - - if train is not None: - train = '.'.join([train, cls.dirname]) - if validation is not None: - validation = '.'.join([validation, cls.dirname]) - if test is not None: - test = '.'.join([test, cls.dirname]) - - if not os.path.exists(os.path.join(path, '.'.join(['train', cls.dirname])) + exts[0]): - cls.clean(path) - - train_data = None if train is None else cls(os.path.join(path, train), exts, **kwargs) - val_data = None if validation is None else cls(os.path.join(path, validation), exts, **kwargs) - test_data = None if test is None else cls(os.path.join(path, test), exts, **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux = '.'.join(['aux', cls.dirname]) - aux_data = cls(os.path.join(path, aux), exts, **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else val_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - @staticmethod - def clean(path): - for f_xml in glob.iglob(os.path.join(path, '*.xml')): - print(f_xml) - f_txt = os.path.splitext(f_xml)[0] - with io.open(f_txt, mode='w', encoding='utf-8') as fd_txt: - root = ET.parse(f_xml).getroot()[0] - for doc in root.findall('doc'): - for e in doc.findall('seg'): - fd_txt.write(e.text.strip() + '\n') - - xml_tags = [' 0] - if len(tagged_context[answer_start:answer_end]) != len(tokenized_answer): - import pdb - - pdb.set_trace() - context_spans = list(range(answer_start, answer_end)) - indexed_answer = tagged_context[context_spans[0] : context_spans[-1] + 1] - if len(indexed_answer) != len(tokenized_answer): - import pdb - - pdb.set_trace() - context_spans += [len(tagged_context)] - for context_idx, answer_word in zip(context_spans, ex.answer): - if context_idx == len(tagged_context): - continue - if tagged_context[context_idx] != answer_word: - import pdb - - pdb.set_trace() - - ex = Example.from_raw( - make_example_id(self, qa['id']), - ' '.join(tagged_context), - question, - ' '.join(tokenized_answer), - lower=lower, - ) - - examples.append(ex) - if subsample is not None and len(examples) > subsample: - break - if subsample is not None and len(examples) > subsample: - break - if subsample is not None and len(examples) > subsample: - break - - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save((examples, all_answers, q_ids), cache_name) - - super(SQuAD, self).__init__(examples, **kwargs) - self.all_answers = all_answers - self.q_ids = q_ids - - @classmethod - def splits(cls, root='.data', description='squad1.1', train='train', validation='dev', test=None, **kwargs): - """Create dataset objects for splits of the SQuAD dataset. - Arguments: - root: directory containing SQuAD data - field: field for handling all columns - train: The prefix of the train data. Default: 'train'. - validation: The prefix of the validation data. Default: 'val'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - assert test is None - path = cls.download(root) - - extension = 'v2.0.json' if '2.0' in description else 'v1.1.json' - - train = '-'.join([train, extension]) if train is not None else None - validation = '-'.join([validation, extension]) if validation is not None else None - - train_data = None if train is None else cls(os.path.join(path, train), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, validation), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux = '-'.join(['aux', extension]) - aux_data = cls(os.path.join(path, aux), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None, - aux=None if do_curriculum is None else aux_data, - ) - - -# https://github.com/abisee/cnn-dailymail/blob/8eace60f306dcbab30d1f1d715e379f07a3782db/make_datafiles.py -dm_single_close_quote = u'\u2019' -dm_double_close_quote = u'\u201d' -# acceptable ways to end a sentence -END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] - - -def fix_missing_period(line): - """Adds a period to a line that is missing a period""" - if "@highlight" in line: - return line - if line == "": - return line - if line[-1] in END_TOKENS: - return line - return line + "." - - -class Summarization(CQA): - def __init__(self, path, one_answer=True, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - - examples = [] - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - with open(os.path.expanduser(path)) as f: - lines = f.readlines() - for line in lines: - ex = json.loads(line) - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - - super(Summarization, self).__init__(examples, **kwargs) - - @classmethod - def cache_splits(cls, path): - - splits = ['training', 'validation', 'test'] - for split in splits: - missing_stories, collected_stories = 0, 0 - split_file_name = os.path.join(path, f'{split}.jsonl') - if os.path.exists(split_file_name): - continue - with open(split_file_name, 'w') as split_file: - url_file_name = os.path.join(path, f'{cls.name}_wayback_{split}_urls.txt') - with open(url_file_name) as url_file: - for url in url_file: - story_file_name = os.path.join( - path, 'stories', f"{hashlib.sha1(url.strip().encode('utf-8')).hexdigest()}.story" - ) - try: - story_file = open(story_file_name) - except EnvironmentError as e: - missing_stories += 1 - logger.warning(e) - if os.path.exists(split_file_name): - os.remove(split_file_name) - else: - with story_file: - article, highlight = [], [] - is_highlight = False - for line in story_file: - line = line.strip() - if line == "": - continue - line = fix_missing_period(line) - if line.startswith("@highlight"): - is_highlight = True - elif "@highlight" in line: - raise ValueError() - elif is_highlight: - highlight.append(line) - else: - article.append(line) - example = { - 'context': unicodedata.normalize('NFKC', ' '.join(article)), - 'answer': unicodedata.normalize('NFKC', ' '.join(highlight)), - 'question': 'What is the summary?', - } - split_file.write(json.dumps(example) + '\n') - collected_stories += 1 - if collected_stories % 1000 == 0: - logger.debug(example) - logger.warning(f'Missing {missing_stories} stories') - logger.info(f'Collected {collected_stories} stories') - - @classmethod - def splits(cls, root='.data', train='training', validation='validation', test='test', **kwargs): - path = cls.download(root) - cls.cache_splits(path) - - train_data = None if train is None else cls(os.path.join(path, 'training.jsonl'), **kwargs) - validation_data = ( - None if validation is None else cls(os.path.join(path, 'validation.jsonl'), one_answer=False, **kwargs) - ) - test_data = None if test is None else cls(os.path.join(path, 'test.jsonl'), one_answer=False, **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'auxiliary.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class DailyMail(Summarization): - name = 'dailymail' - dirname = 'dailymail' - urls = [ - ('https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs', 'dailymail_stories.tgz'), - ( - 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/dailymail_wayback_training_urls.txt', - 'dailymail/dailymail_wayback_training_urls.txt', - ), - ( - 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/dailymail_wayback_validation_urls.txt', - 'dailymail/dailymail_wayback_validation_urls.txt', - ), - ( - 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/dailymail_wayback_test_urls.txt', - 'dailymail/dailymail_wayback_test_urls.txt', - ), - ] - - -class CNN(Summarization): - name = 'cnn' - dirname = 'cnn' - urls = [ - ('https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ', 'cnn_stories.tgz'), - ( - 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/cnn_wayback_training_urls.txt', - 'cnn/cnn_wayback_training_urls.txt', - ), - ( - 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/cnn_wayback_validation_urls.txt', - 'cnn/cnn_wayback_validation_urls.txt', - ), - ( - 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/cnn_wayback_test_urls.txt', - 'cnn/cnn_wayback_test_urls.txt', - ), - ] - - -class Query: - # https://github.com/salesforce/WikiSQL/blob/c2ed4f9b22db1cc2721805d53e6e76e07e2ccbdc/lib/query.py#L10 - - agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG'] - cond_ops = ['=', '>', '<', 'OP'] - syms = [ - 'SELECT', - 'WHERE', - 'AND', - 'COL', - 'TABLE', - 'CAPTION', - 'PAGE', - 'SECTION', - 'OP', - 'COND', - 'QUESTION', - 'AGG', - 'AGGOPS', - 'CONDOPS', - ] - - def __init__(self, sel_index, agg_index, columns, conditions=tuple()): - self.sel_index = sel_index - self.agg_index = agg_index - self.columns = columns - self.conditions = list(conditions) - - def __repr__(self): - rep = 'SELECT {agg} {sel} FROM table'.format( - agg=self.agg_ops[self.agg_index], - sel=self.columns[self.sel_index] if self.columns is not None else 'col{}'.format(self.sel_index), - ) - if self.conditions: - rep += ' WHERE ' + ' AND '.join( - ['{} {} {}'.format(self.columns[i], self.cond_ops[o], v) for i, o, v in self.conditions] - ) - return ' '.join(rep.split()) - - @classmethod - def from_dict(cls, d, t): - return cls(sel_index=d['sel'], agg_index=d['agg'], columns=t, conditions=d['conds']) - - -class WikiSQL(CQA): - urls = ['https://github.com/salesforce/WikiSQL/raw/master/data.tar.bz2'] - name = 'wikisql' - dirname = 'data' - - def __init__( - self, path, query_as_question=False, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs - ): - cache_name = os.path.join( - cached_path, - 'query_as_question' if query_as_question else 'query_as_context', - os.path.basename(path), - str(subsample), - ) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples, all_answers = torch.load(cache_name) - else: - - expanded_path = os.path.expanduser(path) - table_path = os.path.splitext(expanded_path) - table_path = table_path[0] + '.tables' + table_path[1] - - with open(table_path) as tables_file: - tables = [json.loads(line) for line in tables_file] - id_to_tables = {x['id']: x for x in tables} - - all_answers = [] - examples = [] - with open(expanded_path) as example_file: - for idx, line in enumerate(example_file): - entry = json.loads(line) - human_query = entry['question'] - table = id_to_tables[entry['table_id']] - sql = entry['sql'] - header = table['header'] - answer = repr(Query.from_dict(sql, header)) - context = ( - f'The table has columns {", ".join(table["header"])} ' - + f'and key words {", ".join(Query.agg_ops[1:] + Query.cond_ops + Query.syms)}' - ) - if query_as_question: - question = human_query - else: - question = 'What is the translation from English to SQL?' - context += f'-- {human_query}' - examples.append(Example.from_raw(make_example_id(self, idx), context, question, answer, lower=lower)) - all_answers.append({'sql': sql, 'header': header, 'answer': answer, 'table': table}) - if subsample is not None and len(examples) > subsample: - break - - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save((examples, all_answers), cache_name) - - super(WikiSQL, self).__init__(examples, **kwargs) - self.all_answers = all_answers - - @classmethod - def splits(cls, root='.data', train='train.jsonl', validation='dev.jsonl', test='test.jsonl', **kwargs): - """Create dataset objects for splits of the SQuAD dataset. - Arguments: - root: directory containing SQuAD data - field: field for handling all columns - train: The prefix of the train data. Default: 'train'. - validation: The prefix of the validation data. Default: 'val'. - Remaining keyword arguments: Passed to the splits method of - Dataset. - """ - path = cls.download(root) - - train_data = None if train is None else cls(os.path.join(path, train), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, validation), **kwargs) - test_data = None if test is None else cls(os.path.join(path, test), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class SRL(CQA): - urls = [ - 'https://dada.cs.washington.edu/qasrl/data/wiki1.train.qa', - 'https://dada.cs.washington.edu/qasrl/data/wiki1.dev.qa', - 'https://dada.cs.washington.edu/qasrl/data/wiki1.test.qa', - ] - - name = 'srl' - dirname = '' - - @classmethod - def clean(cls, s): - closing_punctuation = { - ' .', - ' ,', - ' ;', - ' !', - ' ?', - ' :', - ' )', - " 'll", - " n't ", - " %", - " 't", - " 's", - " 'm", - " 'd", - " 're", - } - opening_punctuation = {'( ', '$ '} - both_sides = {' - '} - s = ' '.join(s.split()).strip() - s = s.replace('-LRB-', '(') - s = s.replace('-RRB-', ')') - s = s.replace('-LAB-', '<') - s = s.replace('-RAB-', '>') - s = s.replace('-AMP-', '&') - s = s.replace('%pw', ' ') - - for p in closing_punctuation: - s = s.replace(p, p.lstrip()) - for p in opening_punctuation: - s = s.replace(p, p.rstrip()) - for p in both_sides: - s = s.replace(p, p.strip()) - s = s.replace('``', '') - s = s.replace('`', '') - s = s.replace("''", '') - s = s.replace('“', '') - s = s.replace('”', '') - s = s.replace(" '", '') - return ' '.join(s.split()).strip() - - def __init__(self, path, one_answer=True, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - - examples, all_answers = [], [] - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples, all_answers = torch.load(cache_name) - else: - with open(os.path.expanduser(path)) as f: - for line in f: - ex = json.loads(line) - aa = ex['all_answers'] - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(all_answers)), context, question, answer, lower=lower) - ) - all_answers.append(aa) - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save((examples, all_answers), cache_name) - - super(SRL, self).__init__(examples, **kwargs) - self.all_answers = all_answers - - @classmethod - def cache_splits(cls, path, train='train', validation='dev', test='test'): - - splits = [train, validation, test] - for split in splits: - split_file_name = os.path.join(path, f'{split}.jsonl') - if os.path.exists(split_file_name): - continue - wiki_file = os.path.join(path, f'wiki1.{split}.qa') - - with open(split_file_name, 'w') as split_file: - with open(os.path.expanduser(wiki_file)) as f: - - def is_int(x): - try: - int(x) - return True - except ValueError: - return False - - lines = [] - for line in f.readlines(): - line = ' '.join(line.split()).strip() - if len(line) == 0: - lines.append(line) - continue - if 'WIKI1' not in line.split('_')[0]: - if not is_int(line.split()[0]) or len(line.split()) > 3: - lines.append(line) - - new_example = True - for line in lines: - line = line.strip() - if new_example: - context = cls.clean(line) - new_example = False - continue - if len(line) == 0: - new_example = True - continue - question, answers = line.split('?') - question = cls.clean(line.split('?')[0].replace(' _', '') + '?') - answer = cls.clean(answers.split('###')[0]) - all_answers = [cls.clean(x) for x in answers.split('###')] - if answer not in context: - low_answer = answer[0].lower() + answer[1:] - up_answer = answer[0].upper() + answer[1:] - if low_answer in context or up_answer in context: - answer = low_answer if low_answer in context else up_answer - else: - if 'Darcy Burner' in answer: - answer = 'Darcy Burner and other 2008 Democratic congressional candidates, in cooperation with some retired national security officials' - elif 'E Street Band' in answer: - answer = 'plan to work with the E Street Band again in the future' - elif 'an electric sender' in answer: - answer = 'an electronic sender' - elif 'the US army' in answer: - answer = 'the US Army' - elif 'Rather than name the' in answer: - answer = 'rather die than name the cause of his disease to his father' - elif answer.lower() in context: - answer = answer.lower() - else: - import pdb - - pdb.set_trace() - assert answer in context - modified_all_answers = [] - for a in all_answers: - if a not in context: - low_answer = a[0].lower() + a[1:] - up_answer = a[0].upper() + a[1:] - if low_answer in context or up_answer in context: - a = low_answer if low_answer in context else up_answer - else: - if 'Darcy Burner' in a: - a = 'Darcy Burner and other 2008 Democratic congressional candidates, in cooperation with some retired national security officials' - elif 'E Street Band' in a: - a = 'plan to work with the E Street Band again in the future' - elif 'an electric sender' in a: - a = 'an electronic sender' - elif 'the US army' in a: - a = 'the US Army' - elif 'Rather than name the' in a: - a = 'rather die than name the cause of his disease to his father' - elif a.lower() in context: - a = a.lower() - else: - import pdb - - pdb.set_trace() - assert a in context - modified_all_answers.append(a) - split_file.write( - json.dumps( - { - 'context': context, - 'question': question, - 'answer': answer, - 'type': 'wiki', - 'all_answers': modified_all_answers, - } - ) - + '\n' - ) - - @classmethod - def splits(cls, root='.data', train='train', validation='dev', test='test', **kwargs): - path = cls.download(root) - cls.cache_splits(path) - - train_data = None if train is None else cls(os.path.join(path, f'{train}.jsonl'), **kwargs) - validation_data = ( - None if validation is None else cls(os.path.join(path, f'{validation}.jsonl'), one_answer=False, **kwargs) - ) - test_data = None if test is None else cls(os.path.join(path, f'{test}.jsonl'), one_answer=False, **kwargs) - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class WinogradSchema(CQA): - urls = ['https://s3.amazonaws.com/research.metamind.io/decaNLP/data/schema.txt'] - - name = 'schema' - dirname = '' - - def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - examples = [] - with open(os.path.expanduser(path)) as f: - for line in f: - ex = json.loads(line) - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - - super(WinogradSchema, self).__init__(examples, **kwargs) - - @classmethod - def cache_splits(cls, path): - pattern = '\[.*\]' - train_jsonl = os.path.expanduser(os.path.join(path, 'train.jsonl')) - if os.path.exists(train_jsonl): - return - - def get_both_schema(context): - variations = [x[1:-1].split('/') for x in re.findall(pattern, context)] - splits = re.split(pattern, context) - results = [] - for which_schema in range(2): - vs = [v[which_schema] for v in variations] - context = '' - for idx in range(len(splits)): - context += splits[idx] - if idx < len(vs): - context += vs[idx] - results.append(context) - return results - - schemas = [] - with open(os.path.expanduser(os.path.join(path, 'schema.txt'))) as schema_file: - schema = [] - for line in schema_file: - if len(line.split()) == 0: - schemas.append(schema) - schema = [] - continue - else: - schema.append(line.strip()) - - examples = [] - for schema in schemas: - context, question, answer = schema - contexts = get_both_schema(context) - questions = get_both_schema(question) - answers = answer.split('/') - for idx in range(2): - answer = answers[idx] - question = questions[idx] + f' {answers[0]} or {answers[1]}?' - examples.append({'context': contexts[idx], 'question': question, 'answer': answer}) - - traindev = examples[:-100] - test = examples[-100:] - train = traindev[:80] - dev = traindev[80:] - - splits = ['train', 'validation', 'test'] - for split, examples in zip(splits, [train, dev, test]): - with open(os.path.expanduser(os.path.join(path, f'{split}.jsonl')), 'a') as split_file: - for ex in examples: - split_file.write(json.dumps(ex) + '\n') - - @classmethod - def splits(cls, root='.data', train='train', validation='validation', test='test', **kwargs): - path = cls.download(root) - cls.cache_splits(path) - - train_data = None if train is None else cls(os.path.join(path, f'{train}.jsonl'), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, f'{validation}.jsonl'), **kwargs) - test_data = None if test is None else cls(os.path.join(path, f'{test}.jsonl'), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class WOZ(CQA): - urls = [ - 'https://raw.githubusercontent.com/nmrksic/neural-belief-tracker/master/data/woz/woz_train_en.json', - 'https://raw.githubusercontent.com/nmrksic/neural-belief-tracker/master/data/woz/woz_test_de.json', - 'https://raw.githubusercontent.com/nmrksic/neural-belief-tracker/master/data/woz/woz_test_en.json', - 'https://raw.githubusercontent.com/nmrksic/neural-belief-tracker/master/data/woz/woz_train_de.json', - 'https://raw.githubusercontent.com/nmrksic/neural-belief-tracker/master/data/woz/woz_train_en.json', - 'https://raw.githubusercontent.com/nmrksic/neural-belief-tracker/master/data/woz/woz_validate_de.json', - 'https://raw.githubusercontent.com/nmrksic/neural-belief-tracker/master/data/woz/woz_validate_en.json', - ] - - name = 'woz' - dirname = '' - - def __init__(self, path, subsample=None, lower=False, description='woz.en', cached_path=None, skip_cache=False, **kwargs): - examples, all_answers = [], [] - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample), description) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples, all_answers = torch.load(cache_name) - else: - with open(os.path.expanduser(path)) as f: - for woz_id, line in enumerate(f): - ex = example_dict = json.loads(line) - if example_dict['lang'] in description: - context, question, answer = ex['context'], ex['question'], ex['answer'] - all_answers.append((ex['lang_dialogue_turn'], answer)) - examples.append( - Example.from_raw(make_example_id(self, woz_id), context, question, answer, lower=lower) - ) - - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save((examples, all_answers), cache_name) - - super(WOZ, self).__init__(examples, **kwargs) - self.all_answers = all_answers - - @classmethod - def cache_splits(cls, path, train='train', validation='validate', test='test'): - train_jsonl = os.path.expanduser(os.path.join(path, 'train.jsonl')) - if os.path.exists(train_jsonl): - return - - file_name_base = 'woz_{}_{}.json' - for split in [train, validation, test]: - with open(os.path.expanduser(os.path.join(path, f'{split}.jsonl')), 'a') as split_file: - for lang in ['en', 'de']: - file_path = file_name_base.format(split, lang) - with open(os.path.expanduser(os.path.join(path, file_path))) as src_file: - dialogues = json.loads(src_file.read()) - for di, d in enumerate(dialogues): - previous_state = {'inform': [], 'request': []} - turns = d['dialogue'] - for ti, t in enumerate(turns): - question = 'What is the change in state?' - actions = [] - for act in t['system_acts']: - if isinstance(act, list): - act = ': '.join(act) - actions.append(act) - actions = ', '.join(actions) - if len(actions) > 0: - actions += ' -- ' - context = actions + t['transcript'] - belief_state = t['belief_state'] - delta_state = {'inform': [], 'request': []} - current_state = {'inform': [], 'request': []} - for item in belief_state: - if 'slots' in item: - slots = item['slots'] - for slot in slots: - act = item['act'] - if act == 'inform': - current_state['inform'].append(slot) - if slot not in previous_state['inform']: - delta_state['inform'].append(slot) - else: - prev_slot = previous_state['inform'][previous_state['inform'].index(slot)] - if prev_slot[1] != slot[1]: - delta_state['inform'].append(slot) - else: - delta_state['request'].append(slot[1]) - current_state['request'].append(slot[1]) - previous_state = current_state - answer = '' - if len(delta_state['inform']) > 0: - answer = ', '.join([f'{x[0]}: {x[1]}' for x in delta_state['inform']]) - answer += ';' - if len(delta_state['request']) > 0: - answer += ' ' - answer += ', '.join(delta_state['request']) - ex = { - 'context': ' '.join(context.split()), - 'question': ' '.join(question.split()), - 'lang': lang, - 'answer': answer if len(answer) > 1 else 'None', - 'lang_dialogue_turn': f'{lang}_{di}_{ti}', - } - split_file.write(json.dumps(ex) + '\n') - - @classmethod - def splits(cls, root='.data', train='train', validation='validate', test='test', **kwargs): - path = cls.download(root) - cls.cache_splits(path) - - train_data = None if train is None else cls(os.path.join(path, f'{train}.jsonl'), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, f'{validation}.jsonl'), **kwargs) - test_data = None if test is None else cls(os.path.join(path, f'{test}.jsonl'), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class MultiNLI(CQA): - urls = ['http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip'] - - name = 'multinli' - dirname = 'multinli_1.0' - - def __init__( - self, path, subsample=None, lower=False, description='multinli.in.out', cached_path=None, skip_cache=False, **kwargs - ): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample), description) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - examples = [] - with open(os.path.expanduser(path)) as f: - for line in f: - ex = example_dict = json.loads(line) - if example_dict['subtask'] in description: - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - - super(MultiNLI, self).__init__(examples, **kwargs) - - @classmethod - def cache_splits(cls, path, train='multinli_1.0_train', validation='mulinli_1.0_dev_{}', test='test'): - train_jsonl = os.path.expanduser(os.path.join(path, 'train.jsonl')) - if os.path.exists(train_jsonl): - return - - with open(os.path.expanduser(os.path.join(path, 'train.jsonl')), 'a') as split_file: - with open(os.path.expanduser(os.path.join(path, 'multinli_1.0_train.jsonl'))) as src_file: - for line in src_file: - ex = json.loads(line) - ex = { - 'context': f'Premise: "{ex["sentence1"]}"', - 'question': f'Hypothesis: "{ex["sentence2"]}" -- entailment, neutral, or contradiction?', - 'answer': ex['gold_label'], - 'subtask': 'multinli', - } - split_file.write(json.dumps(ex) + '\n') - - with open(os.path.expanduser(os.path.join(path, 'validation.jsonl')), 'a') as split_file: - for subtask in ['matched', 'mismatched']: - with open(os.path.expanduser(os.path.join(path, 'multinli_1.0_dev_{}.jsonl'.format(subtask)))) as src_file: - for line in src_file: - ex = json.loads(line) - ex = { - 'context': f'Premise: "{ex["sentence1"]}"', - 'question': f'Hypothesis: "{ex["sentence2"]}" -- entailment, neutral, or contradiction?', - 'answer': ex['gold_label'], - 'subtask': 'in' if subtask == 'matched' else 'out', - } - split_file.write(json.dumps(ex) + '\n') - - @classmethod - def splits(cls, root='.data', train='train', validation='validation', test='test', **kwargs): - path = cls.download(root) - cls.cache_splits(path) - - train_data = None if train is None else cls(os.path.join(path, f'{train}.jsonl'), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, f'{validation}.jsonl'), **kwargs) - test_data = None if test is None else cls(os.path.join(path, f'{test}.jsonl'), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class ZeroShotRE(CQA): - urls = ['http://nlp.cs.washington.edu/zeroshot/relation_splits.tar.bz2'] - dirname = 'relation_splits' - name = 'zre' - - def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - examples = [] - with open(os.path.expanduser(path)) as f: - for line in f: - ex = json.loads(line) - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - - super().__init__(examples, **kwargs) - - @classmethod - def cache_splits(cls, path, train='train', validation='dev', test='test'): - train_jsonl = os.path.expanduser(os.path.join(path, f'{train}.jsonl')) - if os.path.exists(train_jsonl): - return - - base_file_name = '{}.0' - for split in [train, validation, test]: - src_file_name = base_file_name.format(split) - with open(os.path.expanduser(os.path.join(path, f'{split}.jsonl')), 'a') as split_file: - with open(os.path.expanduser(os.path.join(path, src_file_name))) as src_file: - for line in src_file: - split_line = line.split('\t') - if len(split_line) == 4: - answer = '' - relation, question, subject, context = split_line - else: - relation, question, subject, context = split_line[:4] - answer = ', '.join(split_line[4:]) - question = question.replace('XXX', subject) - ex = { - 'context': context, - 'question': question, - 'answer': answer if len(answer) > 0 else 'unanswerable', - } - split_file.write(json.dumps(ex) + '\n') - - @classmethod - def splits(cls, root='.data', train='train', validation='dev', test='test', **kwargs): - path = cls.download(root) - cls.cache_splits(path) - - train_data = None if train is None else cls(os.path.join(path, f'{train}.jsonl'), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, f'{validation}.jsonl'), **kwargs) - test_data = None if test is None else cls(os.path.join(path, f'{test}.jsonl'), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class OntoNotesNER(CQA): - urls = [ - 'http://conll.cemantix.org/2012/download/ids/english/all/train.id', - 'http://conll.cemantix.org/2012/download/ids/english/all/development.id', - 'http://conll.cemantix.org/2012/download/ids/english/all/test.id', - ] - - name = 'ontonotes.ner' - dirname = '' - - @classmethod - def clean(cls, s): - closing_punctuation = {' .', ' ,', ' ;', ' !', ' ?', ' :', ' )', " '", " n't ", " %"} - opening_punctuation = {'( ', '$ '} - both_sides = {' - '} - s = ' '.join(s.split()).strip() - s = s.replace(' /.', ' .') - s = s.replace(' /?', ' ?') - s = s.replace('-LRB-', '(') - s = s.replace('-RRB-', ')') - s = s.replace('-LAB-', '<') - s = s.replace('-RAB-', '>') - s = s.replace('-AMP-', '&') - s = s.replace('%pw', ' ') - - for p in closing_punctuation: - s = s.replace(p, p.lstrip()) - for p in opening_punctuation: - s = s.replace(p, p.rstrip()) - for p in both_sides: - s = s.replace(p, p.strip()) - s = s.replace('``', '"') - s = s.replace("''", '"') - quote_is_open = True - quote_idx = s.find('"') - raw = '' - while quote_idx >= 0: - start_enamex_open_idx = s.find(' -1: - end_enamex_open_idx = s.find('">') + 2 - if start_enamex_open_idx <= quote_idx <= end_enamex_open_idx: - raw += s[:end_enamex_open_idx] - s = s[end_enamex_open_idx:] - quote_idx = s.find('"') - continue - if quote_is_open: - raw += s[: quote_idx + 1] - s = s[quote_idx + 1 :].strip() - quote_is_open = False - else: - raw += s[:quote_idx].strip() + '"' - s = s[quote_idx + 1 :] - quote_is_open = True - quote_idx = s.find('"') - raw += s - - return ' '.join(raw.split()).strip() - - def __init__( - self, - path, - one_answer=True, - subsample=None, - lower=False, - path_to_files='.data/ontonotes-release-5.0/data/files', - subtask='all', - nones=True, - cached_path=None, - skip_cache=False, - **kwargs, - ): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample), subtask, str(nones)) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - examples = [] - with open(os.path.expanduser(path)) as f: - for line in f: - example_dict = json.loads(line) - t = example_dict['type'] - a = example_dict['answer'] - if subtask == 'both' or t == subtask: - if a != 'None' or nones: - ex = example_dict - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - - super(OntoNotesNER, self).__init__(examples, **kwargs) - - @classmethod - def cache_splits(cls, path, path_to_files, train='train', validation='development', test='test'): - - label_to_answer = { - 'PERSON': 'person', - 'NORP': 'political', - 'FAC': 'facility', - 'ORG': 'organization', - 'GPE': 'geopolitical', - 'LOC': 'location', - 'PRODUCT': 'product', - 'EVENT': 'event', - 'WORK_OF_ART': 'artwork', - 'LAW': 'legal', - 'LANGUAGE': 'language', - 'DATE': 'date', - 'TIME': 'time', - 'PERCENT': 'percentage', - 'MONEY': 'monetary', - 'QUANTITY': 'quantitative', - 'ORDINAL': 'ordinal', - 'CARDINAL': 'cardinal', - } - - pluralize = { - 'person': 'persons', - 'political': 'political', - 'facility': 'facilities', - 'organization': 'organizations', - 'geopolitical': 'geopolitical', - 'location': 'locations', - 'product': 'products', - 'event': 'events', - 'artwork': 'artworks', - 'legal': 'legal', - 'language': 'languages', - 'date': 'dates', - 'time': 'times', - 'percentage': 'percentages', - 'monetary': 'monetary', - 'quantitative': 'quantitative', - 'ordinal': 'ordinal', - 'cardinal': 'cardinal', - } - - for split in [train, validation, test]: - split_file_name = os.path.join(path, f'{split}.jsonl') - if os.path.exists(split_file_name): - continue - id_file = os.path.join(path, f'{split}.id') - - num_file_ids = 0 - with open(split_file_name, 'w') as split_file: - with open(os.path.expanduser(id_file)) as f: - for file_id in f: - example_file_name = os.path.join(os.path.expanduser(path_to_files), file_id.strip()) + '.name' - if not os.path.exists(example_file_name) or 'annotations/tc/ch' in example_file_name: - continue - num_file_ids += 1 - with open(example_file_name) as example_file: - lines = [x.strip() for x in example_file.readlines() if 'DOC' not in x] - for line in lines: - original = line - line = cls.clean(line) - entities = [] - while True: - start_enamex_open_idx = line.find('') + 2 - start_enamex_close_idx = line.find('') - end_enamex_close_idx = start_enamex_close_idx + len('') - - enamex_open_tag = line[start_enamex_open_idx:end_enamex_open_idx] - before_entity = line[:start_enamex_open_idx] - entity = line[end_enamex_open_idx:start_enamex_close_idx] - after_entity = line[end_enamex_close_idx:] - - if 'S_OFF' in enamex_open_tag: - s_off_start = enamex_open_tag.find('S_OFF="') - s_off_end = ( - enamex_open_tag.find('">') - if 'E_OFF' not in enamex_open_tag - else enamex_open_tag.find('" E_OFF') - ) - s_off = int(enamex_open_tag[s_off_start + len('S_OFF="') : s_off_end]) - enamex_open_tag = enamex_open_tag[: s_off_start - 2] + '">' - before_entity += entity[:s_off] - entity = entity[s_off:] - - if 'E_OFF' in enamex_open_tag: - s_off_start = enamex_open_tag.find('E_OFF="') - s_off_end = enamex_open_tag.find('">') - s_off = int(enamex_open_tag[s_off_start + len('E_OFF="') : s_off_end]) - enamex_open_tag = enamex_open_tag[: s_off_start - 2] + '">' - after_entity = entity[-s_off:] + after_entity - entity = entity[:-s_off] - - label_start = enamex_open_tag.find('TYPE="') + len('TYPE="') - label_end = enamex_open_tag.find('">') - label = enamex_open_tag[label_start:label_end] - assert label in label_to_answer - - offsets = (len(before_entity), len(before_entity) + len(entity)) - entities.append({'entity': entity, 'char_offsets': offsets, 'label': label}) - line = before_entity + entity + after_entity - - context = line.strip() - is_no_good = False - for entity_tuple in entities: - entity = entity_tuple['entity'] - start, end = entity_tuple['char_offsets'] - if not context[start:end] == entity: - is_no_good = True - break - if is_no_good: - logger.warning( - 'Throwing out example that looks poorly labeled: ', - original.strip(), - ' (', - file_id.strip(), - ')', - ) - continue - question = 'What are the tags for all entities?' - answer = '; '.join([f'{x["entity"]} -- {label_to_answer[x["label"]]}' for x in entities]) - if len(answer) == 0: - answer = 'None' - split_file.write( - json.dumps( - { - 'context': context, - 'question': question, - 'answer': answer, - 'file_id': file_id.strip(), - 'original': original.strip(), - 'entity_list': entities, - 'type': 'all', - } - ) - + '\n' - ) - partial_question = 'Which entities are {}?' - - for lab, ans in label_to_answer.items(): - question = partial_question.format(pluralize[ans]) - entity_of_type_lab = [x['entity'] for x in entities if x['label'] == lab] - answer = ', '.join(entity_of_type_lab) - if len(answer) == 0: - answer = 'None' - split_file.write( - json.dumps( - { - 'context': context, - 'question': question, - 'answer': answer, - 'file_id': file_id.strip(), - 'original': original.strip(), - 'entity_list': entities, - 'type': 'one', - } - ) - + '\n' - ) - - @classmethod - def splits(cls, root='.data', train='train', validation='development', test='test', **kwargs): - path_to_files = os.path.join(root, 'ontonotes-release-5.0', 'data', 'files') - assert os.path.exists(path_to_files) - path = cls.download(root) - cls.cache_splits(path, path_to_files) - - train_data = None if train is None else cls(os.path.join(path, f'{train}.jsonl'), **kwargs) - validation_data = ( - None if validation is None else cls(os.path.join(path, f'{validation}.jsonl'), one_answer=False, **kwargs) - ) - test_data = None if test is None else cls(os.path.join(path, f'{test}.jsonl'), one_answer=False, **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - -class SNLI(CQA): - urls = ['http://nlp.stanford.edu/projects/snli/snli_1.0.zip'] - dirname = 'snli_1.0' - name = 'snli' - - def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - examples = [] - with open(os.path.expanduser(path)) as f: - for line in f: - example_dict = json.loads(line) - ex = example_dict - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - - super().__init__(examples, **kwargs) - - @classmethod - def cache_splits(cls, path, train='train', validation='dev', test='test'): - train_jsonl = os.path.expanduser(os.path.join(path, f'{train}.jsonl')) - if os.path.exists(train_jsonl): - return - - base_file_name = 'snli_1.0_{}.jsonl' - for split in [train, validation, test]: - src_file_name = base_file_name.format(split) - with open(os.path.expanduser(os.path.join(path, f'{split}.jsonl')), 'a') as split_file: - with open(os.path.expanduser(os.path.join(path, src_file_name))) as src_file: - for line in src_file: - ex = json.loads(line) - ex = { - 'context': f'Premise: "{ex["sentence1"]}"', - 'question': f'Hypothesis: "{ex["sentence2"]}" -- entailment, neutral, or contradiction?', - 'answer': ex['gold_label'], - } - split_file.write(json.dumps(ex) + '\n') - - @classmethod - def splits(cls, root='.data', train='train', validation='dev', test='test', **kwargs): - path = cls.download(root) - cls.cache_splits(path) - - train_data = None if train is None else cls(os.path.join(path, f'{train}.jsonl'), **kwargs) - validation_data = None if validation is None else cls(os.path.join(path, f'{validation}.jsonl'), **kwargs) - test_data = None if test is None else cls(os.path.join(path, f'{test}.jsonl'), **kwargs) - - aux_data = None - do_curriculum = kwargs.get('curriculum', False) - if do_curriculum: - kwargs.pop('curriculum') - aux_data = cls(os.path.join(path, 'aux.jsonl'), **kwargs) - - return Split( - train=None if train is None else train_data, - eval=None if validation is None else validation_data, - test=None if test is None else test_data, - aux=None if do_curriculum is None else aux_data, - ) - - class JSON(CQA): name = 'json' diff --git a/genienlp/tasks/generic_task.py b/genienlp/tasks/generic_task.py index 6ae4b85a..309bfbf9 100644 --- a/genienlp/tasks/generic_task.py +++ b/genienlp/tasks/generic_task.py @@ -30,154 +30,12 @@ from collections import OrderedDict from ..data_utils.example import Example -from . import generic_dataset from .almond_task import BaseAlmondTask from .base_task import BaseTask from .generic_dataset import BiTODDataset, CrossNERDataset, OODDataset from .registry import register_task -@register_task('multi30k') -class Multi30K(BaseTask): - @property - def metrics(self): - return ['casedbleu', 'em'] - - def get_splits(self, root, **kwargs): - src, trg = ['.' + x for x in self.name.split('.')[1:]] - return generic_dataset.Multi30k.splits(exts=(src, trg), root=root, **kwargs) - - -@register_task('iwslt') -class IWSLT(BaseTask): - @property - def metrics(self): - return ['casedbleu', 'em'] - - def get_splits(self, root, **kwargs): - src, trg = ['.' + x for x in self.name.split('.')[1:]] - return generic_dataset.IWSLT.splits(exts=(src, trg), root=root, **kwargs) - - -@register_task('squad') -class SQuAD(BaseTask): - @property - def metrics(self): - return ['em'] - - def get_splits(self, root, **kwargs): - return generic_dataset.SQuAD.splits(root=root, description=self.name, **kwargs) - - -@register_task('wikisql') -class WikiSQL(BaseTask): - @property - def metrics(self): - return ['em'] - - def get_splits(self, root, **kwargs): - return generic_dataset.WikiSQL.splits(root=root, query_as_question='query_as_question' in self.name, **kwargs) - - -@register_task('ontonotes') -class OntoNotesNER(BaseTask): - def get_splits(self, root, **kwargs): - split_task = self.name.split('.') - _, _, subtask, nones, counting = split_task - return generic_dataset.OntoNotesNER.splits( - subtask=subtask, nones=True if nones == 'nones' else False, root=root, **kwargs - ) - - -@register_task('woz') -class WoZ(BaseTask): - @property - def metrics(self): - return ['em'] - - def get_splits(self, root, **kwargs): - return generic_dataset.WOZ.splits(description=self.name, root=root, **kwargs) - - -@register_task('multinli') -class MultiNLI(BaseTask): - def get_splits(self, root, **kwargs): - return generic_dataset.MultiNLI.splits(description=self.name, root=root, **kwargs) - - -@register_task('srl') -class SRL(BaseTask): - @property - def metrics(self): - return ['em'] - - def get_splits(self, root, **kwargs): - return generic_dataset.SRL.splits(root=root, **kwargs) - - -@register_task('snli') -class SNLI(BaseTask): - def get_splits(self, root, **kwargs): - return generic_dataset.SNLI.splits(root=root, **kwargs) - - -@register_task('schema') -class WinogradSchema(BaseTask): - def get_splits(self, root, **kwargs): - return generic_dataset.WinogradSchema.splits(root=root, **kwargs) - - -class BaseSummarizationTask(BaseTask): - @property - def metrics(self): - return ['em'] - - -@register_task('cnn') -class CNN(BaseSummarizationTask): - def get_splits(self, root, **kwargs): - return generic_dataset.CNN.splits(root=root, **kwargs) - - -@register_task('dailymail') -class DailyMail(BaseSummarizationTask): - def get_splits(self, root, **kwargs): - return generic_dataset.DailyMail.splits(root=root, **kwargs) - - -@register_task('cnn_dailymail') -class CNNDailyMail(BaseSummarizationTask): - def get_splits(self, root, **kwargs): - split_cnn = generic_dataset.CNN.splits(root=root, **kwargs) - split_dm = generic_dataset.DailyMail.splits(root=root, **kwargs) - for scnn, sdm in zip(split_cnn, split_dm): - scnn.examples.extend(sdm) - return split_cnn - - -@register_task('sst') -class SST(BaseTask): - def get_splits(self, root, **kwargs): - return generic_dataset.SST.splits(root=root, **kwargs) - - -@register_task('imdb') -class IMDB(BaseTask): - def get_splits(self, root, **kwargs): - kwargs['validation'] = None - return generic_dataset.IMDb.splits(root=root, **kwargs) - - -@register_task('zre') -class ZRE(BaseTask): - @property - def metrics(self): - return ['em'] - - def get_splits(self, root, **kwargs): - return generic_dataset.ZeroShotRE.splits(root=root, **kwargs) - - @register_task('cross_ner') class CrossNERTask(BaseAlmondTask): politics_labels = [ From 7485141a5b46aee6aed48fdd0ce8a94715408e61 Mon Sep 17 00:00:00 2001 From: mehrad Date: Fri, 25 Feb 2022 17:33:24 -0800 Subject: [PATCH 06/19] Move validation code to appropriate model class --- genienlp/kfserver.py | 6 +- genienlp/models/transformer_seq2seq.py | 483 ++++++++++++++- .../transformer_token_classification.py | 76 ++- genienlp/predict.py | 1 - genienlp/server.py | 9 +- genienlp/train.py | 7 +- genienlp/util.py | 12 + genienlp/validate.py | 576 +----------------- 8 files changed, 581 insertions(+), 589 deletions(-) diff --git a/genienlp/kfserver.py b/genienlp/kfserver.py index 3b3e5eb2..c32a13e3 100644 --- a/genienlp/kfserver.py +++ b/genienlp/kfserver.py @@ -39,9 +39,9 @@ class KFModelServer(kfserving.KFModel): - def __init__(self, name, args, numericalizer, model, device, confidence_estimators, estimator_filenames, ned_model): + def __init__(self, name, args, model, device, confidence_estimators, estimator_filenames, ned_model): super().__init__(name) - self.server = Server(args, numericalizer, model, device, confidence_estimators, estimator_filenames, ned_model) + self.server = Server(args, model, device, confidence_estimators, estimator_filenames, ned_model) def load(self): log_model_size(logger, self.server.model, self.server.args.model) @@ -57,7 +57,7 @@ def predict(self, request): def main(args): model, device, confidence_estimators, estimator_filenames, ned_model = init(args) model_server = KFModelServer( - args.inference_name, args, model.numericalizer, model, device, confidence_estimators, estimator_filenames, ned_model + args.inference_name, args, model, device, confidence_estimators, estimator_filenames, ned_model ) model_server.load() kfserving.KFServer(workers=1).start([model_server]) diff --git a/genienlp/models/transformer_seq2seq.py b/genienlp/models/transformer_seq2seq.py index 6bcf8586..5d1b79f6 100644 --- a/genienlp/models/transformer_seq2seq.py +++ b/genienlp/models/transformer_seq2seq.py @@ -26,16 +26,29 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - +import copy import logging +import os +from collections import defaultdict from typing import List import torch -from transformers import AutoConfig, AutoModelForSeq2SeqLM, MBartTokenizer, MBartTokenizerFast +import ujson +from dateparser.languages import default_loader +from dialogues import Bitod +from transformers import AutoConfig, AutoModelForSeq2SeqLM, MarianTokenizer, MBartTokenizer, MBartTokenizerFast +from ..data_utils.example import NumericalizedExamples, SequentialField from ..data_utils.numericalizer import TransformerNumericalizer +from ..data_utils.progbar import progress_bar from ..model_utils.transformers_utils import MULTILINGUAL_TOKENIZERS -from ..util import ConfidenceFeatures, adjust_language_code +from ..util import ( + ConfidenceFeatures, + GenerationOutput, + adjust_language_code, + merge_translated_sentences, + replace_capturing_group, +) from .base import GenieModel from .common import LabelSmoothingCrossEntropy @@ -211,6 +224,470 @@ def generate( return generated + def validate( + self, + data_iterator, + task, + output_predictions_only=False, + output_confidence_features=False, + original_order=None, + confidence_estimators=None, + disable_progbar=True, + ): + """ + Inputs: + original_order: List of indices. If provided, we will sort the results according to this order + confidence_estimator: if provided, will use it to calculate and output confidence scores + Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise + loss + predictions: a List of Lists of strings + answers + contexts + """ + total_loss = 0.0 if 'loss' in task.metrics else None + output_confidence_scores = confidence_estimators is not None + predictions = [] + raw_predictions = [] + confidence_features = [] + example_ids = [] + answers = [] + contexts = [] + + if self.numericalizer._tokenizer.tgt_lang: + tgt_lang = self.numericalizer._tokenizer.tgt_lang + else: + tgt_lang = self.model.orig_tgt_lang + + if self.numericalizer._tokenizer.src_lang: + src_lang = self.numericalizer._tokenizer.src_lang + else: + src_lang = self.model.orig_src_lang + + date_parser = default_loader.get_locale(src_lang[:2]) + + translate_return_raw_outputs = getattr(self.args, 'translate_return_raw_outputs', False) + + for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): + batch_size = len(batch.example_id) + batch_prediction = [[] for _ in range(batch_size)] + batch_raw_prediction = [[] for _ in range(batch_size)] + batch_confidence_features = [[] for _ in range(batch_size)] + batch_example_ids = batch.example_id + + example_ids += batch_example_ids + if not output_predictions_only: + batch_answer = self.numericalizer.reverse(batch.answer.value.data, 'answer') + batch_answer = [ + task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) + ] + answers += batch_answer + batch_context = self.numericalizer.reverse(batch.context.value.data, 'context') + contexts += batch_context + elif output_confidence_features: + # need gold answer for confidence estimation + batch_answer = self.numericalizer.reverse(batch.answer.value.data, 'answer') + answers += batch_answer + + if total_loss is not None: + loss = self.forward(batch, train=True).loss.item() + total_loss += loss + + for hyperparameter_idx in range(len(self.args.temperature)): + generated = self.generate( + batch, + max_output_length=self.args.max_output_length, + min_output_length=self.args.min_output_length, + num_outputs=self.args.num_outputs[hyperparameter_idx], + temperature=self.args.temperature[hyperparameter_idx] + if self.args.temperature[hyperparameter_idx] > 0 + else 1.0, + repetition_penalty=self.args.repetition_penalty[hyperparameter_idx], + top_k=self.args.top_k[hyperparameter_idx], + top_p=self.args.top_p[hyperparameter_idx], + num_beams=self.args.num_beams[hyperparameter_idx], + num_beam_groups=self.args.num_beam_groups[hyperparameter_idx], + diversity_penalty=self.args.diversity_penalty[hyperparameter_idx], + no_repeat_ngram_size=self.args.no_repeat_ngram_size[hyperparameter_idx], + do_sample=self.args.temperature[hyperparameter_idx] != 0, # if temperature==0, we do not sample + ) + partial_batch_prediction_ids = generated.sequences + partial_batch_words = None + + if getattr(task, 'need_attention_scores', False): + cross_attentions = generated.cross_attentions + + # stack tensors to shape (max_output_length, num_layers, batch_size, num_heads, 1, max_input_length) + cross_attentions = torch.stack(([torch.stack(tuple) for tuple in cross_attentions])).cpu() + + # reshape to (num_layers, batch_size, num_heads, max_output_length, max_input_length) + cross_attentions = cross_attentions.squeeze(4) + cross_attentions = cross_attentions.permute(1, 2, 3, 0, 4).contiguous() + + # choose only last layer attentions + # cross_attentions = torch.mean(cross_attentions[-3:, ...], dim=0) + cross_attentions = cross_attentions[-1, ...] + + # postprocess prediction ids + kwargs = { + 'self.numericalizer': self.numericalizer, + 'cross_attentions': cross_attentions, + 'tgt_lang': tgt_lang, + 'date_parser': date_parser, + } + + if translate_return_raw_outputs: + partial_batch_raw_prediction_ids = partial_batch_prediction_ids + + partial_batch_prediction_ids, partial_batch_words = task.batch_postprocess_prediction_ids( + batch_example_ids, batch.context.value.data, partial_batch_prediction_ids, **kwargs + ) + + # MarianTokenizer uses two different spm models for encoding source and target languages. + # in almond_translate we postprocess text with alignment which produces code-switched sentences. + # encoding a code-switched sentence with either spm will omit tokens from the other language + # so we have to return both the processed and encoded text. + # we need to return encoded text too since confidence_features requires ids + if isinstance(self.numericalizer._tokenizer, MarianTokenizer) and partial_batch_words: + partial_batch_prediction = partial_batch_words + else: + if output_confidence_features or output_confidence_scores: + partial_batch_confidence_features = self.model.confidence_features( + batch=batch, predictions=partial_batch_prediction_ids, mc_dropout_num=self.args.mc_dropout_num + ) + partial_batch_prediction = self.numericalizer.reverse(partial_batch_prediction_ids, 'answer') + + def get_example_index(i): + return (i // self.args.num_outputs[hyperparameter_idx]) % batch_size + + if translate_return_raw_outputs: + partial_batch_raw_prediction = self.numericalizer.reverse(partial_batch_raw_prediction_ids, 'answer') + for i in range(len(partial_batch_prediction)): + partial_batch_raw_prediction[i] = task.postprocess_prediction( + batch_example_ids[get_example_index(i)], partial_batch_raw_prediction[i] + ) + for i in range(len(partial_batch_prediction)): + batch_raw_prediction[get_example_index(i)].append(partial_batch_raw_prediction[i]) + + # post-process predictions + for i in range(len(partial_batch_prediction)): + partial_batch_prediction[i] = task.postprocess_prediction( + batch_example_ids[get_example_index(i)], partial_batch_prediction[i] + ) + + # put them into the right array + for i in range(len(partial_batch_prediction)): + batch_prediction[get_example_index(i)].append(partial_batch_prediction[i]) + if output_confidence_features or output_confidence_scores: + batch_confidence_features[get_example_index(i)].append(partial_batch_confidence_features[i]) + + predictions += batch_prediction + confidence_features += batch_confidence_features + raw_predictions += batch_raw_prediction + + if total_loss is not None: + total_loss /= len(example_ids) + + if original_order is not None: + # sort back to the original order + original_order, example_ids, predictions, raw_predictions, answers, contexts, confidence_features = [ + list(a) + for a in tuple( + zip( + *sorted( + list( + zip( + original_order, + example_ids, + predictions, + raw_predictions, + answers, + contexts, + confidence_features, + ) + ) + ) + ) + ) + ] + + if getattr(self.args, 'translate_example_split', False): + # stitch sentences back together + example_ids, predictions, raw_predictions, answers, contexts, confidence_features = merge_translated_sentences( + example_ids, + predictions, + raw_predictions, + answers, + contexts, + confidence_features, + self.numericalizer._tokenizer.src_lang, + self.numericalizer._tokenizer.tgt_lang, + ) + + output = GenerationOutput(loss=total_loss) + + if output_predictions_only: + output.predictions = predictions + else: + output.example_ids, output.predictions, output.answers, output.contexts = ( + example_ids, + predictions, + answers, + contexts, + ) + if output_confidence_features: + output.confidence_features = confidence_features + if self.args.override_confidence_labels: + for i, example in enumerate(confidence_features): + for confidence in example: + confidence.label = answers[i] == self.args.override_confidence_labels + if output_confidence_scores: + output.confidence_scores = [] + for estimator in confidence_estimators: + confidence_scores = estimator.estimate(confidence_features) + output.confidence_scores.append(confidence_scores) + if translate_return_raw_outputs: + output.raw_predictions = raw_predictions + + return output + + def validate_e2e_dialogues( + self, data_iterator, task, eval_dir, output_predictions_only=False, original_order=None, disable_progbar=True + ): + """ + Inputs: + original_order: List of indices. If provided, we will sort the results according to this order + confidence_estimator: if provided, will use it to calculate and output confidence scores + Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise + loss + predictions: a List of Lists of strings + answers + contexts + """ + + dataset = Bitod() + e2e_dialogue_preds = dict() + + predictions = [] + example_ids = [] + answers = [] + contexts = [] + + # TODO: handle multiple responses + hyperparameter_idx = 0 + + cur_dial_id = '' + knowledge = None + + device = self.device + args = self.args + + special_tokens = self.numericalizer._tokenizer.all_special_tokens + + for k, turn in enumerate(progress_bar(data_iterator, desc='Generating', disable=disable_progbar)): + batch_size = len(turn.example_id) + assert batch_size == 1 + batch_prediction = [] + batch_example_ids = turn.example_id + + example_ids += batch_example_ids + + task_name, dial_id, turn_id, train_target = example_ids[-1].split('/') + turn_id = int(turn_id) + + if cur_dial_id != dial_id: + # new dialogue + cur_dial_id = dial_id + dialogue_state = {} + # new_state_text = 'null' + knowledge = defaultdict(dict) + new_knowledge_text = 'null' + new_actions_text = 'null' + active_api = None + e2e_dialogue_preds[dial_id] = {"turns": defaultdict(dict), "API": defaultdict(dict)} + + batch_context = [] + batch_tokens = self.numericalizer.convert_ids_to_tokens(turn.context.value.data, skip_special_tokens=False) + + # remove only beginning and trailing special tokens + # otherwise the sep_token added between context and question will be lost + for text in batch_tokens: + i = 0 + while text[i] in special_tokens: + i += 1 + j = len(text) - 1 + while text[j] in special_tokens: + j -= 1 + text = text[i : j + 1] + + batch_context.append(self.numericalizer._tokenizer.convert_tokens_to_string(text)) + + contexts += batch_context + + if not output_predictions_only: + batch_answer = self.numericalizer.reverse(turn.answer.value.data, 'answer') + batch_answer = [ + task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) + ] + answers += batch_answer + + new_state_text = dataset.state2span(dialogue_state) + + if train_target == 'dst': + input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) + + ## we always use gold history following common practice + ## if you want to use predicted response instead of gold uncomment the following + # last_sys_pred = predictions[-1][0].strip() + # input_text = replace_match(input_text, last_system_re, last_sys_pred) + + elif train_target == 'api': + + # replace state + input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) + + elif train_target == 'da': + # replace state + input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) + + # replace knowledge + input_text = replace_capturing_group(input_text, dataset.knowledge_re, new_knowledge_text) + + elif train_target == 'rg': + + # replace actions + input_text = replace_capturing_group(contexts[-1], dataset.actions_re, new_actions_text) + + else: + raise ValueError(f'Invalid train_target: {train_target}') + + # replace old context with updated + contexts[-1] = input_text + + tokenized_contexts = self.numericalizer.encode_batch([input_text], field_name='context', features=None)[0] + + numericalized_turn = NumericalizedExamples( + example_id=[turn.example_id[0]], + context=SequentialField( + value=torch.tensor([tokenized_contexts.value], device=device), + length=torch.tensor([tokenized_contexts.length], device=device), + limited=torch.tensor([tokenized_contexts.limited], device=device), + feature=None, + ), + answer=SequentialField(value=None, length=None, limited=None, feature=None), + ) + + generated = self.generate( + numericalized_turn, + max_output_length=args.max_output_length, + min_output_length=args.min_output_length, + num_outputs=args.num_outputs[hyperparameter_idx], + temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, + repetition_penalty=args.repetition_penalty[hyperparameter_idx], + top_k=args.top_k[hyperparameter_idx], + top_p=args.top_p[hyperparameter_idx], + num_beams=args.num_beams[hyperparameter_idx], + num_beam_groups=args.num_beam_groups[hyperparameter_idx], + diversity_penalty=args.diversity_penalty[hyperparameter_idx], + no_repeat_ngram_size=args.no_repeat_ngram_size[hyperparameter_idx], + do_sample=args.temperature[hyperparameter_idx] != 0, + ) + + partial_batch_prediction_ids = generated.sequences + + partial_batch_prediction = self.numericalizer.reverse(partial_batch_prediction_ids, 'answer')[0] + + if train_target == 'da': + partial_batch_prediction = dataset.postprocess_prediction( + partial_batch_prediction, knowledge, lang=self.numericalizer._tokenizer.src_lang[:2] + ) + + partial_batch_prediction = task.postprocess_prediction(batch_example_ids[0], partial_batch_prediction) + + # put them into the right array + batch_prediction.append([partial_batch_prediction]) + + predictions += batch_prediction + + if train_target == 'dst': + # update dialogue_state + lev = predictions[-1][0].strip() + state_update = dataset.span2state(lev) + if state_update: + active_api = list(state_update.keys())[-1] + dataset.update_state(state_update, dialogue_state) + + #### save latest state + state_to_record = copy.deepcopy(dialogue_state) + state_to_record = {dataset.domain2api_name(k): v for k, v in state_to_record.items()} + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["state"] = state_to_record + #### + + elif train_target == 'api': + if dataset.do_knowledge_reset(active_api): + new_knowledge_text = "null" + knowledge = defaultdict(dict) + + do_api_call = predictions[-1][0].strip() + + if do_api_call == 'yes': + # make api call + api_name = active_api + if api_name in dialogue_state: + constraints, new_knowledge_text = dataset.make_api_call( + dialogue_state, knowledge, api_name, self.numericalizer._tokenizer.src_lang, dial_id, turn_id + ) + #### save latest api constraints + e2e_dialogue_preds[dial_id]["API"][dataset.domain2api_name(api_name)] = copy.deepcopy(constraints) + #### + + elif do_api_call == 'no': + # do nothing + pass + else: + logger.error( + f'API call should be either yes or no but got {do_api_call}. Seems model is not trained for enough steps. For now we assume it\'s a no' + ) + + #### save latest api results + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["api"] = new_knowledge_text + #### + + elif train_target == 'da': + new_actions_text = predictions[-1][0] + #### save latest actions + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["actions"] = predictions[-1][0] + #### + + elif train_target == 'rg': + #### save latest response + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["response"] = predictions[-1] + #### + + with open(os.path.join(eval_dir, 'e2e_dialogue_preds.json'), 'w') as fout: + ujson.dump(e2e_dialogue_preds, fout, indent=2, ensure_ascii=False) + + if original_order is not None: + # sort back to the original order + original_order, example_ids, predictions, answers, contexts = [ + list(a) for a in tuple(zip(*sorted(list(zip(original_order, example_ids, predictions, answers, contexts))))) + ] + + # TODO calculate and return loss + loss = None + output = GenerationOutput(loss=loss) + + if output_predictions_only: + output.predictions = predictions + else: + output.example_ids, output.predictions, output.answers, output.contexts = ( + example_ids, + predictions, + answers, + contexts, + ) + + return output + def confidence_features(self, batch, predictions, mc_dropout_num=0) -> List[ConfidenceFeatures]: """ predictions: Tensor of shape (batch_size, output_length) diff --git a/genienlp/models/transformer_token_classification.py b/genienlp/models/transformer_token_classification.py index e6198f70..f40bbd2b 100644 --- a/genienlp/models/transformer_token_classification.py +++ b/genienlp/models/transformer_token_classification.py @@ -29,11 +29,13 @@ import logging +import torch from transformers import AutoConfig, AutoModelForTokenClassification from ..data_utils.numericalizer import TransformerNumericalizer +from ..data_utils.progbar import progress_bar from ..models.base import GenieModel -from ..util import adjust_language_code +from ..util import GenerationOutput, adjust_language_code logger = logging.getLogger(__name__) @@ -102,3 +104,75 @@ def forward(self, *input, **kwargs): return outputs else: return self.model(**kwargs) + + def validate(self, data_iterator, task, original_order=None, disable_progbar=True): + total_loss = 0.0 + all_example_ids = [] + all_answers = [] + all_contexts = [] + all_predictions = [] + + for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): + batch_example_ids = batch.example_id + + batch_context = self.numericalizer.reverse(batch.context.value.data, 'context') + + all_example_ids += batch_example_ids + + # pass labels to get loss + output = self.forward( + input_ids=batch.context.value, + attention_mask=(batch.context.value != self.numericalizer.pad_id), + labels=batch.answer.value, + ) + + labels = batch.answer.value.tolist() + + logits = output.logits + predictions = torch.argmax(logits, dim=-1).tolist() + + # logits for sequence classification is 2 dimensional + if logits.dim() == 2: + predictions = [[p] for p in predictions] + + # Remove ignored index (special tokens) + processed_preds = [] + processed_labels = [] + for pred, label in zip(predictions, labels): + preds_list = [] + labels_list = [] + for p_, l_ in zip(pred, label): + if l_ == self.numericalizer.answer_pad_id: + continue + preds_list.append(task.id2label[p_]) + labels_list.append(task.id2label[l_]) + + processed_preds.append([" ".join(preds_list)]) + processed_labels.append(" ".join(labels_list)) + + all_contexts += batch_context + all_answers += processed_labels + all_predictions += processed_preds + + total_loss += output.loss + + total_loss /= len(all_example_ids) + + if original_order is not None: + # sort back to the original order + original_order, all_example_ids, all_predictions, all_answers, all_contexts = [ + list(a) + for a in tuple( + zip(*sorted(list(zip(original_order, all_example_ids, all_predictions, all_answers, all_contexts)))) + ) + ] + + output = GenerationOutput( + loss=total_loss, + example_ids=all_example_ids, + contexts=all_contexts, + answers=all_answers, + predictions=all_predictions, + ) + + return output diff --git a/genienlp/predict.py b/genienlp/predict.py index ae7f826e..4a48023d 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -479,7 +479,6 @@ def run(args, device): generation_output = generate_with_model( model, it, - model.numericalizer, task, args, original_order=original_order, diff --git a/genienlp/server.py b/genienlp/server.py index 9f700ba0..a348a0e7 100644 --- a/genienlp/server.py +++ b/genienlp/server.py @@ -130,10 +130,10 @@ def parse_argv(parser): class Server(object): - def __init__(self, args, numericalizer, model, device, confidence_estimators, estimator_filenames, ned_model): + def __init__(self, args, model, device, confidence_estimators, estimator_filenames, ned_model): self.args = args self.device = device - self.numericalizer = numericalizer + self.numericalizer = model.numericalizer self.model = model self.confidence_estimators = confidence_estimators self.estimator_filenames = estimator_filenames @@ -216,7 +216,6 @@ def _predict_batch(self, batch, task, args): output = generate_with_model( self.model, [batch], - self.numericalizer, task, args, output_predictions_only=True, @@ -239,7 +238,7 @@ def _predict_batch(self, batch, task, args): instance['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx]) response.append(instance) else: - output = generate_with_model(self.model, [batch], self.numericalizer, task, args, output_predictions_only=True) + output = generate_with_model(self.model, [batch], task, args, output_predictions_only=True) if sum(args.num_outputs) > 1: response = [] for idx, predictions in enumerate(output.predictions): @@ -384,5 +383,5 @@ def init(args): def main(args): model, device, confidence_estimators, estimator_filenames, ned_model = init(args) - server = Server(args, model.numericalizer, model, device, confidence_estimators, estimator_filenames, ned_model) + server = Server(args, model, device, confidence_estimators, estimator_filenames, ned_model) server.run() diff --git a/genienlp/train.py b/genienlp/train.py index 3f23b767..830b5fb3 100644 --- a/genienlp/train.py +++ b/genienlp/train.py @@ -228,12 +228,10 @@ def should_log(iteration, log_every): return iteration % log_every == 0 -def do_validate( - iteration, args, model, numericalizer, val_iters, *, train_task, round_progress, task_progress, writer, logger -): +def do_validate(iteration, args, model, val_iters, *, train_task, round_progress, task_progress, writer, logger): deca_score = 0 for val_task_idx, (val_task, val_iter) in enumerate(val_iters): - output, metric_dict = validate(val_task, val_iter, model, numericalizer, args, num_print=args.num_print) + output, metric_dict = validate(val_task, val_iter, model, args, num_print=args.num_print) val_loss = output.loss if val_loss is not None: log_entry = f'{args.timestamp}:{elapsed_time(logger)}:iteration_{iteration}:{round_progress}train_{train_task.name}:{task_progress}val_{val_task.name}:val_loss_{val_loss:.4f}:' @@ -572,7 +570,6 @@ def train( iteration, args, model, - numericalizer, val_iters, train_task=task, round_progress=round_progress, diff --git a/genienlp/util.py b/genienlp/util.py index bb789738..5b174ec1 100644 --- a/genienlp/util.py +++ b/genienlp/util.py @@ -1002,3 +1002,15 @@ def load_config_json(args): args.verbose = False args.best_checkpoint = os.path.join(args.path, args.checkpoint_name) + + +def replace_capturing_group(input, re_pattern, replacement): + # replace first captured group in the input with replacement using regex re_pattern + if re_pattern.search(input): + whole_match = re_pattern.search(input).group(0).strip() + captured_match = re_pattern.search(input).group(1).strip() + new_whole_match = whole_match.replace(captured_match, replacement) + new_input = re.sub(re_pattern, new_whole_match, input) + else: + new_input = input + return new_input diff --git a/genienlp/validate.py b/genienlp/validate.py index e9bb35e0..cbb3ad07 100644 --- a/genienlp/validate.py +++ b/genienlp/validate.py @@ -27,24 +27,13 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import copy import logging -import os -import re import sys -from collections import defaultdict import torch -import ujson -from dateparser.languages import default_loader -from dialogues import Bitod -from transformers import MarianTokenizer -from .data_utils.example import NumericalizedExamples, SequentialField -from .data_utils.progbar import progress_bar from .metrics import calculate_and_reduce_metrics from .models import TransformerForSequenceClassification, TransformerForTokenClassification -from .util import GenerationOutput, merge_translated_sentences logger = logging.getLogger(__name__) @@ -52,7 +41,6 @@ def generate_with_model( model, data_iterator, - numericalizer, task, args, output_predictions_only=False, @@ -63,12 +51,9 @@ def generate_with_model( eval_dir=None, ): if args.e2e_dialogue_evaluation: - return generate_with_seq2seq_model_for_dialogue( - model, + return model.validate_e2e_dialogues( data_iterator, - numericalizer, task, - args, eval_dir, output_predictions_only=output_predictions_only, original_order=original_order, @@ -76,16 +61,11 @@ def generate_with_model( ) elif isinstance(model, (TransformerForTokenClassification, TransformerForSequenceClassification)): - return generate_with_classification_model( - model, data_iterator, numericalizer, task, original_order=original_order, disable_progbar=disable_progbar - ) + return model.validate(data_iterator, task, original_order=original_order, disable_progbar=disable_progbar) else: - return generate_with_seq2seq_model( - model, + return model.validate( data_iterator, - numericalizer, task, - args, output_predictions_only=output_predictions_only, output_confidence_features=output_confidence_features, original_order=original_order, @@ -94,552 +74,6 @@ def generate_with_model( ) -def replace_capturing_group(input, re_pattern, replacement): - # replace first captured group in the input with replacement using regex re_pattern - if re_pattern.search(input): - whole_match = re_pattern.search(input).group(0).strip() - captured_match = re_pattern.search(input).group(1).strip() - new_whole_match = whole_match.replace(captured_match, replacement) - new_input = re.sub(re_pattern, new_whole_match, input) - else: - new_input = input - return new_input - - -def generate_with_seq2seq_model_for_dialogue( - model, - data_iterator, - numericalizer, - task, - args, - eval_dir, - output_predictions_only=False, - original_order=None, - disable_progbar=True, -) -> GenerationOutput: - """ - Inputs: - original_order: List of indices. If provided, we will sort the results according to this order - confidence_estimator: if provided, will use it to calculate and output confidence scores - Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise - loss - predictions: a List of Lists of strings - answers - contexts - """ - - dataset = Bitod() - e2e_dialogue_preds = dict() - - predictions = [] - example_ids = [] - answers = [] - contexts = [] - - # TODO: handle multiple responses - hyperparameter_idx = 0 - - cur_dial_id = '' - knowledge = None - - device = model.device - - special_tokens = numericalizer._tokenizer.all_special_tokens - - for k, turn in enumerate(progress_bar(data_iterator, desc='Generating', disable=disable_progbar)): - batch_size = len(turn.example_id) - assert batch_size == 1 - batch_prediction = [] - batch_example_ids = turn.example_id - - example_ids += batch_example_ids - - task_name, dial_id, turn_id, train_target = example_ids[-1].split('/') - turn_id = int(turn_id) - - if cur_dial_id != dial_id: - # new dialogue - cur_dial_id = dial_id - dialogue_state = {} - # new_state_text = 'null' - knowledge = defaultdict(dict) - new_knowledge_text = 'null' - new_actions_text = 'null' - active_api = None - e2e_dialogue_preds[dial_id] = {"turns": defaultdict(dict), "API": defaultdict(dict)} - - batch_context = [] - batch_tokens = numericalizer.convert_ids_to_tokens(turn.context.value.data, skip_special_tokens=False) - - # remove only beginning and trailing special tokens - # otherwise the sep_token added between context and question will be lost - for text in batch_tokens: - i = 0 - while text[i] in special_tokens: - i += 1 - j = len(text) - 1 - while text[j] in special_tokens: - j -= 1 - text = text[i : j + 1] - - batch_context.append(numericalizer._tokenizer.convert_tokens_to_string(text)) - - contexts += batch_context - - if not output_predictions_only: - batch_answer = numericalizer.reverse(turn.answer.value.data, 'answer') - batch_answer = [ - task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) - ] - answers += batch_answer - - new_state_text = dataset.state2span(dialogue_state) - - if train_target == 'dst': - input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) - - ## we always use gold history following common practice - ## if you want to use predicted response instead of gold uncomment the following - # last_sys_pred = predictions[-1][0].strip() - # input_text = replace_match(input_text, last_system_re, last_sys_pred) - - elif train_target == 'api': - - # replace state - input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) - - elif train_target == 'da': - # replace state - input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) - - # replace knowledge - input_text = replace_capturing_group(input_text, dataset.knowledge_re, new_knowledge_text) - - elif train_target == 'rg': - - # replace actions - input_text = replace_capturing_group(contexts[-1], dataset.actions_re, new_actions_text) - - else: - raise ValueError(f'Invalid train_target: {train_target}') - - # replace old context with updated - contexts[-1] = input_text - - tokenized_contexts = numericalizer.encode_batch([input_text], field_name='context', features=None)[0] - - numericalized_turn = NumericalizedExamples( - example_id=[turn.example_id[0]], - context=SequentialField( - value=torch.tensor([tokenized_contexts.value], device=device), - length=torch.tensor([tokenized_contexts.length], device=device), - limited=torch.tensor([tokenized_contexts.limited], device=device), - feature=None, - ), - answer=SequentialField(value=None, length=None, limited=None, feature=None), - ) - - generated = model.generate( - numericalized_turn, - max_output_length=args.max_output_length, - min_output_length=args.min_output_length, - num_outputs=args.num_outputs[hyperparameter_idx], - temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, - repetition_penalty=args.repetition_penalty[hyperparameter_idx], - top_k=args.top_k[hyperparameter_idx], - top_p=args.top_p[hyperparameter_idx], - num_beams=args.num_beams[hyperparameter_idx], - num_beam_groups=args.num_beam_groups[hyperparameter_idx], - diversity_penalty=args.diversity_penalty[hyperparameter_idx], - no_repeat_ngram_size=args.no_repeat_ngram_size[hyperparameter_idx], - do_sample=args.temperature[hyperparameter_idx] != 0, - ) - - partial_batch_prediction_ids = generated.sequences - - partial_batch_prediction = numericalizer.reverse(partial_batch_prediction_ids, 'answer')[0] - - if train_target == 'da': - partial_batch_prediction = dataset.postprocess_prediction( - partial_batch_prediction, knowledge, lang=numericalizer._tokenizer.src_lang[:2] - ) - - partial_batch_prediction = task.postprocess_prediction(batch_example_ids[0], partial_batch_prediction) - - # put them into the right array - batch_prediction.append([partial_batch_prediction]) - - predictions += batch_prediction - - if train_target == 'dst': - # update dialogue_state - lev = predictions[-1][0].strip() - state_update = dataset.span2state(lev) - if state_update: - active_api = list(state_update.keys())[-1] - dataset.update_state(state_update, dialogue_state) - - #### save latest state - state_to_record = copy.deepcopy(dialogue_state) - state_to_record = {dataset.domain2api_name(k): v for k, v in state_to_record.items()} - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["state"] = state_to_record - #### - - elif train_target == 'api': - if dataset.do_knowledge_reset(active_api): - new_knowledge_text = "null" - knowledge = defaultdict(dict) - - do_api_call = predictions[-1][0].strip() - - if do_api_call == 'yes': - # make api call - api_name = active_api - if api_name in dialogue_state: - constraints, new_knowledge_text = dataset.make_api_call( - dialogue_state, knowledge, api_name, numericalizer._tokenizer.src_lang, dial_id, turn_id - ) - #### save latest api constraints - e2e_dialogue_preds[dial_id]["API"][dataset.domain2api_name(api_name)] = copy.deepcopy(constraints) - #### - - elif do_api_call == 'no': - # do nothing - pass - else: - logger.error( - f'API call should be either yes or no but got {do_api_call}. Seems model is not trained for enough steps. For now we assume it\'s a no' - ) - - #### save latest api results - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["api"] = new_knowledge_text - #### - - elif train_target == 'da': - new_actions_text = predictions[-1][0] - #### save latest actions - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["actions"] = predictions[-1][0] - #### - - elif train_target == 'rg': - #### save latest response - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["response"] = predictions[-1] - #### - - with open(os.path.join(eval_dir, 'e2e_dialogue_preds.json'), 'w') as fout: - ujson.dump(e2e_dialogue_preds, fout, indent=2, ensure_ascii=False) - - if original_order is not None: - # sort back to the original order - original_order, example_ids, predictions, answers, contexts = [ - list(a) for a in tuple(zip(*sorted(list(zip(original_order, example_ids, predictions, answers, contexts))))) - ] - - # TODO calculate and return loss - loss = None - output = GenerationOutput(loss=loss) - - if output_predictions_only: - output.predictions = predictions - else: - output.example_ids, output.predictions, output.answers, output.contexts = example_ids, predictions, answers, contexts - - return output - - -def generate_with_seq2seq_model( - model, - data_iterator, - numericalizer, - task, - args, - output_predictions_only=False, - output_confidence_features=False, - original_order=None, - confidence_estimators=None, - disable_progbar=True, -) -> GenerationOutput: - """ - Inputs: - original_order: List of indices. If provided, we will sort the results according to this order - confidence_estimator: if provided, will use it to calculate and output confidence scores - Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise - loss - predictions: a List of Lists of strings - answers - contexts - """ - total_loss = 0.0 if 'loss' in task.metrics else None - output_confidence_scores = confidence_estimators is not None - predictions = [] - raw_predictions = [] - confidence_features = [] - example_ids = [] - answers = [] - contexts = [] - - if numericalizer._tokenizer.tgt_lang: - tgt_lang = numericalizer._tokenizer.tgt_lang - else: - tgt_lang = model.orig_tgt_lang - - if numericalizer._tokenizer.src_lang: - src_lang = numericalizer._tokenizer.src_lang - else: - src_lang = model.orig_src_lang - - date_parser = default_loader.get_locale(src_lang[:2]) - - translate_return_raw_outputs = getattr(args, 'translate_return_raw_outputs', False) - - for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): - batch_size = len(batch.example_id) - batch_prediction = [[] for _ in range(batch_size)] - batch_raw_prediction = [[] for _ in range(batch_size)] - batch_confidence_features = [[] for _ in range(batch_size)] - batch_example_ids = batch.example_id - - example_ids += batch_example_ids - if not output_predictions_only: - batch_answer = numericalizer.reverse(batch.answer.value.data, 'answer') - batch_answer = [ - task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) - ] - answers += batch_answer - batch_context = numericalizer.reverse(batch.context.value.data, 'context') - contexts += batch_context - elif output_confidence_features: - # need gold answer for confidence estimation - batch_answer = numericalizer.reverse(batch.answer.value.data, 'answer') - answers += batch_answer - - if total_loss is not None: - loss = model(batch, train=True).loss.item() - total_loss += loss - - for hyperparameter_idx in range(len(args.temperature)): - generated = model.generate( - batch, - max_output_length=args.max_output_length, - min_output_length=args.min_output_length, - num_outputs=args.num_outputs[hyperparameter_idx], - temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, - repetition_penalty=args.repetition_penalty[hyperparameter_idx], - top_k=args.top_k[hyperparameter_idx], - top_p=args.top_p[hyperparameter_idx], - num_beams=args.num_beams[hyperparameter_idx], - num_beam_groups=args.num_beam_groups[hyperparameter_idx], - diversity_penalty=args.diversity_penalty[hyperparameter_idx], - no_repeat_ngram_size=args.no_repeat_ngram_size[hyperparameter_idx], - do_sample=args.temperature[hyperparameter_idx] != 0, # if temperature==0, we do not sample - ) - partial_batch_prediction_ids = generated.sequences - partial_batch_words = None - - if getattr(task, 'need_attention_scores', False): - cross_attentions = generated.cross_attentions - - # stack tensors to shape (max_output_length, num_layers, batch_size, num_heads, 1, max_input_length) - cross_attentions = torch.stack(([torch.stack(tuple) for tuple in cross_attentions])).cpu() - - # reshape to (num_layers, batch_size, num_heads, max_output_length, max_input_length) - cross_attentions = cross_attentions.squeeze(4) - cross_attentions = cross_attentions.permute(1, 2, 3, 0, 4).contiguous() - - # choose only last layer attentions - # cross_attentions = torch.mean(cross_attentions[-3:, ...], dim=0) - cross_attentions = cross_attentions[-1, ...] - - # postprocess prediction ids - kwargs = { - 'numericalizer': numericalizer, - 'cross_attentions': cross_attentions, - 'tgt_lang': tgt_lang, - 'date_parser': date_parser, - } - - if translate_return_raw_outputs: - partial_batch_raw_prediction_ids = partial_batch_prediction_ids - - partial_batch_prediction_ids, partial_batch_words = task.batch_postprocess_prediction_ids( - batch_example_ids, batch.context.value.data, partial_batch_prediction_ids, **kwargs - ) - - # MarianTokenizer uses two different spm models for encoding source and target languages. - # in almond_translate we postprocess text with alignment which produces code-switched sentences. - # encoding a code-switched sentence with either spm will omit tokens from the other language - # so we have to return both the processed and encoded text. - # we need to return encoded text too since confidence_features requires ids - if isinstance(numericalizer._tokenizer, MarianTokenizer) and partial_batch_words: - partial_batch_prediction = partial_batch_words - else: - if output_confidence_features or output_confidence_scores: - partial_batch_confidence_features = model.confidence_features( - batch=batch, predictions=partial_batch_prediction_ids, mc_dropout_num=args.mc_dropout_num - ) - partial_batch_prediction = numericalizer.reverse(partial_batch_prediction_ids, 'answer') - - def get_example_index(i): - return (i // args.num_outputs[hyperparameter_idx]) % batch_size - - if translate_return_raw_outputs: - partial_batch_raw_prediction = numericalizer.reverse(partial_batch_raw_prediction_ids, 'answer') - for i in range(len(partial_batch_prediction)): - partial_batch_raw_prediction[i] = task.postprocess_prediction( - batch_example_ids[get_example_index(i)], partial_batch_raw_prediction[i] - ) - for i in range(len(partial_batch_prediction)): - batch_raw_prediction[get_example_index(i)].append(partial_batch_raw_prediction[i]) - - # post-process predictions - for i in range(len(partial_batch_prediction)): - partial_batch_prediction[i] = task.postprocess_prediction( - batch_example_ids[get_example_index(i)], partial_batch_prediction[i] - ) - - # put them into the right array - for i in range(len(partial_batch_prediction)): - batch_prediction[get_example_index(i)].append(partial_batch_prediction[i]) - if output_confidence_features or output_confidence_scores: - batch_confidence_features[get_example_index(i)].append(partial_batch_confidence_features[i]) - - predictions += batch_prediction - confidence_features += batch_confidence_features - raw_predictions += batch_raw_prediction - - if total_loss is not None: - total_loss /= len(example_ids) - - if original_order is not None: - # sort back to the original order - original_order, example_ids, predictions, raw_predictions, answers, contexts, confidence_features = [ - list(a) - for a in tuple( - zip( - *sorted( - list( - zip( - original_order, - example_ids, - predictions, - raw_predictions, - answers, - contexts, - confidence_features, - ) - ) - ) - ) - ) - ] - - if getattr(args, 'translate_example_split', False): - # stitch sentences back together - example_ids, predictions, raw_predictions, answers, contexts, confidence_features = merge_translated_sentences( - example_ids, - predictions, - raw_predictions, - answers, - contexts, - confidence_features, - numericalizer._tokenizer.src_lang, - numericalizer._tokenizer.tgt_lang, - ) - - output = GenerationOutput(loss=total_loss) - - if output_predictions_only: - output.predictions = predictions - else: - output.example_ids, output.predictions, output.answers, output.contexts = example_ids, predictions, answers, contexts - if output_confidence_features: - output.confidence_features = confidence_features - if args.override_confidence_labels: - for i, example in enumerate(confidence_features): - for confidence in example: - confidence.label = answers[i] == args.override_confidence_labels - if output_confidence_scores: - output.confidence_scores = [] - for estimator in confidence_estimators: - confidence_scores = estimator.estimate(confidence_features) - output.confidence_scores.append(confidence_scores) - if translate_return_raw_outputs: - output.raw_predictions = raw_predictions - - return output - - -def generate_with_classification_model( - model, data_iterator, numericalizer, task, original_order=None, disable_progbar=True -) -> GenerationOutput: - total_loss = 0.0 - all_example_ids = [] - all_answers = [] - all_contexts = [] - all_predictions = [] - - for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): - batch_example_ids = batch.example_id - - batch_context = numericalizer.reverse(batch.context.value.data, 'context') - - all_example_ids += batch_example_ids - - # pass labels to get loss - output = model( - input_ids=batch.context.value, - attention_mask=(batch.context.value != numericalizer.pad_id), - labels=batch.answer.value, - ) - - labels = batch.answer.value.tolist() - - logits = output.logits - predictions = torch.argmax(logits, dim=-1).tolist() - - # logits for sequence classification is 2 dimensional - if logits.dim() == 2: - predictions = [[p] for p in predictions] - - # Remove ignored index (special tokens) - processed_preds = [] - processed_labels = [] - for pred, label in zip(predictions, labels): - preds_list = [] - labels_list = [] - for p_, l_ in zip(pred, label): - if l_ == numericalizer.answer_pad_id: - continue - preds_list.append(task.id2label[p_]) - labels_list.append(task.id2label[l_]) - - processed_preds.append([" ".join(preds_list)]) - processed_labels.append(" ".join(labels_list)) - - all_contexts += batch_context - all_answers += processed_labels - all_predictions += processed_preds - - total_loss += output.loss - - total_loss /= len(all_example_ids) - - if original_order is not None: - # sort back to the original order - original_order, all_example_ids, all_predictions, all_answers, all_contexts = [ - list(a) - for a in tuple( - zip(*sorted(list(zip(original_order, all_example_ids, all_predictions, all_answers, all_contexts)))) - ) - ] - - output = GenerationOutput( - loss=total_loss, example_ids=all_example_ids, contexts=all_contexts, answers=all_answers, predictions=all_predictions - ) - - return output - - def print_results(results, num_print): print() @@ -670,14 +104,14 @@ def print_results(results, num_print): sys.stdout.flush() -def validate(task, val_iter, model, numericalizer, args, num_print=10): +def validate(task, val_iter, model, args, num_print=10): with torch.no_grad(): model.eval() if isinstance(model, torch.nn.DataParallel): # get rid of the DataParallel wrapper model = model.module - generation_output = generate_with_model(model, val_iter, numericalizer, task, args) + generation_output = generate_with_model(model, val_iter, task, args) # loss is already calculated metrics_to_return = [metric for metric in task.metrics if metric != 'loss'] From 0a8bfa1dda2cd61dd7f6439764a12e32a4a50352 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 10:24:59 -0800 Subject: [PATCH 07/19] Remove duplicate code in transformer_sequence_classification --- genienlp/models/transformer_lstm.py | 25 +++++---- genienlp/models/transformer_seq2seq.py | 9 ++-- .../transformer_sequence_classification.py | 50 +++--------------- .../transformer_token_classification.py | 51 ++++++++++--------- 4 files changed, 51 insertions(+), 84 deletions(-) diff --git a/genienlp/models/transformer_lstm.py b/genienlp/models/transformer_lstm.py index 62008a8a..8792b087 100644 --- a/genienlp/models/transformer_lstm.py +++ b/genienlp/models/transformer_lstm.py @@ -58,6 +58,7 @@ def __init__(self, config=None, *inputs, args, vocab_sets, tasks, save_directory encoder_embeddings = args.pretrained_model config = AutoConfig.from_pretrained(encoder_embeddings, cache_dir=args.embeddings) + self.config = config args.dimension = config.hidden_size # tasks is not passed during initialization only in server mode @@ -66,7 +67,7 @@ def __init__(self, config=None, *inputs, args, vocab_sets, tasks, save_directory self.set_generation_output_options(tasks) self.src_lang, self.tgt_lang = adjust_language_code( - config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') + self.config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') ) self.numericalizer = TransformerNumericalizer( @@ -74,7 +75,7 @@ def __init__(self, config=None, *inputs, args, vocab_sets, tasks, save_directory args, max_generative_vocab=args.max_generative_vocab, save_dir=save_directory, - config=config, + config=self.config, src_lang=self.src_lang, tgt_lang=self.tgt_lang, vocab_sets=vocab_sets, @@ -84,35 +85,37 @@ def __init__(self, config=None, *inputs, args, vocab_sets, tasks, save_directory logger.info('Initializing encoder and decoder embeddings') if args.do_ned: - if type(config) == BertConfig: + if type(self.config) == BertConfig: if save_directory is not None: - self.encoder_embeddings = BertModelForNER(config, args.num_db_types, args.db_unk_id) + self.encoder_embeddings = BertModelForNER(self.config, args.num_db_types, args.db_unk_id) else: - self.encoder_embeddings = BertModelForNER(config, args.num_db_types, args.db_unk_id).from_pretrained( + self.encoder_embeddings = BertModelForNER(self.config, args.num_db_types, args.db_unk_id).from_pretrained( encoder_embeddings, num_db_types=args.num_db_types, db_unk_id=args.db_unk_id, cache_dir=args.embeddings ) - elif type(config) == XLMRobertaConfig: + elif type(self.config) == XLMRobertaConfig: if save_directory is not None: - self.encoder_embeddings = XLMRobertaModelForNER(config, args.num_db_types, args.db_unk_id) + self.encoder_embeddings = XLMRobertaModelForNER(self.config, args.num_db_types, args.db_unk_id) else: - self.encoder_embeddings = XLMRobertaModelForNER(config, args.num_db_types, args.db_unk_id).from_pretrained( + self.encoder_embeddings = XLMRobertaModelForNER( + self.config, args.num_db_types, args.db_unk_id + ).from_pretrained( encoder_embeddings, num_db_types=args.num_db_types, db_unk_id=args.db_unk_id, cache_dir=args.embeddings ) else: raise ValueError('Model is not supported for using entity embeddings for NER') else: if save_directory is not None: - self.encoder_embeddings = AutoModel.from_config(config) + self.encoder_embeddings = AutoModel.from_config(self.config) else: self.encoder_embeddings = AutoModel.from_pretrained( - encoder_embeddings, config=config, cache_dir=args.embeddings + encoder_embeddings, config=self.config, cache_dir=args.embeddings ) self.encoder_embeddings.resize_token_embeddings(self.numericalizer.num_tokens) logger.info(f'Vocabulary has {self.numericalizer.num_tokens} tokens') - self.encoder = IdentityEncoder(self.numericalizer, args, config, self.encoder_embeddings) + self.encoder = IdentityEncoder(self.numericalizer, args, self.config, self.encoder_embeddings) self.decoder = MQANDecoder(self.numericalizer, args) def add_new_vocab_from_data(self, tasks, resize_decoder=False): diff --git a/genienlp/models/transformer_seq2seq.py b/genienlp/models/transformer_seq2seq.py index 5d1b79f6..29a2af57 100644 --- a/genienlp/models/transformer_seq2seq.py +++ b/genienlp/models/transformer_seq2seq.py @@ -61,10 +61,9 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory If `save_directory` is None, will initialize a new model and numericalizer, otherwise, will load them from `save_directory` """ config = AutoConfig.from_pretrained(args.pretrained_model, cache_dir=args.embeddings) - self.config = config super().__init__(config) self.args = args - args.dimension = config.d_model + args.dimension = self.config.d_model self._is_bart_large = self.args.pretrained_model == 'facebook/bart-large' # tasks is not passed during initialization only in server mode @@ -75,11 +74,11 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory # only used for Marian models. adjusted language codes passed to numericalizer will be None for models trained on single langauge pairs self.orig_src_lang, self.orig_tgt_lang = kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') self.src_lang, self.tgt_lang = adjust_language_code( - config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') + self.config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') ) if save_directory is not None: - self.model = AutoModelForSeq2SeqLM.from_config(config) + self.model = AutoModelForSeq2SeqLM.from_config(self.config) else: self.model = AutoModelForSeq2SeqLM.from_pretrained(self.args.pretrained_model, cache_dir=self.args.embeddings) @@ -88,7 +87,7 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory args, max_generative_vocab=None, save_dir=save_directory, - config=config, + config=self.config, src_lang=self.src_lang, tgt_lang=self.tgt_lang, vocab_sets=vocab_sets, diff --git a/genienlp/models/transformer_sequence_classification.py b/genienlp/models/transformer_sequence_classification.py index b34cb89b..1ddca860 100644 --- a/genienlp/models/transformer_sequence_classification.py +++ b/genienlp/models/transformer_sequence_classification.py @@ -29,44 +29,24 @@ import logging -from transformers import AutoConfig, AutoModelForSequenceClassification +from transformers import AutoModelForSequenceClassification from ..data_utils.numericalizer import TransformerNumericalizer -from ..models.base import GenieModel -from ..util import adjust_language_code +from .transformer_token_classification import TransformerForTokenClassification logger = logging.getLogger(__name__) -class TransformerForSequenceClassification(GenieModel): +class TransformerForSequenceClassification(TransformerForTokenClassification): def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory=None, **kwargs): - num_labels = 0 - if args.num_labels is not None: - num_labels = args.num_labels - else: - for task in tasks: - # if having multiple tasks choose max num_labels - if hasattr(task, 'num_labels'): - num_labels = max(num_labels, task.num_labels) - - config = AutoConfig.from_pretrained(args.pretrained_model, cache_dir=args.embeddings, num_labels=num_labels) - GenieModel.__init__(self, config) - self.args = args - if hasattr(config, 'd_model'): - args.dimension = config.d_model - else: - args.dimension = config.hidden_size - - self.src_lang, self.tgt_lang = adjust_language_code( - config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') - ) + self._init_common(args, tasks, **kwargs) if save_directory is not None: - self.model = AutoModelForSequenceClassification.from_config(config) + self.model = AutoModelForSequenceClassification.from_config(self.config) else: self.model = AutoModelForSequenceClassification.from_pretrained( - self.args.pretrained_model, cache_dir=self.args.embeddings, config=config + self.args.pretrained_model, cache_dir=self.args.embeddings, config=self.config ) self.numericalizer = TransformerNumericalizer( @@ -74,7 +54,7 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory args, max_generative_vocab=None, save_dir=save_directory, - config=config, + config=self.config, src_lang=self.src_lang, tgt_lang=self.tgt_lang, vocab_sets=vocab_sets, @@ -84,19 +64,3 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory self.model.resize_token_embeddings(self.numericalizer.num_tokens) self.numericalizer.answer_pad_id = -100 - - def add_new_vocab_from_data(self, tasks, resize_decoder=False): - super().add_new_vocab_from_data(tasks, resize_decoder) - self.model.resize_token_embeddings(self.numericalizer.num_tokens) - - def forward(self, *input, **kwargs): - if self.training: - batch = input[0] - outputs = self.model( - batch.context.value, - labels=batch.answer.value, - attention_mask=(batch.context.value != self.numericalizer.pad_id), - ) - return outputs - else: - return self.model(**kwargs) diff --git a/genienlp/models/transformer_token_classification.py b/genienlp/models/transformer_token_classification.py index f40bbd2b..d23b8517 100644 --- a/genienlp/models/transformer_token_classification.py +++ b/genienlp/models/transformer_token_classification.py @@ -42,7 +42,31 @@ class TransformerForTokenClassification(GenieModel): def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory=None, **kwargs): + self._init_common(args, tasks, **kwargs) + if save_directory is not None: + self.model = AutoModelForTokenClassification.from_config(self.config) + else: + self.model = AutoModelForTokenClassification.from_pretrained( + self.args.pretrained_model, cache_dir=self.args.embeddings, config=self.config + ) + self.numericalizer = TransformerNumericalizer( + self.args.pretrained_model, + args, + max_generative_vocab=None, + save_dir=save_directory, + config=self.config, + src_lang=self.src_lang, + tgt_lang=self.tgt_lang, + vocab_sets=vocab_sets, + tasks=tasks, + ) + + self.model.resize_token_embeddings(self.numericalizer.num_tokens) + self.numericalizer.answer_pad_id = -100 + + def _init_common(self, args, tasks, **kwargs): + self.args = args num_labels = 0 if args.num_labels is not None: num_labels = args.num_labels @@ -55,8 +79,8 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory config = AutoConfig.from_pretrained( args.pretrained_model, cache_dir=args.embeddings, num_labels=num_labels, finetuning_task='ned' ) - GenieModel.__init__(self, config) - self.args = args + super().__init__(config) + if hasattr(config, 'd_model'): args.dimension = config.d_model else: @@ -66,29 +90,6 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') ) - if save_directory is not None: - self.model = AutoModelForTokenClassification.from_config(config) - else: - self.model = AutoModelForTokenClassification.from_pretrained( - self.args.pretrained_model, cache_dir=self.args.embeddings, config=config - ) - - self.numericalizer = TransformerNumericalizer( - self.args.pretrained_model, - args, - max_generative_vocab=None, - save_dir=save_directory, - config=config, - src_lang=self.src_lang, - tgt_lang=self.tgt_lang, - vocab_sets=vocab_sets, - tasks=tasks, - ) - - self.model.resize_token_embeddings(self.numericalizer.num_tokens) - - self.numericalizer.answer_pad_id = -100 - def add_new_vocab_from_data(self, tasks, resize_decoder=False): super().add_new_vocab_from_data(tasks, resize_decoder) self.model.resize_token_embeddings(self.numericalizer.num_tokens) From 96f6db7e3ff8e3cb5d6ec93ec7c1603df606fef5 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 11:45:23 -0800 Subject: [PATCH 08/19] Remove data caching it's almost never been used --- README.md | 4 +- genienlp/arguments.py | 5 -- genienlp/predict.py | 4 - genienlp/run_bootleg.py | 9 -- genienlp/tasks/almond_dataset.py | 109 +++++++++++-------------- genienlp/tasks/generic_dataset.py | 31 +++---- genienlp/tasks/hf_dataset.py | 13 --- genienlp/train.py | 4 - tests/test_NED.sh | 4 +- tests/test_calibration.sh | 4 +- tests/test_cuda.sh | 2 +- tests/test_e2e_dialogues.sh | 4 +- tests/test_kfserver.sh | 2 +- tests/test_main_almond.sh | 4 +- tests/test_main_almond_multilingual.sh | 6 +- tests/test_paraphrasing.sh | 4 +- tests/test_sequence_classification.sh | 4 +- tests/test_token_classification.sh | 8 +- tests/test_translation.sh | 4 +- 19 files changed, 81 insertions(+), 144 deletions(-) diff --git a/README.md b/README.md index 3faf4d46..cb325aa2 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ genienlp run-paraphrase --model_name_or_path --temperature 0.3 --rep Use the following command for training/ finetuning an NMT model: ```bash -genienlp train --train_tasks almond_translate --data --train_languages --eval_languages --no_commit --train_iterations --preserve_case --save --exist_ok --skip_cache --model TransformerSeq2Seq --pretrained_model +genienlp train --train_tasks almond_translate --data --train_languages --eval_languages --no_commit --train_iterations --preserve_case --save --exist_ok --model TransformerSeq2Seq --pretrained_model ``` We currently support MarianMT, MBART, MT5, and M2M100 models.
@@ -131,7 +131,7 @@ To save a pretrained model in genienlp format without any finetuning, set train_ To produce translations for an eval/ test set run the following command: ```bash -genienlp predict --tasks almond_translate --data --pred_languages --pred_tgt_languages --path --eval_dir --skip_cache --val_batch_size 4000 --evaluate --overwrite --silent +genienlp predict --tasks almond_translate --data --pred_languages --pred_tgt_languages --path --eval_dir --val_batch_size 4000 --evaluate --overwrite --silent ``` If your dataset is a document or contains long examples, pass `--translate_example_split` to break the examples down into individual sentences before translation for better results.
diff --git a/genienlp/arguments.py b/genienlp/arguments.py index 9ba7c9c1..ca9f97a2 100644 --- a/genienlp/arguments.py +++ b/genienlp/arguments.py @@ -67,7 +67,6 @@ def parse_argv(parser): parser.add_argument('--data', default='.data/', type=str, help='where to load data from.') parser.add_argument('--save', required=True, type=str, help='where to save results.') parser.add_argument('--embeddings', default='.embeddings/', type=str, help='where to save embeddings.') - parser.add_argument('--cache', default='.cache/', type=str, help='where to save cached files') parser.add_argument( '--train_languages', @@ -409,10 +408,6 @@ def parse_argv(parser): help='Ignore all conditions and use fast version of huggingface tokenizer', ) - parser.add_argument('--skip_cache', action='store_true', help='whether to use existing cached splits or generate new ones') - parser.add_argument( - '--cache_input_data', action='store_true', help='Cache examples from input data for faster subsequent trainings' - ) parser.add_argument('--use_curriculum', action='store_true', help='Use curriculum learning') parser.add_argument( '--aux_dataset', default='', type=str, help='path to auxiliary dataset (ignored if curriculum is not used)' diff --git a/genienlp/predict.py b/genienlp/predict.py index 4a48023d..faeaf919 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -100,9 +100,7 @@ def parse_argv(parser): parser.add_argument('--overwrite', action='store_true', help='whether to overwrite previously written predictions') parser.add_argument('--silent', action='store_true', help='whether to print predictions to stdout') - parser.add_argument('--skip_cache', action='store_true', help='whether use exisiting cached splits or generate new ones') parser.add_argument('--eval_dir', type=str, required=True, help='use this directory to store eval results') - parser.add_argument('--cache', default='.cache', type=str, help='where to save cached files') parser.add_argument('--subsample', default=20000000, type=int, help='subsample the eval/test datasets') parser.add_argument( @@ -331,9 +329,7 @@ def prepare_data(args): kwargs.update( { - 'skip_cache': args.skip_cache, 'subsample': args.subsample, - 'cached_path': os.path.join(args.cache, task.name), 'all_dirs': task_languages, 'num_workers': args.num_workers, 'src_lang': src_lang, diff --git a/genienlp/run_bootleg.py b/genienlp/run_bootleg.py index da179da0..556a3e5e 100644 --- a/genienlp/run_bootleg.py +++ b/genienlp/run_bootleg.py @@ -48,7 +48,6 @@ def parse_argv(parser): parser.add_argument('--save', required=True, type=str, help='where to save results.') parser.add_argument('--embeddings', default='.embeddings/', type=str, help='where to save embeddings.') parser.add_argument('--data', default='.data/', type=str, help='where to load data from.') - parser.add_argument('--cache', default='.cache/', type=str, help='where to save cached files') parser.add_argument( '--train_languages', @@ -177,11 +176,6 @@ def parse_argv(parser): '--exist_ok', action='store_true', help='Ok if the save directory already exists, i.e. overwrite is ok' ) - parser.add_argument('--skip_cache', action='store_true', help='whether to use existing cached splits or generate new ones') - parser.add_argument( - '--cache_input_data', action='store_true', help='Cache examples from input data for faster subsequent trainings' - ) - # token classification task args parser.add_argument('--num_labels', type=int, help='num_labels for classification tasks') parser.add_argument('--crossner_domains', nargs='+', type=str, help='domains to use for CrossNER task') @@ -197,8 +191,6 @@ def bootleg_dump_entities(args, logger): bootleg_shared_kwargs = { 'subsample': args.subsample, - 'skip_cache': args.skip_cache, - 'cache_input_data': args.cache_input_data, 'num_workers': args.num_workers, 'all_dirs': args.train_src_languages, 'crossner_domains': args.crossner_domains, @@ -212,7 +204,6 @@ def bootleg_dump_entities(args, logger): kwargs = {'train': None, 'validation': None, 'test': None} kwargs.update(bootleg_shared_kwargs) - kwargs['cached_path'] = os.path.join(args.cache, task.name) for split in args.bootleg_data_splits: if split == 'train': del kwargs['train'] # deleting keys means use the default file name diff --git a/genienlp/tasks/almond_dataset.py b/genienlp/tasks/almond_dataset.py index ea5ff16e..f0cb00e1 100644 --- a/genienlp/tasks/almond_dataset.py +++ b/genienlp/tasks/almond_dataset.py @@ -32,8 +32,6 @@ import multiprocessing as mp import os -import torch - from ..data_utils.almond_utils import chunk_file, create_examples_from_file from .base_dataset import Split from .generic_dataset import CQA @@ -48,73 +46,58 @@ class AlmondDataset(CQA): def __init__(self, path, *, make_example, **kwargs): - # TODO fix cache_path for multilingual task subsample = kwargs.get('subsample') - cached_path = kwargs.get('cached_path') - - skip_cache = kwargs.get('skip_cache', True) - cache_input_data = kwargs.get('cache_input_data', False) num_workers = kwargs.get('num_workers', 0) - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) dir_name = os.path.basename(os.path.dirname(path)) - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) + n = 0 + with open(path, 'r', encoding='utf-8') as fp: + for line in fp: + n += 1 + + max_examples = min(n, subsample) if subsample is not None else n + if num_workers > 0: + num_processes = min(num_workers, int(mp.cpu_count())) + logger.info(f'Using {num_processes} workers...') + chunk_size = int(math.ceil(max_examples / num_processes)) + num_chunks = int(math.ceil(max_examples / chunk_size)) + + base_path, extension = path.rsplit('.', 1) + + chunk_file_paths = [f'{base_path}_{chunk_id}.tsv' for chunk_id in range(num_chunks)] + chunk_file(path, chunk_file_paths, chunk_size, num_chunks) + num_processes = min(num_processes, num_chunks) + + with mp.Pool(processes=num_processes) as pool: + process_args = [ + { + 'in_file': chunk_file_paths[i], + 'chunk_size': chunk_size, + 'dir_name': dir_name, + 'example_batch_size': 1, + 'make_process_example': make_example, + 'kwargs': kwargs, + } + for i in range(num_chunks) + ] + results = pool.map(create_examples_from_file, process_args) + + # merge all results + examples = [item for sublist in results for item in sublist] + + for file in chunk_file_paths: + os.remove(file) else: - n = 0 - with open(path, 'r', encoding='utf-8') as fp: - for line in fp: - n += 1 - - max_examples = min(n, subsample) if subsample is not None else n - if num_workers > 0: - num_processes = min(num_workers, int(mp.cpu_count())) - logger.info(f'Using {num_processes} workers...') - chunk_size = int(math.ceil(max_examples / num_processes)) - num_chunks = int(math.ceil(max_examples / chunk_size)) - - base_path, extension = path.rsplit('.', 1) - - chunk_file_paths = [f'{base_path}_{chunk_id}.tsv' for chunk_id in range(num_chunks)] - chunk_file(path, chunk_file_paths, chunk_size, num_chunks) - num_processes = min(num_processes, num_chunks) - - with mp.Pool(processes=num_processes) as pool: - process_args = [ - { - 'in_file': chunk_file_paths[i], - 'chunk_size': chunk_size, - 'dir_name': dir_name, - 'example_batch_size': 1, - 'make_process_example': make_example, - 'kwargs': kwargs, - } - for i in range(num_chunks) - ] - results = pool.map(create_examples_from_file, process_args) - - # merge all results - examples = [item for sublist in results for item in sublist] - - for file in chunk_file_paths: - os.remove(file) - else: - process_args = { - 'in_file': path, - 'chunk_size': max_examples, - 'dir_name': dir_name, - 'example_batch_size': 1, - 'make_process_example': make_example, - 'kwargs': kwargs, - } - examples = create_examples_from_file(process_args) - - if cache_input_data: - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) + process_args = { + 'in_file': path, + 'chunk_size': max_examples, + 'dir_name': dir_name, + 'example_batch_size': 1, + 'make_process_example': make_example, + 'kwargs': kwargs, + } + examples = create_examples_from_file(process_args) super().__init__(examples, **kwargs) diff --git a/genienlp/tasks/generic_dataset.py b/genienlp/tasks/generic_dataset.py index 161360e5..33a0dfaa 100644 --- a/genienlp/tasks/generic_dataset.py +++ b/genienlp/tasks/generic_dataset.py @@ -33,7 +33,6 @@ import os from typing import Iterable -import torch import ujson from datasets import load_dataset @@ -97,27 +96,17 @@ def __init__(self, examples, sort_key_fn=input_then_output_len, batch_size_fn=al class JSON(CQA): name = 'json' - def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs): - cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample)) + def __init__(self, path, subsample=None, lower=False, **kwargs): examples = [] - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) - else: - with open(os.path.expanduser(path)) as f: - lines = f.readlines() - for line in lines: - ex = json.loads(line) - context, question, answer = ex['context'], ex['question'], ex['answer'] - examples.append( - Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower) - ) - if subsample is not None and len(examples) >= subsample: - break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) + with open(os.path.expanduser(path)) as f: + lines = f.readlines() + for line in lines: + ex = json.loads(line) + context, question, answer = ex['context'], ex['question'], ex['answer'] + examples.append(Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower)) + if subsample is not None and len(examples) >= subsample: + break super(JSON, self).__init__(examples, **kwargs) @@ -235,7 +224,7 @@ class OODDataset(CQA): name = 'ood' is_sequence_classification = True - def __init__(self, path, lower=False, cached_path=None, skip_cache=False, **kwargs): + def __init__(self, path, lower=False, **kwargs): examples = [] question = 'Is this sentence in-domain or out-domain?' diff --git a/genienlp/tasks/hf_dataset.py b/genienlp/tasks/hf_dataset.py index ca262687..578fce22 100644 --- a/genienlp/tasks/hf_dataset.py +++ b/genienlp/tasks/hf_dataset.py @@ -28,10 +28,8 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import logging -import os import datasets -import torch from datasets import load_dataset from ..tasks.generic_dataset import CQA @@ -46,24 +44,13 @@ class HFDataset(CQA): def __init__(self, data, make_example, **kwargs): subsample = kwargs.get('subsample') - skip_cache = kwargs.pop('kwargs', True) - - cache_name = os.path.join(os.path.dirname(data.cache_files[0]['filename']), data.split._name, str(subsample)) examples = [] - if os.path.exists(cache_name) and not skip_cache: - logger.info(f'Loading cached data from {cache_name}') - examples = torch.load(cache_name) for ex in data: examples.append(make_example(ex, **kwargs)) - if subsample is not None and len(examples) >= subsample: break - os.makedirs(os.path.dirname(cache_name), exist_ok=True) - logger.info(f'Caching data to {cache_name}') - torch.save(examples, cache_name) - super().__init__(examples, **kwargs) @classmethod diff --git a/genienlp/train.py b/genienlp/train.py index 830b5fb3..957a5ff0 100644 --- a/genienlp/train.py +++ b/genienlp/train.py @@ -87,8 +87,6 @@ def prepare_data(args, logger): train_eval_shared_kwargs = { 'subsample': args.subsample, - 'skip_cache': args.skip_cache, - 'cache_input_data': args.cache_input_data, 'num_workers': args.num_workers, } @@ -99,7 +97,6 @@ def prepare_data(args, logger): kwargs['train'] = args.train_set_name kwargs.update(train_eval_shared_kwargs) kwargs['all_dirs'] = args.train_src_languages - kwargs['cached_path'] = os.path.join(args.cache, task.name) kwargs['crossner_domains'] = args.crossner_domains if args.use_curriculum: kwargs['curriculum'] = True @@ -144,7 +141,6 @@ def prepare_data(args, logger): kwargs['validation'] = args.eval_set_name kwargs.update(train_eval_shared_kwargs) kwargs['all_dirs'] = args.eval_src_languages - kwargs['cached_path'] = os.path.join(args.cache, task.name) kwargs['crossner_domains'] = args.crossner_domains kwargs['hf_test_overfit'] = args.hf_test_overfit diff --git a/tests/test_NED.sh b/tests/test_NED.sh index a056d718..38723a37 100755 --- a/tests/test_NED.sh +++ b/tests/test_NED.sh @@ -15,10 +15,10 @@ for hparams in \ do # train - genienlp train --train_tasks almond_dialogue_nlu --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --bootleg_output_dir $SRCDIR/dataset/thingpedia_99/bootleg/ --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit --do_ned --min_entity_len 2 --max_entity_len 4 $hparams + genienlp train --train_tasks almond_dialogue_nlu --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --bootleg_output_dir $SRCDIR/dataset/thingpedia_99/bootleg/ --exist_ok --embeddings $EMBEDDING_DIR --no_commit --do_ned --min_entity_len 2 --max_entity_len 4 $hparams # greedy prediction - genienlp predict --tasks almond_dialogue_nlu --evaluate valid --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --embeddings $EMBEDDING_DIR --skip_cache + genienlp predict --tasks almond_dialogue_nlu --evaluate valid --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --embeddings $EMBEDDING_DIR # check if result file exists if test ! -f $workdir/model_$i/eval_results/valid/almond_dialogue_nlu.tsv ; then diff --git a/tests/test_calibration.sh b/tests/test_calibration.sh index b2a97c01..d1063eb6 100644 --- a/tests/test_calibration.sh +++ b/tests/test_calibration.sh @@ -9,10 +9,10 @@ for hparams in \ do # train - genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit + genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit # greedy prediction - genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --skip_cache --save_confidence_features --confidence_feature_path $workdir/model_$i/confidences.pkl --mc_dropout_num 10 + genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --save_confidence_features --confidence_feature_path $workdir/model_$i/confidences.pkl --mc_dropout_num 10 # check if confidence file exists if test ! -f $workdir/model_$i/confidences.pkl ; then diff --git a/tests/test_cuda.sh b/tests/test_cuda.sh index 5f2766a1..009b6e58 100755 --- a/tests/test_cuda.sh +++ b/tests/test_cuda.sh @@ -9,7 +9,7 @@ for hparams in \ do # train - genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit + genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit # generate a long sequence long_sequence='' diff --git a/tests/test_e2e_dialogues.sh b/tests/test_e2e_dialogues.sh index 13ad3e90..7c88c017 100755 --- a/tests/test_e2e_dialogues.sh +++ b/tests/test_e2e_dialogues.sh @@ -16,10 +16,10 @@ tasks=( for i in ${!hparams[*]}; do # train - genienlp train --train_tasks ${tasks[i]} --train_batch_tokens 100 --val_batch_size 300 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/bitod --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit ${hparams[i]} + genienlp train --train_tasks ${tasks[i]} --train_batch_tokens 100 --val_batch_size 300 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/bitod --exist_ok --embeddings $EMBEDDING_DIR --no_commit ${hparams[i]} # greedy prediction - genienlp predict --tasks ${tasks[i]} --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/bitod --embeddings $EMBEDDING_DIR --skip_cache --extra_metrics e2e_dialogue_score + genienlp predict --tasks ${tasks[i]} --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/bitod --embeddings $EMBEDDING_DIR --extra_metrics e2e_dialogue_score # check if result file exists if test ! -f $workdir/model_$i/eval_results/test/${tasks[i]}.tsv ; then diff --git a/tests/test_kfserver.sh b/tests/test_kfserver.sh index 21714825..eae368b5 100644 --- a/tests/test_kfserver.sh +++ b/tests/test_kfserver.sh @@ -9,7 +9,7 @@ for hparams in \ do # train - genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit + genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit # run kfserver in background (genienlp kfserver --path $workdir/model_$i)& diff --git a/tests/test_main_almond.sh b/tests/test_main_almond.sh index 6b4cf31d..df552586 100755 --- a/tests/test_main_almond.sh +++ b/tests/test_main_almond.sh @@ -14,10 +14,10 @@ for hparams in \ do # train - genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit + genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit # greedy prediction - genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --skip_cache + genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR # check if result file exists if test ! -f $workdir/model_$i/eval_results/test/almond.tsv ; then diff --git a/tests/test_main_almond_multilingual.sh b/tests/test_main_almond_multilingual.sh index 290fd661..bd77ca79 100755 --- a/tests/test_main_almond_multilingual.sh +++ b/tests/test_main_almond_multilingual.sh @@ -11,13 +11,13 @@ for hparams in \ do # train - genienlp train --train_tasks almond_multilingual --train_languages fa+en --eval_languages fa+en --train_batch_tokens 100 --val_batch_size 200 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit + genienlp train --train_tasks almond_multilingual --train_languages fa+en --eval_languages fa+en --train_batch_tokens 100 --val_batch_size 200 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit # greedy decode # combined evaluation - genienlp predict --tasks almond_multilingual --pred_languages fa+en --pred_tgt_languages en --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --skip_cache + genienlp predict --tasks almond_multilingual --pred_languages fa+en --pred_tgt_languages en --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR # separate evaluation - genienlp predict --tasks almond_multilingual --separate_eval --pred_languages fa+en --pred_tgt_languages en --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --skip_cache + genienlp predict --tasks almond_multilingual --separate_eval --pred_languages fa+en --pred_tgt_languages en --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR # check if result file exists if test ! -f $workdir/model_$i/eval_results/test/almond_multilingual_en.tsv || test ! -f $workdir/model_$i/eval_results/test/almond_multilingual_fa.tsv || test ! -f $workdir/model_$i/eval_results/test/almond_multilingual_fa+en.tsv; then diff --git a/tests/test_paraphrasing.sh b/tests/test_paraphrasing.sh index e6ec687a..17116b77 100644 --- a/tests/test_paraphrasing.sh +++ b/tests/test_paraphrasing.sh @@ -9,10 +9,10 @@ for hparams in \ "--model TransformerSeq2Seq --pretrained_model sshleifer/bart-tiny-random"; do # train - genienlp train --train_tasks almond_natural_seq2seq --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit + genienlp train --train_tasks almond_natural_seq2seq --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit # greedy prediction - genienlp predict --tasks almond_paraphrase --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --skip_cache + genienlp predict --tasks almond_paraphrase --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR # check if result file exists if test ! -f $workdir/model_$i/eval_results/test/almond_paraphrase.tsv || test ! -f $workdir/model_$i/eval_results/test/almond_paraphrase.results.json; then diff --git a/tests/test_sequence_classification.sh b/tests/test_sequence_classification.sh index 955b6762..1db24719 100755 --- a/tests/test_sequence_classification.sh +++ b/tests/test_sequence_classification.sh @@ -3,10 +3,10 @@ . ./tests/lib.sh # train -genienlp train --train_tasks ood_task --model TransformerForSequenceClassification --pretrained_model distilbert-base-uncased --min_output_length 1 --save $workdir/model --train_iterations 20 --save_every 10 --log_every 10 --val_every 10 --data $SRCDIR/dataset/ood/ --skip_cache --force_fast_tokenizer --train_batch_tokens 200 --num_print 0 +genienlp train --train_tasks ood_task --model TransformerForSequenceClassification --pretrained_model distilbert-base-uncased --min_output_length 1 --save $workdir/model --train_iterations 20 --save_every 10 --log_every 10 --val_every 10 --data $SRCDIR/dataset/ood/ --force_fast_tokenizer --train_batch_tokens 200 --num_print 0 # greedy prediction -genienlp predict --tasks ood_task --evaluate valid --pred_set_name eval --path $workdir/model --overwrite --eval_dir $workdir/model/eval_results/ --data $SRCDIR/dataset/ood/ --embeddings $EMBEDDING_DIR --skip_cache --val_batch_size 200 +genienlp predict --tasks ood_task --evaluate valid --pred_set_name eval --path $workdir/model --overwrite --eval_dir $workdir/model/eval_results/ --data $SRCDIR/dataset/ood/ --embeddings $EMBEDDING_DIR --val_batch_size 200 # check if result file exists if test ! -f $workdir/model/eval_results/valid/ood_task.tsv ; then diff --git a/tests/test_token_classification.sh b/tests/test_token_classification.sh index 49ca1f44..60aa3dab 100755 --- a/tests/test_token_classification.sh +++ b/tests/test_token_classification.sh @@ -10,10 +10,10 @@ for hparams in \ do # train - genienlp train --train_tasks cross_ner --model TransformerForTokenClassification --pretrained_model bert-base-cased --force_fast_tokenizer --train_batch_tokens 200 --val_batch_size 200 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR $hparams --exist_ok --skip_cache --no_commit + genienlp train --train_tasks cross_ner --model TransformerForTokenClassification --pretrained_model bert-base-cased --force_fast_tokenizer --train_batch_tokens 200 --val_batch_size 200 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR $hparams --exist_ok --no_commit # greedy prediction - genienlp predict --tasks cross_ner --evaluate valid --pred_set_name dev --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR --skip_cache --val_batch_size 2000 + genienlp predict --tasks cross_ner --evaluate valid --pred_set_name dev --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR --val_batch_size 2000 # check if result file exists if test ! -f $workdir/model_$i/eval_results/valid/cross_ner.tsv ; then @@ -35,10 +35,10 @@ for hparams in \ do # train - genienlp train --train_tasks conll2003 --crossner_domains music --model TransformerForTokenClassification --pretrained_model bert-base-cased --force_fast_tokenizer --subsample 5 --train_batch_tokens 100 --val_batch_size 100 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR $hparams --exist_ok --skip_cache --no_commit + genienlp train --train_tasks conll2003 --crossner_domains music --model TransformerForTokenClassification --pretrained_model bert-base-cased --force_fast_tokenizer --subsample 5 --train_batch_tokens 100 --val_batch_size 100 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR $hparams --exist_ok --no_commit # greedy prediction - genienlp predict --tasks conll2003 --evaluate valid --pred_set_name validation --subsample 5 --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR --skip_cache --val_batch_size 2000 + genienlp predict --tasks conll2003 --evaluate valid --pred_set_name validation --subsample 5 --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/cross_ner/ --embeddings $EMBEDDING_DIR --val_batch_size 2000 # check if result file exists if test ! -f $workdir/model_$i/eval_results/valid/conll2003.tsv ; then diff --git a/tests/test_translation.sh b/tests/test_translation.sh index 6b5cdd23..1220e4a3 100755 --- a/tests/test_translation.sh +++ b/tests/test_translation.sh @@ -21,10 +21,10 @@ for model in "Helsinki-NLP/opus-mt-en-de" "sshleifer/tiny-mbart" ; do cp $workdir/translation/almond/train.tsv $workdir/translation/almond/eval.tsv # train - genienlp train --train_tasks almond_translate --do_alignment --train_languages en --train_tgt_languages de --eval_languages en --eval_tgt_languages de --model TransformerSeq2Seq --pretrained_model $model --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $workdir/translation/ --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit + genienlp train --train_tasks almond_translate --do_alignment --train_languages en --train_tgt_languages de --eval_languages en --eval_tgt_languages de --model TransformerSeq2Seq --pretrained_model $model --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $workdir/translation/ --exist_ok --embeddings $EMBEDDING_DIR --no_commit # greedy prediction - genienlp predict --tasks almond_translate --evaluate valid --pred_languages en --pred_tgt_languages de --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $workdir/translation/ --embeddings $EMBEDDING_DIR --skip_cache + genienlp predict --tasks almond_translate --evaluate valid --pred_languages en --pred_tgt_languages de --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $workdir/translation/ --embeddings $EMBEDDING_DIR # check if result file exists and matches expected_result echo $expected_result | diff -u - $workdir/model_$i/eval_results/valid/almond_translate.results.json From 1c301f2288db55ba848e49718eff75067c0b2821 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 11:59:39 -0800 Subject: [PATCH 09/19] predict: move loop outside of create_output_lines --- genienlp/predict.py | 63 ++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/genienlp/predict.py b/genienlp/predict.py index faeaf919..4325c6f2 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -383,31 +383,34 @@ def prepare_data_iterators(args, val_sets, numericalizer, device): return iters -def create_output_line(args, generation_output): - lines = [] - for i in range(len(generation_output.example_ids)): - predictions = generation_output.raw_predictions if args.translate_return_raw_outputs else generation_output.predictions - if args.one_output_per_line: - lines = [ - '\t'.join( - [generation_output.example_ids[i], prediction, generation_output.answers[i], generation_output.contexts[i]] - ) - for prediction in predictions[i] - ] # one line per generation output - else: - lines = [ - '\t'.join( - [ - generation_output.example_ids[i], - *predictions[i], - generation_output.answers[i], - generation_output.contexts[i], - ] - ) - ] # one line with all generation outputs separated by '\t' - if args.calibrator_paths is not None: - for score in generation_output.confidence_scores: - lines = [line + '\t' + str(score[i]) for line in lines] # append score to all lines +def create_output_lines(args, index, generation_output): + predictions = generation_output.raw_predictions if args.translate_return_raw_outputs else generation_output.predictions + if args.one_output_per_line: + lines = [ + '\t'.join( + [ + generation_output.example_ids[index], + prediction, + generation_output.answers[index], + generation_output.contexts[index], + ] + ) + for prediction in predictions[index] + ] # one line per generation output + else: + lines = [ + '\t'.join( + [ + generation_output.example_ids[index], + *predictions[index], + generation_output.answers[index], + generation_output.contexts[index], + ] + ) + ] # one line with all generation outputs separated by '\t' + if args.calibrator_paths is not None: + for score in generation_output.confidence_scores: + lines = [line + '\t' + str(score[index]) for line in lines] # append score to all lines return lines @@ -490,13 +493,15 @@ def run(args, device): # write into file # TODO change to jsonl format with open(prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: - lines = create_output_line(args, generation_output) - prediction_file.write('\n'.join(lines) + '\n') + for i in range(len(generation_output.example_ids)): + lines = create_output_lines(args, i, generation_output) + prediction_file.write('\n'.join(lines) + '\n') if args.translate_return_raw_outputs: with open(raw_prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: - lines = create_output_line(args, generation_output) - prediction_file.write('\n'.join(lines) + '\n') + for i in range(len(generation_output.example_ids)): + lines = create_output_lines(args, i, generation_output) + prediction_file.write('\n'.join(lines) + '\n') if len(generation_output.answers) > 0: metrics_to_compute = get_metrics_to_compute(args, task) From 5ccaa1a1e35d74608a92e42398239c192bcb98d8 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 12:07:07 -0800 Subject: [PATCH 10/19] Remove no longer needed arguments --- genienlp/arguments.py | 5 ----- genienlp/predict.py | 3 --- genienlp/run_bootleg.py | 6 ------ genienlp/tasks/generic_dataset.py | 20 ++++++++++---------- genienlp/tasks/hf_dataset.py | 17 +++++++++-------- genienlp/train.py | 3 --- genienlp/util.py | 1 - 7 files changed, 19 insertions(+), 36 deletions(-) diff --git a/genienlp/arguments.py b/genienlp/arguments.py index ca9f97a2..991c6ac4 100644 --- a/genienlp/arguments.py +++ b/genienlp/arguments.py @@ -534,11 +534,6 @@ def parse_argv(parser): # token classification task args parser.add_argument('--num_labels', type=int, help='num_labels for classification tasks') parser.add_argument('--crossner_domains', nargs='+', type=str, help='domains to use for CrossNER task') - parser.add_argument( - '--hf_test_overfit', - action='store_true', - help='Debugging flag for hf datasets where validation will be performed on train set', - ) parser.add_argument( '--e2e_dialogue_evaluation', diff --git a/genienlp/predict.py b/genienlp/predict.py index 4325c6f2..c372770b 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -315,7 +315,6 @@ def prepare_data(args): if len(args.pred_src_languages) == 1 and len(args.tasks) > 1: args.pred_src_languages *= len(args.tasks) for i, task in enumerate(args.tasks): - task_languages = args.pred_src_languages[i] logger.info(f'Loading {task}') kwargs = {'train': None, 'validation': None, 'test': None} if args.evaluate == 'train': @@ -330,11 +329,9 @@ def prepare_data(args): kwargs.update( { 'subsample': args.subsample, - 'all_dirs': task_languages, 'num_workers': args.num_workers, 'src_lang': src_lang, 'crossner_domains': args.crossner_domains, - 'hf_test_overfit': args.hf_test_overfit, } ) diff --git a/genienlp/run_bootleg.py b/genienlp/run_bootleg.py index 556a3e5e..175008d3 100644 --- a/genienlp/run_bootleg.py +++ b/genienlp/run_bootleg.py @@ -179,11 +179,6 @@ def parse_argv(parser): # token classification task args parser.add_argument('--num_labels', type=int, help='num_labels for classification tasks') parser.add_argument('--crossner_domains', nargs='+', type=str, help='domains to use for CrossNER task') - parser.add_argument( - '--hf_test_overfit', - action='store_true', - help='Debugging flag for hf datasets where validation will be performed on train set', - ) def bootleg_dump_entities(args, logger): @@ -192,7 +187,6 @@ def bootleg_dump_entities(args, logger): bootleg_shared_kwargs = { 'subsample': args.subsample, 'num_workers': args.num_workers, - 'all_dirs': args.train_src_languages, 'crossner_domains': args.crossner_domains, } diff --git a/genienlp/tasks/generic_dataset.py b/genienlp/tasks/generic_dataset.py index 33a0dfaa..2721412c 100644 --- a/genienlp/tasks/generic_dataset.py +++ b/genienlp/tasks/generic_dataset.py @@ -185,16 +185,16 @@ def return_splits(cls, path='.data', train='train', validation='dev', test='test with open(test_path, "r") as fin: test_data = fin.readlines() - # Uncomment for testing - if kwargs.pop("hf_test_overfit", False): - if validation: - validation_path = os.path.join(path, domain, 'train.txt') - with open(validation_path, "r") as fin: - validation_data = fin.readlines() - if test: - test_path = os.path.join(path, domain, 'train.txt') - with open(test_path, "r") as fin: - test_data = fin.readlines() + # Uncomment for debugging + # if True: + # if validation: + # validation_path = os.path.join(path, domain, 'train.txt') + # with open(validation_path, "r") as fin: + # validation_data = fin.readlines() + # if test: + # test_path = os.path.join(path, domain, 'train.txt') + # with open(test_path, "r") as fin: + # test_data = fin.readlines() kwargs['domain'] = domain diff --git a/genienlp/tasks/hf_dataset.py b/genienlp/tasks/hf_dataset.py index 578fce22..fdc6ad35 100644 --- a/genienlp/tasks/hf_dataset.py +++ b/genienlp/tasks/hf_dataset.py @@ -69,14 +69,15 @@ def return_splits(cls, name, root='.data', train='train', validation='validation test_data = load_dataset(name, split='test', cache_dir=root) test_path = test_data.cache_files[0]['filename'] - if kwargs.pop('hf_test_overfit', False): - # override validation/ test data with train data - if validation: - validation_data = load_dataset(name, split='train', cache_dir=root) - validation_path = validation_data.cache_files[0]['filename'] - if test: - test_data = load_dataset(name, split='train', cache_dir=root) - test_path = test_data.cache_files[0]['filename'] + # Uncomment for debugging + # if True: + # # override validation/ test data with train data + # if validation: + # validation_data = load_dataset(name, split='train', cache_dir=root) + # validation_path = validation_data.cache_files[0]['filename'] + # if test: + # test_data = load_dataset(name, split='train', cache_dir=root) + # test_path = test_data.cache_files[0]['filename'] train_data = None if train is None else cls(train_data, **kwargs) validation_data = None if validation is None else cls(validation_data, **kwargs) diff --git a/genienlp/train.py b/genienlp/train.py index 957a5ff0..c71ac23b 100644 --- a/genienlp/train.py +++ b/genienlp/train.py @@ -96,7 +96,6 @@ def prepare_data(args, logger): kwargs = {'test': None, 'validation': None} kwargs['train'] = args.train_set_name kwargs.update(train_eval_shared_kwargs) - kwargs['all_dirs'] = args.train_src_languages kwargs['crossner_domains'] = args.crossner_domains if args.use_curriculum: kwargs['curriculum'] = True @@ -140,9 +139,7 @@ def prepare_data(args, logger): if args.eval_set_name is not None: kwargs['validation'] = args.eval_set_name kwargs.update(train_eval_shared_kwargs) - kwargs['all_dirs'] = args.eval_src_languages kwargs['crossner_domains'] = args.crossner_domains - kwargs['hf_test_overfit'] = args.hf_test_overfit logger.info(f'Adding {task.name} to validation datasets') splits, paths = task.get_splits(args.data, lower=args.lower, **kwargs) diff --git a/genienlp/util.py b/genienlp/util.py index 5b174ec1..bce36453 100644 --- a/genienlp/util.py +++ b/genienlp/util.py @@ -862,7 +862,6 @@ def load_config_json(args): 'no_separator', 'num_labels', 'crossner_domains', - 'hf_test_overfit', 'override_valid_metrics', 'eval_src_languages', 'eval_tgt_languages', From 1b790f66586615b81d320bde41c6a170bf6ef6f0 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 13:22:45 -0800 Subject: [PATCH 11/19] Remove test_main_almond_multilingual --- .travis.yml | 5 ---- tests/test_main_almond_multilingual.sh | 37 -------------------------- 2 files changed, 42 deletions(-) delete mode 100755 tests/test_main_almond_multilingual.sh diff --git a/.travis.yml b/.travis.yml index 9a6d55f7..27762b54 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,11 +25,6 @@ jobs: stage: test script: - bash ./tests/test_main_almond.sh - - - name: "Main tests for almond_multilingual task" - stage: test - script: - - bash ./tests/test_main_almond_multilingual.sh - name: "Paraphrasing tests" stage: test diff --git a/tests/test_main_almond_multilingual.sh b/tests/test_main_almond_multilingual.sh deleted file mode 100755 index bd77ca79..00000000 --- a/tests/test_main_almond_multilingual.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash - -. ./tests/lib.sh - -i=0 -# test almond_multilingual task -for hparams in \ - "--model TransformerLSTM --pretrained_model bert-base-multilingual-cased --trainable_decoder_embeddings=50 --rnn_zero_state cls --almond_lang_as_question" \ - "--model TransformerLSTM --pretrained_model bert-base-multilingual-cased --trainable_decoder_embeddings=50" \ - "--model TransformerLSTM --pretrained_model bert-base-multilingual-cased --trainable_decoder_embeddings=50 --sentence_batching --use_encoder_loss" ; -do - - # train - genienlp train --train_tasks almond_multilingual --train_languages fa+en --eval_languages fa+en --train_batch_tokens 100 --val_batch_size 200 --train_iterations 4 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit - - # greedy decode - # combined evaluation - genienlp predict --tasks almond_multilingual --pred_languages fa+en --pred_tgt_languages en --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR - # separate evaluation - genienlp predict --tasks almond_multilingual --separate_eval --pred_languages fa+en --pred_tgt_languages en --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR - - # check if result file exists - if test ! -f $workdir/model_$i/eval_results/test/almond_multilingual_en.tsv || test ! -f $workdir/model_$i/eval_results/test/almond_multilingual_fa.tsv || test ! -f $workdir/model_$i/eval_results/test/almond_multilingual_fa+en.tsv; then - echo "File not found!" - exit 1 - fi - - if [ $i == 0 ] ; then - # check if predictions matches expected_results - diff -u $SRCDIR/expected_results/almond_multilingual/bert_base_multilingual_cased_en.results.json $workdir/model_$i/eval_results/test/almond_multilingual_en.results.json - diff -u $SRCDIR/expected_results/almond_multilingual/bert_base_multilingual_cased_fa.results.json $workdir/model_$i/eval_results/test/almond_multilingual_fa.results.json - diff -u $SRCDIR/expected_results/almond_multilingual/bert_base_multilingual_cased_fa+en.results.json $workdir/model_$i/eval_results/test/almond_multilingual_fa+en.results.json - fi - - rm -rf $workdir/model_$i - i=$((i+1)) -done From 5e27d18b8d3c7b6b3d8aba8cc7c6a4c5c54ea012 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 13:33:35 -0800 Subject: [PATCH 12/19] Minor fixes --- genienlp/models/transformer_seq2seq.py | 7 ++++--- genienlp/models/transformer_token_classification.py | 2 +- genienlp/validate.py | 6 +----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/genienlp/models/transformer_seq2seq.py b/genienlp/models/transformer_seq2seq.py index 29a2af57..174a03ae 100644 --- a/genienlp/models/transformer_seq2seq.py +++ b/genienlp/models/transformer_seq2seq.py @@ -450,7 +450,7 @@ def get_example_index(i): return output def validate_e2e_dialogues( - self, data_iterator, task, eval_dir, output_predictions_only=False, original_order=None, disable_progbar=True + self, data_iterator, task, eval_dir=None, output_predictions_only=False, original_order=None, disable_progbar=True ): """ Inputs: @@ -662,8 +662,9 @@ def validate_e2e_dialogues( e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["response"] = predictions[-1] #### - with open(os.path.join(eval_dir, 'e2e_dialogue_preds.json'), 'w') as fout: - ujson.dump(e2e_dialogue_preds, fout, indent=2, ensure_ascii=False) + if eval_dir: + with open(os.path.join(eval_dir, 'e2e_dialogue_preds.json'), 'w') as fout: + ujson.dump(e2e_dialogue_preds, fout, indent=2, ensure_ascii=False) if original_order is not None: # sort back to the original order diff --git a/genienlp/models/transformer_token_classification.py b/genienlp/models/transformer_token_classification.py index d23b8517..491392ab 100644 --- a/genienlp/models/transformer_token_classification.py +++ b/genienlp/models/transformer_token_classification.py @@ -106,7 +106,7 @@ def forward(self, *input, **kwargs): else: return self.model(**kwargs) - def validate(self, data_iterator, task, original_order=None, disable_progbar=True): + def validate(self, data_iterator, task, original_order=None, disable_progbar=True, **kwargs): total_loss = 0.0 all_example_ids = [] all_answers = [] diff --git a/genienlp/validate.py b/genienlp/validate.py index cbb3ad07..1ced3b73 100644 --- a/genienlp/validate.py +++ b/genienlp/validate.py @@ -33,7 +33,6 @@ import torch from .metrics import calculate_and_reduce_metrics -from .models import TransformerForSequenceClassification, TransformerForTokenClassification logger = logging.getLogger(__name__) @@ -54,14 +53,11 @@ def generate_with_model( return model.validate_e2e_dialogues( data_iterator, task, - eval_dir, + eval_dir=eval_dir, output_predictions_only=output_predictions_only, original_order=original_order, disable_progbar=disable_progbar, ) - - elif isinstance(model, (TransformerForTokenClassification, TransformerForSequenceClassification)): - return model.validate(data_iterator, task, original_order=original_order, disable_progbar=disable_progbar) else: return model.validate( data_iterator, From f9b19859613989edc2ed637e25628092f1c07792 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 13:47:55 -0800 Subject: [PATCH 13/19] Define wrapper model classes to avoid duplicate code in models --- genienlp/models/base.py | 594 +++++++++++++++++- genienlp/models/transformer_lstm.py | 4 +- genienlp/models/transformer_seq2seq.py | 487 +------------- .../transformer_sequence_classification.py | 5 +- .../transformer_token_classification.py | 124 +--- 5 files changed, 606 insertions(+), 608 deletions(-) diff --git a/genienlp/models/base.py b/genienlp/models/base.py index e9d1eafe..be2301c1 100644 --- a/genienlp/models/base.py +++ b/genienlp/models/base.py @@ -27,13 +27,21 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import copy import logging import os +from collections import defaultdict import torch -from transformers import PreTrainedModel +import ujson +from dateparser.languages import default_loader +from dialogues import Bitod +from transformers import AutoConfig, MarianTokenizer, PreTrainedModel +from ..data_utils.example import NumericalizedExamples, SequentialField from ..data_utils.numericalizer import TransformerNumericalizer +from ..data_utils.progbar import progress_bar +from ..util import GenerationOutput, adjust_language_code, merge_translated_sentences, replace_capturing_group logger = logging.getLogger(__name__) @@ -86,3 +94,587 @@ def set_generation_output_options(self, tasks): self._output_attentions = any(getattr(task, 'need_attention_scores', False) for task in tasks) self._output_scores = False self._output_hidden_states = False + + +# TransformerSeq2Seq and TransformerLSTM will inherit from this model +class GenieModelForGeneration(GenieModel): + def validate( + self, + data_iterator, + task, + output_predictions_only=False, + output_confidence_features=False, + original_order=None, + confidence_estimators=None, + disable_progbar=True, + ): + """ + Inputs: + original_order: List of indices. If provided, we will sort the results according to this order + confidence_estimator: if provided, will use it to calculate and output confidence scores + Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise + loss + predictions: a List of Lists of strings + answers + contexts + """ + total_loss = 0.0 if 'loss' in task.metrics else None + output_confidence_scores = confidence_estimators is not None + predictions = [] + raw_predictions = [] + confidence_features = [] + example_ids = [] + answers = [] + contexts = [] + + if self.numericalizer._tokenizer.tgt_lang: + tgt_lang = self.numericalizer._tokenizer.tgt_lang + else: + tgt_lang = self.model.orig_tgt_lang + + if self.numericalizer._tokenizer.src_lang: + src_lang = self.numericalizer._tokenizer.src_lang + else: + src_lang = self.model.orig_src_lang + + date_parser = default_loader.get_locale(src_lang[:2]) + + translate_return_raw_outputs = getattr(self.args, 'translate_return_raw_outputs', False) + + for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): + batch_size = len(batch.example_id) + batch_prediction = [[] for _ in range(batch_size)] + batch_raw_prediction = [[] for _ in range(batch_size)] + batch_confidence_features = [[] for _ in range(batch_size)] + batch_example_ids = batch.example_id + + example_ids += batch_example_ids + if not output_predictions_only: + batch_answer = self.numericalizer.reverse(batch.answer.value.data, 'answer') + batch_answer = [ + task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) + ] + answers += batch_answer + batch_context = self.numericalizer.reverse(batch.context.value.data, 'context') + contexts += batch_context + elif output_confidence_features: + # need gold answer for confidence estimation + batch_answer = self.numericalizer.reverse(batch.answer.value.data, 'answer') + answers += batch_answer + + if total_loss is not None: + loss = self.forward(batch, train=True).loss.item() + total_loss += loss + + for hyperparameter_idx in range(len(self.args.temperature)): + generated = self.generate( + batch, + max_output_length=self.args.max_output_length, + min_output_length=self.args.min_output_length, + num_outputs=self.args.num_outputs[hyperparameter_idx], + temperature=self.args.temperature[hyperparameter_idx] + if self.args.temperature[hyperparameter_idx] > 0 + else 1.0, + repetition_penalty=self.args.repetition_penalty[hyperparameter_idx], + top_k=self.args.top_k[hyperparameter_idx], + top_p=self.args.top_p[hyperparameter_idx], + num_beams=self.args.num_beams[hyperparameter_idx], + num_beam_groups=self.args.num_beam_groups[hyperparameter_idx], + diversity_penalty=self.args.diversity_penalty[hyperparameter_idx], + no_repeat_ngram_size=self.args.no_repeat_ngram_size[hyperparameter_idx], + do_sample=self.args.temperature[hyperparameter_idx] != 0, # if temperature==0, we do not sample + ) + partial_batch_prediction_ids = generated.sequences + partial_batch_words = None + + if getattr(task, 'need_attention_scores', False): + cross_attentions = generated.cross_attentions + + # stack tensors to shape (max_output_length, num_layers, batch_size, num_heads, 1, max_input_length) + cross_attentions = torch.stack(([torch.stack(tuple) for tuple in cross_attentions])).cpu() + + # reshape to (num_layers, batch_size, num_heads, max_output_length, max_input_length) + cross_attentions = cross_attentions.squeeze(4) + cross_attentions = cross_attentions.permute(1, 2, 3, 0, 4).contiguous() + + # choose only last layer attentions + # cross_attentions = torch.mean(cross_attentions[-3:, ...], dim=0) + cross_attentions = cross_attentions[-1, ...] + + # postprocess prediction ids + kwargs = { + 'self.numericalizer': self.numericalizer, + 'cross_attentions': cross_attentions, + 'tgt_lang': tgt_lang, + 'date_parser': date_parser, + } + + if translate_return_raw_outputs: + partial_batch_raw_prediction_ids = partial_batch_prediction_ids + + partial_batch_prediction_ids, partial_batch_words = task.batch_postprocess_prediction_ids( + batch_example_ids, batch.context.value.data, partial_batch_prediction_ids, **kwargs + ) + + # MarianTokenizer uses two different spm models for encoding source and target languages. + # in almond_translate we postprocess text with alignment which produces code-switched sentences. + # encoding a code-switched sentence with either spm will omit tokens from the other language + # so we have to return both the processed and encoded text. + # we need to return encoded text too since confidence_features requires ids + if isinstance(self.numericalizer._tokenizer, MarianTokenizer) and partial_batch_words: + partial_batch_prediction = partial_batch_words + else: + if output_confidence_features or output_confidence_scores: + partial_batch_confidence_features = self.model.confidence_features( + batch=batch, predictions=partial_batch_prediction_ids, mc_dropout_num=self.args.mc_dropout_num + ) + partial_batch_prediction = self.numericalizer.reverse(partial_batch_prediction_ids, 'answer') + + def get_example_index(i): + return (i // self.args.num_outputs[hyperparameter_idx]) % batch_size + + if translate_return_raw_outputs: + partial_batch_raw_prediction = self.numericalizer.reverse(partial_batch_raw_prediction_ids, 'answer') + for i in range(len(partial_batch_prediction)): + partial_batch_raw_prediction[i] = task.postprocess_prediction( + batch_example_ids[get_example_index(i)], partial_batch_raw_prediction[i] + ) + for i in range(len(partial_batch_prediction)): + batch_raw_prediction[get_example_index(i)].append(partial_batch_raw_prediction[i]) + + # post-process predictions + for i in range(len(partial_batch_prediction)): + partial_batch_prediction[i] = task.postprocess_prediction( + batch_example_ids[get_example_index(i)], partial_batch_prediction[i] + ) + + # put them into the right array + for i in range(len(partial_batch_prediction)): + batch_prediction[get_example_index(i)].append(partial_batch_prediction[i]) + if output_confidence_features or output_confidence_scores: + batch_confidence_features[get_example_index(i)].append(partial_batch_confidence_features[i]) + + predictions += batch_prediction + confidence_features += batch_confidence_features + raw_predictions += batch_raw_prediction + + if total_loss is not None: + total_loss /= len(example_ids) + + if original_order is not None: + # sort back to the original order + original_order, example_ids, predictions, raw_predictions, answers, contexts, confidence_features = [ + list(a) + for a in tuple( + zip( + *sorted( + list( + zip( + original_order, + example_ids, + predictions, + raw_predictions, + answers, + contexts, + confidence_features, + ) + ) + ) + ) + ) + ] + + if getattr(self.args, 'translate_example_split', False): + # stitch sentences back together + example_ids, predictions, raw_predictions, answers, contexts, confidence_features = merge_translated_sentences( + example_ids, + predictions, + raw_predictions, + answers, + contexts, + confidence_features, + self.numericalizer._tokenizer.src_lang, + self.numericalizer._tokenizer.tgt_lang, + ) + + output = GenerationOutput(loss=total_loss) + + if output_predictions_only: + output.predictions = predictions + else: + output.example_ids, output.predictions, output.answers, output.contexts = ( + example_ids, + predictions, + answers, + contexts, + ) + if output_confidence_features: + output.confidence_features = confidence_features + if self.args.override_confidence_labels: + for i, example in enumerate(confidence_features): + for confidence in example: + confidence.label = answers[i] == self.args.override_confidence_labels + if output_confidence_scores: + output.confidence_scores = [] + for estimator in confidence_estimators: + confidence_scores = estimator.estimate(confidence_features) + output.confidence_scores.append(confidence_scores) + if translate_return_raw_outputs: + output.raw_predictions = raw_predictions + + return output + + def validate_e2e_dialogues( + self, data_iterator, task, eval_dir=None, output_predictions_only=False, original_order=None, disable_progbar=True + ): + """ + Inputs: + original_order: List of indices. If provided, we will sort the results according to this order + confidence_estimator: if provided, will use it to calculate and output confidence scores + Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise + loss + predictions: a List of Lists of strings + answers + contexts + """ + + dataset = Bitod() + e2e_dialogue_preds = dict() + + predictions = [] + example_ids = [] + answers = [] + contexts = [] + + # TODO: handle multiple responses + hyperparameter_idx = 0 + + cur_dial_id = '' + knowledge = None + + device = self.device + args = self.args + + special_tokens = self.numericalizer._tokenizer.all_special_tokens + + for k, turn in enumerate(progress_bar(data_iterator, desc='Generating', disable=disable_progbar)): + batch_size = len(turn.example_id) + assert batch_size == 1 + batch_prediction = [] + batch_example_ids = turn.example_id + + example_ids += batch_example_ids + + task_name, dial_id, turn_id, train_target = example_ids[-1].split('/') + turn_id = int(turn_id) + + if cur_dial_id != dial_id: + # new dialogue + cur_dial_id = dial_id + dialogue_state = {} + # new_state_text = 'null' + knowledge = defaultdict(dict) + new_knowledge_text = 'null' + new_actions_text = 'null' + active_api = None + e2e_dialogue_preds[dial_id] = {"turns": defaultdict(dict), "API": defaultdict(dict)} + + batch_context = [] + batch_tokens = self.numericalizer.convert_ids_to_tokens(turn.context.value.data, skip_special_tokens=False) + + # remove only beginning and trailing special tokens + # otherwise the sep_token added between context and question will be lost + for text in batch_tokens: + i = 0 + while text[i] in special_tokens: + i += 1 + j = len(text) - 1 + while text[j] in special_tokens: + j -= 1 + text = text[i : j + 1] + + batch_context.append(self.numericalizer._tokenizer.convert_tokens_to_string(text)) + + contexts += batch_context + + if not output_predictions_only: + batch_answer = self.numericalizer.reverse(turn.answer.value.data, 'answer') + batch_answer = [ + task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) + ] + answers += batch_answer + + new_state_text = dataset.state2span(dialogue_state) + + if train_target == 'dst': + input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) + + ## we always use gold history following common practice + ## if you want to use predicted response instead of gold uncomment the following + # last_sys_pred = predictions[-1][0].strip() + # input_text = replace_match(input_text, last_system_re, last_sys_pred) + + elif train_target == 'api': + + # replace state + input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) + + elif train_target == 'da': + # replace state + input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) + + # replace knowledge + input_text = replace_capturing_group(input_text, dataset.knowledge_re, new_knowledge_text) + + elif train_target == 'rg': + + # replace actions + input_text = replace_capturing_group(contexts[-1], dataset.actions_re, new_actions_text) + + else: + raise ValueError(f'Invalid train_target: {train_target}') + + # replace old context with updated + contexts[-1] = input_text + + tokenized_contexts = self.numericalizer.encode_batch([input_text], field_name='context', features=None)[0] + + numericalized_turn = NumericalizedExamples( + example_id=[turn.example_id[0]], + context=SequentialField( + value=torch.tensor([tokenized_contexts.value], device=device), + length=torch.tensor([tokenized_contexts.length], device=device), + limited=torch.tensor([tokenized_contexts.limited], device=device), + feature=None, + ), + answer=SequentialField(value=None, length=None, limited=None, feature=None), + ) + + generated = self.generate( + numericalized_turn, + max_output_length=args.max_output_length, + min_output_length=args.min_output_length, + num_outputs=args.num_outputs[hyperparameter_idx], + temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, + repetition_penalty=args.repetition_penalty[hyperparameter_idx], + top_k=args.top_k[hyperparameter_idx], + top_p=args.top_p[hyperparameter_idx], + num_beams=args.num_beams[hyperparameter_idx], + num_beam_groups=args.num_beam_groups[hyperparameter_idx], + diversity_penalty=args.diversity_penalty[hyperparameter_idx], + no_repeat_ngram_size=args.no_repeat_ngram_size[hyperparameter_idx], + do_sample=args.temperature[hyperparameter_idx] != 0, + ) + + partial_batch_prediction_ids = generated.sequences + + partial_batch_prediction = self.numericalizer.reverse(partial_batch_prediction_ids, 'answer')[0] + + if train_target == 'da': + partial_batch_prediction = dataset.postprocess_prediction( + partial_batch_prediction, knowledge, lang=self.numericalizer._tokenizer.src_lang[:2] + ) + + partial_batch_prediction = task.postprocess_prediction(batch_example_ids[0], partial_batch_prediction) + + # put them into the right array + batch_prediction.append([partial_batch_prediction]) + + predictions += batch_prediction + + if train_target == 'dst': + # update dialogue_state + lev = predictions[-1][0].strip() + state_update = dataset.span2state(lev) + if state_update: + active_api = list(state_update.keys())[-1] + dataset.update_state(state_update, dialogue_state) + + #### save latest state + state_to_record = copy.deepcopy(dialogue_state) + state_to_record = {dataset.domain2api_name(k): v for k, v in state_to_record.items()} + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["state"] = state_to_record + #### + + elif train_target == 'api': + if dataset.do_knowledge_reset(active_api): + new_knowledge_text = "null" + knowledge = defaultdict(dict) + + do_api_call = predictions[-1][0].strip() + + if do_api_call == 'yes': + # make api call + api_name = active_api + if api_name in dialogue_state: + constraints, new_knowledge_text = dataset.make_api_call( + dialogue_state, knowledge, api_name, self.numericalizer._tokenizer.src_lang, dial_id, turn_id + ) + #### save latest api constraints + e2e_dialogue_preds[dial_id]["API"][dataset.domain2api_name(api_name)] = copy.deepcopy(constraints) + #### + + elif do_api_call == 'no': + # do nothing + pass + else: + logger.error( + f'API call should be either yes or no but got {do_api_call}. Seems model is not trained for enough steps. For now we assume it\'s a no' + ) + + #### save latest api results + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["api"] = new_knowledge_text + #### + + elif train_target == 'da': + new_actions_text = predictions[-1][0] + #### save latest actions + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["actions"] = predictions[-1][0] + #### + + elif train_target == 'rg': + #### save latest response + e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["response"] = predictions[-1] + #### + + if eval_dir: + with open(os.path.join(eval_dir, 'e2e_dialogue_preds.json'), 'w') as fout: + ujson.dump(e2e_dialogue_preds, fout, indent=2, ensure_ascii=False) + + if original_order is not None: + # sort back to the original order + original_order, example_ids, predictions, answers, contexts = [ + list(a) for a in tuple(zip(*sorted(list(zip(original_order, example_ids, predictions, answers, contexts))))) + ] + + # TODO calculate and return loss + loss = None + output = GenerationOutput(loss=loss) + + if output_predictions_only: + output.predictions = predictions + else: + output.example_ids, output.predictions, output.answers, output.contexts = ( + example_ids, + predictions, + answers, + contexts, + ) + + return output + + +# TransformerForSequenceClassification and TransformerForTokenClassification will inherit from this model +class GenieModelForClassification(GenieModel): + def _init_common(self, args, tasks, **kwargs): + self.args = args + num_labels = 0 + if args.num_labels is not None: + num_labels = args.num_labels + else: + for task in tasks: + # if having multiple tasks choose max num_labels + if hasattr(task, 'num_labels'): + num_labels = max(num_labels, task.num_labels) + + config = AutoConfig.from_pretrained( + args.pretrained_model, cache_dir=args.embeddings, num_labels=num_labels, finetuning_task='ned' + ) + super().__init__(config) + + if hasattr(config, 'd_model'): + args.dimension = config.d_model + else: + args.dimension = config.hidden_size + + self.src_lang, self.tgt_lang = adjust_language_code( + config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') + ) + + def add_new_vocab_from_data(self, tasks, resize_decoder=False): + super().add_new_vocab_from_data(tasks, resize_decoder) + self.model.resize_token_embeddings(self.numericalizer.num_tokens) + + def forward(self, *input, **kwargs): + if self.training: + batch = input[0] + outputs = self.model( + batch.context.value, + labels=batch.answer.value, + attention_mask=(batch.context.value != self.numericalizer.pad_id), + ) + return outputs + else: + return self.model(**kwargs) + + def validate(self, data_iterator, task, original_order=None, disable_progbar=True, **kwargs): + total_loss = 0.0 + all_example_ids = [] + all_answers = [] + all_contexts = [] + all_predictions = [] + + for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): + batch_example_ids = batch.example_id + + batch_context = self.numericalizer.reverse(batch.context.value.data, 'context') + + all_example_ids += batch_example_ids + + # pass labels to get loss + output = self.forward( + input_ids=batch.context.value, + attention_mask=(batch.context.value != self.numericalizer.pad_id), + labels=batch.answer.value, + ) + + labels = batch.answer.value.tolist() + + logits = output.logits + predictions = torch.argmax(logits, dim=-1).tolist() + + # logits for sequence classification is 2 dimensional + if logits.dim() == 2: + predictions = [[p] for p in predictions] + + # Remove ignored index (special tokens) + processed_preds = [] + processed_labels = [] + for pred, label in zip(predictions, labels): + preds_list = [] + labels_list = [] + for p_, l_ in zip(pred, label): + if l_ == self.numericalizer.answer_pad_id: + continue + preds_list.append(task.id2label[p_]) + labels_list.append(task.id2label[l_]) + + processed_preds.append([" ".join(preds_list)]) + processed_labels.append(" ".join(labels_list)) + + all_contexts += batch_context + all_answers += processed_labels + all_predictions += processed_preds + + total_loss += output.loss + + total_loss /= len(all_example_ids) + + if original_order is not None: + # sort back to the original order + original_order, all_example_ids, all_predictions, all_answers, all_contexts = [ + list(a) + for a in tuple( + zip(*sorted(list(zip(original_order, all_example_ids, all_predictions, all_answers, all_contexts)))) + ) + ] + + output = GenerationOutput( + loss=total_loss, + example_ids=all_example_ids, + contexts=all_contexts, + answers=all_answers, + predictions=all_predictions, + ) + + return output diff --git a/genienlp/models/transformer_lstm.py b/genienlp/models/transformer_lstm.py index 8792b087..326cbf0e 100644 --- a/genienlp/models/transformer_lstm.py +++ b/genienlp/models/transformer_lstm.py @@ -36,14 +36,14 @@ from ..data_utils.numericalizer import TransformerNumericalizer from ..model_utils.transformers_utils import BertModelForNER, XLMRobertaModelForNER from ..util import adjust_language_code -from .base import GenieModel +from .base import GenieModelForGeneration from .identity_encoder import IdentityEncoder from .mqan_decoder import MQANDecoder logger = logging.getLogger(__name__) -class TransformerLSTM(GenieModel): +class TransformerLSTM(GenieModelForGeneration): def __init__(self, config=None, *inputs, args, vocab_sets, tasks, save_directory=None, **kwargs): """ Relevant inputs should be provided using kwargs. This method is defined this way to match parent's and siblings' method signatures. diff --git a/genienlp/models/transformer_seq2seq.py b/genienlp/models/transformer_seq2seq.py index 174a03ae..0e0530fc 100644 --- a/genienlp/models/transformer_seq2seq.py +++ b/genienlp/models/transformer_seq2seq.py @@ -26,36 +26,22 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import copy import logging -import os -from collections import defaultdict from typing import List import torch -import ujson -from dateparser.languages import default_loader -from dialogues import Bitod -from transformers import AutoConfig, AutoModelForSeq2SeqLM, MarianTokenizer, MBartTokenizer, MBartTokenizerFast +from transformers import AutoConfig, AutoModelForSeq2SeqLM, MBartTokenizer, MBartTokenizerFast -from ..data_utils.example import NumericalizedExamples, SequentialField from ..data_utils.numericalizer import TransformerNumericalizer -from ..data_utils.progbar import progress_bar from ..model_utils.transformers_utils import MULTILINGUAL_TOKENIZERS -from ..util import ( - ConfidenceFeatures, - GenerationOutput, - adjust_language_code, - merge_translated_sentences, - replace_capturing_group, -) -from .base import GenieModel +from ..util import ConfidenceFeatures, adjust_language_code +from .base import GenieModelForGeneration from .common import LabelSmoothingCrossEntropy logger = logging.getLogger(__name__) -class TransformerSeq2Seq(GenieModel): +class TransformerSeq2Seq(GenieModelForGeneration): def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory=None, **kwargs): """ If `save_directory` is None, will initialize a new model and numericalizer, otherwise, will load them from `save_directory` @@ -223,471 +209,6 @@ def generate( return generated - def validate( - self, - data_iterator, - task, - output_predictions_only=False, - output_confidence_features=False, - original_order=None, - confidence_estimators=None, - disable_progbar=True, - ): - """ - Inputs: - original_order: List of indices. If provided, we will sort the results according to this order - confidence_estimator: if provided, will use it to calculate and output confidence scores - Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise - loss - predictions: a List of Lists of strings - answers - contexts - """ - total_loss = 0.0 if 'loss' in task.metrics else None - output_confidence_scores = confidence_estimators is not None - predictions = [] - raw_predictions = [] - confidence_features = [] - example_ids = [] - answers = [] - contexts = [] - - if self.numericalizer._tokenizer.tgt_lang: - tgt_lang = self.numericalizer._tokenizer.tgt_lang - else: - tgt_lang = self.model.orig_tgt_lang - - if self.numericalizer._tokenizer.src_lang: - src_lang = self.numericalizer._tokenizer.src_lang - else: - src_lang = self.model.orig_src_lang - - date_parser = default_loader.get_locale(src_lang[:2]) - - translate_return_raw_outputs = getattr(self.args, 'translate_return_raw_outputs', False) - - for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): - batch_size = len(batch.example_id) - batch_prediction = [[] for _ in range(batch_size)] - batch_raw_prediction = [[] for _ in range(batch_size)] - batch_confidence_features = [[] for _ in range(batch_size)] - batch_example_ids = batch.example_id - - example_ids += batch_example_ids - if not output_predictions_only: - batch_answer = self.numericalizer.reverse(batch.answer.value.data, 'answer') - batch_answer = [ - task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) - ] - answers += batch_answer - batch_context = self.numericalizer.reverse(batch.context.value.data, 'context') - contexts += batch_context - elif output_confidence_features: - # need gold answer for confidence estimation - batch_answer = self.numericalizer.reverse(batch.answer.value.data, 'answer') - answers += batch_answer - - if total_loss is not None: - loss = self.forward(batch, train=True).loss.item() - total_loss += loss - - for hyperparameter_idx in range(len(self.args.temperature)): - generated = self.generate( - batch, - max_output_length=self.args.max_output_length, - min_output_length=self.args.min_output_length, - num_outputs=self.args.num_outputs[hyperparameter_idx], - temperature=self.args.temperature[hyperparameter_idx] - if self.args.temperature[hyperparameter_idx] > 0 - else 1.0, - repetition_penalty=self.args.repetition_penalty[hyperparameter_idx], - top_k=self.args.top_k[hyperparameter_idx], - top_p=self.args.top_p[hyperparameter_idx], - num_beams=self.args.num_beams[hyperparameter_idx], - num_beam_groups=self.args.num_beam_groups[hyperparameter_idx], - diversity_penalty=self.args.diversity_penalty[hyperparameter_idx], - no_repeat_ngram_size=self.args.no_repeat_ngram_size[hyperparameter_idx], - do_sample=self.args.temperature[hyperparameter_idx] != 0, # if temperature==0, we do not sample - ) - partial_batch_prediction_ids = generated.sequences - partial_batch_words = None - - if getattr(task, 'need_attention_scores', False): - cross_attentions = generated.cross_attentions - - # stack tensors to shape (max_output_length, num_layers, batch_size, num_heads, 1, max_input_length) - cross_attentions = torch.stack(([torch.stack(tuple) for tuple in cross_attentions])).cpu() - - # reshape to (num_layers, batch_size, num_heads, max_output_length, max_input_length) - cross_attentions = cross_attentions.squeeze(4) - cross_attentions = cross_attentions.permute(1, 2, 3, 0, 4).contiguous() - - # choose only last layer attentions - # cross_attentions = torch.mean(cross_attentions[-3:, ...], dim=0) - cross_attentions = cross_attentions[-1, ...] - - # postprocess prediction ids - kwargs = { - 'self.numericalizer': self.numericalizer, - 'cross_attentions': cross_attentions, - 'tgt_lang': tgt_lang, - 'date_parser': date_parser, - } - - if translate_return_raw_outputs: - partial_batch_raw_prediction_ids = partial_batch_prediction_ids - - partial_batch_prediction_ids, partial_batch_words = task.batch_postprocess_prediction_ids( - batch_example_ids, batch.context.value.data, partial_batch_prediction_ids, **kwargs - ) - - # MarianTokenizer uses two different spm models for encoding source and target languages. - # in almond_translate we postprocess text with alignment which produces code-switched sentences. - # encoding a code-switched sentence with either spm will omit tokens from the other language - # so we have to return both the processed and encoded text. - # we need to return encoded text too since confidence_features requires ids - if isinstance(self.numericalizer._tokenizer, MarianTokenizer) and partial_batch_words: - partial_batch_prediction = partial_batch_words - else: - if output_confidence_features or output_confidence_scores: - partial_batch_confidence_features = self.model.confidence_features( - batch=batch, predictions=partial_batch_prediction_ids, mc_dropout_num=self.args.mc_dropout_num - ) - partial_batch_prediction = self.numericalizer.reverse(partial_batch_prediction_ids, 'answer') - - def get_example_index(i): - return (i // self.args.num_outputs[hyperparameter_idx]) % batch_size - - if translate_return_raw_outputs: - partial_batch_raw_prediction = self.numericalizer.reverse(partial_batch_raw_prediction_ids, 'answer') - for i in range(len(partial_batch_prediction)): - partial_batch_raw_prediction[i] = task.postprocess_prediction( - batch_example_ids[get_example_index(i)], partial_batch_raw_prediction[i] - ) - for i in range(len(partial_batch_prediction)): - batch_raw_prediction[get_example_index(i)].append(partial_batch_raw_prediction[i]) - - # post-process predictions - for i in range(len(partial_batch_prediction)): - partial_batch_prediction[i] = task.postprocess_prediction( - batch_example_ids[get_example_index(i)], partial_batch_prediction[i] - ) - - # put them into the right array - for i in range(len(partial_batch_prediction)): - batch_prediction[get_example_index(i)].append(partial_batch_prediction[i]) - if output_confidence_features or output_confidence_scores: - batch_confidence_features[get_example_index(i)].append(partial_batch_confidence_features[i]) - - predictions += batch_prediction - confidence_features += batch_confidence_features - raw_predictions += batch_raw_prediction - - if total_loss is not None: - total_loss /= len(example_ids) - - if original_order is not None: - # sort back to the original order - original_order, example_ids, predictions, raw_predictions, answers, contexts, confidence_features = [ - list(a) - for a in tuple( - zip( - *sorted( - list( - zip( - original_order, - example_ids, - predictions, - raw_predictions, - answers, - contexts, - confidence_features, - ) - ) - ) - ) - ) - ] - - if getattr(self.args, 'translate_example_split', False): - # stitch sentences back together - example_ids, predictions, raw_predictions, answers, contexts, confidence_features = merge_translated_sentences( - example_ids, - predictions, - raw_predictions, - answers, - contexts, - confidence_features, - self.numericalizer._tokenizer.src_lang, - self.numericalizer._tokenizer.tgt_lang, - ) - - output = GenerationOutput(loss=total_loss) - - if output_predictions_only: - output.predictions = predictions - else: - output.example_ids, output.predictions, output.answers, output.contexts = ( - example_ids, - predictions, - answers, - contexts, - ) - if output_confidence_features: - output.confidence_features = confidence_features - if self.args.override_confidence_labels: - for i, example in enumerate(confidence_features): - for confidence in example: - confidence.label = answers[i] == self.args.override_confidence_labels - if output_confidence_scores: - output.confidence_scores = [] - for estimator in confidence_estimators: - confidence_scores = estimator.estimate(confidence_features) - output.confidence_scores.append(confidence_scores) - if translate_return_raw_outputs: - output.raw_predictions = raw_predictions - - return output - - def validate_e2e_dialogues( - self, data_iterator, task, eval_dir=None, output_predictions_only=False, original_order=None, disable_progbar=True - ): - """ - Inputs: - original_order: List of indices. If provided, we will sort the results according to this order - confidence_estimator: if provided, will use it to calculate and output confidence scores - Outputs: predictions if `output_predictions_only` == True, (loss, predictions, answers, contexts) otherwise - loss - predictions: a List of Lists of strings - answers - contexts - """ - - dataset = Bitod() - e2e_dialogue_preds = dict() - - predictions = [] - example_ids = [] - answers = [] - contexts = [] - - # TODO: handle multiple responses - hyperparameter_idx = 0 - - cur_dial_id = '' - knowledge = None - - device = self.device - args = self.args - - special_tokens = self.numericalizer._tokenizer.all_special_tokens - - for k, turn in enumerate(progress_bar(data_iterator, desc='Generating', disable=disable_progbar)): - batch_size = len(turn.example_id) - assert batch_size == 1 - batch_prediction = [] - batch_example_ids = turn.example_id - - example_ids += batch_example_ids - - task_name, dial_id, turn_id, train_target = example_ids[-1].split('/') - turn_id = int(turn_id) - - if cur_dial_id != dial_id: - # new dialogue - cur_dial_id = dial_id - dialogue_state = {} - # new_state_text = 'null' - knowledge = defaultdict(dict) - new_knowledge_text = 'null' - new_actions_text = 'null' - active_api = None - e2e_dialogue_preds[dial_id] = {"turns": defaultdict(dict), "API": defaultdict(dict)} - - batch_context = [] - batch_tokens = self.numericalizer.convert_ids_to_tokens(turn.context.value.data, skip_special_tokens=False) - - # remove only beginning and trailing special tokens - # otherwise the sep_token added between context and question will be lost - for text in batch_tokens: - i = 0 - while text[i] in special_tokens: - i += 1 - j = len(text) - 1 - while text[j] in special_tokens: - j -= 1 - text = text[i : j + 1] - - batch_context.append(self.numericalizer._tokenizer.convert_tokens_to_string(text)) - - contexts += batch_context - - if not output_predictions_only: - batch_answer = self.numericalizer.reverse(turn.answer.value.data, 'answer') - batch_answer = [ - task.postprocess_prediction(batch_example_ids[i], batch_answer[i]) for i in range(len(batch_answer)) - ] - answers += batch_answer - - new_state_text = dataset.state2span(dialogue_state) - - if train_target == 'dst': - input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) - - ## we always use gold history following common practice - ## if you want to use predicted response instead of gold uncomment the following - # last_sys_pred = predictions[-1][0].strip() - # input_text = replace_match(input_text, last_system_re, last_sys_pred) - - elif train_target == 'api': - - # replace state - input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) - - elif train_target == 'da': - # replace state - input_text = replace_capturing_group(contexts[-1], dataset.state_re, new_state_text) - - # replace knowledge - input_text = replace_capturing_group(input_text, dataset.knowledge_re, new_knowledge_text) - - elif train_target == 'rg': - - # replace actions - input_text = replace_capturing_group(contexts[-1], dataset.actions_re, new_actions_text) - - else: - raise ValueError(f'Invalid train_target: {train_target}') - - # replace old context with updated - contexts[-1] = input_text - - tokenized_contexts = self.numericalizer.encode_batch([input_text], field_name='context', features=None)[0] - - numericalized_turn = NumericalizedExamples( - example_id=[turn.example_id[0]], - context=SequentialField( - value=torch.tensor([tokenized_contexts.value], device=device), - length=torch.tensor([tokenized_contexts.length], device=device), - limited=torch.tensor([tokenized_contexts.limited], device=device), - feature=None, - ), - answer=SequentialField(value=None, length=None, limited=None, feature=None), - ) - - generated = self.generate( - numericalized_turn, - max_output_length=args.max_output_length, - min_output_length=args.min_output_length, - num_outputs=args.num_outputs[hyperparameter_idx], - temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, - repetition_penalty=args.repetition_penalty[hyperparameter_idx], - top_k=args.top_k[hyperparameter_idx], - top_p=args.top_p[hyperparameter_idx], - num_beams=args.num_beams[hyperparameter_idx], - num_beam_groups=args.num_beam_groups[hyperparameter_idx], - diversity_penalty=args.diversity_penalty[hyperparameter_idx], - no_repeat_ngram_size=args.no_repeat_ngram_size[hyperparameter_idx], - do_sample=args.temperature[hyperparameter_idx] != 0, - ) - - partial_batch_prediction_ids = generated.sequences - - partial_batch_prediction = self.numericalizer.reverse(partial_batch_prediction_ids, 'answer')[0] - - if train_target == 'da': - partial_batch_prediction = dataset.postprocess_prediction( - partial_batch_prediction, knowledge, lang=self.numericalizer._tokenizer.src_lang[:2] - ) - - partial_batch_prediction = task.postprocess_prediction(batch_example_ids[0], partial_batch_prediction) - - # put them into the right array - batch_prediction.append([partial_batch_prediction]) - - predictions += batch_prediction - - if train_target == 'dst': - # update dialogue_state - lev = predictions[-1][0].strip() - state_update = dataset.span2state(lev) - if state_update: - active_api = list(state_update.keys())[-1] - dataset.update_state(state_update, dialogue_state) - - #### save latest state - state_to_record = copy.deepcopy(dialogue_state) - state_to_record = {dataset.domain2api_name(k): v for k, v in state_to_record.items()} - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["state"] = state_to_record - #### - - elif train_target == 'api': - if dataset.do_knowledge_reset(active_api): - new_knowledge_text = "null" - knowledge = defaultdict(dict) - - do_api_call = predictions[-1][0].strip() - - if do_api_call == 'yes': - # make api call - api_name = active_api - if api_name in dialogue_state: - constraints, new_knowledge_text = dataset.make_api_call( - dialogue_state, knowledge, api_name, self.numericalizer._tokenizer.src_lang, dial_id, turn_id - ) - #### save latest api constraints - e2e_dialogue_preds[dial_id]["API"][dataset.domain2api_name(api_name)] = copy.deepcopy(constraints) - #### - - elif do_api_call == 'no': - # do nothing - pass - else: - logger.error( - f'API call should be either yes or no but got {do_api_call}. Seems model is not trained for enough steps. For now we assume it\'s a no' - ) - - #### save latest api results - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["api"] = new_knowledge_text - #### - - elif train_target == 'da': - new_actions_text = predictions[-1][0] - #### save latest actions - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["actions"] = predictions[-1][0] - #### - - elif train_target == 'rg': - #### save latest response - e2e_dialogue_preds[dial_id]["turns"][str(turn_id)]["response"] = predictions[-1] - #### - - if eval_dir: - with open(os.path.join(eval_dir, 'e2e_dialogue_preds.json'), 'w') as fout: - ujson.dump(e2e_dialogue_preds, fout, indent=2, ensure_ascii=False) - - if original_order is not None: - # sort back to the original order - original_order, example_ids, predictions, answers, contexts = [ - list(a) for a in tuple(zip(*sorted(list(zip(original_order, example_ids, predictions, answers, contexts))))) - ] - - # TODO calculate and return loss - loss = None - output = GenerationOutput(loss=loss) - - if output_predictions_only: - output.predictions = predictions - else: - output.example_ids, output.predictions, output.answers, output.contexts = ( - example_ids, - predictions, - answers, - contexts, - ) - - return output - def confidence_features(self, batch, predictions, mc_dropout_num=0) -> List[ConfidenceFeatures]: """ predictions: Tensor of shape (batch_size, output_length) diff --git a/genienlp/models/transformer_sequence_classification.py b/genienlp/models/transformer_sequence_classification.py index 1ddca860..550ba81a 100644 --- a/genienlp/models/transformer_sequence_classification.py +++ b/genienlp/models/transformer_sequence_classification.py @@ -32,12 +32,12 @@ from transformers import AutoModelForSequenceClassification from ..data_utils.numericalizer import TransformerNumericalizer -from .transformer_token_classification import TransformerForTokenClassification +from ..models.base import GenieModelForClassification logger = logging.getLogger(__name__) -class TransformerForSequenceClassification(TransformerForTokenClassification): +class TransformerForSequenceClassification(GenieModelForClassification): def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory=None, **kwargs): self._init_common(args, tasks, **kwargs) @@ -62,5 +62,4 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory ) self.model.resize_token_embeddings(self.numericalizer.num_tokens) - self.numericalizer.answer_pad_id = -100 diff --git a/genienlp/models/transformer_token_classification.py b/genienlp/models/transformer_token_classification.py index 491392ab..a5396414 100644 --- a/genienlp/models/transformer_token_classification.py +++ b/genienlp/models/transformer_token_classification.py @@ -29,20 +29,19 @@ import logging -import torch -from transformers import AutoConfig, AutoModelForTokenClassification +from transformers import AutoModelForTokenClassification from ..data_utils.numericalizer import TransformerNumericalizer -from ..data_utils.progbar import progress_bar -from ..models.base import GenieModel -from ..util import GenerationOutput, adjust_language_code +from ..models.base import GenieModelForClassification logger = logging.getLogger(__name__) -class TransformerForTokenClassification(GenieModel): +class TransformerForTokenClassification(GenieModelForClassification): def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory=None, **kwargs): + self._init_common(args, tasks, **kwargs) + if save_directory is not None: self.model = AutoModelForTokenClassification.from_config(self.config) else: @@ -64,116 +63,3 @@ def __init__(self, config=None, *inputs, args, tasks, vocab_sets, save_directory self.model.resize_token_embeddings(self.numericalizer.num_tokens) self.numericalizer.answer_pad_id = -100 - - def _init_common(self, args, tasks, **kwargs): - self.args = args - num_labels = 0 - if args.num_labels is not None: - num_labels = args.num_labels - else: - for task in tasks: - # if having multiple tasks choose max num_labels - if hasattr(task, 'num_labels'): - num_labels = max(num_labels, task.num_labels) - - config = AutoConfig.from_pretrained( - args.pretrained_model, cache_dir=args.embeddings, num_labels=num_labels, finetuning_task='ned' - ) - super().__init__(config) - - if hasattr(config, 'd_model'): - args.dimension = config.d_model - else: - args.dimension = config.hidden_size - - self.src_lang, self.tgt_lang = adjust_language_code( - config, args.pretrained_model, kwargs.get('src_lang', 'en'), kwargs.get('tgt_lang', 'en') - ) - - def add_new_vocab_from_data(self, tasks, resize_decoder=False): - super().add_new_vocab_from_data(tasks, resize_decoder) - self.model.resize_token_embeddings(self.numericalizer.num_tokens) - - def forward(self, *input, **kwargs): - if self.training: - batch = input[0] - outputs = self.model( - batch.context.value, - labels=batch.answer.value, - attention_mask=(batch.context.value != self.numericalizer.pad_id), - ) - return outputs - else: - return self.model(**kwargs) - - def validate(self, data_iterator, task, original_order=None, disable_progbar=True, **kwargs): - total_loss = 0.0 - all_example_ids = [] - all_answers = [] - all_contexts = [] - all_predictions = [] - - for batch in progress_bar(data_iterator, desc='Generating', disable=disable_progbar): - batch_example_ids = batch.example_id - - batch_context = self.numericalizer.reverse(batch.context.value.data, 'context') - - all_example_ids += batch_example_ids - - # pass labels to get loss - output = self.forward( - input_ids=batch.context.value, - attention_mask=(batch.context.value != self.numericalizer.pad_id), - labels=batch.answer.value, - ) - - labels = batch.answer.value.tolist() - - logits = output.logits - predictions = torch.argmax(logits, dim=-1).tolist() - - # logits for sequence classification is 2 dimensional - if logits.dim() == 2: - predictions = [[p] for p in predictions] - - # Remove ignored index (special tokens) - processed_preds = [] - processed_labels = [] - for pred, label in zip(predictions, labels): - preds_list = [] - labels_list = [] - for p_, l_ in zip(pred, label): - if l_ == self.numericalizer.answer_pad_id: - continue - preds_list.append(task.id2label[p_]) - labels_list.append(task.id2label[l_]) - - processed_preds.append([" ".join(preds_list)]) - processed_labels.append(" ".join(labels_list)) - - all_contexts += batch_context - all_answers += processed_labels - all_predictions += processed_preds - - total_loss += output.loss - - total_loss /= len(all_example_ids) - - if original_order is not None: - # sort back to the original order - original_order, all_example_ids, all_predictions, all_answers, all_contexts = [ - list(a) - for a in tuple( - zip(*sorted(list(zip(original_order, all_example_ids, all_predictions, all_answers, all_contexts)))) - ) - ] - - output = GenerationOutput( - loss=total_loss, - example_ids=all_example_ids, - contexts=all_contexts, - answers=all_answers, - predictions=all_predictions, - ) - - return output From f26cd1278729616c45c747b4f0a2108dc00fc950 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 14:01:55 -0800 Subject: [PATCH 14/19] base: bug fixes --- genienlp/models/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/genienlp/models/base.py b/genienlp/models/base.py index be2301c1..2f4ee597 100644 --- a/genienlp/models/base.py +++ b/genienlp/models/base.py @@ -130,12 +130,12 @@ def validate( if self.numericalizer._tokenizer.tgt_lang: tgt_lang = self.numericalizer._tokenizer.tgt_lang else: - tgt_lang = self.model.orig_tgt_lang + tgt_lang = self.orig_tgt_lang if self.numericalizer._tokenizer.src_lang: src_lang = self.numericalizer._tokenizer.src_lang else: - src_lang = self.model.orig_src_lang + src_lang = self.orig_src_lang date_parser = default_loader.get_locale(src_lang[:2]) @@ -203,7 +203,7 @@ def validate( # postprocess prediction ids kwargs = { - 'self.numericalizer': self.numericalizer, + 'numericalizer': self.numericalizer, 'cross_attentions': cross_attentions, 'tgt_lang': tgt_lang, 'date_parser': date_parser, @@ -225,7 +225,7 @@ def validate( partial_batch_prediction = partial_batch_words else: if output_confidence_features or output_confidence_scores: - partial_batch_confidence_features = self.model.confidence_features( + partial_batch_confidence_features = self.confidence_features( batch=batch, predictions=partial_batch_prediction_ids, mc_dropout_num=self.args.mc_dropout_num ) partial_batch_prediction = self.numericalizer.reverse(partial_batch_prediction_ids, 'answer') From c4ac2cd191f76ff0fa6d044697628e8569d0a0af Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 28 Feb 2022 14:03:46 -0800 Subject: [PATCH 15/19] Drop removed nf1 metric --- genienlp/tasks/almond_task.py | 2 +- genienlp/tasks/base_task.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/genienlp/tasks/almond_task.py b/genienlp/tasks/almond_task.py index 1c05bef8..dbc16890 100644 --- a/genienlp/tasks/almond_task.py +++ b/genienlp/tasks/almond_task.py @@ -211,7 +211,7 @@ class NaturalSeq2Seq(BaseAlmondTask): def __init__(self, name, args): super().__init__(name, args) - self._metrics = ['bleu', 'em', 'nf1'] + self._metrics = ['bleu', 'em'] def _is_program_field(self, field_name): return False diff --git a/genienlp/tasks/base_task.py b/genienlp/tasks/base_task.py index b88ac2cf..1714318d 100644 --- a/genienlp/tasks/base_task.py +++ b/genienlp/tasks/base_task.py @@ -42,7 +42,7 @@ class BaseTask(object): def __init__(self, name, args): self.name = name self.args = args - self._metrics = ['em', 'nem', 'nf1'] + self._metrics = ['em', 'nem'] # special task-specific tokens that should not be subword tokenized self.special_tokens = set() self.override_context = args.override_context From 939f9d08045be0be88e3e673bc297d30cba3e728 Mon Sep 17 00:00:00 2001 From: mehrad Date: Tue, 1 Mar 2022 14:48:49 -0800 Subject: [PATCH 16/19] metrics: add rouge score --- genienlp/metrics.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/genienlp/metrics.py b/genienlp/metrics.py index 63636651..a186c8b3 100644 --- a/genienlp/metrics.py +++ b/genienlp/metrics.py @@ -29,7 +29,7 @@ import logging from collections import Counter, OrderedDict, defaultdict -from typing import Iterable, Union +from typing import List, Union import sacrebleu from datasets import load_metric @@ -84,6 +84,12 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): return max(scores_for_ground_truths) +def computeROUGE(outputs, targets, rouge_types): + targets = [target[0] for target in targets] + rouge_metric = load_metric('rouge') + return rouge_metric.compute(references=targets, predictions=outputs, rouge_types=rouge_types) + + def computeSequenceClassificationPrecision(outputs, targets): targets = [target[0] for target in targets] precision_metric = load_metric('precision') @@ -267,12 +273,12 @@ def compute_ner_f1(predictions, answers, schema='IOB2'): def compute_metrics( - predictions: Iterable[str], - answers: Union[Iterable[str], Iterable[Iterable[str]]], - requested_metrics: Iterable, + predictions: List[str], + answers: Union[List[str], List[List[str]]], + requested_metrics: List, lang: str, args, - example_ids: Iterable[str] = None, + example_ids: List[str] = None, ): """ Inputs: @@ -359,6 +365,13 @@ def compute_metrics( ner_f1 = compute_ner_f1(predictions, answers) metric_keys.append('ner_f1') metric_values.append(ner_f1) + for m in ['rouge1', 'rouge2', 'rougeL']: + if m in requested_metrics: + rouge = computeROUGE(predictions, answers, rouge_types=[m])[m] + requested_metrics.remove(m) + requested_metrics += [f'{m}_low', f'{m}_mid', f'{m}_high'] + metric_keys += [f'{m}_low', f'{m}_mid', f'{m}_high'] + metric_values += [rouge.low.fmeasure, rouge.mid.fmeasure, rouge.high.fmeasure] metric_dict = dict(zip(metric_keys, metric_values)) metric_dict = OrderedDict((key, metric_dict[key]) for key in requested_metrics) From 8833a1b4b515de31f2882872bf08431c975b8078 Mon Sep 17 00:00:00 2001 From: mehrad Date: Tue, 1 Mar 2022 20:28:09 -0800 Subject: [PATCH 17/19] Simplify validation code --- genienlp/metrics.py | 2 +- genienlp/models/base.py | 27 +++++++++ genienlp/predict.py | 9 +-- genienlp/server.py | 11 ++-- genienlp/train.py | 28 ++++++++- genienlp/util.py | 31 ++++++++++ genienlp/validate.py | 125 ---------------------------------------- 7 files changed, 95 insertions(+), 138 deletions(-) delete mode 100644 genienlp/validate.py diff --git a/genienlp/metrics.py b/genienlp/metrics.py index a186c8b3..17fba9bb 100644 --- a/genienlp/metrics.py +++ b/genienlp/metrics.py @@ -287,7 +287,7 @@ def compute_metrics( requested_metrics: contains a subset of the following metrics em (exact match) sm (structure match): valid if the output is ThingTalk code. Whether the gold answer and prediction are identical if we ignore parameter values of ThingTalk programs - #TODO add all + # TODO add all lang: the language of the predictions and answers. Used for BERTScore. args: arguments example_ids: used to calculate some of e2e dialogue metrics that need to know span of each dialogue such as JGA diff --git a/genienlp/models/base.py b/genienlp/models/base.py index 2f4ee597..d97f662f 100644 --- a/genienlp/models/base.py +++ b/genienlp/models/base.py @@ -99,6 +99,33 @@ def set_generation_output_options(self, tasks): # TransformerSeq2Seq and TransformerLSTM will inherit from this model class GenieModelForGeneration(GenieModel): def validate( + self, + data_iterator, + task, + eval_dir=None, + output_predictions_only=False, + output_confidence_features=False, + original_order=None, + confidence_estimators=None, + disable_progbar=True, + **kwargs, + ): + if self.args.e2e_dialogue_evaluation: + return self.validate_e2e_dialogues( + data_iterator, task, eval_dir, output_predictions_only, original_order, disable_progbar + ) + else: + return self.validate_batch( + data_iterator, + task, + output_predictions_only, + output_confidence_features, + original_order, + confidence_estimators, + disable_progbar, + ) + + def validate_batch( self, data_iterator, task, diff --git a/genienlp/predict.py b/genienlp/predict.py index c372770b..5d8e7254 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -62,7 +62,6 @@ set_seed, split_folder_on_disk, ) -from .validate import generate_with_model logger = logging.getLogger(__name__) @@ -472,16 +471,14 @@ def run(args, device): confidence_estimators = None with torch.no_grad(), torch.cuda.amp.autocast(enabled=args.mixed_precision): - generation_output = generate_with_model( - model, + generation_output = model.validate( it, task, - args, - original_order=original_order, + eval_dir=eval_dir, output_confidence_features=args.save_confidence_features, + original_order=original_order, confidence_estimators=confidence_estimators, disable_progbar=False, - eval_dir=eval_dir, ) if args.save_confidence_features: diff --git a/genienlp/server.py b/genienlp/server.py index a348a0e7..d1c679fc 100644 --- a/genienlp/server.py +++ b/genienlp/server.py @@ -46,7 +46,6 @@ from .ned.ned_utils import init_ned_model from .tasks.registry import get_tasks from .util import adjust_language_code, get_devices, load_config_json, log_model_size, set_seed -from .validate import generate_with_model logger = logging.getLogger(__name__) @@ -213,11 +212,9 @@ def _numericalize_request(self, request, task, args): def _predict_batch(self, batch, task, args): if args.calibrator_paths is not None: - output = generate_with_model( - self.model, + output = self.model.validate( [batch], task, - args, output_predictions_only=True, confidence_estimators=self.confidence_estimators, ) @@ -238,7 +235,11 @@ def _predict_batch(self, batch, task, args): instance['score'][self.estimator_filenames[e_idx]] = float(estimator_scores[idx]) response.append(instance) else: - output = generate_with_model(self.model, [batch], task, args, output_predictions_only=True) + output = self.model.validate( + [batch], + task, + output_predictions_only=True, + ) if sum(args.num_outputs) > 1: response = [] for idx, predictions in enumerate(output.predictions): diff --git a/genienlp/train.py b/genienlp/train.py index c71ac23b..2fbfe9f1 100644 --- a/genienlp/train.py +++ b/genienlp/train.py @@ -43,6 +43,7 @@ from . import arguments, models from .arguments import save_args +from .metrics import calculate_and_reduce_metrics from .model_utils.optimizer import init_opt from .model_utils.parallel_utils import NamedTupleCompatibleDataParallel from .model_utils.saver import Saver @@ -54,9 +55,9 @@ log_model_size, make_data_loader, ned_dump_entity_type_pairs, + print_results, set_seed, ) -from .validate import print_results, validate def initialize_logger(args): @@ -221,6 +222,31 @@ def should_log(iteration, log_every): return iteration % log_every == 0 +def validate(task, val_iter, model, args, num_print=10): + with torch.no_grad(): + model.eval() + if isinstance(model, torch.nn.DataParallel): + # get rid of the DataParallel wrapper + model = model.module + + generation_output = model.validate(val_iter, task) + + # loss is already calculated + metrics_to_return = [metric for metric in task.metrics if metric != 'loss'] + + metrics = calculate_and_reduce_metrics(args, generation_output, metrics_to_return, model.tgt_lang) + + results = { + 'model prediction': generation_output.predictions, + 'gold answer': generation_output.answers, + 'context': generation_output.contexts, + } + + print_results(results, num_print) + + return generation_output, metrics + + def do_validate(iteration, args, model, val_iters, *, train_task, round_progress, task_progress, writer, logger): deca_score = 0 for val_task_idx, (val_task, val_iter) in enumerate(val_iters): diff --git a/genienlp/util.py b/genienlp/util.py index bce36453..551ae0eb 100644 --- a/genienlp/util.py +++ b/genienlp/util.py @@ -34,6 +34,7 @@ import random import re import shutil +import sys import time from json.decoder import JSONDecodeError from typing import List, Optional @@ -1013,3 +1014,33 @@ def replace_capturing_group(input, re_pattern, replacement): else: new_input = input return new_input + + +def print_results(results, num_print): + print() + + values = list(results.values()) + num_examples = len(values[0]) + + # examples are sorted by length + # to get good diversity, get half of examples from second quartile + start = int(num_examples / 4) + end = start + int(num_print / 2) + first_list = [val[start:end] for val in values] + + # and the other half from fourth quartile + start = int(3 * num_examples / 4) + end = start + num_print - int(num_print / 2) + second_list = [val[start:end] for val in values] + + # join examples + processed_values = [first + second for first, second in zip(first_list, second_list)] + + for ex_idx in range(len(processed_values[0])): + for key_idx, key in enumerate(results.keys()): + value = processed_values[key_idx][ex_idx] + v = value[0] if isinstance(value, list) else value + key_width = max(len(key) for key in results) + print(f'{key:>{key_width}}: {repr(v)}') + print() + sys.stdout.flush() diff --git a/genienlp/validate.py b/genienlp/validate.py deleted file mode 100644 index 1ced3b73..00000000 --- a/genienlp/validate.py +++ /dev/null @@ -1,125 +0,0 @@ -# -# Copyright (c) 2018, Salesforce, Inc. -# The Board of Trustees of the Leland Stanford Junior University -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import logging -import sys - -import torch - -from .metrics import calculate_and_reduce_metrics - -logger = logging.getLogger(__name__) - - -def generate_with_model( - model, - data_iterator, - task, - args, - output_predictions_only=False, - output_confidence_features=False, - original_order=None, - confidence_estimators=None, - disable_progbar=True, - eval_dir=None, -): - if args.e2e_dialogue_evaluation: - return model.validate_e2e_dialogues( - data_iterator, - task, - eval_dir=eval_dir, - output_predictions_only=output_predictions_only, - original_order=original_order, - disable_progbar=disable_progbar, - ) - else: - return model.validate( - data_iterator, - task, - output_predictions_only=output_predictions_only, - output_confidence_features=output_confidence_features, - original_order=original_order, - confidence_estimators=confidence_estimators, - disable_progbar=disable_progbar, - ) - - -def print_results(results, num_print): - print() - - values = list(results.values()) - num_examples = len(values[0]) - - # examples are sorted by length - # to get good diversity, get half of examples from second quartile - start = int(num_examples / 4) - end = start + int(num_print / 2) - first_list = [val[start:end] for val in values] - - # and the other half from fourth quartile - start = int(3 * num_examples / 4) - end = start + num_print - int(num_print / 2) - second_list = [val[start:end] for val in values] - - # join examples - processed_values = [first + second for first, second in zip(first_list, second_list)] - - for ex_idx in range(len(processed_values[0])): - for key_idx, key in enumerate(results.keys()): - value = processed_values[key_idx][ex_idx] - v = value[0] if isinstance(value, list) else value - key_width = max(len(key) for key in results) - print(f'{key:>{key_width}}: {repr(v)}') - print() - sys.stdout.flush() - - -def validate(task, val_iter, model, args, num_print=10): - with torch.no_grad(): - model.eval() - if isinstance(model, torch.nn.DataParallel): - # get rid of the DataParallel wrapper - model = model.module - - generation_output = generate_with_model(model, val_iter, task, args) - - # loss is already calculated - metrics_to_return = [metric for metric in task.metrics if metric != 'loss'] - - metrics = calculate_and_reduce_metrics(args, generation_output, metrics_to_return, model.tgt_lang) - - results = { - 'model prediction': generation_output.predictions, - 'gold answer': generation_output.answers, - 'context': generation_output.contexts, - } - - print_results(results, num_print) - - return generation_output, metrics From e2cad6bb0292102eec366b0a9c178ab2d65fb51e Mon Sep 17 00:00:00 2001 From: mehrad Date: Wed, 2 Mar 2022 12:49:16 -0800 Subject: [PATCH 18/19] train: validate --> validate_while_training --- genienlp/train.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/genienlp/train.py b/genienlp/train.py index 2fbfe9f1..a465f9b2 100644 --- a/genienlp/train.py +++ b/genienlp/train.py @@ -204,7 +204,7 @@ def update_fraction(args, task_iteration): return fraction -def should_validate(iteration, val_every, resume, start_iteration): +def should_validate_while_training(iteration, val_every, resume, start_iteration): if val_every is None: return False return (iteration % val_every == 0) or (resume and iteration == start_iteration) @@ -222,7 +222,7 @@ def should_log(iteration, log_every): return iteration % log_every == 0 -def validate(task, val_iter, model, args, num_print=10): +def validate_while_training(task, val_iter, model, args, num_print=10): with torch.no_grad(): model.eval() if isinstance(model, torch.nn.DataParallel): @@ -247,10 +247,12 @@ def validate(task, val_iter, model, args, num_print=10): return generation_output, metrics -def do_validate(iteration, args, model, val_iters, *, train_task, round_progress, task_progress, writer, logger): +def do_validate_while_training( + iteration, args, model, val_iters, *, train_task, round_progress, task_progress, writer, logger +): deca_score = 0 for val_task_idx, (val_task, val_iter) in enumerate(val_iters): - output, metric_dict = validate(val_task, val_iter, model, args, num_print=args.num_print) + output, metric_dict = validate_while_training(val_task, val_iter, model, args, num_print=args.num_print) val_loss = output.loss if val_loss is not None: log_entry = f'{args.timestamp}:{elapsed_time(logger)}:iteration_{iteration}:{round_progress}train_{train_task.name}:{task_progress}val_{val_task.name}:val_loss_{val_loss:.4f}:' @@ -576,7 +578,7 @@ def train( local_loss = 0 # validate - if should_validate(iteration, val_every, resume=args.resume, start_iteration=start_iteration): + if should_validate_while_training(iteration, val_every, resume=args.resume, start_iteration=start_iteration): if args.print_train_examples_too: results = { 'answer': numericalizer.reverse(batch.answer.value.data, 'answer'), @@ -585,7 +587,7 @@ def train( num_print = min(len(results['answer']), args.num_print) print_results(results, num_print) - deca_score = do_validate( + deca_score = do_validate_while_training( iteration, args, model, From 91c30319ad125b90f55ab62aba0bc10d19cd8eea Mon Sep 17 00:00:00 2001 From: mehrad Date: Wed, 2 Mar 2022 12:52:49 -0800 Subject: [PATCH 19/19] generation_output --> validation_output --- genienlp/metrics.py | 8 ++++---- genienlp/models/base.py | 35 +++++++++++++++++++++++++++++++---- genienlp/predict.py | 40 ++++++++++++++++++++-------------------- genienlp/train.py | 12 ++++++------ genienlp/util.py | 28 +--------------------------- 5 files changed, 62 insertions(+), 61 deletions(-) diff --git a/genienlp/metrics.py b/genienlp/metrics.py index 17fba9bb..b8680e78 100644 --- a/genienlp/metrics.py +++ b/genienlp/metrics.py @@ -378,11 +378,11 @@ def compute_metrics( return metric_dict -def calculate_and_reduce_metrics(args, generation_output, metrics_to_compute, lang): +def calculate_and_reduce_metrics(args, validation_output, metrics_to_compute, lang): metrics = OrderedDict() - example_ids = generation_output.example_ids - predictions = generation_output.predictions - answers = generation_output.answers + example_ids = validation_output.example_ids + predictions = validation_output.predictions + answers = validation_output.answers if args.reduce_metrics == 'max': for i in range(len(predictions[0])): # for each output (in case of multiple outputs) diff --git a/genienlp/models/base.py b/genienlp/models/base.py index d97f662f..e91c0e16 100644 --- a/genienlp/models/base.py +++ b/genienlp/models/base.py @@ -31,6 +31,7 @@ import logging import os from collections import defaultdict +from typing import List, Optional import torch import ujson @@ -41,7 +42,7 @@ from ..data_utils.example import NumericalizedExamples, SequentialField from ..data_utils.numericalizer import TransformerNumericalizer from ..data_utils.progbar import progress_bar -from ..util import GenerationOutput, adjust_language_code, merge_translated_sentences, replace_capturing_group +from ..util import adjust_language_code, merge_translated_sentences, replace_capturing_group logger = logging.getLogger(__name__) @@ -96,6 +97,32 @@ def set_generation_output_options(self, tasks): self._output_hidden_states = False +class ValidationOutput(object): + """ + Contains all the information that model's validate() method may output + """ + + def __init__( + self, + loss: Optional[float] = None, + example_ids: Optional[List] = None, + predictions: Optional[List] = None, + raw_predictions: Optional[List] = None, + answers: Optional[List] = None, + contexts: Optional[List] = None, + confidence_features: Optional[List] = None, + confidence_scores: Optional[List] = None, + ): + self.loss = loss + self.example_ids = example_ids + self.predictions = predictions + self.raw_predictions = raw_predictions + self.answers = answers + self.contexts = contexts + self.confidence_features = confidence_features + self.confidence_scores = confidence_scores + + # TransformerSeq2Seq and TransformerLSTM will inherit from this model class GenieModelForGeneration(GenieModel): def validate( @@ -324,7 +351,7 @@ def get_example_index(i): self.numericalizer._tokenizer.tgt_lang, ) - output = GenerationOutput(loss=total_loss) + output = ValidationOutput(loss=total_loss) if output_predictions_only: output.predictions = predictions @@ -576,7 +603,7 @@ def validate_e2e_dialogues( # TODO calculate and return loss loss = None - output = GenerationOutput(loss=loss) + output = ValidationOutput(loss=loss) if output_predictions_only: output.predictions = predictions @@ -696,7 +723,7 @@ def validate(self, data_iterator, task, original_order=None, disable_progbar=Tru ) ] - output = GenerationOutput( + output = ValidationOutput( loss=total_loss, example_ids=all_example_ids, contexts=all_contexts, diff --git a/genienlp/predict.py b/genienlp/predict.py index 5d8e7254..0793da34 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -379,16 +379,16 @@ def prepare_data_iterators(args, val_sets, numericalizer, device): return iters -def create_output_lines(args, index, generation_output): - predictions = generation_output.raw_predictions if args.translate_return_raw_outputs else generation_output.predictions +def create_output_lines(args, index, validation_output): + predictions = validation_output.raw_predictions if args.translate_return_raw_outputs else validation_output.predictions if args.one_output_per_line: lines = [ '\t'.join( [ - generation_output.example_ids[index], + validation_output.example_ids[index], prediction, - generation_output.answers[index], - generation_output.contexts[index], + validation_output.answers[index], + validation_output.contexts[index], ] ) for prediction in predictions[index] @@ -397,15 +397,15 @@ def create_output_lines(args, index, generation_output): lines = [ '\t'.join( [ - generation_output.example_ids[index], + validation_output.example_ids[index], *predictions[index], - generation_output.answers[index], - generation_output.contexts[index], + validation_output.answers[index], + validation_output.contexts[index], ] ) ] # one line with all generation outputs separated by '\t' if args.calibrator_paths is not None: - for score in generation_output.confidence_scores: + for score in validation_output.confidence_scores: lines = [line + '\t' + str(score[index]) for line in lines] # append score to all lines return lines @@ -471,7 +471,7 @@ def run(args, device): confidence_estimators = None with torch.no_grad(), torch.cuda.amp.autocast(enabled=args.mixed_precision): - generation_output = model.validate( + validation_output = model.validate( it, task, eval_dir=eval_dir, @@ -482,45 +482,45 @@ def run(args, device): ) if args.save_confidence_features: - torch.save(generation_output.confidence_features, args.confidence_feature_path) + torch.save(validation_output.confidence_features, args.confidence_feature_path) # write into file # TODO change to jsonl format with open(prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: - for i in range(len(generation_output.example_ids)): - lines = create_output_lines(args, i, generation_output) + for i in range(len(validation_output.example_ids)): + lines = create_output_lines(args, i, validation_output) prediction_file.write('\n'.join(lines) + '\n') if args.translate_return_raw_outputs: with open(raw_prediction_file_name, 'w' + ('' if args.overwrite else '+')) as prediction_file: - for i in range(len(generation_output.example_ids)): - lines = create_output_lines(args, i, generation_output) + for i in range(len(validation_output.example_ids)): + lines = create_output_lines(args, i, validation_output) prediction_file.write('\n'.join(lines) + '\n') - if len(generation_output.answers) > 0: + if len(validation_output.answers) > 0: metrics_to_compute = get_metrics_to_compute(args, task) - metrics = calculate_and_reduce_metrics(args, generation_output, metrics_to_compute, tgt_lang) + metrics = calculate_and_reduce_metrics(args, validation_output, metrics_to_compute, tgt_lang) with open(results_file_name, 'w' + ('' if args.overwrite else '+')) as results_file: results_file.write(json.dumps(metrics) + '\n') if not args.silent: for i, (c, p, a) in enumerate( - zip(generation_output.contexts, generation_output.predictions, generation_output.answers) + zip(validation_output.contexts, validation_output.predictions, validation_output.answers) ): log_string = '\n'.join( [f'Context {i + 1}: {c}', f'Prediction {i + 1} ({len(p)} outputs): {p}', f'Answer {i + 1}: {a}'] ) if args.calibrator_paths is not None: log_string += f'Confidence {i + 1} : ' - for score in generation_output.confidence_scores: + for score in validation_output.confidence_scores: log_string += f'{score[i]:.3f}, ' log_string += '\n' logger.info(log_string) logger.info(metrics) - task_scores[task].append((len(generation_output.answers), metrics[task.metrics[0]])) + task_scores[task].append((len(validation_output.answers), metrics[task.metrics[0]])) decaScore = [] for task in task_scores.keys(): diff --git a/genienlp/train.py b/genienlp/train.py index a465f9b2..d0c6f5ac 100644 --- a/genienlp/train.py +++ b/genienlp/train.py @@ -229,22 +229,22 @@ def validate_while_training(task, val_iter, model, args, num_print=10): # get rid of the DataParallel wrapper model = model.module - generation_output = model.validate(val_iter, task) + validation_output = model.validate(val_iter, task) # loss is already calculated metrics_to_return = [metric for metric in task.metrics if metric != 'loss'] - metrics = calculate_and_reduce_metrics(args, generation_output, metrics_to_return, model.tgt_lang) + metrics = calculate_and_reduce_metrics(args, validation_output, metrics_to_return, model.tgt_lang) results = { - 'model prediction': generation_output.predictions, - 'gold answer': generation_output.answers, - 'context': generation_output.contexts, + 'model prediction': validation_output.predictions, + 'gold answer': validation_output.answers, + 'context': validation_output.contexts, } print_results(results, num_print) - return generation_output, metrics + return validation_output, metrics def do_validate_while_training( diff --git a/genienlp/util.py b/genienlp/util.py index 551ae0eb..9dafe694 100644 --- a/genienlp/util.py +++ b/genienlp/util.py @@ -37,7 +37,7 @@ import sys import time from json.decoder import JSONDecodeError -from typing import List, Optional +from typing import List import numpy as np import torch @@ -233,32 +233,6 @@ def __repr__(self) -> str: ) -class GenerationOutput(object): - """ - Contains all the information that the generation function may need to output - """ - - def __init__( - self, - loss: Optional[float] = None, - example_ids: Optional[List] = None, - predictions: Optional[List] = None, - raw_predictions: Optional[List] = None, - answers: Optional[List] = None, - contexts: Optional[List] = None, - confidence_features: Optional[List] = None, - confidence_scores: Optional[List] = None, - ): - self.loss = loss - self.example_ids = example_ids - self.predictions = predictions - self.raw_predictions = raw_predictions - self.answers = answers - self.contexts = contexts - self.confidence_features = confidence_features - self.confidence_scores = confidence_scores - - def remove_thingtalk_quotes(thingtalk): quote_values = [] while True: