diff --git a/config/base.yml b/config/base.yml new file mode 100644 index 0000000..5e413dc --- /dev/null +++ b/config/base.yml @@ -0,0 +1,11 @@ +max_seq: 2048 +l_r: 0.001 +embedding_dim: 256 +num_attention_layer: 6 +batch_size: 10 +loss_type: 'categorical_crossentropy' +event_dim: 388 +#pad_token: event_dim +##token_sos: event_dim + 1 +##token_eos: event_dim + 2 +##vocab_size: event_dim + 3 \ No newline at end of file diff --git a/custom/callback.py b/custom/criterion.py similarity index 59% rename from custom/callback.py rename to custom/criterion.py index 5866fa9..6ab6b2c 100644 --- a/custom/callback.py +++ b/custom/criterion.py @@ -1,8 +1,12 @@ -from tensorflow.python import keras -import tensorflow as tf +from typing import Optional, Any + import params as par import sys -from tensorflow.python.keras.optimizer_v2.learning_rate_schedule import LearningRateSchedule + +from torch.__init__ import Tensor +import torch +from torch.nn.modules.loss import CrossEntropyLoss +# from tensorflow.python.keras.optimizer_v2.learning_rate_schedule import LearningRateSchedule class MTFitCallback(keras.callbacks.Callback): @@ -15,24 +19,20 @@ def on_epoch_end(self, epoch, logs=None): self.model.save(self.save_path) -class TransformerLoss(keras.losses.SparseCategoricalCrossentropy): - def __init__(self, from_logits=False, reduction='none', debug=False, **kwargs): - super(TransformerLoss, self).__init__(from_logits, reduction, **kwargs) - self.debug = debug - pass +class TransformerLoss(CrossEntropyLoss): + def __init__(self, weight: Optional[Any] = ..., ignore_index: int = ..., reduction: str = ...) -> None: + self.reduction = reduction + super().__init__(weight, ignore_index, 'none') - def call(self, y_true, y_pred): - y_true = tf.cast(y_true, tf.int32) - mask = tf.math.logical_not(tf.math.equal(y_true, par.pad_token)) - mask = tf.cast(mask, tf.float32) - _loss = super(TransformerLoss, self).call(y_true, y_pred) + def forward(self, input: Tensor, target: Tensor) -> Tensor: + mask = target != par.pad_token + not_masked_length = mask.to(torch.int).sum() + _loss = super().forward(input, target) _loss *= mask - if self.debug: - tf.print('loss shape:', _loss.shape, output_stream=sys.stdout) - tf.print('output:', tf.argmax(y_pred,-1), output_stream=sys.stdout) - tf.print(mask, output_stream=sys.stdout) - tf.print(_loss, output_stream=sys.stdout) - return _loss + return _loss.sum() / not_masked_length + + def __call__(self, input: Tensor, target: Tensor) -> Tensor: + return self.forward(input, target) def transformer_dist_train_loss(y_true, y_pred): diff --git a/custom/layers.py b/custom/layers.py index eda3bd0..e3fd503 100644 --- a/custom/layers.py +++ b/custom/layers.py @@ -118,12 +118,11 @@ def _skewing(self, tensor: torch.Tensor): padded = F.pad(tensor, [0, 0, 0, 0, 0, 0, 1, 0]) reshaped = torch.reshape(padded, shape=[-1, padded.size(1), padded.size(-1), padded.size(-2)]) Srel = reshaped[:, :, 1:, :] - # print('Sre: {}'.format(Srel)) if self.len_k > self.len_q: Srel = F.pad(Srel, [0, 0, 0, 0, 0, 0, 0, self.len_k-self.len_q]) elif self.len_k < self.len_q: - Srel = Srel[:,:,:,:self.len_k] + Srel = Srel[:, :, :, :self.len_k] return Srel @@ -224,4 +223,4 @@ def call(self, x, mask=None): for i in range(self.num_layers): x, w = self.enc_layers[i](x, mask) weights.append(w) - return x, weights # (batch_size, input_seq_len, d_model) \ No newline at end of file + return x, weights # (batch_size, input_seq_len, d_model) diff --git a/custom/metrics.py b/custom/metrics.py new file mode 100644 index 0000000..3e78406 --- /dev/null +++ b/custom/metrics.py @@ -0,0 +1,35 @@ +import torch +from typing import List + + +class _Metric(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, input: torch.Tensor, target: torch.Tensor): + pass + + +class CategoricalAccuracy(_Metric): + def __init__(self): + super().__init__() + + def forward(self, input: torch.Tensor, target: torch.Tensor): + pass + + +class Accuracy(_Metric): + def __init__(self): + super().__init__() + + def forward(self, input: torch.Tensor, target: torch.Tensor): + pass + + +class MetricsSet(_Metric): + def __init__(self, metrics: List[_Metric]): + super().__init__() + self.metrics = metrics + + def forward(self, input: torch.Tensor, target: torch.Tensor): + return [metric(input, target) for metric in self.metrics] \ No newline at end of file diff --git a/deprecated/train.py b/deprecated/train.py index 1a4338c..93c9410 100644 --- a/deprecated/train.py +++ b/deprecated/train.py @@ -1,6 +1,6 @@ from model import MusicTransformer from custom.layers import * -from custom import callback +from custom import criterion import params as par from tensorflow.python.keras.optimizer_v2.adam import Adam from data import Data @@ -44,7 +44,7 @@ # load model -learning_rate = callback.CustomSchedule(par.embedding_dim) if l_r is None else l_r +learning_rate = criterion.CustomSchedule(par.embedding_dim) if l_r is None else l_r opt = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) @@ -56,7 +56,7 @@ max_seq=max_seq, dropout=0.2, debug=False, loader_path=load_path) -mt.compile(optimizer=opt, loss=callback.transformer_dist_train_loss) +mt.compile(optimizer=opt, loss=criterion.transformer_dist_train_loss) # define tensorboard writer diff --git a/dist_train.py b/dist_train.py index 758219c..7861c95 100644 --- a/dist_train.py +++ b/dist_train.py @@ -1,6 +1,6 @@ from model import MusicTransformer from custom.layers import * -from custom import callback +from custom import criterion import params as par from tensorflow.python.keras.optimizer_v2.adam import Adam from data import Data @@ -43,7 +43,7 @@ # load model -learning_rate = callback.CustomSchedule(par.embedding_dim) +learning_rate = criterion.CustomSchedule(par.embedding_dim) opt = Adam(l_r, beta_1=0.9, beta_2=0.98, epsilon=1e-9) strategy = tf.distribute.MirroredStrategy() @@ -58,7 +58,7 @@ max_seq=max_seq, dropout=0.2, debug=False, loader_path=load_path) - mt.compile(optimizer=opt, loss=callback.transformer_dist_train_loss) + mt.compile(optimizer=opt, loss=criterion.transformer_dist_train_loss) # Train Start for e in range(epochs): diff --git a/generate.py b/generate.py index 825d355..27f6a84 100644 --- a/generate.py +++ b/generate.py @@ -1,6 +1,6 @@ from model import MusicTransformer, MusicTransformerDecoder from custom.layers import * -from custom import callback +from custom import criterion import params as par from tensorflow.python.keras.optimizer_v2.adam import Adam from data import Data diff --git a/model.py b/model.py index e0b8349..27ed330 100644 --- a/model.py +++ b/model.py @@ -1,5 +1,5 @@ from custom.layers import * -from custom.callback import * +from custom.criterion import * from custom.layers import Encoder import params as par @@ -16,7 +16,7 @@ class MusicTransformer(torch.nn.Module): def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6, - max_seq=2048, dropout=0.2, debug=False, loader_path=None, dist=False): + max_seq=2048, dropout=0.2, debug=False, loader_path=None, dist=False, writer=None): super().__init__() if loader_path is not None: @@ -29,6 +29,7 @@ def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6, self.vocab_size = vocab_size self.dist = dist + self.writer = writer self.Decoder = Encoder( num_layers=self.num_layer, d_model=self.embedding_dim, input_vocab_size=self.vocab_size, rate=dropout, max_len=max_seq) @@ -36,15 +37,16 @@ def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6, self._set_metrics() - def forward(self, x): + def forward(self, x, lookup_mask=None): decoder, w = self.Decoder(x, mask=lookup_mask) fc = self.fc(decoder) - if self.training: - return fc - elif eval: - return fc, w - else: - return F.softmax(fc) + return fc, w + # if self.training: + # return fc + # elif eval: + # return fc, w + # else: + # return F.softmax(fc) def generate(self, prior: list, length=2048, tf_board=False): decode_array = np.array([prior]) @@ -58,25 +60,33 @@ def generate(self, prior: list, length=2048, tf_board=False): _, _, look_ahead_mask = \ utils.get_masked_with_pad_tensor(decode_array.shape[1], decode_array, decode_array) - result = self.call(decode_array, lookup_mask=look_ahead_mask, training=False) - if tf_board: - tf.summary.image('generate_vector', tf.expand_dims(result, -1), i) - # import sys - # tf.print('[debug out:]', result, sys.stdout ) + result, _ = self.forward(decode_array, lookup_mask=look_ahead_mask) + u = random.uniform(0, 1) if u > 1: result = F.argmax(result[:, -1], -1).to(torch.int32) - decode_array = tf.concat([decode_array, tf.expand_dims(result, -1)], -1) + decode_array = torch.cat([decode_array, result.unsqueeze(-1)], -1) else: pdf = dist.OneHotCategorical(probs=result[:, -1]) result = pdf.sample(1) - result = torch.transpose(result, (1, 0)).to(torch.int32) + result = torch.transpose(result, 1, 0).to(torch.int32) decode_array = torch.cat((decode_array, result), dim=-1) - # decode_array = tf.concat([decode_array, tf.expand_dims(result[:, -1], 0)], -1) del look_ahead_mask decode_array = decode_array[0] return decode_array + def train_forward(self, x): + x, _ = self.__prepare_train_data(x, x) + _, _, look_ahead_mask = utils.get_masked_with_pad_tensor(self.max_seq, x, x) + + predictions, _ = self.forward( + x, lookup_mask=look_ahead_mask, + ) + + if self._debug: + print('train step finished') + return predictions + class MusicTransformerDecoder(torch.nn.Module): def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6, @@ -141,12 +151,10 @@ def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None, reset return [loss.numpy()]+result_metric - # @tf.function def __dist_train_step(self, inp_tar, out_tar, lookup_mask, training): return self._distribution_strategy.experimental_run_v2( self.__train_step, args=(inp_tar, out_tar, lookup_mask, training)) - # @tf.function def __train_step(self, inp_tar, out_tar, lookup_mask, training): with tf.GradientTape() as tape: predictions = self.call( @@ -326,22 +334,22 @@ def __prepare_train_data(x, y): # x = data.add_noise(x, rate=0.01) return x, y - -if __name__ == '__main__': - # import utils - print(tf.executing_eagerly()) - - src = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)]) - trg = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)]) - src_mask, trg_mask, lookup_mask = utils.get_masked_with_pad_tensor(2048, src,trg) - print(lookup_mask) - print(src_mask) - mt = MusicTransformer(debug=True, embedding_dim=par.embedding_dim, vocab_size=par.vocab_size) - mt.save_weights('my_model.h5', save_format='h5') - mt.load_weights('my_model.h5') - result = mt.generate([27, 186, 43, 213, 115, 131], length=100) - print(result) - from deprecated import sequence - - sequence.EventSeq.from_array(result[0]).to_note_seq().to_midi_file('result.midi') - pass \ No newline at end of file +# +# if __name__ == '__main__': +# # import utils +# print(tf.executing_eagerly()) +# +# src = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)]) +# trg = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)]) +# src_mask, trg_mask, lookup_mask = utils.get_masked_with_pad_tensor(2048, src,trg) +# print(lookup_mask) +# print(src_mask) +# mt = MusicTransformer(debug=True, embedding_dim=par.embedding_dim, vocab_size=par.vocab_size) +# mt.save_weights('my_model.h5', save_format='h5') +# mt.load_weights('my_model.h5') +# result = mt.generate([27, 186, 43, 213, 115, 131], length=100) +# print(result) +# from deprecated import sequence +# +# sequence.EventSeq.from_array(result[0]).to_note_seq().to_midi_file('result.midi') +# pass \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bad9ca4..789aa42 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +torch +tensorboardX absl-py==0.7.1 alembic==1.0.11 appdirs==1.4.3 @@ -89,10 +91,7 @@ SQLAlchemy==1.3.5 sqlparse==0.3.0 ssh-import-id==5.7 tabulate==0.8.3 -tb-nightly==1.14.0a20190603 -tensorflow-gpu==2.0.0b1 termcolor==1.1.0 -tf-estimator-nightly==1.14.0.dev2019060501 tfp-nightly==0.8.0.dev20190807 treelib==1.5.5 urllib3==1.22 diff --git a/train.py b/train.py index 767586b..dea03f0 100644 --- a/train.py +++ b/train.py @@ -1,15 +1,16 @@ -from model import MusicTransformerDecoder +from model import MusicTransformer +from custom.metrics import * from custom.layers import * -from custom import callback +from custom.criterion import TransformerLoss import params as par -from tensorflow.python.keras.optimizer_v2.adam import Adam from data import Data import utils import argparse import datetime +import torch.optim as optim +from tensorboardX import SummaryWriter import sys -tf.executing_eagerly() parser = argparse.ArgumentParser() @@ -46,67 +47,63 @@ # load model -learning_rate = callback.CustomSchedule(par.embedding_dim) if l_r is None else l_r -opt = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) - +learning_rate = l_r # define model -mt = MusicTransformerDecoder( +mt = MusicTransformer( embedding_dim=256, vocab_size=par.vocab_size, num_layer=num_layer, max_seq=max_seq, dropout=0.2, - debug=False, loader_path=load_path) -mt.compile(optimizer=opt, loss=callback.transformer_dist_train_loss) - + debug=False, loader_path=load_path +) +criterion = TransformerLoss +opt = optim.Adam(mt.parameters(), lr=l_r) +metric_set = MetricsSet([Accuracy, ]) # define tensorboard writer current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') train_log_dir = 'logs/mt_decoder/'+current_time+'/train' eval_log_dir = 'logs/mt_decoder/'+current_time+'/eval' -train_summary_writer = tf.summary.create_file_writer(train_log_dir) -eval_summary_writer = tf.summary.create_file_writer(eval_log_dir) + +train_summary_writer = SummaryWriter(train_log_dir) +eval_summary_writer = SummaryWriter(eval_log_dir) # Train Start idx = 0 +opt.zero_grad() for e in range(epochs): - mt.reset_metrics() for b in range(len(dataset.files) // batch_size): try: batch_x, batch_y = dataset.slide_seq2seq_batch(batch_size, max_seq) + batch_x = torch.from_numpy(batch_x) + batch_y - torch.from_numpy(batch_y) except: continue - result_metrics = mt.train_on_batch(batch_x, batch_y) + + sample = mt.train_forward(batch_x) + loss = criterion(sample, batch_y) + loss.backward() + opt.step() + + result_metrics = metric_set(sample, batch_y) if b % 100 == 0: eval_x, eval_y = dataset.slide_seq2seq_batch(batch_size, max_seq, 'eval') eval_result_metrics, weights = mt.evaluate(eval_x, eval_y) mt.save(save_path) - with train_summary_writer.as_default(): - if b == 0: - tf.summary.histogram("target_analysis", batch_y, step=e) - tf.summary.histogram("source_analysis", batch_x, step=e) - - tf.summary.scalar('loss', result_metrics[0], step=idx) - tf.summary.scalar('accuracy', result_metrics[1], step=idx) - - with eval_summary_writer.as_default(): - if b == 0: - mt.sanity_check(eval_x, eval_y, step=e) - - tf.summary.scalar('loss', eval_result_metrics[0], step=idx) - tf.summary.scalar('accuracy', eval_result_metrics[1], step=idx) - for i, weight in enumerate(weights): - with tf.name_scope("layer_%d" % i): - with tf.name_scope("w"): - utils.attention_image_summary(weight, step=idx) - # for i, weight in enumerate(weights): - # with tf.name_scope("layer_%d" % i): - # with tf.name_scope("_w0"): - # utils.attention_image_summary(weight[0]) - # with tf.name_scope("_w1"): - # utils.attention_image_summary(weight[1]) + if b == 0: + train_summary_writer.add_histogram("target_analysis", batch_y, global_step=e) + train_summary_writer.add_histogram("source_analysis", batch_x, global_step=e) + train_summary_writer.add_scalar('loss', result_metrics[0], global_step=idx) + train_summary_writer.add_scalar('accuracy', result_metrics[1], global_step=idx) + + eval_summary_writer.add_scalar('loss', eval_result_metrics[0], global_step=idx) + eval_summary_writer.add_scalar('accuracy', eval_result_metrics[1], global_step=idx) + for i, weight in enumerate(weights): + attn_log_name = "attn/layer-{}".format(i) + utils.attention_image_summary(attn_log_name, step=idx) idx += 1 print('\n====================================================') print('Epoch/Batch: {}/{}'.format(e, b)) diff --git a/utils.py b/utils.py index baa5e9d..1693b7d 100644 --- a/utils.py +++ b/utils.py @@ -2,6 +2,7 @@ import numpy as np from deprecated.sequence import EventSeq, ControlSeq import torch +import torch.nn.functional as F import params as par @@ -146,13 +147,6 @@ def append_token(data: torch.Tensor): return torch.cat([start_token, data, end_token], -1) -def weights2boards(weights, dir, step): # weights stored weight[layer][w1,w2] - for weight in weights: - w1, w2 = weight - tf.summary.histogram() - pass - - def shape_list(x): """Shape list""" x_shape = x.size() @@ -167,9 +161,9 @@ def shape_list(x): return res -def attention_image_summary(attn, step=0): - """Compute color image summary. - Args: +def attention_image_summary(attn, step=0, writer=None): + """Compute color image summary. + Args: attn: a Tensor with shape [batch, num_heads, query_length, memory_length] image_shapes: optional tuple of integer scalars. If the query positions and memory positions represent the @@ -179,33 +173,33 @@ def attention_image_summary(attn, step=0): pixels x channels of flattened images, then pass in their dimensions: (query_rows, query_cols, query_channels, memory_rows, memory_cols, memory_channels). - """ - num_heads = shape_list(attn)[1] - # [batch, query_length, memory_length, num_heads] - image = attn.view([0, 2, 3, 1]) - image = torch.pow(image, 0.2) # for high-dynamic-range - # Each head will correspond to one of RGB. - # pad the heads to be a multiple of 3 - image = tf.pad(image, [[0, 0], [0, 0], [0, 0], [0, tf.math.mod(-num_heads, 3)]]) - image = split_last_dimension(image, 3) - image = torch.max(image, dim=4) - tf.summary.image("attention", image, max_outputs=1, step=step) + """ + num_heads = shape_list(attn)[1] + # [batch, query_length, memory_length, num_heads] + image = attn.view([0, 2, 3, 1]) + image = torch.pow(image, 0.2) # for high-dynamic-range + # Each head will correspond to one of RGB. + # pad the heads to be a multiple of 3 + image = F.pad(image, [0, 0, 0, 0, 0, 0, 0, torch.fmod(-num_heads, 3)]) + image = split_last_dimension(image, 3) + image = torch.max(image, dim=4) + writer.add_image(attn, image, max_outputs=1, global_step=step) def split_last_dimension(x, n): - """Reshape x so that the last dimension becomes two dimensions. - The first of these two dimensions is n. - Args: + """Reshape x so that the last dimension becomes two dimensions. + The first of these two dimensions is n. + Args: x: a Tensor with shape [..., m] n: an integer. - Returns: + Returns: a Tensor with shape [..., n, m/n] - """ - x_shape = shape_list(x) - m = x_shape[-1] - if isinstance(m, int) and isinstance(n, int): - assert m % n == 0 - return torch.reshape(x, x_shape[:-1] + [n, m // n]) + """ + x_shape = shape_list(x) + m = x_shape[-1] + if isinstance(m, int) and isinstance(n, int): + assert m % n == 0 + return torch.reshape(x, x_shape[:-1] + [n, m // n]) def subsequent_mask(size):