diff --git a/config/base.yml b/config/base.yml
new file mode 100644
index 0000000..5e413dc
--- /dev/null
+++ b/config/base.yml
@@ -0,0 +1,11 @@
+max_seq: 2048
+l_r: 0.001
+embedding_dim: 256
+num_attention_layer: 6
+batch_size: 10
+loss_type: 'categorical_crossentropy'
+event_dim: 388
+#pad_token: event_dim
+##token_sos: event_dim + 1
+##token_eos: event_dim + 2
+##vocab_size: event_dim + 3
\ No newline at end of file
diff --git a/custom/callback.py b/custom/criterion.py
similarity index 59%
rename from custom/callback.py
rename to custom/criterion.py
index 5866fa9..6ab6b2c 100644
--- a/custom/callback.py
+++ b/custom/criterion.py
@@ -1,8 +1,12 @@
-from tensorflow.python import keras
-import tensorflow as tf
+from typing import Optional, Any
+
 import params as par
 import sys
-from tensorflow.python.keras.optimizer_v2.learning_rate_schedule import LearningRateSchedule
+
+from torch.__init__ import Tensor
+import torch
+from torch.nn.modules.loss import CrossEntropyLoss
+# from tensorflow.python.keras.optimizer_v2.learning_rate_schedule import LearningRateSchedule
 
 
 class MTFitCallback(keras.callbacks.Callback):
@@ -15,24 +19,20 @@ def on_epoch_end(self, epoch, logs=None):
         self.model.save(self.save_path)
 
 
-class TransformerLoss(keras.losses.SparseCategoricalCrossentropy):
-    def __init__(self, from_logits=False, reduction='none', debug=False,  **kwargs):
-        super(TransformerLoss, self).__init__(from_logits, reduction, **kwargs)
-        self.debug = debug
-        pass
+class TransformerLoss(CrossEntropyLoss):
+    def __init__(self, weight: Optional[Any] = ..., ignore_index: int = ..., reduction: str = ...) -> None:
+        self.reduction = reduction
+        super().__init__(weight, ignore_index, 'none')
 
-    def call(self, y_true, y_pred):
-        y_true = tf.cast(y_true, tf.int32)
-        mask = tf.math.logical_not(tf.math.equal(y_true, par.pad_token))
-        mask = tf.cast(mask, tf.float32)
-        _loss = super(TransformerLoss, self).call(y_true, y_pred)
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        mask = target != par.pad_token
+        not_masked_length = mask.to(torch.int).sum()
+        _loss = super().forward(input, target)
         _loss *= mask
-        if self.debug:
-            tf.print('loss shape:', _loss.shape, output_stream=sys.stdout)
-            tf.print('output:', tf.argmax(y_pred,-1), output_stream=sys.stdout)
-            tf.print(mask, output_stream=sys.stdout)
-            tf.print(_loss, output_stream=sys.stdout)
-        return _loss
+        return _loss.sum() / not_masked_length
+
+    def __call__(self, input: Tensor, target: Tensor) -> Tensor:
+        return self.forward(input, target)
 
 
 def transformer_dist_train_loss(y_true, y_pred):
diff --git a/custom/layers.py b/custom/layers.py
index eda3bd0..e3fd503 100644
--- a/custom/layers.py
+++ b/custom/layers.py
@@ -118,12 +118,11 @@ def _skewing(self, tensor: torch.Tensor):
         padded = F.pad(tensor, [0, 0, 0, 0, 0, 0, 1, 0])
         reshaped = torch.reshape(padded, shape=[-1, padded.size(1), padded.size(-1), padded.size(-2)])
         Srel = reshaped[:, :, 1:, :]
-        # print('Sre: {}'.format(Srel))
 
         if self.len_k > self.len_q:
             Srel = F.pad(Srel, [0, 0, 0, 0, 0, 0, 0, self.len_k-self.len_q])
         elif self.len_k < self.len_q:
-            Srel = Srel[:,:,:,:self.len_k]
+            Srel = Srel[:, :, :, :self.len_k]
 
         return Srel
 
@@ -224,4 +223,4 @@ def call(self, x, mask=None):
         for i in range(self.num_layers):
             x, w = self.enc_layers[i](x, mask)
             weights.append(w)
-        return x, weights  # (batch_size, input_seq_len, d_model)
\ No newline at end of file
+        return x, weights # (batch_size, input_seq_len, d_model)
diff --git a/custom/metrics.py b/custom/metrics.py
new file mode 100644
index 0000000..3e78406
--- /dev/null
+++ b/custom/metrics.py
@@ -0,0 +1,35 @@
+import torch
+from typing import List
+
+
+class _Metric(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        pass
+
+
+class CategoricalAccuracy(_Metric):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        pass
+
+
+class Accuracy(_Metric):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        pass
+
+
+class MetricsSet(_Metric):
+    def __init__(self, metrics: List[_Metric]):
+        super().__init__()
+        self.metrics = metrics
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        return [metric(input, target) for metric in self.metrics]
\ No newline at end of file
diff --git a/deprecated/train.py b/deprecated/train.py
index 1a4338c..93c9410 100644
--- a/deprecated/train.py
+++ b/deprecated/train.py
@@ -1,6 +1,6 @@
 from model import MusicTransformer
 from custom.layers import *
-from custom import callback
+from custom import criterion
 import params as par
 from tensorflow.python.keras.optimizer_v2.adam import Adam
 from data import Data
@@ -44,7 +44,7 @@
 
 
 # load model
-learning_rate = callback.CustomSchedule(par.embedding_dim) if l_r is None else l_r
+learning_rate = criterion.CustomSchedule(par.embedding_dim) if l_r is None else l_r
 opt = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
 
 
@@ -56,7 +56,7 @@
             max_seq=max_seq,
             dropout=0.2,
             debug=False, loader_path=load_path)
-mt.compile(optimizer=opt, loss=callback.transformer_dist_train_loss)
+mt.compile(optimizer=opt, loss=criterion.transformer_dist_train_loss)
 
 
 # define tensorboard writer
diff --git a/dist_train.py b/dist_train.py
index 758219c..7861c95 100644
--- a/dist_train.py
+++ b/dist_train.py
@@ -1,6 +1,6 @@
 from model import MusicTransformer
 from custom.layers import *
-from custom import callback
+from custom import criterion
 import params as par
 from tensorflow.python.keras.optimizer_v2.adam import Adam
 from data import Data
@@ -43,7 +43,7 @@
 
 
 # load model
-learning_rate = callback.CustomSchedule(par.embedding_dim)
+learning_rate = criterion.CustomSchedule(par.embedding_dim)
 opt = Adam(l_r, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
 
 strategy = tf.distribute.MirroredStrategy()
@@ -58,7 +58,7 @@
             max_seq=max_seq,
             dropout=0.2,
             debug=False, loader_path=load_path)
-    mt.compile(optimizer=opt, loss=callback.transformer_dist_train_loss)
+    mt.compile(optimizer=opt, loss=criterion.transformer_dist_train_loss)
 
     # Train Start
     for e in range(epochs):
diff --git a/generate.py b/generate.py
index 825d355..27f6a84 100644
--- a/generate.py
+++ b/generate.py
@@ -1,6 +1,6 @@
 from model import MusicTransformer, MusicTransformerDecoder
 from custom.layers import *
-from custom import callback
+from custom import criterion
 import params as par
 from tensorflow.python.keras.optimizer_v2.adam import Adam
 from data import Data
diff --git a/model.py b/model.py
index e0b8349..27ed330 100644
--- a/model.py
+++ b/model.py
@@ -1,5 +1,5 @@
 from custom.layers import *
-from custom.callback import *
+from custom.criterion import *
 from custom.layers import Encoder
 import params as par
 
@@ -16,7 +16,7 @@
 
 class MusicTransformer(torch.nn.Module):
     def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6,
-                 max_seq=2048, dropout=0.2, debug=False, loader_path=None, dist=False):
+                 max_seq=2048, dropout=0.2, debug=False, loader_path=None, dist=False, writer=None):
         super().__init__()
 
         if loader_path is not None:
@@ -29,6 +29,7 @@ def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6,
             self.vocab_size = vocab_size
             self.dist = dist
 
+        self.writer = writer
         self.Decoder = Encoder(
             num_layers=self.num_layer, d_model=self.embedding_dim,
             input_vocab_size=self.vocab_size, rate=dropout, max_len=max_seq)
@@ -36,15 +37,16 @@ def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6,
 
         self._set_metrics()
 
-    def forward(self, x):
+    def forward(self, x, lookup_mask=None):
         decoder, w = self.Decoder(x, mask=lookup_mask)
         fc = self.fc(decoder)
-        if self.training:
-            return fc
-        elif eval:
-            return fc, w
-        else:
-            return F.softmax(fc)
+        return fc, w
+        # if self.training:
+        #     return fc
+        # elif eval:
+        #     return fc, w
+        # else:
+        #     return F.softmax(fc)
 
     def generate(self, prior: list, length=2048, tf_board=False):
         decode_array = np.array([prior])
@@ -58,25 +60,33 @@ def generate(self, prior: list, length=2048, tf_board=False):
             _, _, look_ahead_mask = \
                 utils.get_masked_with_pad_tensor(decode_array.shape[1], decode_array, decode_array)
 
-            result = self.call(decode_array, lookup_mask=look_ahead_mask, training=False)
-            if tf_board:
-                tf.summary.image('generate_vector', tf.expand_dims(result, -1), i)
-            # import sys
-            # tf.print('[debug out:]', result, sys.stdout )
+            result, _ = self.forward(decode_array, lookup_mask=look_ahead_mask)
+
             u = random.uniform(0, 1)
             if u > 1:
                 result = F.argmax(result[:, -1], -1).to(torch.int32)
-                decode_array = tf.concat([decode_array, tf.expand_dims(result, -1)], -1)
+                decode_array = torch.cat([decode_array, result.unsqueeze(-1)], -1)
             else:
                 pdf = dist.OneHotCategorical(probs=result[:, -1])
                 result = pdf.sample(1)
-                result = torch.transpose(result, (1, 0)).to(torch.int32)
+                result = torch.transpose(result, 1, 0).to(torch.int32)
                 decode_array = torch.cat((decode_array, result), dim=-1)
-            # decode_array = tf.concat([decode_array, tf.expand_dims(result[:, -1], 0)], -1)
             del look_ahead_mask
         decode_array = decode_array[0]
         return decode_array
 
+    def train_forward(self, x):
+        x, _ = self.__prepare_train_data(x, x)
+        _, _, look_ahead_mask = utils.get_masked_with_pad_tensor(self.max_seq, x, x)
+
+        predictions, _ = self.forward(
+            x, lookup_mask=look_ahead_mask,
+        )
+
+        if self._debug:
+            print('train step finished')
+        return predictions
+
 
 class MusicTransformerDecoder(torch.nn.Module):
     def __init__(self, embedding_dim=256, vocab_size=388+2, num_layer=6,
@@ -141,12 +151,10 @@ def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None, reset
 
         return [loss.numpy()]+result_metric
 
-    # @tf.function
     def __dist_train_step(self, inp_tar, out_tar, lookup_mask, training):
         return self._distribution_strategy.experimental_run_v2(
             self.__train_step, args=(inp_tar, out_tar, lookup_mask, training))
 
-    # @tf.function
     def __train_step(self, inp_tar, out_tar, lookup_mask, training):
         with tf.GradientTape() as tape:
             predictions = self.call(
@@ -326,22 +334,22 @@ def __prepare_train_data(x, y):
         # x = data.add_noise(x, rate=0.01)
         return x, y
 
-
-if __name__ == '__main__':
-    # import utils
-    print(tf.executing_eagerly())
-
-    src = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)])
-    trg = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)])
-    src_mask, trg_mask, lookup_mask = utils.get_masked_with_pad_tensor(2048, src,trg)
-    print(lookup_mask)
-    print(src_mask)
-    mt = MusicTransformer(debug=True, embedding_dim=par.embedding_dim, vocab_size=par.vocab_size)
-    mt.save_weights('my_model.h5', save_format='h5')
-    mt.load_weights('my_model.h5')
-    result = mt.generate([27, 186,  43, 213, 115, 131], length=100)
-    print(result)
-    from deprecated import sequence
-
-    sequence.EventSeq.from_array(result[0]).to_note_seq().to_midi_file('result.midi')
-    pass
\ No newline at end of file
+#
+# if __name__ == '__main__':
+#     # import utils
+#     print(tf.executing_eagerly())
+#
+#     src = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)])
+#     trg = tf.constant([utils.fill_with_placeholder([1,2,3,4],max_len=2048)])
+#     src_mask, trg_mask, lookup_mask = utils.get_masked_with_pad_tensor(2048, src,trg)
+#     print(lookup_mask)
+#     print(src_mask)
+#     mt = MusicTransformer(debug=True, embedding_dim=par.embedding_dim, vocab_size=par.vocab_size)
+#     mt.save_weights('my_model.h5', save_format='h5')
+#     mt.load_weights('my_model.h5')
+#     result = mt.generate([27, 186,  43, 213, 115, 131], length=100)
+#     print(result)
+#     from deprecated import sequence
+#
+#     sequence.EventSeq.from_array(result[0]).to_note_seq().to_midi_file('result.midi')
+#     pass
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index bad9ca4..789aa42 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
+torch
+tensorboardX
 absl-py==0.7.1
 alembic==1.0.11
 appdirs==1.4.3
@@ -89,10 +91,7 @@ SQLAlchemy==1.3.5
 sqlparse==0.3.0
 ssh-import-id==5.7
 tabulate==0.8.3
-tb-nightly==1.14.0a20190603
-tensorflow-gpu==2.0.0b1
 termcolor==1.1.0
-tf-estimator-nightly==1.14.0.dev2019060501
 tfp-nightly==0.8.0.dev20190807
 treelib==1.5.5
 urllib3==1.22
diff --git a/train.py b/train.py
index 767586b..dea03f0 100644
--- a/train.py
+++ b/train.py
@@ -1,15 +1,16 @@
-from model import MusicTransformerDecoder
+from model import MusicTransformer
+from custom.metrics import *
 from custom.layers import *
-from custom import callback
+from custom.criterion import TransformerLoss
 import params as par
-from tensorflow.python.keras.optimizer_v2.adam import Adam
 from data import Data
 import utils
 import argparse
 import datetime
+import torch.optim as optim
+from tensorboardX import SummaryWriter
 import sys
 
-tf.executing_eagerly()
 
 parser = argparse.ArgumentParser()
 
@@ -46,67 +47,63 @@
 
 
 # load model
-learning_rate = callback.CustomSchedule(par.embedding_dim) if l_r is None else l_r
-opt = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
-
+learning_rate = l_r
 
 # define model
-mt = MusicTransformerDecoder(
+mt = MusicTransformer(
             embedding_dim=256,
             vocab_size=par.vocab_size,
             num_layer=num_layer,
             max_seq=max_seq,
             dropout=0.2,
-            debug=False, loader_path=load_path)
-mt.compile(optimizer=opt, loss=callback.transformer_dist_train_loss)
-
+            debug=False, loader_path=load_path
+)
+criterion = TransformerLoss
+opt = optim.Adam(mt.parameters(), lr=l_r)
+metric_set = MetricsSet([Accuracy, ])
 
 # define tensorboard writer
 current_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
 train_log_dir = 'logs/mt_decoder/'+current_time+'/train'
 eval_log_dir = 'logs/mt_decoder/'+current_time+'/eval'
-train_summary_writer = tf.summary.create_file_writer(train_log_dir)
-eval_summary_writer = tf.summary.create_file_writer(eval_log_dir)
+
+train_summary_writer = SummaryWriter(train_log_dir)
+eval_summary_writer = SummaryWriter(eval_log_dir)
 
 
 # Train Start
 idx = 0
+opt.zero_grad()
 for e in range(epochs):
-    mt.reset_metrics()
     for b in range(len(dataset.files) // batch_size):
         try:
             batch_x, batch_y = dataset.slide_seq2seq_batch(batch_size, max_seq)
+            batch_x = torch.from_numpy(batch_x)
+            batch_y - torch.from_numpy(batch_y)
         except:
             continue
-        result_metrics = mt.train_on_batch(batch_x, batch_y)
+
+        sample = mt.train_forward(batch_x)
+        loss = criterion(sample, batch_y)
+        loss.backward()
+        opt.step()
+        
+        result_metrics = metric_set(sample, batch_y)
         if b % 100 == 0:
             eval_x, eval_y = dataset.slide_seq2seq_batch(batch_size, max_seq, 'eval')
             eval_result_metrics, weights = mt.evaluate(eval_x, eval_y)
             mt.save(save_path)
-            with train_summary_writer.as_default():
-                if b == 0:
-                    tf.summary.histogram("target_analysis", batch_y, step=e)
-                    tf.summary.histogram("source_analysis", batch_x, step=e)
-
-                tf.summary.scalar('loss', result_metrics[0], step=idx)
-                tf.summary.scalar('accuracy', result_metrics[1], step=idx)
-
-            with eval_summary_writer.as_default():
-                if b == 0:
-                    mt.sanity_check(eval_x, eval_y, step=e)
-
-                tf.summary.scalar('loss', eval_result_metrics[0], step=idx)
-                tf.summary.scalar('accuracy', eval_result_metrics[1], step=idx)
-                for i, weight in enumerate(weights):
-                    with tf.name_scope("layer_%d" % i):
-                        with tf.name_scope("w"):
-                            utils.attention_image_summary(weight, step=idx)
-                # for i, weight in enumerate(weights):
-                #     with tf.name_scope("layer_%d" % i):
-                #         with tf.name_scope("_w0"):
-                #             utils.attention_image_summary(weight[0])
-                #         with tf.name_scope("_w1"):
-                #             utils.attention_image_summary(weight[1])
+            if b == 0:
+                train_summary_writer.add_histogram("target_analysis", batch_y, global_step=e)
+                train_summary_writer.add_histogram("source_analysis", batch_x, global_step=e)
+            train_summary_writer.add_scalar('loss', result_metrics[0], global_step=idx)
+            train_summary_writer.add_scalar('accuracy', result_metrics[1], global_step=idx)
+
+            eval_summary_writer.add_scalar('loss', eval_result_metrics[0], global_step=idx)
+            eval_summary_writer.add_scalar('accuracy', eval_result_metrics[1], global_step=idx)
+            for i, weight in enumerate(weights):
+                    attn_log_name = "attn/layer-{}".format(i)
+                    utils.attention_image_summary(attn_log_name, step=idx)
             idx += 1
             print('\n====================================================')
             print('Epoch/Batch: {}/{}'.format(e, b))
diff --git a/utils.py b/utils.py
index baa5e9d..1693b7d 100644
--- a/utils.py
+++ b/utils.py
@@ -2,6 +2,7 @@
 import numpy as np
 from deprecated.sequence import EventSeq, ControlSeq
 import torch
+import torch.nn.functional as F
 import params as par
 
 
@@ -146,13 +147,6 @@ def append_token(data: torch.Tensor):
     return torch.cat([start_token, data, end_token], -1)
 
 
-def weights2boards(weights, dir, step): # weights stored weight[layer][w1,w2]
-    for weight in weights:
-        w1, w2 = weight
-        tf.summary.histogram()
-    pass
-
-
 def shape_list(x):
     """Shape list"""
     x_shape = x.size()
@@ -167,9 +161,9 @@ def shape_list(x):
     return res
 
 
-def attention_image_summary(attn, step=0):
-  """Compute color image summary.
-  Args:
+def attention_image_summary(attn, step=0, writer=None):
+    """Compute color image summary.
+    Args:
     attn: a Tensor with shape [batch, num_heads, query_length, memory_length]
     image_shapes: optional tuple of integer scalars.
       If the query positions and memory positions represent the
@@ -179,33 +173,33 @@ def attention_image_summary(attn, step=0):
       pixels x channels of flattened images, then pass in their dimensions:
         (query_rows, query_cols, query_channels,
          memory_rows, memory_cols, memory_channels).
-  """
-  num_heads = shape_list(attn)[1]
-  # [batch, query_length, memory_length, num_heads]
-  image = attn.view([0, 2, 3, 1])
-  image = torch.pow(image, 0.2)  # for high-dynamic-range
-  # Each head will correspond to one of RGB.
-  # pad the heads to be a multiple of 3
-  image = tf.pad(image, [[0, 0], [0, 0], [0, 0], [0, tf.math.mod(-num_heads, 3)]])
-  image = split_last_dimension(image, 3)
-  image = torch.max(image, dim=4)
-  tf.summary.image("attention", image, max_outputs=1, step=step)
+    """
+    num_heads = shape_list(attn)[1]
+    # [batch, query_length, memory_length, num_heads]
+    image = attn.view([0, 2, 3, 1])
+    image = torch.pow(image, 0.2)  # for high-dynamic-range
+    # Each head will correspond to one of RGB.
+    # pad the heads to be a multiple of 3
+    image = F.pad(image, [0, 0, 0, 0, 0, 0, 0, torch.fmod(-num_heads, 3)])
+    image = split_last_dimension(image, 3)
+    image = torch.max(image, dim=4)
+    writer.add_image(attn, image, max_outputs=1, global_step=step)
 
 
 def split_last_dimension(x, n):
-  """Reshape x so that the last dimension becomes two dimensions.
-  The first of these two dimensions is n.
-  Args:
+    """Reshape x so that the last dimension becomes two dimensions.
+    The first of these two dimensions is n.
+    Args:
     x: a Tensor with shape [..., m]
     n: an integer.
-  Returns:
+    Returns:
     a Tensor with shape [..., n, m/n]
-  """
-  x_shape = shape_list(x)
-  m = x_shape[-1]
-  if isinstance(m, int) and isinstance(n, int):
-    assert m % n == 0
-  return torch.reshape(x, x_shape[:-1] + [n, m // n])
+    """
+    x_shape = shape_list(x)
+    m = x_shape[-1]
+    if isinstance(m, int) and isinstance(n, int):
+        assert m % n == 0
+    return torch.reshape(x, x_shape[:-1] + [n, m // n])
 
 
 def subsequent_mask(size):